natural-pdf 0.1.11__py3-none-any.whl → 0.1.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. natural_pdf/__init__.py +7 -2
  2. natural_pdf/analyzers/shape_detection_mixin.py +1092 -0
  3. natural_pdf/analyzers/text_options.py +9 -1
  4. natural_pdf/analyzers/text_structure.py +371 -58
  5. natural_pdf/classification/manager.py +3 -4
  6. natural_pdf/collections/pdf_collection.py +19 -39
  7. natural_pdf/core/element_manager.py +11 -1
  8. natural_pdf/core/highlighting_service.py +146 -75
  9. natural_pdf/core/page.py +287 -188
  10. natural_pdf/core/pdf.py +57 -42
  11. natural_pdf/elements/base.py +51 -0
  12. natural_pdf/elements/collections.py +362 -67
  13. natural_pdf/elements/line.py +5 -0
  14. natural_pdf/elements/region.py +396 -23
  15. natural_pdf/exporters/data/__init__.py +0 -0
  16. natural_pdf/exporters/data/pdf.ttf +0 -0
  17. natural_pdf/exporters/data/sRGB.icc +0 -0
  18. natural_pdf/exporters/hocr.py +40 -61
  19. natural_pdf/exporters/hocr_font.py +7 -13
  20. natural_pdf/exporters/original_pdf.py +10 -13
  21. natural_pdf/exporters/paddleocr.py +51 -11
  22. natural_pdf/exporters/searchable_pdf.py +0 -10
  23. natural_pdf/flows/__init__.py +12 -0
  24. natural_pdf/flows/collections.py +533 -0
  25. natural_pdf/flows/element.py +382 -0
  26. natural_pdf/flows/flow.py +216 -0
  27. natural_pdf/flows/region.py +458 -0
  28. natural_pdf/search/__init__.py +65 -52
  29. natural_pdf/search/lancedb_search_service.py +325 -0
  30. natural_pdf/search/numpy_search_service.py +255 -0
  31. natural_pdf/search/searchable_mixin.py +25 -71
  32. natural_pdf/selectors/parser.py +163 -8
  33. natural_pdf/widgets/viewer.py +22 -31
  34. {natural_pdf-0.1.11.dist-info → natural_pdf-0.1.13.dist-info}/METADATA +55 -49
  35. {natural_pdf-0.1.11.dist-info → natural_pdf-0.1.13.dist-info}/RECORD +38 -30
  36. {natural_pdf-0.1.11.dist-info → natural_pdf-0.1.13.dist-info}/WHEEL +1 -1
  37. natural_pdf/search/haystack_search_service.py +0 -687
  38. natural_pdf/search/haystack_utils.py +0 -474
  39. natural_pdf/utils/tqdm_utils.py +0 -51
  40. {natural_pdf-0.1.11.dist-info → natural_pdf-0.1.13.dist-info}/licenses/LICENSE +0 -0
  41. {natural_pdf-0.1.11.dist-info → natural_pdf-0.1.13.dist-info}/top_level.txt +0 -0
@@ -28,6 +28,11 @@ class LineElement(Element):
28
28
  """
29
29
  super().__init__(obj, page)
30
30
 
31
+ @property
32
+ def source(self) -> Optional[str]:
33
+ """Get the source of this line element (e.g., 'pdf', 'detected')."""
34
+ return self._obj.get("source")
35
+
31
36
  @property
32
37
  def type(self) -> str:
33
38
  """Element type."""
@@ -13,6 +13,7 @@ from natural_pdf.classification.manager import ClassificationManager # Keep for
13
13
  from natural_pdf.classification.mixin import ClassificationMixin
14
14
  from natural_pdf.elements.base import DirectionalMixin
15
15
  from natural_pdf.extraction.mixin import ExtractionMixin # Import extraction mixin
16
+ from natural_pdf.elements.text import TextElement # ADDED IMPORT
16
17
  from natural_pdf.ocr.utils import _apply_ocr_correction_to_elements # Import utility
17
18
  from natural_pdf.selectors.parser import parse_selector, selector_to_filter_func
18
19
  from natural_pdf.utils.locks import pdf_render_lock # Import the lock
@@ -20,11 +21,12 @@ from natural_pdf.utils.locks import pdf_render_lock # Import the lock
20
21
  # Import new utils
21
22
  from natural_pdf.utils.text_extraction import filter_chars_spatially, generate_text_layout
22
23
 
23
- # --- NEW: Import tqdm utility --- #
24
- from natural_pdf.utils.tqdm_utils import get_tqdm
25
-
24
+ from tqdm.auto import tqdm
26
25
  # --- End Classification Imports --- #
27
26
 
27
+ # --- Shape Detection Mixin --- #
28
+ from natural_pdf.analyzers.shape_detection_mixin import ShapeDetectionMixin
29
+ # --- End Shape Detection Mixin --- #
28
30
 
29
31
  if TYPE_CHECKING:
30
32
  # --- NEW: Add Image type hint for classification --- #
@@ -33,6 +35,7 @@ if TYPE_CHECKING:
33
35
  from natural_pdf.core.page import Page
34
36
  from natural_pdf.elements.collections import ElementCollection
35
37
  from natural_pdf.elements.text import TextElement
38
+ from natural_pdf.elements.base import Element # Added for type hint
36
39
 
37
40
  # Import OCRManager conditionally to avoid circular imports
38
41
  try:
@@ -44,7 +47,7 @@ except ImportError:
44
47
  logger = logging.getLogger(__name__)
45
48
 
46
49
 
47
- class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
50
+ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetectionMixin):
48
51
  """
49
52
  Represents a rectangular region on a page.
50
53
  """
@@ -720,14 +723,36 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
720
723
  Returns:
721
724
  PIL Image of just this region
722
725
  """
726
+ # Handle the case where user wants the cropped region to have a specific width
727
+ page_kwargs = kwargs.copy()
728
+ effective_resolution = resolution # Start with the provided resolution
729
+
730
+ if crop_only and 'width' in kwargs:
731
+ target_width = kwargs['width']
732
+ # Calculate what resolution is needed to make the region crop have target_width
733
+ region_width_points = self.width # Region width in PDF points
734
+
735
+ if region_width_points > 0:
736
+ # Calculate scale needed: target_width / region_width_points
737
+ required_scale = target_width / region_width_points
738
+ # Convert scale to resolution: scale * 72 DPI
739
+ effective_resolution = required_scale * 72.0
740
+ page_kwargs.pop('width') # Remove width parameter to avoid conflicts
741
+ logger.debug(f"Region {self.bbox}: Calculated required resolution {effective_resolution:.1f} DPI for region crop width {target_width}")
742
+ else:
743
+ logger.warning(f"Region {self.bbox}: Invalid region width {region_width_points}, using original resolution")
744
+
723
745
  # First get the full page image with highlights if requested
724
746
  page_image = self._page.to_image(
725
- scale=scale, resolution=resolution, include_highlights=include_highlights, **kwargs
747
+ scale=scale, resolution=effective_resolution, include_highlights=include_highlights, **page_kwargs
726
748
  )
727
749
 
728
- # Calculate the crop coordinates - apply resolution scaling factor
729
- # PDF coordinates are in points (1/72 inch), but image is scaled by resolution
730
- scale_factor = resolution / 72.0 # Scale based on DPI
750
+ # Calculate the actual scale factor used by the page image
751
+ if page_image.width > 0 and self._page.width > 0:
752
+ scale_factor = page_image.width / self._page.width
753
+ else:
754
+ # Fallback to resolution-based calculation if dimensions are invalid
755
+ scale_factor = resolution / 72.0
731
756
 
732
757
  # Apply scaling to the coordinates
733
758
  x0 = int(self.x0 * scale_factor)
@@ -772,6 +797,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
772
797
  # Add a default color for standalone show
773
798
  color: Optional[Union[Tuple, str]] = "blue",
774
799
  label: Optional[str] = None,
800
+ width: Optional[int] = None, # Add width parameter
775
801
  ) -> "Image.Image":
776
802
  """
777
803
  Show the page with just this region highlighted temporarily.
@@ -782,6 +808,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
782
808
  legend_position: Position of the legend
783
809
  color: Color to highlight this region (default: blue)
784
810
  label: Optional label for this region in the legend
811
+ width: Optional width for the output image in pixels
785
812
 
786
813
  Returns:
787
814
  PIL Image of the page with only this region highlighted
@@ -812,6 +839,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
812
839
  page_index=self._page.index,
813
840
  temporary_highlights=[temp_highlight_data],
814
841
  scale=scale,
842
+ width=width, # Pass the width parameter
815
843
  labels=labels,
816
844
  legend_position=legend_position,
817
845
  )
@@ -871,6 +899,233 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
871
899
  image.save(filename)
872
900
  return self
873
901
 
902
+ def trim(self, padding: int = 1, threshold: float = 0.95, resolution: float = 150, pre_shrink: float = 0.5) -> "Region":
903
+ """
904
+ Trim visual whitespace from the edges of this region.
905
+
906
+ Similar to Python's string .strip() method, but for visual whitespace in the region image.
907
+ Uses pixel analysis to detect rows/columns that are predominantly whitespace.
908
+
909
+ Args:
910
+ padding: Number of pixels to keep as padding after trimming (default: 1)
911
+ threshold: Threshold for considering a row/column as whitespace (0.0-1.0, default: 0.95)
912
+ Higher values mean more strict whitespace detection.
913
+ E.g., 0.95 means if 95% of pixels in a row/column are white, consider it whitespace.
914
+ resolution: Resolution for image rendering in DPI (default: 150)
915
+ pre_shrink: Amount to shrink region before trimming, then expand back after (default: 0.5)
916
+ This helps avoid detecting box borders/slivers as content.
917
+
918
+ Returns:
919
+ New Region with visual whitespace trimmed from all edges
920
+
921
+ Example:
922
+ # Basic trimming with 1 pixel padding and 0.5px pre-shrink
923
+ trimmed = region.trim()
924
+
925
+ # More aggressive trimming with no padding and no pre-shrink
926
+ tight = region.trim(padding=0, threshold=0.9, pre_shrink=0)
927
+
928
+ # Conservative trimming with more padding
929
+ loose = region.trim(padding=3, threshold=0.98)
930
+ """
931
+ # Pre-shrink the region to avoid box slivers
932
+ work_region = self.expand(left=-pre_shrink, right=-pre_shrink, top=-pre_shrink, bottom=-pre_shrink) if pre_shrink > 0 else self
933
+
934
+ # Get the region image
935
+ image = work_region.to_image(resolution=resolution, crop_only=True, include_highlights=False)
936
+
937
+ if image is None:
938
+ logger.warning(f"Region {self.bbox}: Could not generate image for trimming. Returning original region.")
939
+ return self
940
+
941
+ # Convert to grayscale for easier analysis
942
+ import numpy as np
943
+
944
+ # Convert PIL image to numpy array
945
+ img_array = np.array(image.convert('L')) # Convert to grayscale
946
+ height, width = img_array.shape
947
+
948
+ if height == 0 or width == 0:
949
+ logger.warning(f"Region {self.bbox}: Image has zero dimensions. Returning original region.")
950
+ return self
951
+
952
+ # Normalize pixel values to 0-1 range (255 = white = 1.0, 0 = black = 0.0)
953
+ normalized = img_array.astype(np.float32) / 255.0
954
+
955
+ # Find content boundaries by analyzing row and column averages
956
+
957
+ # Analyze rows (horizontal strips) to find top and bottom boundaries
958
+ row_averages = np.mean(normalized, axis=1) # Average each row
959
+ content_rows = row_averages < threshold # True where there's content (not whitespace)
960
+
961
+ # Find first and last rows with content
962
+ content_row_indices = np.where(content_rows)[0]
963
+ if len(content_row_indices) == 0:
964
+ # No content found, return a minimal region at the center
965
+ logger.warning(f"Region {self.bbox}: No content detected during trimming. Returning center point.")
966
+ center_x = (self.x0 + self.x1) / 2
967
+ center_y = (self.top + self.bottom) / 2
968
+ return Region(self.page, (center_x, center_y, center_x, center_y))
969
+
970
+ top_content_row = max(0, content_row_indices[0] - padding)
971
+ bottom_content_row = min(height - 1, content_row_indices[-1] + padding)
972
+
973
+ # Analyze columns (vertical strips) to find left and right boundaries
974
+ col_averages = np.mean(normalized, axis=0) # Average each column
975
+ content_cols = col_averages < threshold # True where there's content
976
+
977
+ content_col_indices = np.where(content_cols)[0]
978
+ if len(content_col_indices) == 0:
979
+ # No content found in columns either
980
+ logger.warning(f"Region {self.bbox}: No column content detected during trimming. Returning center point.")
981
+ center_x = (self.x0 + self.x1) / 2
982
+ center_y = (self.top + self.bottom) / 2
983
+ return Region(self.page, (center_x, center_y, center_x, center_y))
984
+
985
+ left_content_col = max(0, content_col_indices[0] - padding)
986
+ right_content_col = min(width - 1, content_col_indices[-1] + padding)
987
+
988
+ # Convert trimmed pixel coordinates back to PDF coordinates
989
+ scale_factor = resolution / 72.0 # Scale factor used in to_image()
990
+
991
+ # Calculate new PDF coordinates and ensure they are Python floats
992
+ trimmed_x0 = float(work_region.x0 + (left_content_col / scale_factor))
993
+ trimmed_top = float(work_region.top + (top_content_row / scale_factor))
994
+ trimmed_x1 = float(work_region.x0 + ((right_content_col + 1) / scale_factor)) # +1 because we want inclusive right edge
995
+ trimmed_bottom = float(work_region.top + ((bottom_content_row + 1) / scale_factor)) # +1 because we want inclusive bottom edge
996
+
997
+ # Ensure the trimmed region doesn't exceed the work region boundaries
998
+ final_x0 = max(work_region.x0, trimmed_x0)
999
+ final_top = max(work_region.top, trimmed_top)
1000
+ final_x1 = min(work_region.x1, trimmed_x1)
1001
+ final_bottom = min(work_region.bottom, trimmed_bottom)
1002
+
1003
+ # Ensure valid coordinates (width > 0, height > 0)
1004
+ if final_x1 <= final_x0 or final_bottom <= final_top:
1005
+ logger.warning(f"Region {self.bbox}: Trimming resulted in invalid dimensions. Returning original region.")
1006
+ return self
1007
+
1008
+ # Create the trimmed region
1009
+ trimmed_region = Region(self.page, (final_x0, final_top, final_x1, final_bottom))
1010
+
1011
+ # Expand back by the pre_shrink amount to restore original positioning
1012
+ if pre_shrink > 0:
1013
+ trimmed_region = trimmed_region.expand(left=pre_shrink, right=pre_shrink, top=pre_shrink, bottom=pre_shrink)
1014
+
1015
+ # Copy relevant metadata
1016
+ trimmed_region.region_type = self.region_type
1017
+ trimmed_region.normalized_type = self.normalized_type
1018
+ trimmed_region.confidence = self.confidence
1019
+ trimmed_region.model = self.model
1020
+ trimmed_region.name = self.name
1021
+ trimmed_region.label = self.label
1022
+ trimmed_region.source = "trimmed" # Indicate this is a derived region
1023
+ trimmed_region.parent_region = self
1024
+
1025
+ logger.debug(f"Region {self.bbox}: Trimmed to {trimmed_region.bbox} (padding={padding}, threshold={threshold}, pre_shrink={pre_shrink})")
1026
+ return trimmed_region
1027
+
1028
+ def clip(
1029
+ self,
1030
+ obj: Optional[Any] = None,
1031
+ left: Optional[float] = None,
1032
+ top: Optional[float] = None,
1033
+ right: Optional[float] = None,
1034
+ bottom: Optional[float] = None,
1035
+ ) -> "Region":
1036
+ """
1037
+ Clip this region to specific bounds, either from another object with bbox or explicit coordinates.
1038
+
1039
+ The clipped region will be constrained to not exceed the specified boundaries.
1040
+ You can provide either an object with bounding box properties, specific coordinates, or both.
1041
+ When both are provided, explicit coordinates take precedence.
1042
+
1043
+ Args:
1044
+ obj: Optional object with bbox properties (Region, Element, TextElement, etc.)
1045
+ left: Optional left boundary (x0) to clip to
1046
+ top: Optional top boundary to clip to
1047
+ right: Optional right boundary (x1) to clip to
1048
+ bottom: Optional bottom boundary to clip to
1049
+
1050
+ Returns:
1051
+ New Region with bounds clipped to the specified constraints
1052
+
1053
+ Examples:
1054
+ # Clip to another region's bounds
1055
+ clipped = region.clip(container_region)
1056
+
1057
+ # Clip to any element's bounds
1058
+ clipped = region.clip(text_element)
1059
+
1060
+ # Clip to specific coordinates
1061
+ clipped = region.clip(left=100, right=400)
1062
+
1063
+ # Mix object bounds with specific overrides
1064
+ clipped = region.clip(obj=container, bottom=page.height/2)
1065
+ """
1066
+ from natural_pdf.elements.base import extract_bbox
1067
+
1068
+ # Start with current region bounds
1069
+ clip_x0 = self.x0
1070
+ clip_top = self.top
1071
+ clip_x1 = self.x1
1072
+ clip_bottom = self.bottom
1073
+
1074
+ # Apply object constraints if provided
1075
+ if obj is not None:
1076
+ obj_bbox = extract_bbox(obj)
1077
+ if obj_bbox is not None:
1078
+ obj_x0, obj_top, obj_x1, obj_bottom = obj_bbox
1079
+ # Constrain to the intersection with the provided object
1080
+ clip_x0 = max(clip_x0, obj_x0)
1081
+ clip_top = max(clip_top, obj_top)
1082
+ clip_x1 = min(clip_x1, obj_x1)
1083
+ clip_bottom = min(clip_bottom, obj_bottom)
1084
+ else:
1085
+ logger.warning(
1086
+ f"Region {self.bbox}: Cannot extract bbox from clipping object {type(obj)}. "
1087
+ "Object must have bbox property or x0/top/x1/bottom attributes."
1088
+ )
1089
+
1090
+ # Apply explicit coordinate constraints (these take precedence)
1091
+ if left is not None:
1092
+ clip_x0 = max(clip_x0, left)
1093
+ if top is not None:
1094
+ clip_top = max(clip_top, top)
1095
+ if right is not None:
1096
+ clip_x1 = min(clip_x1, right)
1097
+ if bottom is not None:
1098
+ clip_bottom = min(clip_bottom, bottom)
1099
+
1100
+ # Ensure valid coordinates
1101
+ if clip_x1 <= clip_x0 or clip_bottom <= clip_top:
1102
+ logger.warning(
1103
+ f"Region {self.bbox}: Clipping resulted in invalid dimensions "
1104
+ f"({clip_x0}, {clip_top}, {clip_x1}, {clip_bottom}). Returning minimal region."
1105
+ )
1106
+ # Return a minimal region at the clip area's top-left
1107
+ return Region(self.page, (clip_x0, clip_top, clip_x0, clip_top))
1108
+
1109
+ # Create the clipped region
1110
+ clipped_region = Region(self.page, (clip_x0, clip_top, clip_x1, clip_bottom))
1111
+
1112
+ # Copy relevant metadata
1113
+ clipped_region.region_type = self.region_type
1114
+ clipped_region.normalized_type = self.normalized_type
1115
+ clipped_region.confidence = self.confidence
1116
+ clipped_region.model = self.model
1117
+ clipped_region.name = self.name
1118
+ clipped_region.label = self.label
1119
+ clipped_region.source = "clipped" # Indicate this is a derived region
1120
+ clipped_region.parent_region = self
1121
+
1122
+ logger.debug(
1123
+ f"Region {self.bbox}: Clipped to {clipped_region.bbox} "
1124
+ f"(constraints: obj={type(obj).__name__ if obj else None}, "
1125
+ f"left={left}, top={top}, right={right}, bottom={bottom})"
1126
+ )
1127
+ return clipped_region
1128
+
874
1129
  def get_elements(
875
1130
  self, selector: Optional[str] = None, apply_exclusions=True, **kwargs
876
1131
  ) -> List["Element"]:
@@ -1258,8 +1513,6 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
1258
1513
  unique_tops = cluster_coords(tops)
1259
1514
  unique_lefts = cluster_coords(lefts)
1260
1515
 
1261
- # --- Setup tqdm --- #
1262
- tqdm = get_tqdm()
1263
1516
  # Determine iterable for tqdm
1264
1517
  cell_iterator = cell_dicts
1265
1518
  if show_progress:
@@ -1333,6 +1586,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
1333
1586
  self,
1334
1587
  *,
1335
1588
  text: str,
1589
+ contains: str = "all",
1336
1590
  apply_exclusions: bool = True,
1337
1591
  regex: bool = False,
1338
1592
  case: bool = True,
@@ -1344,6 +1598,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
1344
1598
  self,
1345
1599
  selector: str,
1346
1600
  *,
1601
+ contains: str = "all",
1347
1602
  apply_exclusions: bool = True,
1348
1603
  regex: bool = False,
1349
1604
  case: bool = True,
@@ -1355,6 +1610,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
1355
1610
  selector: Optional[str] = None, # Now optional
1356
1611
  *,
1357
1612
  text: Optional[str] = None, # New text parameter
1613
+ contains: str = "all", # New parameter for containment behavior
1358
1614
  apply_exclusions: bool = True,
1359
1615
  regex: bool = False,
1360
1616
  case: bool = True,
@@ -1368,6 +1624,9 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
1368
1624
  Args:
1369
1625
  selector: CSS-like selector string.
1370
1626
  text: Text content to search for (equivalent to 'text:contains(...)').
1627
+ contains: How to determine if elements are inside: 'all' (fully inside),
1628
+ 'any' (any overlap), or 'center' (center point inside).
1629
+ (default: "all")
1371
1630
  apply_exclusions: Whether to exclude elements in exclusion regions (default: True).
1372
1631
  regex: Whether to use regex for text search (`selector` or `text`) (default: False).
1373
1632
  case: Whether to do case-sensitive text search (`selector` or `text`) (default: True).
@@ -1380,6 +1639,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
1380
1639
  elements = self.find_all(
1381
1640
  selector=selector,
1382
1641
  text=text,
1642
+ contains=contains,
1383
1643
  apply_exclusions=apply_exclusions,
1384
1644
  regex=regex,
1385
1645
  case=case,
@@ -1392,6 +1652,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
1392
1652
  self,
1393
1653
  *,
1394
1654
  text: str,
1655
+ contains: str = "all",
1395
1656
  apply_exclusions: bool = True,
1396
1657
  regex: bool = False,
1397
1658
  case: bool = True,
@@ -1403,6 +1664,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
1403
1664
  self,
1404
1665
  selector: str,
1405
1666
  *,
1667
+ contains: str = "all",
1406
1668
  apply_exclusions: bool = True,
1407
1669
  regex: bool = False,
1408
1670
  case: bool = True,
@@ -1414,6 +1676,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
1414
1676
  selector: Optional[str] = None, # Now optional
1415
1677
  *,
1416
1678
  text: Optional[str] = None, # New text parameter
1679
+ contains: str = "all", # New parameter to control inside/overlap behavior
1417
1680
  apply_exclusions: bool = True,
1418
1681
  regex: bool = False,
1419
1682
  case: bool = True,
@@ -1427,6 +1690,9 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
1427
1690
  Args:
1428
1691
  selector: CSS-like selector string.
1429
1692
  text: Text content to search for (equivalent to 'text:contains(...)').
1693
+ contains: How to determine if elements are inside: 'all' (fully inside),
1694
+ 'any' (any overlap), or 'center' (center point inside).
1695
+ (default: "all")
1430
1696
  apply_exclusions: Whether to exclude elements in exclusion regions (default: True).
1431
1697
  regex: Whether to use regex for text search (`selector` or `text`) (default: False).
1432
1698
  case: Whether to do case-sensitive text search (`selector` or `text`) (default: True).
@@ -1442,6 +1708,10 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
1442
1708
  if selector is None and text is None:
1443
1709
  raise ValueError("Provide either 'selector' or 'text'.")
1444
1710
 
1711
+ # Validate contains parameter
1712
+ if contains not in ["all", "any", "center"]:
1713
+ raise ValueError(f"Invalid contains value: {contains}. Must be 'all', 'any', or 'center'")
1714
+
1445
1715
  # Construct selector if 'text' is provided
1446
1716
  effective_selector = ""
1447
1717
  if text is not None:
@@ -1481,22 +1751,34 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
1481
1751
  # Let the page handle its exclusion logic if needed
1482
1752
  potential_elements = self.page.find_all(
1483
1753
  selector=effective_selector,
1484
- apply_exclusions=False, # Apply exclusions LATER based on region bbox
1754
+ apply_exclusions=apply_exclusions,
1485
1755
  regex=regex,
1486
1756
  case=case,
1487
1757
  **kwargs,
1488
1758
  )
1489
1759
 
1490
- # Filter these elements to those strictly within the region's bounds
1760
+ # Filter these elements based on the specified containment method
1491
1761
  region_bbox = self.bbox
1492
- matching_elements = [
1493
- el
1494
- for el in potential_elements
1495
- if el.x0 >= region_bbox[0]
1496
- and el.top >= region_bbox[1]
1497
- and el.x1 <= region_bbox[2]
1498
- and el.bottom <= region_bbox[3]
1499
- ]
1762
+ matching_elements = []
1763
+
1764
+ if contains == "all": # Fully inside (strict)
1765
+ matching_elements = [
1766
+ el for el in potential_elements
1767
+ if el.x0 >= region_bbox[0]
1768
+ and el.top >= region_bbox[1]
1769
+ and el.x1 <= region_bbox[2]
1770
+ and el.bottom <= region_bbox[3]
1771
+ ]
1772
+ elif contains == "any": # Any overlap
1773
+ matching_elements = [
1774
+ el for el in potential_elements
1775
+ if self.intersects(el)
1776
+ ]
1777
+ elif contains == "center": # Center point inside
1778
+ matching_elements = [
1779
+ el for el in potential_elements
1780
+ if self.is_element_center_inside(el)
1781
+ ]
1500
1782
 
1501
1783
  return ElementCollection(matching_elements)
1502
1784
 
@@ -1745,7 +2027,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
1745
2027
 
1746
2028
  def get_sections(
1747
2029
  self, start_elements=None, end_elements=None, boundary_inclusion="both"
1748
- ) -> List["Region"]:
2030
+ ) -> "ElementCollection[Region]":
1749
2031
  """
1750
2032
  Get sections within this region based on start/end elements.
1751
2033
 
@@ -1865,7 +2147,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
1865
2147
  section = self.get_section_between(start_element, end_element, boundary_inclusion)
1866
2148
  sections.append(section)
1867
2149
 
1868
- return sections
2150
+ return ElementCollection(sections)
1869
2151
 
1870
2152
  def create_cells(self):
1871
2153
  """
@@ -1988,7 +2270,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
1988
2270
  from natural_pdf.qa.document_qa import get_qa_engine
1989
2271
  except ImportError:
1990
2272
  logger.error(
1991
- "Question answering requires optional dependencies. Install with `pip install natural-pdf[qa]`"
2273
+ "Question answering requires optional dependencies. Install with `pip install natural-pdf[core-ml]`"
1992
2274
  )
1993
2275
  return {
1994
2276
  "answer": None,
@@ -2381,3 +2663,94 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
2381
2663
  return ElementCollection(cell_regions)
2382
2664
 
2383
2665
  # --- END NEW METHOD ---
2666
+
2667
+ def to_text_element(
2668
+ self,
2669
+ text_content: Optional[Union[str, Callable[["Region"], Optional[str]]]] = None,
2670
+ source_label: str = "derived_from_region",
2671
+ object_type: str = "word", # Or "char", controls how it's categorized
2672
+ default_font_size: float = 10.0,
2673
+ default_font_name: str = "RegionContent",
2674
+ confidence: Optional[float] = None, # Allow overriding confidence
2675
+ add_to_page: bool = False # NEW: Option to add to page
2676
+ ) -> "TextElement":
2677
+ """
2678
+ Creates a new TextElement object based on this region's geometry.
2679
+
2680
+ The text for the new TextElement can be provided directly,
2681
+ generated by a callback function, or left as None.
2682
+
2683
+ Args:
2684
+ text_content:
2685
+ - If a string, this will be the text of the new TextElement.
2686
+ - If a callable, it will be called with this region instance
2687
+ and its return value (a string or None) will be the text.
2688
+ - If None (default), the TextElement's text will be None.
2689
+ source_label: The 'source' attribute for the new TextElement.
2690
+ object_type: The 'object_type' for the TextElement's data dict
2691
+ (e.g., "word", "char").
2692
+ default_font_size: Placeholder font size if text is generated.
2693
+ default_font_name: Placeholder font name if text is generated.
2694
+ confidence: Confidence score for the text. If text_content is None,
2695
+ defaults to 0.0. If text is provided/generated, defaults to 1.0
2696
+ unless specified.
2697
+ add_to_page: If True, the created TextElement will be added to the
2698
+ region's parent page. (Default: False)
2699
+
2700
+ Returns:
2701
+ A new TextElement instance.
2702
+
2703
+ Raises:
2704
+ ValueError: If the region does not have a valid 'page' attribute.
2705
+ """
2706
+ actual_text: Optional[str] = None
2707
+ if isinstance(text_content, str):
2708
+ actual_text = text_content
2709
+ elif callable(text_content):
2710
+ try:
2711
+ actual_text = text_content(self)
2712
+ except Exception as e:
2713
+ logger.error(f"Error executing text_content callback for region {self.bbox}: {e}", exc_info=True)
2714
+ actual_text = None # Ensure actual_text is None on error
2715
+
2716
+ final_confidence = confidence
2717
+ if final_confidence is None:
2718
+ final_confidence = 1.0 if actual_text is not None and actual_text.strip() else 0.0
2719
+
2720
+ if not hasattr(self, 'page') or self.page is None:
2721
+ raise ValueError("Region must have a valid 'page' attribute to create a TextElement.")
2722
+
2723
+ elem_data = {
2724
+ "text": actual_text,
2725
+ "x0": self.x0,
2726
+ "top": self.top,
2727
+ "x1": self.x1,
2728
+ "bottom": self.bottom,
2729
+ "width": self.width,
2730
+ "height": self.height,
2731
+ "object_type": object_type,
2732
+ "page_number": self.page.page_number,
2733
+ "stroking_color": getattr(self, 'stroking_color', (0,0,0)),
2734
+ "non_stroking_color": getattr(self, 'non_stroking_color', (0,0,0)),
2735
+ "fontname": default_font_name,
2736
+ "size": default_font_size,
2737
+ "upright": True,
2738
+ "direction": 1,
2739
+ "adv": self.width,
2740
+ "source": source_label,
2741
+ "confidence": final_confidence,
2742
+ "_char_dicts": []
2743
+ }
2744
+ text_element = TextElement(elem_data, self.page)
2745
+
2746
+ if add_to_page:
2747
+ if hasattr(self.page, '_element_mgr') and self.page._element_mgr is not None:
2748
+ add_as_type = "words" if object_type == "word" else "chars" if object_type == "char" else object_type
2749
+ # REMOVED try-except block around add_element
2750
+ self.page._element_mgr.add_element(text_element, element_type=add_as_type)
2751
+ logger.debug(f"TextElement created from region {self.bbox} and added to page {self.page.page_number} as {add_as_type}.")
2752
+ else:
2753
+ page_num_str = str(self.page.page_number) if hasattr(self.page, 'page_number') else 'N/A'
2754
+ logger.warning(f"Cannot add TextElement to page: Page {page_num_str} for region {self.bbox} is missing '_element_mgr'.")
2755
+
2756
+ return text_element
File without changes
Binary file
Binary file