natural-pdf 0.1.12__py3-none-any.whl → 0.1.14__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -13,6 +13,7 @@ from natural_pdf.classification.manager import ClassificationManager # Keep for
13
13
  from natural_pdf.classification.mixin import ClassificationMixin
14
14
  from natural_pdf.elements.base import DirectionalMixin
15
15
  from natural_pdf.extraction.mixin import ExtractionMixin # Import extraction mixin
16
+ from natural_pdf.elements.text import TextElement # ADDED IMPORT
16
17
  from natural_pdf.ocr.utils import _apply_ocr_correction_to_elements # Import utility
17
18
  from natural_pdf.selectors.parser import parse_selector, selector_to_filter_func
18
19
  from natural_pdf.utils.locks import pdf_render_lock # Import the lock
@@ -20,11 +21,12 @@ from natural_pdf.utils.locks import pdf_render_lock # Import the lock
20
21
  # Import new utils
21
22
  from natural_pdf.utils.text_extraction import filter_chars_spatially, generate_text_layout
22
23
 
23
- # --- NEW: Import tqdm utility --- #
24
- from natural_pdf.utils.tqdm_utils import get_tqdm
25
-
24
+ from tqdm.auto import tqdm
26
25
  # --- End Classification Imports --- #
27
26
 
27
+ # --- Shape Detection Mixin --- #
28
+ from natural_pdf.analyzers.shape_detection_mixin import ShapeDetectionMixin
29
+ # --- End Shape Detection Mixin --- #
28
30
 
29
31
  if TYPE_CHECKING:
30
32
  # --- NEW: Add Image type hint for classification --- #
@@ -33,6 +35,7 @@ if TYPE_CHECKING:
33
35
  from natural_pdf.core.page import Page
34
36
  from natural_pdf.elements.collections import ElementCollection
35
37
  from natural_pdf.elements.text import TextElement
38
+ from natural_pdf.elements.base import Element # Added for type hint
36
39
 
37
40
  # Import OCRManager conditionally to avoid circular imports
38
41
  try:
@@ -44,7 +47,7 @@ except ImportError:
44
47
  logger = logging.getLogger(__name__)
45
48
 
46
49
 
47
- class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
50
+ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetectionMixin):
48
51
  """
49
52
  Represents a rectangular region on a page.
50
53
  """
@@ -103,7 +106,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
103
106
  direction: str,
104
107
  size: Optional[float] = None,
105
108
  cross_size: str = "full",
106
- include_element: bool = False,
109
+ include_source: bool = False,
107
110
  until: Optional[str] = None,
108
111
  include_endpoint: bool = True,
109
112
  **kwargs,
@@ -115,7 +118,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
115
118
  direction: 'left', 'right', 'above', or 'below'
116
119
  size: Size in the primary direction (width for horizontal, height for vertical)
117
120
  cross_size: Size in the cross direction ('full' or 'element')
118
- include_element: Whether to include this region's area in the result
121
+ include_source: Whether to include this region's area in the result
119
122
  until: Optional selector string to specify a boundary element
120
123
  include_endpoint: Whether to include the boundary element found by 'until'
121
124
  **kwargs: Additional parameters for the 'until' selector search
@@ -129,7 +132,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
129
132
  is_positive = direction in ("right", "below") # right/below are positive directions
130
133
  pixel_offset = 1 # Offset for excluding elements/endpoints
131
134
 
132
- # 1. Determine initial boundaries based on direction and include_element
135
+ # 1. Determine initial boundaries based on direction and include_source
133
136
  if is_horizontal:
134
137
  # Initial cross-boundaries (vertical)
135
138
  y0 = 0 if cross_size == "full" else self.top
@@ -137,11 +140,11 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
137
140
 
138
141
  # Initial primary boundaries (horizontal)
139
142
  if is_positive: # right
140
- x0_initial = self.x0 if include_element else self.x1 + pixel_offset
143
+ x0_initial = self.x0 if include_source else self.x1 + pixel_offset
141
144
  x1_initial = self.x1 # This edge moves
142
145
  else: # left
143
146
  x0_initial = self.x0 # This edge moves
144
- x1_initial = self.x1 if include_element else self.x0 - pixel_offset
147
+ x1_initial = self.x1 if include_source else self.x0 - pixel_offset
145
148
  else: # Vertical
146
149
  # Initial cross-boundaries (horizontal)
147
150
  x0 = 0 if cross_size == "full" else self.x0
@@ -149,11 +152,11 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
149
152
 
150
153
  # Initial primary boundaries (vertical)
151
154
  if is_positive: # below
152
- y0_initial = self.top if include_element else self.bottom + pixel_offset
155
+ y0_initial = self.top if include_source else self.bottom + pixel_offset
153
156
  y1_initial = self.bottom # This edge moves
154
157
  else: # above
155
158
  y0_initial = self.top # This edge moves
156
- y1_initial = self.bottom if include_element else self.top - pixel_offset
159
+ y1_initial = self.bottom if include_source else self.top - pixel_offset
157
160
 
158
161
  # 2. Calculate the final primary boundary, considering 'size' or page limits
159
162
  if is_horizontal:
@@ -245,7 +248,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
245
248
  # 5. Create and return Region
246
249
  region = Region(self.page, final_bbox)
247
250
  region.source_element = self
248
- region.includes_source = include_element
251
+ region.includes_source = include_source
249
252
  # Optionally store the boundary element if found
250
253
  if target:
251
254
  region.boundary_element = target
@@ -256,7 +259,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
256
259
  self,
257
260
  height: Optional[float] = None,
258
261
  width: str = "full",
259
- include_element: bool = False,
262
+ include_source: bool = False,
260
263
  until: Optional[str] = None,
261
264
  include_endpoint: bool = True,
262
265
  **kwargs,
@@ -267,7 +270,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
267
270
  Args:
268
271
  height: Height of the region above, in points
269
272
  width: Width mode - "full" for full page width or "element" for element width
270
- include_element: Whether to include this region in the result (default: False)
273
+ include_source: Whether to include this region in the result (default: False)
271
274
  until: Optional selector string to specify an upper boundary element
272
275
  include_endpoint: Whether to include the boundary element in the region (default: True)
273
276
  **kwargs: Additional parameters
@@ -279,7 +282,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
279
282
  direction="above",
280
283
  size=height,
281
284
  cross_size=width,
282
- include_element=include_element,
285
+ include_source=include_source,
283
286
  until=until,
284
287
  include_endpoint=include_endpoint,
285
288
  **kwargs,
@@ -289,7 +292,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
289
292
  self,
290
293
  height: Optional[float] = None,
291
294
  width: str = "full",
292
- include_element: bool = False,
295
+ include_source: bool = False,
293
296
  until: Optional[str] = None,
294
297
  include_endpoint: bool = True,
295
298
  **kwargs,
@@ -300,7 +303,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
300
303
  Args:
301
304
  height: Height of the region below, in points
302
305
  width: Width mode - "full" for full page width or "element" for element width
303
- include_element: Whether to include this region in the result (default: False)
306
+ include_source: Whether to include this region in the result (default: False)
304
307
  until: Optional selector string to specify a lower boundary element
305
308
  include_endpoint: Whether to include the boundary element in the region (default: True)
306
309
  **kwargs: Additional parameters
@@ -312,7 +315,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
312
315
  direction="below",
313
316
  size=height,
314
317
  cross_size=width,
315
- include_element=include_element,
318
+ include_source=include_source,
316
319
  until=until,
317
320
  include_endpoint=include_endpoint,
318
321
  **kwargs,
@@ -322,7 +325,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
322
325
  self,
323
326
  width: Optional[float] = None,
324
327
  height: str = "full",
325
- include_element: bool = False,
328
+ include_source: bool = False,
326
329
  until: Optional[str] = None,
327
330
  include_endpoint: bool = True,
328
331
  **kwargs,
@@ -333,7 +336,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
333
336
  Args:
334
337
  width: Width of the region to the left, in points
335
338
  height: Height mode - "full" for full page height or "element" for element height
336
- include_element: Whether to include this region in the result (default: False)
339
+ include_source: Whether to include this region in the result (default: False)
337
340
  until: Optional selector string to specify a left boundary element
338
341
  include_endpoint: Whether to include the boundary element in the region (default: True)
339
342
  **kwargs: Additional parameters
@@ -345,7 +348,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
345
348
  direction="left",
346
349
  size=width,
347
350
  cross_size=height,
348
- include_element=include_element,
351
+ include_source=include_source,
349
352
  until=until,
350
353
  include_endpoint=include_endpoint,
351
354
  **kwargs,
@@ -355,7 +358,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
355
358
  self,
356
359
  width: Optional[float] = None,
357
360
  height: str = "full",
358
- include_element: bool = False,
361
+ include_source: bool = False,
359
362
  until: Optional[str] = None,
360
363
  include_endpoint: bool = True,
361
364
  **kwargs,
@@ -366,7 +369,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
366
369
  Args:
367
370
  width: Width of the region to the right, in points
368
371
  height: Height mode - "full" for full page height or "element" for element height
369
- include_element: Whether to include this region in the result (default: False)
372
+ include_source: Whether to include this region in the result (default: False)
370
373
  until: Optional selector string to specify a right boundary element
371
374
  include_endpoint: Whether to include the boundary element in the region (default: True)
372
375
  **kwargs: Additional parameters
@@ -378,7 +381,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
378
381
  direction="right",
379
382
  size=width,
380
383
  cross_size=height,
381
- include_element=include_element,
384
+ include_source=include_source,
382
385
  until=until,
383
386
  include_endpoint=include_endpoint,
384
387
  **kwargs,
@@ -720,14 +723,36 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
720
723
  Returns:
721
724
  PIL Image of just this region
722
725
  """
726
+ # Handle the case where user wants the cropped region to have a specific width
727
+ page_kwargs = kwargs.copy()
728
+ effective_resolution = resolution # Start with the provided resolution
729
+
730
+ if crop_only and 'width' in kwargs:
731
+ target_width = kwargs['width']
732
+ # Calculate what resolution is needed to make the region crop have target_width
733
+ region_width_points = self.width # Region width in PDF points
734
+
735
+ if region_width_points > 0:
736
+ # Calculate scale needed: target_width / region_width_points
737
+ required_scale = target_width / region_width_points
738
+ # Convert scale to resolution: scale * 72 DPI
739
+ effective_resolution = required_scale * 72.0
740
+ page_kwargs.pop('width') # Remove width parameter to avoid conflicts
741
+ logger.debug(f"Region {self.bbox}: Calculated required resolution {effective_resolution:.1f} DPI for region crop width {target_width}")
742
+ else:
743
+ logger.warning(f"Region {self.bbox}: Invalid region width {region_width_points}, using original resolution")
744
+
723
745
  # First get the full page image with highlights if requested
724
746
  page_image = self._page.to_image(
725
- scale=scale, resolution=resolution, include_highlights=include_highlights, **kwargs
747
+ scale=scale, resolution=effective_resolution, include_highlights=include_highlights, **page_kwargs
726
748
  )
727
749
 
728
- # Calculate the crop coordinates - apply resolution scaling factor
729
- # PDF coordinates are in points (1/72 inch), but image is scaled by resolution
730
- scale_factor = resolution / 72.0 # Scale based on DPI
750
+ # Calculate the actual scale factor used by the page image
751
+ if page_image.width > 0 and self._page.width > 0:
752
+ scale_factor = page_image.width / self._page.width
753
+ else:
754
+ # Fallback to resolution-based calculation if dimensions are invalid
755
+ scale_factor = resolution / 72.0
731
756
 
732
757
  # Apply scaling to the coordinates
733
758
  x0 = int(self.x0 * scale_factor)
@@ -874,6 +899,233 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
874
899
  image.save(filename)
875
900
  return self
876
901
 
902
+ def trim(self, padding: int = 1, threshold: float = 0.95, resolution: float = 150, pre_shrink: float = 0.5) -> "Region":
903
+ """
904
+ Trim visual whitespace from the edges of this region.
905
+
906
+ Similar to Python's string .strip() method, but for visual whitespace in the region image.
907
+ Uses pixel analysis to detect rows/columns that are predominantly whitespace.
908
+
909
+ Args:
910
+ padding: Number of pixels to keep as padding after trimming (default: 1)
911
+ threshold: Threshold for considering a row/column as whitespace (0.0-1.0, default: 0.95)
912
+ Higher values mean more strict whitespace detection.
913
+ E.g., 0.95 means if 95% of pixels in a row/column are white, consider it whitespace.
914
+ resolution: Resolution for image rendering in DPI (default: 150)
915
+ pre_shrink: Amount to shrink region before trimming, then expand back after (default: 0.5)
916
+ This helps avoid detecting box borders/slivers as content.
917
+
918
+ Returns:
919
+ New Region with visual whitespace trimmed from all edges
920
+
921
+ Example:
922
+ # Basic trimming with 1 pixel padding and 0.5px pre-shrink
923
+ trimmed = region.trim()
924
+
925
+ # More aggressive trimming with no padding and no pre-shrink
926
+ tight = region.trim(padding=0, threshold=0.9, pre_shrink=0)
927
+
928
+ # Conservative trimming with more padding
929
+ loose = region.trim(padding=3, threshold=0.98)
930
+ """
931
+ # Pre-shrink the region to avoid box slivers
932
+ work_region = self.expand(left=-pre_shrink, right=-pre_shrink, top=-pre_shrink, bottom=-pre_shrink) if pre_shrink > 0 else self
933
+
934
+ # Get the region image
935
+ image = work_region.to_image(resolution=resolution, crop_only=True, include_highlights=False)
936
+
937
+ if image is None:
938
+ logger.warning(f"Region {self.bbox}: Could not generate image for trimming. Returning original region.")
939
+ return self
940
+
941
+ # Convert to grayscale for easier analysis
942
+ import numpy as np
943
+
944
+ # Convert PIL image to numpy array
945
+ img_array = np.array(image.convert('L')) # Convert to grayscale
946
+ height, width = img_array.shape
947
+
948
+ if height == 0 or width == 0:
949
+ logger.warning(f"Region {self.bbox}: Image has zero dimensions. Returning original region.")
950
+ return self
951
+
952
+ # Normalize pixel values to 0-1 range (255 = white = 1.0, 0 = black = 0.0)
953
+ normalized = img_array.astype(np.float32) / 255.0
954
+
955
+ # Find content boundaries by analyzing row and column averages
956
+
957
+ # Analyze rows (horizontal strips) to find top and bottom boundaries
958
+ row_averages = np.mean(normalized, axis=1) # Average each row
959
+ content_rows = row_averages < threshold # True where there's content (not whitespace)
960
+
961
+ # Find first and last rows with content
962
+ content_row_indices = np.where(content_rows)[0]
963
+ if len(content_row_indices) == 0:
964
+ # No content found, return a minimal region at the center
965
+ logger.warning(f"Region {self.bbox}: No content detected during trimming. Returning center point.")
966
+ center_x = (self.x0 + self.x1) / 2
967
+ center_y = (self.top + self.bottom) / 2
968
+ return Region(self.page, (center_x, center_y, center_x, center_y))
969
+
970
+ top_content_row = max(0, content_row_indices[0] - padding)
971
+ bottom_content_row = min(height - 1, content_row_indices[-1] + padding)
972
+
973
+ # Analyze columns (vertical strips) to find left and right boundaries
974
+ col_averages = np.mean(normalized, axis=0) # Average each column
975
+ content_cols = col_averages < threshold # True where there's content
976
+
977
+ content_col_indices = np.where(content_cols)[0]
978
+ if len(content_col_indices) == 0:
979
+ # No content found in columns either
980
+ logger.warning(f"Region {self.bbox}: No column content detected during trimming. Returning center point.")
981
+ center_x = (self.x0 + self.x1) / 2
982
+ center_y = (self.top + self.bottom) / 2
983
+ return Region(self.page, (center_x, center_y, center_x, center_y))
984
+
985
+ left_content_col = max(0, content_col_indices[0] - padding)
986
+ right_content_col = min(width - 1, content_col_indices[-1] + padding)
987
+
988
+ # Convert trimmed pixel coordinates back to PDF coordinates
989
+ scale_factor = resolution / 72.0 # Scale factor used in to_image()
990
+
991
+ # Calculate new PDF coordinates and ensure they are Python floats
992
+ trimmed_x0 = float(work_region.x0 + (left_content_col / scale_factor))
993
+ trimmed_top = float(work_region.top + (top_content_row / scale_factor))
994
+ trimmed_x1 = float(work_region.x0 + ((right_content_col + 1) / scale_factor)) # +1 because we want inclusive right edge
995
+ trimmed_bottom = float(work_region.top + ((bottom_content_row + 1) / scale_factor)) # +1 because we want inclusive bottom edge
996
+
997
+ # Ensure the trimmed region doesn't exceed the work region boundaries
998
+ final_x0 = max(work_region.x0, trimmed_x0)
999
+ final_top = max(work_region.top, trimmed_top)
1000
+ final_x1 = min(work_region.x1, trimmed_x1)
1001
+ final_bottom = min(work_region.bottom, trimmed_bottom)
1002
+
1003
+ # Ensure valid coordinates (width > 0, height > 0)
1004
+ if final_x1 <= final_x0 or final_bottom <= final_top:
1005
+ logger.warning(f"Region {self.bbox}: Trimming resulted in invalid dimensions. Returning original region.")
1006
+ return self
1007
+
1008
+ # Create the trimmed region
1009
+ trimmed_region = Region(self.page, (final_x0, final_top, final_x1, final_bottom))
1010
+
1011
+ # Expand back by the pre_shrink amount to restore original positioning
1012
+ if pre_shrink > 0:
1013
+ trimmed_region = trimmed_region.expand(left=pre_shrink, right=pre_shrink, top=pre_shrink, bottom=pre_shrink)
1014
+
1015
+ # Copy relevant metadata
1016
+ trimmed_region.region_type = self.region_type
1017
+ trimmed_region.normalized_type = self.normalized_type
1018
+ trimmed_region.confidence = self.confidence
1019
+ trimmed_region.model = self.model
1020
+ trimmed_region.name = self.name
1021
+ trimmed_region.label = self.label
1022
+ trimmed_region.source = "trimmed" # Indicate this is a derived region
1023
+ trimmed_region.parent_region = self
1024
+
1025
+ logger.debug(f"Region {self.bbox}: Trimmed to {trimmed_region.bbox} (padding={padding}, threshold={threshold}, pre_shrink={pre_shrink})")
1026
+ return trimmed_region
1027
+
1028
+ def clip(
1029
+ self,
1030
+ obj: Optional[Any] = None,
1031
+ left: Optional[float] = None,
1032
+ top: Optional[float] = None,
1033
+ right: Optional[float] = None,
1034
+ bottom: Optional[float] = None,
1035
+ ) -> "Region":
1036
+ """
1037
+ Clip this region to specific bounds, either from another object with bbox or explicit coordinates.
1038
+
1039
+ The clipped region will be constrained to not exceed the specified boundaries.
1040
+ You can provide either an object with bounding box properties, specific coordinates, or both.
1041
+ When both are provided, explicit coordinates take precedence.
1042
+
1043
+ Args:
1044
+ obj: Optional object with bbox properties (Region, Element, TextElement, etc.)
1045
+ left: Optional left boundary (x0) to clip to
1046
+ top: Optional top boundary to clip to
1047
+ right: Optional right boundary (x1) to clip to
1048
+ bottom: Optional bottom boundary to clip to
1049
+
1050
+ Returns:
1051
+ New Region with bounds clipped to the specified constraints
1052
+
1053
+ Examples:
1054
+ # Clip to another region's bounds
1055
+ clipped = region.clip(container_region)
1056
+
1057
+ # Clip to any element's bounds
1058
+ clipped = region.clip(text_element)
1059
+
1060
+ # Clip to specific coordinates
1061
+ clipped = region.clip(left=100, right=400)
1062
+
1063
+ # Mix object bounds with specific overrides
1064
+ clipped = region.clip(obj=container, bottom=page.height/2)
1065
+ """
1066
+ from natural_pdf.elements.base import extract_bbox
1067
+
1068
+ # Start with current region bounds
1069
+ clip_x0 = self.x0
1070
+ clip_top = self.top
1071
+ clip_x1 = self.x1
1072
+ clip_bottom = self.bottom
1073
+
1074
+ # Apply object constraints if provided
1075
+ if obj is not None:
1076
+ obj_bbox = extract_bbox(obj)
1077
+ if obj_bbox is not None:
1078
+ obj_x0, obj_top, obj_x1, obj_bottom = obj_bbox
1079
+ # Constrain to the intersection with the provided object
1080
+ clip_x0 = max(clip_x0, obj_x0)
1081
+ clip_top = max(clip_top, obj_top)
1082
+ clip_x1 = min(clip_x1, obj_x1)
1083
+ clip_bottom = min(clip_bottom, obj_bottom)
1084
+ else:
1085
+ logger.warning(
1086
+ f"Region {self.bbox}: Cannot extract bbox from clipping object {type(obj)}. "
1087
+ "Object must have bbox property or x0/top/x1/bottom attributes."
1088
+ )
1089
+
1090
+ # Apply explicit coordinate constraints (these take precedence)
1091
+ if left is not None:
1092
+ clip_x0 = max(clip_x0, left)
1093
+ if top is not None:
1094
+ clip_top = max(clip_top, top)
1095
+ if right is not None:
1096
+ clip_x1 = min(clip_x1, right)
1097
+ if bottom is not None:
1098
+ clip_bottom = min(clip_bottom, bottom)
1099
+
1100
+ # Ensure valid coordinates
1101
+ if clip_x1 <= clip_x0 or clip_bottom <= clip_top:
1102
+ logger.warning(
1103
+ f"Region {self.bbox}: Clipping resulted in invalid dimensions "
1104
+ f"({clip_x0}, {clip_top}, {clip_x1}, {clip_bottom}). Returning minimal region."
1105
+ )
1106
+ # Return a minimal region at the clip area's top-left
1107
+ return Region(self.page, (clip_x0, clip_top, clip_x0, clip_top))
1108
+
1109
+ # Create the clipped region
1110
+ clipped_region = Region(self.page, (clip_x0, clip_top, clip_x1, clip_bottom))
1111
+
1112
+ # Copy relevant metadata
1113
+ clipped_region.region_type = self.region_type
1114
+ clipped_region.normalized_type = self.normalized_type
1115
+ clipped_region.confidence = self.confidence
1116
+ clipped_region.model = self.model
1117
+ clipped_region.name = self.name
1118
+ clipped_region.label = self.label
1119
+ clipped_region.source = "clipped" # Indicate this is a derived region
1120
+ clipped_region.parent_region = self
1121
+
1122
+ logger.debug(
1123
+ f"Region {self.bbox}: Clipped to {clipped_region.bbox} "
1124
+ f"(constraints: obj={type(obj).__name__ if obj else None}, "
1125
+ f"left={left}, top={top}, right={right}, bottom={bottom})"
1126
+ )
1127
+ return clipped_region
1128
+
877
1129
  def get_elements(
878
1130
  self, selector: Optional[str] = None, apply_exclusions=True, **kwargs
879
1131
  ) -> List["Element"]:
@@ -1022,7 +1274,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
1022
1274
  if hasattr(self, "model") and self.model == "tatr" and self.region_type == "table":
1023
1275
  effective_method = "tatr"
1024
1276
  else:
1025
- effective_method = "text"
1277
+ effective_method = "plumber"
1026
1278
 
1027
1279
  logger.debug(f"Region {self.bbox}: Extracting table using method '{effective_method}'")
1028
1280
 
@@ -1045,6 +1297,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
1045
1297
  def _extract_table_plumber(self, table_settings: dict) -> List[List[str]]:
1046
1298
  """
1047
1299
  Extract table using pdfplumber's table extraction.
1300
+ This method extracts the largest table within the region.
1048
1301
 
1049
1302
  Args:
1050
1303
  table_settings: Settings for pdfplumber table extraction
@@ -1055,12 +1308,12 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
1055
1308
  # Create a crop of the page for this region
1056
1309
  cropped = self.page._page.crop(self.bbox)
1057
1310
 
1058
- # Extract table from the cropped area
1059
- tables = cropped.extract_tables(table_settings)
1311
+ # Extract the single largest table from the cropped area
1312
+ table = cropped.extract_table(table_settings)
1060
1313
 
1061
- # Return the first table or an empty list if none found
1062
- if tables:
1063
- return tables[0]
1314
+ # Return the table or an empty list if none found
1315
+ if table:
1316
+ return table
1064
1317
  return []
1065
1318
 
1066
1319
  def _extract_table_tatr(self, use_ocr=False, ocr_config=None) -> List[List[str]]:
@@ -1261,8 +1514,6 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
1261
1514
  unique_tops = cluster_coords(tops)
1262
1515
  unique_lefts = cluster_coords(lefts)
1263
1516
 
1264
- # --- Setup tqdm --- #
1265
- tqdm = get_tqdm()
1266
1517
  # Determine iterable for tqdm
1267
1518
  cell_iterator = cell_dicts
1268
1519
  if show_progress:
@@ -1777,7 +2028,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
1777
2028
 
1778
2029
  def get_sections(
1779
2030
  self, start_elements=None, end_elements=None, boundary_inclusion="both"
1780
- ) -> List["Region"]:
2031
+ ) -> "ElementCollection[Region]":
1781
2032
  """
1782
2033
  Get sections within this region based on start/end elements.
1783
2034
 
@@ -1897,7 +2148,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
1897
2148
  section = self.get_section_between(start_element, end_element, boundary_inclusion)
1898
2149
  sections.append(section)
1899
2150
 
1900
- return sections
2151
+ return ElementCollection(sections)
1901
2152
 
1902
2153
  def create_cells(self):
1903
2154
  """
@@ -2413,3 +2664,94 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
2413
2664
  return ElementCollection(cell_regions)
2414
2665
 
2415
2666
  # --- END NEW METHOD ---
2667
+
2668
+ def to_text_element(
2669
+ self,
2670
+ text_content: Optional[Union[str, Callable[["Region"], Optional[str]]]] = None,
2671
+ source_label: str = "derived_from_region",
2672
+ object_type: str = "word", # Or "char", controls how it's categorized
2673
+ default_font_size: float = 10.0,
2674
+ default_font_name: str = "RegionContent",
2675
+ confidence: Optional[float] = None, # Allow overriding confidence
2676
+ add_to_page: bool = False # NEW: Option to add to page
2677
+ ) -> "TextElement":
2678
+ """
2679
+ Creates a new TextElement object based on this region's geometry.
2680
+
2681
+ The text for the new TextElement can be provided directly,
2682
+ generated by a callback function, or left as None.
2683
+
2684
+ Args:
2685
+ text_content:
2686
+ - If a string, this will be the text of the new TextElement.
2687
+ - If a callable, it will be called with this region instance
2688
+ and its return value (a string or None) will be the text.
2689
+ - If None (default), the TextElement's text will be None.
2690
+ source_label: The 'source' attribute for the new TextElement.
2691
+ object_type: The 'object_type' for the TextElement's data dict
2692
+ (e.g., "word", "char").
2693
+ default_font_size: Placeholder font size if text is generated.
2694
+ default_font_name: Placeholder font name if text is generated.
2695
+ confidence: Confidence score for the text. If text_content is None,
2696
+ defaults to 0.0. If text is provided/generated, defaults to 1.0
2697
+ unless specified.
2698
+ add_to_page: If True, the created TextElement will be added to the
2699
+ region's parent page. (Default: False)
2700
+
2701
+ Returns:
2702
+ A new TextElement instance.
2703
+
2704
+ Raises:
2705
+ ValueError: If the region does not have a valid 'page' attribute.
2706
+ """
2707
+ actual_text: Optional[str] = None
2708
+ if isinstance(text_content, str):
2709
+ actual_text = text_content
2710
+ elif callable(text_content):
2711
+ try:
2712
+ actual_text = text_content(self)
2713
+ except Exception as e:
2714
+ logger.error(f"Error executing text_content callback for region {self.bbox}: {e}", exc_info=True)
2715
+ actual_text = None # Ensure actual_text is None on error
2716
+
2717
+ final_confidence = confidence
2718
+ if final_confidence is None:
2719
+ final_confidence = 1.0 if actual_text is not None and actual_text.strip() else 0.0
2720
+
2721
+ if not hasattr(self, 'page') or self.page is None:
2722
+ raise ValueError("Region must have a valid 'page' attribute to create a TextElement.")
2723
+
2724
+ elem_data = {
2725
+ "text": actual_text,
2726
+ "x0": self.x0,
2727
+ "top": self.top,
2728
+ "x1": self.x1,
2729
+ "bottom": self.bottom,
2730
+ "width": self.width,
2731
+ "height": self.height,
2732
+ "object_type": object_type,
2733
+ "page_number": self.page.page_number,
2734
+ "stroking_color": getattr(self, 'stroking_color', (0,0,0)),
2735
+ "non_stroking_color": getattr(self, 'non_stroking_color', (0,0,0)),
2736
+ "fontname": default_font_name,
2737
+ "size": default_font_size,
2738
+ "upright": True,
2739
+ "direction": 1,
2740
+ "adv": self.width,
2741
+ "source": source_label,
2742
+ "confidence": final_confidence,
2743
+ "_char_dicts": []
2744
+ }
2745
+ text_element = TextElement(elem_data, self.page)
2746
+
2747
+ if add_to_page:
2748
+ if hasattr(self.page, '_element_mgr') and self.page._element_mgr is not None:
2749
+ add_as_type = "words" if object_type == "word" else "chars" if object_type == "char" else object_type
2750
+ # REMOVED try-except block around add_element
2751
+ self.page._element_mgr.add_element(text_element, element_type=add_as_type)
2752
+ logger.debug(f"TextElement created from region {self.bbox} and added to page {self.page.page_number} as {add_as_type}.")
2753
+ else:
2754
+ page_num_str = str(self.page.page_number) if hasattr(self.page, 'page_number') else 'N/A'
2755
+ logger.warning(f"Cannot add TextElement to page: Page {page_num_str} for region {self.bbox} is missing '_element_mgr'.")
2756
+
2757
+ return text_element