natural-pdf 0.1.15__py3-none-any.whl → 0.1.17__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. natural_pdf/__init__.py +31 -0
  2. natural_pdf/analyzers/layout/gemini.py +137 -162
  3. natural_pdf/analyzers/layout/layout_manager.py +9 -5
  4. natural_pdf/analyzers/layout/layout_options.py +77 -7
  5. natural_pdf/analyzers/layout/paddle.py +318 -165
  6. natural_pdf/analyzers/layout/table_structure_utils.py +78 -0
  7. natural_pdf/analyzers/shape_detection_mixin.py +770 -405
  8. natural_pdf/classification/mixin.py +2 -8
  9. natural_pdf/collections/pdf_collection.py +25 -30
  10. natural_pdf/core/highlighting_service.py +47 -32
  11. natural_pdf/core/page.py +119 -76
  12. natural_pdf/core/pdf.py +19 -22
  13. natural_pdf/describe/__init__.py +21 -0
  14. natural_pdf/describe/base.py +457 -0
  15. natural_pdf/describe/elements.py +411 -0
  16. natural_pdf/describe/mixin.py +84 -0
  17. natural_pdf/describe/summary.py +186 -0
  18. natural_pdf/elements/base.py +11 -10
  19. natural_pdf/elements/collections.py +116 -51
  20. natural_pdf/elements/region.py +204 -127
  21. natural_pdf/exporters/paddleocr.py +38 -13
  22. natural_pdf/flows/__init__.py +3 -3
  23. natural_pdf/flows/collections.py +303 -132
  24. natural_pdf/flows/element.py +277 -132
  25. natural_pdf/flows/flow.py +33 -16
  26. natural_pdf/flows/region.py +142 -79
  27. natural_pdf/ocr/engine_doctr.py +37 -4
  28. natural_pdf/ocr/engine_easyocr.py +23 -3
  29. natural_pdf/ocr/engine_paddle.py +281 -30
  30. natural_pdf/ocr/engine_surya.py +8 -3
  31. natural_pdf/ocr/ocr_manager.py +75 -76
  32. natural_pdf/ocr/ocr_options.py +52 -87
  33. natural_pdf/search/__init__.py +25 -12
  34. natural_pdf/search/lancedb_search_service.py +91 -54
  35. natural_pdf/search/numpy_search_service.py +86 -65
  36. natural_pdf/search/searchable_mixin.py +2 -2
  37. natural_pdf/selectors/parser.py +125 -81
  38. natural_pdf/widgets/__init__.py +1 -1
  39. natural_pdf/widgets/viewer.py +205 -449
  40. {natural_pdf-0.1.15.dist-info → natural_pdf-0.1.17.dist-info}/METADATA +27 -45
  41. {natural_pdf-0.1.15.dist-info → natural_pdf-0.1.17.dist-info}/RECORD +44 -38
  42. {natural_pdf-0.1.15.dist-info → natural_pdf-0.1.17.dist-info}/WHEEL +0 -0
  43. {natural_pdf-0.1.15.dist-info → natural_pdf-0.1.17.dist-info}/licenses/LICENSE +0 -0
  44. {natural_pdf-0.1.15.dist-info → natural_pdf-0.1.17.dist-info}/top_level.txt +0 -0
@@ -5,15 +5,20 @@ from pdfplumber.utils.geometry import get_bbox_overlap, merge_bboxes, objects_to
5
5
 
6
6
  # New Imports
7
7
  from pdfplumber.utils.text import TEXTMAP_KWARGS, WORD_EXTRACTOR_KWARGS, chars_to_textmap
8
+ from tqdm.auto import tqdm
8
9
 
9
10
  from natural_pdf.analyzers.layout.pdfplumber_table_finder import find_text_based_tables
11
+
12
+ # --- Shape Detection Mixin --- #
13
+ from natural_pdf.analyzers.shape_detection_mixin import ShapeDetectionMixin
10
14
  from natural_pdf.classification.manager import ClassificationManager # Keep for type hint
11
15
 
12
16
  # --- Classification Imports --- #
13
17
  from natural_pdf.classification.mixin import ClassificationMixin
18
+ from natural_pdf.describe.mixin import DescribeMixin
14
19
  from natural_pdf.elements.base import DirectionalMixin
20
+ from natural_pdf.elements.text import TextElement # ADDED IMPORT
15
21
  from natural_pdf.extraction.mixin import ExtractionMixin # Import extraction mixin
16
- from natural_pdf.elements.text import TextElement # ADDED IMPORT
17
22
  from natural_pdf.ocr.utils import _apply_ocr_correction_to_elements # Import utility
18
23
  from natural_pdf.selectors.parser import parse_selector, selector_to_filter_func
19
24
  from natural_pdf.utils.locks import pdf_render_lock # Import the lock
@@ -21,21 +26,19 @@ from natural_pdf.utils.locks import pdf_render_lock # Import the lock
21
26
  # Import new utils
22
27
  from natural_pdf.utils.text_extraction import filter_chars_spatially, generate_text_layout
23
28
 
24
- from tqdm.auto import tqdm
25
29
  # --- End Classification Imports --- #
26
30
 
27
- # --- Shape Detection Mixin --- #
28
- from natural_pdf.analyzers.shape_detection_mixin import ShapeDetectionMixin
29
- # --- End Shape Detection Mixin --- #
31
+
32
+ # --- End Shape Detection Mixin --- #
30
33
 
31
34
  if TYPE_CHECKING:
32
35
  # --- NEW: Add Image type hint for classification --- #
33
36
  from PIL.Image import Image
34
37
 
35
38
  from natural_pdf.core.page import Page
39
+ from natural_pdf.elements.base import Element # Added for type hint
36
40
  from natural_pdf.elements.collections import ElementCollection
37
41
  from natural_pdf.elements.text import TextElement
38
- from natural_pdf.elements.base import Element # Added for type hint
39
42
 
40
43
  # Import OCRManager conditionally to avoid circular imports
41
44
  try:
@@ -47,7 +50,7 @@ except ImportError:
47
50
  logger = logging.getLogger(__name__)
48
51
 
49
52
 
50
- class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetectionMixin):
53
+ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMixin):
51
54
  """
52
55
  Represents a rectangular region on a page.
53
56
  """
@@ -726,25 +729,32 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
726
729
  # Handle the case where user wants the cropped region to have a specific width
727
730
  page_kwargs = kwargs.copy()
728
731
  effective_resolution = resolution # Start with the provided resolution
729
-
730
- if crop_only and 'width' in kwargs:
731
- target_width = kwargs['width']
732
+
733
+ if crop_only and "width" in kwargs:
734
+ target_width = kwargs["width"]
732
735
  # Calculate what resolution is needed to make the region crop have target_width
733
736
  region_width_points = self.width # Region width in PDF points
734
-
737
+
735
738
  if region_width_points > 0:
736
739
  # Calculate scale needed: target_width / region_width_points
737
740
  required_scale = target_width / region_width_points
738
741
  # Convert scale to resolution: scale * 72 DPI
739
742
  effective_resolution = required_scale * 72.0
740
- page_kwargs.pop('width') # Remove width parameter to avoid conflicts
741
- logger.debug(f"Region {self.bbox}: Calculated required resolution {effective_resolution:.1f} DPI for region crop width {target_width}")
743
+ page_kwargs.pop("width") # Remove width parameter to avoid conflicts
744
+ logger.debug(
745
+ f"Region {self.bbox}: Calculated required resolution {effective_resolution:.1f} DPI for region crop width {target_width}"
746
+ )
742
747
  else:
743
- logger.warning(f"Region {self.bbox}: Invalid region width {region_width_points}, using original resolution")
748
+ logger.warning(
749
+ f"Region {self.bbox}: Invalid region width {region_width_points}, using original resolution"
750
+ )
744
751
 
745
752
  # First get the full page image with highlights if requested
746
753
  page_image = self._page.to_image(
747
- scale=scale, resolution=effective_resolution, include_highlights=include_highlights, **page_kwargs
754
+ scale=scale,
755
+ resolution=effective_resolution,
756
+ include_highlights=include_highlights,
757
+ **page_kwargs,
748
758
  )
749
759
 
750
760
  # Calculate the actual scale factor used by the page image
@@ -899,13 +909,19 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
899
909
  image.save(filename)
900
910
  return self
901
911
 
902
- def trim(self, padding: int = 1, threshold: float = 0.95, resolution: float = 150, pre_shrink: float = 0.5) -> "Region":
912
+ def trim(
913
+ self,
914
+ padding: int = 1,
915
+ threshold: float = 0.95,
916
+ resolution: float = 150,
917
+ pre_shrink: float = 0.5,
918
+ ) -> "Region":
903
919
  """
904
920
  Trim visual whitespace from the edges of this region.
905
-
921
+
906
922
  Similar to Python's string .strip() method, but for visual whitespace in the region image.
907
923
  Uses pixel analysis to detect rows/columns that are predominantly whitespace.
908
-
924
+
909
925
  Args:
910
926
  padding: Number of pixels to keep as padding after trimming (default: 1)
911
927
  threshold: Threshold for considering a row/column as whitespace (0.0-1.0, default: 0.95)
@@ -914,104 +930,126 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
914
930
  resolution: Resolution for image rendering in DPI (default: 150)
915
931
  pre_shrink: Amount to shrink region before trimming, then expand back after (default: 0.5)
916
932
  This helps avoid detecting box borders/slivers as content.
917
-
933
+
918
934
  Returns:
919
935
  New Region with visual whitespace trimmed from all edges
920
-
936
+
921
937
  Example:
922
938
  # Basic trimming with 1 pixel padding and 0.5px pre-shrink
923
939
  trimmed = region.trim()
924
-
940
+
925
941
  # More aggressive trimming with no padding and no pre-shrink
926
942
  tight = region.trim(padding=0, threshold=0.9, pre_shrink=0)
927
-
943
+
928
944
  # Conservative trimming with more padding
929
945
  loose = region.trim(padding=3, threshold=0.98)
930
946
  """
931
947
  # Pre-shrink the region to avoid box slivers
932
- work_region = self.expand(left=-pre_shrink, right=-pre_shrink, top=-pre_shrink, bottom=-pre_shrink) if pre_shrink > 0 else self
933
-
948
+ work_region = (
949
+ self.expand(left=-pre_shrink, right=-pre_shrink, top=-pre_shrink, bottom=-pre_shrink)
950
+ if pre_shrink > 0
951
+ else self
952
+ )
953
+
934
954
  # Get the region image
935
- image = work_region.to_image(resolution=resolution, crop_only=True, include_highlights=False)
936
-
955
+ image = work_region.to_image(
956
+ resolution=resolution, crop_only=True, include_highlights=False
957
+ )
958
+
937
959
  if image is None:
938
- logger.warning(f"Region {self.bbox}: Could not generate image for trimming. Returning original region.")
960
+ logger.warning(
961
+ f"Region {self.bbox}: Could not generate image for trimming. Returning original region."
962
+ )
939
963
  return self
940
-
964
+
941
965
  # Convert to grayscale for easier analysis
942
966
  import numpy as np
943
-
967
+
944
968
  # Convert PIL image to numpy array
945
- img_array = np.array(image.convert('L')) # Convert to grayscale
969
+ img_array = np.array(image.convert("L")) # Convert to grayscale
946
970
  height, width = img_array.shape
947
-
971
+
948
972
  if height == 0 or width == 0:
949
- logger.warning(f"Region {self.bbox}: Image has zero dimensions. Returning original region.")
973
+ logger.warning(
974
+ f"Region {self.bbox}: Image has zero dimensions. Returning original region."
975
+ )
950
976
  return self
951
-
977
+
952
978
  # Normalize pixel values to 0-1 range (255 = white = 1.0, 0 = black = 0.0)
953
979
  normalized = img_array.astype(np.float32) / 255.0
954
-
980
+
955
981
  # Find content boundaries by analyzing row and column averages
956
-
982
+
957
983
  # Analyze rows (horizontal strips) to find top and bottom boundaries
958
984
  row_averages = np.mean(normalized, axis=1) # Average each row
959
985
  content_rows = row_averages < threshold # True where there's content (not whitespace)
960
-
986
+
961
987
  # Find first and last rows with content
962
988
  content_row_indices = np.where(content_rows)[0]
963
989
  if len(content_row_indices) == 0:
964
990
  # No content found, return a minimal region at the center
965
- logger.warning(f"Region {self.bbox}: No content detected during trimming. Returning center point.")
991
+ logger.warning(
992
+ f"Region {self.bbox}: No content detected during trimming. Returning center point."
993
+ )
966
994
  center_x = (self.x0 + self.x1) / 2
967
995
  center_y = (self.top + self.bottom) / 2
968
996
  return Region(self.page, (center_x, center_y, center_x, center_y))
969
-
997
+
970
998
  top_content_row = max(0, content_row_indices[0] - padding)
971
999
  bottom_content_row = min(height - 1, content_row_indices[-1] + padding)
972
-
973
- # Analyze columns (vertical strips) to find left and right boundaries
1000
+
1001
+ # Analyze columns (vertical strips) to find left and right boundaries
974
1002
  col_averages = np.mean(normalized, axis=0) # Average each column
975
1003
  content_cols = col_averages < threshold # True where there's content
976
-
1004
+
977
1005
  content_col_indices = np.where(content_cols)[0]
978
1006
  if len(content_col_indices) == 0:
979
1007
  # No content found in columns either
980
- logger.warning(f"Region {self.bbox}: No column content detected during trimming. Returning center point.")
1008
+ logger.warning(
1009
+ f"Region {self.bbox}: No column content detected during trimming. Returning center point."
1010
+ )
981
1011
  center_x = (self.x0 + self.x1) / 2
982
1012
  center_y = (self.top + self.bottom) / 2
983
1013
  return Region(self.page, (center_x, center_y, center_x, center_y))
984
-
1014
+
985
1015
  left_content_col = max(0, content_col_indices[0] - padding)
986
1016
  right_content_col = min(width - 1, content_col_indices[-1] + padding)
987
-
1017
+
988
1018
  # Convert trimmed pixel coordinates back to PDF coordinates
989
1019
  scale_factor = resolution / 72.0 # Scale factor used in to_image()
990
-
1020
+
991
1021
  # Calculate new PDF coordinates and ensure they are Python floats
992
1022
  trimmed_x0 = float(work_region.x0 + (left_content_col / scale_factor))
993
1023
  trimmed_top = float(work_region.top + (top_content_row / scale_factor))
994
- trimmed_x1 = float(work_region.x0 + ((right_content_col + 1) / scale_factor)) # +1 because we want inclusive right edge
995
- trimmed_bottom = float(work_region.top + ((bottom_content_row + 1) / scale_factor)) # +1 because we want inclusive bottom edge
996
-
1024
+ trimmed_x1 = float(
1025
+ work_region.x0 + ((right_content_col + 1) / scale_factor)
1026
+ ) # +1 because we want inclusive right edge
1027
+ trimmed_bottom = float(
1028
+ work_region.top + ((bottom_content_row + 1) / scale_factor)
1029
+ ) # +1 because we want inclusive bottom edge
1030
+
997
1031
  # Ensure the trimmed region doesn't exceed the work region boundaries
998
1032
  final_x0 = max(work_region.x0, trimmed_x0)
999
1033
  final_top = max(work_region.top, trimmed_top)
1000
1034
  final_x1 = min(work_region.x1, trimmed_x1)
1001
1035
  final_bottom = min(work_region.bottom, trimmed_bottom)
1002
-
1036
+
1003
1037
  # Ensure valid coordinates (width > 0, height > 0)
1004
1038
  if final_x1 <= final_x0 or final_bottom <= final_top:
1005
- logger.warning(f"Region {self.bbox}: Trimming resulted in invalid dimensions. Returning original region.")
1039
+ logger.warning(
1040
+ f"Region {self.bbox}: Trimming resulted in invalid dimensions. Returning original region."
1041
+ )
1006
1042
  return self
1007
-
1043
+
1008
1044
  # Create the trimmed region
1009
1045
  trimmed_region = Region(self.page, (final_x0, final_top, final_x1, final_bottom))
1010
-
1046
+
1011
1047
  # Expand back by the pre_shrink amount to restore original positioning
1012
1048
  if pre_shrink > 0:
1013
- trimmed_region = trimmed_region.expand(left=pre_shrink, right=pre_shrink, top=pre_shrink, bottom=pre_shrink)
1014
-
1049
+ trimmed_region = trimmed_region.expand(
1050
+ left=pre_shrink, right=pre_shrink, top=pre_shrink, bottom=pre_shrink
1051
+ )
1052
+
1015
1053
  # Copy relevant metadata
1016
1054
  trimmed_region.region_type = self.region_type
1017
1055
  trimmed_region.normalized_type = self.normalized_type
@@ -1021,8 +1059,10 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
1021
1059
  trimmed_region.label = self.label
1022
1060
  trimmed_region.source = "trimmed" # Indicate this is a derived region
1023
1061
  trimmed_region.parent_region = self
1024
-
1025
- logger.debug(f"Region {self.bbox}: Trimmed to {trimmed_region.bbox} (padding={padding}, threshold={threshold}, pre_shrink={pre_shrink})")
1062
+
1063
+ logger.debug(
1064
+ f"Region {self.bbox}: Trimmed to {trimmed_region.bbox} (padding={padding}, threshold={threshold}, pre_shrink={pre_shrink})"
1065
+ )
1026
1066
  return trimmed_region
1027
1067
 
1028
1068
  def clip(
@@ -1035,42 +1075,42 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
1035
1075
  ) -> "Region":
1036
1076
  """
1037
1077
  Clip this region to specific bounds, either from another object with bbox or explicit coordinates.
1038
-
1078
+
1039
1079
  The clipped region will be constrained to not exceed the specified boundaries.
1040
1080
  You can provide either an object with bounding box properties, specific coordinates, or both.
1041
1081
  When both are provided, explicit coordinates take precedence.
1042
-
1082
+
1043
1083
  Args:
1044
1084
  obj: Optional object with bbox properties (Region, Element, TextElement, etc.)
1045
1085
  left: Optional left boundary (x0) to clip to
1046
- top: Optional top boundary to clip to
1086
+ top: Optional top boundary to clip to
1047
1087
  right: Optional right boundary (x1) to clip to
1048
1088
  bottom: Optional bottom boundary to clip to
1049
-
1089
+
1050
1090
  Returns:
1051
1091
  New Region with bounds clipped to the specified constraints
1052
-
1092
+
1053
1093
  Examples:
1054
1094
  # Clip to another region's bounds
1055
1095
  clipped = region.clip(container_region)
1056
-
1096
+
1057
1097
  # Clip to any element's bounds
1058
1098
  clipped = region.clip(text_element)
1059
-
1099
+
1060
1100
  # Clip to specific coordinates
1061
1101
  clipped = region.clip(left=100, right=400)
1062
-
1102
+
1063
1103
  # Mix object bounds with specific overrides
1064
1104
  clipped = region.clip(obj=container, bottom=page.height/2)
1065
1105
  """
1066
1106
  from natural_pdf.elements.base import extract_bbox
1067
-
1107
+
1068
1108
  # Start with current region bounds
1069
1109
  clip_x0 = self.x0
1070
1110
  clip_top = self.top
1071
1111
  clip_x1 = self.x1
1072
1112
  clip_bottom = self.bottom
1073
-
1113
+
1074
1114
  # Apply object constraints if provided
1075
1115
  if obj is not None:
1076
1116
  obj_bbox = extract_bbox(obj)
@@ -1086,7 +1126,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
1086
1126
  f"Region {self.bbox}: Cannot extract bbox from clipping object {type(obj)}. "
1087
1127
  "Object must have bbox property or x0/top/x1/bottom attributes."
1088
1128
  )
1089
-
1129
+
1090
1130
  # Apply explicit coordinate constraints (these take precedence)
1091
1131
  if left is not None:
1092
1132
  clip_x0 = max(clip_x0, left)
@@ -1096,7 +1136,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
1096
1136
  clip_x1 = min(clip_x1, right)
1097
1137
  if bottom is not None:
1098
1138
  clip_bottom = min(clip_bottom, bottom)
1099
-
1139
+
1100
1140
  # Ensure valid coordinates
1101
1141
  if clip_x1 <= clip_x0 or clip_bottom <= clip_top:
1102
1142
  logger.warning(
@@ -1105,10 +1145,10 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
1105
1145
  )
1106
1146
  # Return a minimal region at the clip area's top-left
1107
1147
  return Region(self.page, (clip_x0, clip_top, clip_x0, clip_top))
1108
-
1148
+
1109
1149
  # Create the clipped region
1110
1150
  clipped_region = Region(self.page, (clip_x0, clip_top, clip_x1, clip_bottom))
1111
-
1151
+
1112
1152
  # Copy relevant metadata
1113
1153
  clipped_region.region_type = self.region_type
1114
1154
  clipped_region.normalized_type = self.normalized_type
@@ -1118,7 +1158,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
1118
1158
  clipped_region.label = self.label
1119
1159
  clipped_region.source = "clipped" # Indicate this is a derived region
1120
1160
  clipped_region.parent_region = self
1121
-
1161
+
1122
1162
  logger.debug(
1123
1163
  f"Region {self.bbox}: Clipped to {clipped_region.bbox} "
1124
1164
  f"(constraints: obj={type(obj).__name__ if obj else None}, "
@@ -1279,24 +1319,36 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
1279
1319
  else:
1280
1320
  # Try lattice first, then fall back to stream if no meaningful results
1281
1321
  logger.debug(f"Region {self.bbox}: Auto-detecting table extraction method...")
1282
-
1322
+
1283
1323
  try:
1284
1324
  logger.debug(f"Region {self.bbox}: Trying 'lattice' method first...")
1285
- lattice_result = self.extract_table('lattice', table_settings=table_settings.copy())
1286
-
1325
+ lattice_result = self.extract_table(
1326
+ "lattice", table_settings=table_settings.copy()
1327
+ )
1328
+
1287
1329
  # Check if lattice found meaningful content
1288
- if (lattice_result and len(lattice_result) > 0 and
1289
- any(any(cell and cell.strip() for cell in row if cell) for row in lattice_result)):
1290
- logger.debug(f"Region {self.bbox}: 'lattice' method found table with {len(lattice_result)} rows")
1330
+ if (
1331
+ lattice_result
1332
+ and len(lattice_result) > 0
1333
+ and any(
1334
+ any(cell and cell.strip() for cell in row if cell)
1335
+ for row in lattice_result
1336
+ )
1337
+ ):
1338
+ logger.debug(
1339
+ f"Region {self.bbox}: 'lattice' method found table with {len(lattice_result)} rows"
1340
+ )
1291
1341
  return lattice_result
1292
1342
  else:
1293
- logger.debug(f"Region {self.bbox}: 'lattice' method found no meaningful content")
1343
+ logger.debug(
1344
+ f"Region {self.bbox}: 'lattice' method found no meaningful content"
1345
+ )
1294
1346
  except Exception as e:
1295
1347
  logger.debug(f"Region {self.bbox}: 'lattice' method failed: {e}")
1296
-
1348
+
1297
1349
  # Fall back to stream
1298
1350
  logger.debug(f"Region {self.bbox}: Falling back to 'stream' method...")
1299
- return self.extract_table('stream', table_settings=table_settings.copy())
1351
+ return self.extract_table("stream", table_settings=table_settings.copy())
1300
1352
  else:
1301
1353
  effective_method = method
1302
1354
 
@@ -1308,7 +1360,9 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
1308
1360
  table_settings.setdefault("vertical_strategy", "text")
1309
1361
  table_settings.setdefault("horizontal_strategy", "text")
1310
1362
  elif effective_method == "lattice":
1311
- logger.debug("Using 'lattice' method alias for 'pdfplumber' with line-based strategies.")
1363
+ logger.debug(
1364
+ "Using 'lattice' method alias for 'pdfplumber' with line-based strategies."
1365
+ )
1312
1366
  effective_method = "pdfplumber"
1313
1367
  # Set default line strategies if not already provided by the user
1314
1368
  table_settings.setdefault("vertical_strategy", "lines")
@@ -1331,7 +1385,6 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
1331
1385
  f"Unknown table extraction method: '{method}'. Choose from 'tatr', 'pdfplumber', 'text', 'stream', 'lattice'."
1332
1386
  )
1333
1387
 
1334
-
1335
1388
  def extract_tables(
1336
1389
  self,
1337
1390
  method: Optional[str] = None,
@@ -1357,33 +1410,45 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
1357
1410
  # Auto-detect method if not specified (try lattice first, then stream)
1358
1411
  if method is None:
1359
1412
  logger.debug(f"Region {self.bbox}: Auto-detecting tables extraction method...")
1360
-
1413
+
1361
1414
  # Try lattice first
1362
1415
  try:
1363
1416
  lattice_settings = table_settings.copy()
1364
1417
  lattice_settings.setdefault("vertical_strategy", "lines")
1365
1418
  lattice_settings.setdefault("horizontal_strategy", "lines")
1366
-
1419
+
1367
1420
  logger.debug(f"Region {self.bbox}: Trying 'lattice' method first for tables...")
1368
1421
  lattice_result = self._extract_tables_plumber(lattice_settings)
1369
-
1422
+
1370
1423
  # Check if lattice found meaningful tables
1371
- if (lattice_result and len(lattice_result) > 0 and
1372
- any(any(any(cell and cell.strip() for cell in row if cell) for row in table if table) for table in lattice_result)):
1373
- logger.debug(f"Region {self.bbox}: 'lattice' method found {len(lattice_result)} tables")
1424
+ if (
1425
+ lattice_result
1426
+ and len(lattice_result) > 0
1427
+ and any(
1428
+ any(
1429
+ any(cell and cell.strip() for cell in row if cell)
1430
+ for row in table
1431
+ if table
1432
+ )
1433
+ for table in lattice_result
1434
+ )
1435
+ ):
1436
+ logger.debug(
1437
+ f"Region {self.bbox}: 'lattice' method found {len(lattice_result)} tables"
1438
+ )
1374
1439
  return lattice_result
1375
1440
  else:
1376
1441
  logger.debug(f"Region {self.bbox}: 'lattice' method found no meaningful tables")
1377
-
1442
+
1378
1443
  except Exception as e:
1379
1444
  logger.debug(f"Region {self.bbox}: 'lattice' method failed: {e}")
1380
-
1445
+
1381
1446
  # Fall back to stream
1382
1447
  logger.debug(f"Region {self.bbox}: Falling back to 'stream' method for tables...")
1383
1448
  stream_settings = table_settings.copy()
1384
1449
  stream_settings.setdefault("vertical_strategy", "text")
1385
1450
  stream_settings.setdefault("horizontal_strategy", "text")
1386
-
1451
+
1387
1452
  return self._extract_tables_plumber(stream_settings)
1388
1453
 
1389
1454
  effective_method = method
@@ -1395,7 +1460,9 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
1395
1460
  table_settings.setdefault("vertical_strategy", "text")
1396
1461
  table_settings.setdefault("horizontal_strategy", "text")
1397
1462
  elif effective_method == "lattice":
1398
- logger.debug("Using 'lattice' method alias for 'pdfplumber' with line-based strategies.")
1463
+ logger.debug(
1464
+ "Using 'lattice' method alias for 'pdfplumber' with line-based strategies."
1465
+ )
1399
1466
  effective_method = "pdfplumber"
1400
1467
  table_settings.setdefault("vertical_strategy", "lines")
1401
1468
  table_settings.setdefault("horizontal_strategy", "lines")
@@ -1844,7 +1911,9 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
1844
1911
 
1845
1912
  # Validate contains parameter
1846
1913
  if contains not in ["all", "any", "center"]:
1847
- raise ValueError(f"Invalid contains value: {contains}. Must be 'all', 'any', or 'center'")
1914
+ raise ValueError(
1915
+ f"Invalid contains value: {contains}. Must be 'all', 'any', or 'center'"
1916
+ )
1848
1917
 
1849
1918
  # Construct selector if 'text' is provided
1850
1919
  effective_selector = ""
@@ -1894,24 +1963,21 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
1894
1963
  # Filter these elements based on the specified containment method
1895
1964
  region_bbox = self.bbox
1896
1965
  matching_elements = []
1897
-
1966
+
1898
1967
  if contains == "all": # Fully inside (strict)
1899
1968
  matching_elements = [
1900
- el for el in potential_elements
1969
+ el
1970
+ for el in potential_elements
1901
1971
  if el.x0 >= region_bbox[0]
1902
1972
  and el.top >= region_bbox[1]
1903
1973
  and el.x1 <= region_bbox[2]
1904
1974
  and el.bottom <= region_bbox[3]
1905
1975
  ]
1906
1976
  elif contains == "any": # Any overlap
1907
- matching_elements = [
1908
- el for el in potential_elements
1909
- if self.intersects(el)
1910
- ]
1977
+ matching_elements = [el for el in potential_elements if self.intersects(el)]
1911
1978
  elif contains == "center": # Center point inside
1912
1979
  matching_elements = [
1913
- el for el in potential_elements
1914
- if self.is_element_center_inside(el)
1980
+ el for el in potential_elements if self.is_element_center_inside(el)
1915
1981
  ]
1916
1982
 
1917
1983
  return ElementCollection(matching_elements)
@@ -2001,17 +2067,13 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
2001
2067
  manager_args = {k: v for k, v in manager_args.items() if v is not None}
2002
2068
 
2003
2069
  # Run OCR on this region's image using the manager
2004
- try:
2005
- results = ocr_mgr.apply_ocr(**manager_args)
2006
- if not isinstance(results, list):
2007
- logger.error(
2008
- f"OCRManager returned unexpected type for single region image: {type(results)}"
2009
- )
2010
- return self
2011
- logger.debug(f"Region OCR processing returned {len(results)} results.")
2012
- except Exception as e:
2013
- logger.error(f"Error during OCRManager processing for region: {e}", exc_info=True)
2070
+ results = ocr_mgr.apply_ocr(**manager_args)
2071
+ if not isinstance(results, list):
2072
+ logger.error(
2073
+ f"OCRManager returned unexpected type for single region image: {type(results)}"
2074
+ )
2014
2075
  return self
2076
+ logger.debug(f"Region OCR processing returned {len(results)} results.")
2015
2077
 
2016
2078
  # Convert results to TextElements
2017
2079
  scale_x = self.width / region_image.width if region_image.width > 0 else 1.0
@@ -2802,11 +2864,11 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
2802
2864
  self,
2803
2865
  text_content: Optional[Union[str, Callable[["Region"], Optional[str]]]] = None,
2804
2866
  source_label: str = "derived_from_region",
2805
- object_type: str = "word", # Or "char", controls how it's categorized
2867
+ object_type: str = "word", # Or "char", controls how it's categorized
2806
2868
  default_font_size: float = 10.0,
2807
2869
  default_font_name: str = "RegionContent",
2808
- confidence: Optional[float] = None, # Allow overriding confidence
2809
- add_to_page: bool = False # NEW: Option to add to page
2870
+ confidence: Optional[float] = None, # Allow overriding confidence
2871
+ add_to_page: bool = False, # NEW: Option to add to page
2810
2872
  ) -> "TextElement":
2811
2873
  """
2812
2874
  Creates a new TextElement object based on this region's geometry.
@@ -2833,7 +2895,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
2833
2895
 
2834
2896
  Returns:
2835
2897
  A new TextElement instance.
2836
-
2898
+
2837
2899
  Raises:
2838
2900
  ValueError: If the region does not have a valid 'page' attribute.
2839
2901
  """
@@ -2844,14 +2906,17 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
2844
2906
  try:
2845
2907
  actual_text = text_content(self)
2846
2908
  except Exception as e:
2847
- logger.error(f"Error executing text_content callback for region {self.bbox}: {e}", exc_info=True)
2848
- actual_text = None # Ensure actual_text is None on error
2909
+ logger.error(
2910
+ f"Error executing text_content callback for region {self.bbox}: {e}",
2911
+ exc_info=True,
2912
+ )
2913
+ actual_text = None # Ensure actual_text is None on error
2849
2914
 
2850
2915
  final_confidence = confidence
2851
2916
  if final_confidence is None:
2852
2917
  final_confidence = 1.0 if actual_text is not None and actual_text.strip() else 0.0
2853
2918
 
2854
- if not hasattr(self, 'page') or self.page is None:
2919
+ if not hasattr(self, "page") or self.page is None:
2855
2920
  raise ValueError("Region must have a valid 'page' attribute to create a TextElement.")
2856
2921
 
2857
2922
  elem_data = {
@@ -2864,8 +2929,8 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
2864
2929
  "height": self.height,
2865
2930
  "object_type": object_type,
2866
2931
  "page_number": self.page.page_number,
2867
- "stroking_color": getattr(self, 'stroking_color', (0,0,0)),
2868
- "non_stroking_color": getattr(self, 'non_stroking_color', (0,0,0)),
2932
+ "stroking_color": getattr(self, "stroking_color", (0, 0, 0)),
2933
+ "non_stroking_color": getattr(self, "non_stroking_color", (0, 0, 0)),
2869
2934
  "fontname": default_font_name,
2870
2935
  "size": default_font_size,
2871
2936
  "upright": True,
@@ -2873,18 +2938,30 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
2873
2938
  "adv": self.width,
2874
2939
  "source": source_label,
2875
2940
  "confidence": final_confidence,
2876
- "_char_dicts": []
2941
+ "_char_dicts": [],
2877
2942
  }
2878
2943
  text_element = TextElement(elem_data, self.page)
2879
2944
 
2880
2945
  if add_to_page:
2881
- if hasattr(self.page, '_element_mgr') and self.page._element_mgr is not None:
2882
- add_as_type = "words" if object_type == "word" else "chars" if object_type == "char" else object_type
2946
+ if hasattr(self.page, "_element_mgr") and self.page._element_mgr is not None:
2947
+ add_as_type = (
2948
+ "words"
2949
+ if object_type == "word"
2950
+ else "chars" if object_type == "char" else object_type
2951
+ )
2883
2952
  # REMOVED try-except block around add_element
2884
2953
  self.page._element_mgr.add_element(text_element, element_type=add_as_type)
2885
- logger.debug(f"TextElement created from region {self.bbox} and added to page {self.page.page_number} as {add_as_type}.")
2954
+ logger.debug(
2955
+ f"TextElement created from region {self.bbox} and added to page {self.page.page_number} as {add_as_type}."
2956
+ )
2886
2957
  else:
2887
- page_num_str = str(self.page.page_number) if hasattr(self.page, 'page_number') else 'N/A'
2888
- logger.warning(f"Cannot add TextElement to page: Page {page_num_str} for region {self.bbox} is missing '_element_mgr'.")
2889
-
2958
+ page_num_str = (
2959
+ str(self.page.page_number) if hasattr(self.page, "page_number") else "N/A"
2960
+ )
2961
+ logger.warning(
2962
+ f"Cannot add TextElement to page: Page {page_num_str} for region {self.bbox} is missing '_element_mgr'."
2963
+ )
2964
+
2890
2965
  return text_element
2966
+
2967
+