natural-pdf 0.1.15__py3-none-any.whl → 0.1.16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. natural_pdf/__init__.py +31 -0
  2. natural_pdf/analyzers/layout/gemini.py +137 -162
  3. natural_pdf/analyzers/layout/layout_manager.py +9 -5
  4. natural_pdf/analyzers/layout/layout_options.py +77 -7
  5. natural_pdf/analyzers/layout/paddle.py +318 -165
  6. natural_pdf/analyzers/layout/table_structure_utils.py +78 -0
  7. natural_pdf/analyzers/shape_detection_mixin.py +770 -405
  8. natural_pdf/classification/mixin.py +2 -8
  9. natural_pdf/collections/pdf_collection.py +25 -30
  10. natural_pdf/core/highlighting_service.py +47 -32
  11. natural_pdf/core/page.py +117 -75
  12. natural_pdf/core/pdf.py +19 -22
  13. natural_pdf/elements/base.py +9 -9
  14. natural_pdf/elements/collections.py +105 -50
  15. natural_pdf/elements/region.py +200 -126
  16. natural_pdf/exporters/paddleocr.py +38 -13
  17. natural_pdf/flows/__init__.py +3 -3
  18. natural_pdf/flows/collections.py +303 -132
  19. natural_pdf/flows/element.py +277 -132
  20. natural_pdf/flows/flow.py +33 -16
  21. natural_pdf/flows/region.py +142 -79
  22. natural_pdf/ocr/engine_doctr.py +37 -4
  23. natural_pdf/ocr/engine_easyocr.py +23 -3
  24. natural_pdf/ocr/engine_paddle.py +281 -30
  25. natural_pdf/ocr/engine_surya.py +8 -3
  26. natural_pdf/ocr/ocr_manager.py +75 -76
  27. natural_pdf/ocr/ocr_options.py +52 -87
  28. natural_pdf/search/__init__.py +25 -12
  29. natural_pdf/search/lancedb_search_service.py +91 -54
  30. natural_pdf/search/numpy_search_service.py +86 -65
  31. natural_pdf/search/searchable_mixin.py +2 -2
  32. natural_pdf/selectors/parser.py +125 -81
  33. natural_pdf/widgets/__init__.py +1 -1
  34. natural_pdf/widgets/viewer.py +205 -449
  35. {natural_pdf-0.1.15.dist-info → natural_pdf-0.1.16.dist-info}/METADATA +27 -45
  36. {natural_pdf-0.1.15.dist-info → natural_pdf-0.1.16.dist-info}/RECORD +39 -38
  37. {natural_pdf-0.1.15.dist-info → natural_pdf-0.1.16.dist-info}/WHEEL +0 -0
  38. {natural_pdf-0.1.15.dist-info → natural_pdf-0.1.16.dist-info}/licenses/LICENSE +0 -0
  39. {natural_pdf-0.1.15.dist-info → natural_pdf-0.1.16.dist-info}/top_level.txt +0 -0
@@ -5,15 +5,19 @@ from pdfplumber.utils.geometry import get_bbox_overlap, merge_bboxes, objects_to
5
5
 
6
6
  # New Imports
7
7
  from pdfplumber.utils.text import TEXTMAP_KWARGS, WORD_EXTRACTOR_KWARGS, chars_to_textmap
8
+ from tqdm.auto import tqdm
8
9
 
9
10
  from natural_pdf.analyzers.layout.pdfplumber_table_finder import find_text_based_tables
11
+
12
+ # --- Shape Detection Mixin --- #
13
+ from natural_pdf.analyzers.shape_detection_mixin import ShapeDetectionMixin
10
14
  from natural_pdf.classification.manager import ClassificationManager # Keep for type hint
11
15
 
12
16
  # --- Classification Imports --- #
13
17
  from natural_pdf.classification.mixin import ClassificationMixin
14
18
  from natural_pdf.elements.base import DirectionalMixin
19
+ from natural_pdf.elements.text import TextElement # ADDED IMPORT
15
20
  from natural_pdf.extraction.mixin import ExtractionMixin # Import extraction mixin
16
- from natural_pdf.elements.text import TextElement # ADDED IMPORT
17
21
  from natural_pdf.ocr.utils import _apply_ocr_correction_to_elements # Import utility
18
22
  from natural_pdf.selectors.parser import parse_selector, selector_to_filter_func
19
23
  from natural_pdf.utils.locks import pdf_render_lock # Import the lock
@@ -21,21 +25,19 @@ from natural_pdf.utils.locks import pdf_render_lock # Import the lock
21
25
  # Import new utils
22
26
  from natural_pdf.utils.text_extraction import filter_chars_spatially, generate_text_layout
23
27
 
24
- from tqdm.auto import tqdm
25
28
  # --- End Classification Imports --- #
26
29
 
27
- # --- Shape Detection Mixin --- #
28
- from natural_pdf.analyzers.shape_detection_mixin import ShapeDetectionMixin
29
- # --- End Shape Detection Mixin --- #
30
+
31
+ # --- End Shape Detection Mixin --- #
30
32
 
31
33
  if TYPE_CHECKING:
32
34
  # --- NEW: Add Image type hint for classification --- #
33
35
  from PIL.Image import Image
34
36
 
35
37
  from natural_pdf.core.page import Page
38
+ from natural_pdf.elements.base import Element # Added for type hint
36
39
  from natural_pdf.elements.collections import ElementCollection
37
40
  from natural_pdf.elements.text import TextElement
38
- from natural_pdf.elements.base import Element # Added for type hint
39
41
 
40
42
  # Import OCRManager conditionally to avoid circular imports
41
43
  try:
@@ -726,25 +728,32 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
726
728
  # Handle the case where user wants the cropped region to have a specific width
727
729
  page_kwargs = kwargs.copy()
728
730
  effective_resolution = resolution # Start with the provided resolution
729
-
730
- if crop_only and 'width' in kwargs:
731
- target_width = kwargs['width']
731
+
732
+ if crop_only and "width" in kwargs:
733
+ target_width = kwargs["width"]
732
734
  # Calculate what resolution is needed to make the region crop have target_width
733
735
  region_width_points = self.width # Region width in PDF points
734
-
736
+
735
737
  if region_width_points > 0:
736
738
  # Calculate scale needed: target_width / region_width_points
737
739
  required_scale = target_width / region_width_points
738
740
  # Convert scale to resolution: scale * 72 DPI
739
741
  effective_resolution = required_scale * 72.0
740
- page_kwargs.pop('width') # Remove width parameter to avoid conflicts
741
- logger.debug(f"Region {self.bbox}: Calculated required resolution {effective_resolution:.1f} DPI for region crop width {target_width}")
742
+ page_kwargs.pop("width") # Remove width parameter to avoid conflicts
743
+ logger.debug(
744
+ f"Region {self.bbox}: Calculated required resolution {effective_resolution:.1f} DPI for region crop width {target_width}"
745
+ )
742
746
  else:
743
- logger.warning(f"Region {self.bbox}: Invalid region width {region_width_points}, using original resolution")
747
+ logger.warning(
748
+ f"Region {self.bbox}: Invalid region width {region_width_points}, using original resolution"
749
+ )
744
750
 
745
751
  # First get the full page image with highlights if requested
746
752
  page_image = self._page.to_image(
747
- scale=scale, resolution=effective_resolution, include_highlights=include_highlights, **page_kwargs
753
+ scale=scale,
754
+ resolution=effective_resolution,
755
+ include_highlights=include_highlights,
756
+ **page_kwargs,
748
757
  )
749
758
 
750
759
  # Calculate the actual scale factor used by the page image
@@ -899,13 +908,19 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
899
908
  image.save(filename)
900
909
  return self
901
910
 
902
- def trim(self, padding: int = 1, threshold: float = 0.95, resolution: float = 150, pre_shrink: float = 0.5) -> "Region":
911
+ def trim(
912
+ self,
913
+ padding: int = 1,
914
+ threshold: float = 0.95,
915
+ resolution: float = 150,
916
+ pre_shrink: float = 0.5,
917
+ ) -> "Region":
903
918
  """
904
919
  Trim visual whitespace from the edges of this region.
905
-
920
+
906
921
  Similar to Python's string .strip() method, but for visual whitespace in the region image.
907
922
  Uses pixel analysis to detect rows/columns that are predominantly whitespace.
908
-
923
+
909
924
  Args:
910
925
  padding: Number of pixels to keep as padding after trimming (default: 1)
911
926
  threshold: Threshold for considering a row/column as whitespace (0.0-1.0, default: 0.95)
@@ -914,104 +929,126 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
914
929
  resolution: Resolution for image rendering in DPI (default: 150)
915
930
  pre_shrink: Amount to shrink region before trimming, then expand back after (default: 0.5)
916
931
  This helps avoid detecting box borders/slivers as content.
917
-
932
+
918
933
  Returns:
919
934
  New Region with visual whitespace trimmed from all edges
920
-
935
+
921
936
  Example:
922
937
  # Basic trimming with 1 pixel padding and 0.5px pre-shrink
923
938
  trimmed = region.trim()
924
-
939
+
925
940
  # More aggressive trimming with no padding and no pre-shrink
926
941
  tight = region.trim(padding=0, threshold=0.9, pre_shrink=0)
927
-
942
+
928
943
  # Conservative trimming with more padding
929
944
  loose = region.trim(padding=3, threshold=0.98)
930
945
  """
931
946
  # Pre-shrink the region to avoid box slivers
932
- work_region = self.expand(left=-pre_shrink, right=-pre_shrink, top=-pre_shrink, bottom=-pre_shrink) if pre_shrink > 0 else self
933
-
947
+ work_region = (
948
+ self.expand(left=-pre_shrink, right=-pre_shrink, top=-pre_shrink, bottom=-pre_shrink)
949
+ if pre_shrink > 0
950
+ else self
951
+ )
952
+
934
953
  # Get the region image
935
- image = work_region.to_image(resolution=resolution, crop_only=True, include_highlights=False)
936
-
954
+ image = work_region.to_image(
955
+ resolution=resolution, crop_only=True, include_highlights=False
956
+ )
957
+
937
958
  if image is None:
938
- logger.warning(f"Region {self.bbox}: Could not generate image for trimming. Returning original region.")
959
+ logger.warning(
960
+ f"Region {self.bbox}: Could not generate image for trimming. Returning original region."
961
+ )
939
962
  return self
940
-
963
+
941
964
  # Convert to grayscale for easier analysis
942
965
  import numpy as np
943
-
966
+
944
967
  # Convert PIL image to numpy array
945
- img_array = np.array(image.convert('L')) # Convert to grayscale
968
+ img_array = np.array(image.convert("L")) # Convert to grayscale
946
969
  height, width = img_array.shape
947
-
970
+
948
971
  if height == 0 or width == 0:
949
- logger.warning(f"Region {self.bbox}: Image has zero dimensions. Returning original region.")
972
+ logger.warning(
973
+ f"Region {self.bbox}: Image has zero dimensions. Returning original region."
974
+ )
950
975
  return self
951
-
976
+
952
977
  # Normalize pixel values to 0-1 range (255 = white = 1.0, 0 = black = 0.0)
953
978
  normalized = img_array.astype(np.float32) / 255.0
954
-
979
+
955
980
  # Find content boundaries by analyzing row and column averages
956
-
981
+
957
982
  # Analyze rows (horizontal strips) to find top and bottom boundaries
958
983
  row_averages = np.mean(normalized, axis=1) # Average each row
959
984
  content_rows = row_averages < threshold # True where there's content (not whitespace)
960
-
985
+
961
986
  # Find first and last rows with content
962
987
  content_row_indices = np.where(content_rows)[0]
963
988
  if len(content_row_indices) == 0:
964
989
  # No content found, return a minimal region at the center
965
- logger.warning(f"Region {self.bbox}: No content detected during trimming. Returning center point.")
990
+ logger.warning(
991
+ f"Region {self.bbox}: No content detected during trimming. Returning center point."
992
+ )
966
993
  center_x = (self.x0 + self.x1) / 2
967
994
  center_y = (self.top + self.bottom) / 2
968
995
  return Region(self.page, (center_x, center_y, center_x, center_y))
969
-
996
+
970
997
  top_content_row = max(0, content_row_indices[0] - padding)
971
998
  bottom_content_row = min(height - 1, content_row_indices[-1] + padding)
972
-
973
- # Analyze columns (vertical strips) to find left and right boundaries
999
+
1000
+ # Analyze columns (vertical strips) to find left and right boundaries
974
1001
  col_averages = np.mean(normalized, axis=0) # Average each column
975
1002
  content_cols = col_averages < threshold # True where there's content
976
-
1003
+
977
1004
  content_col_indices = np.where(content_cols)[0]
978
1005
  if len(content_col_indices) == 0:
979
1006
  # No content found in columns either
980
- logger.warning(f"Region {self.bbox}: No column content detected during trimming. Returning center point.")
1007
+ logger.warning(
1008
+ f"Region {self.bbox}: No column content detected during trimming. Returning center point."
1009
+ )
981
1010
  center_x = (self.x0 + self.x1) / 2
982
1011
  center_y = (self.top + self.bottom) / 2
983
1012
  return Region(self.page, (center_x, center_y, center_x, center_y))
984
-
1013
+
985
1014
  left_content_col = max(0, content_col_indices[0] - padding)
986
1015
  right_content_col = min(width - 1, content_col_indices[-1] + padding)
987
-
1016
+
988
1017
  # Convert trimmed pixel coordinates back to PDF coordinates
989
1018
  scale_factor = resolution / 72.0 # Scale factor used in to_image()
990
-
1019
+
991
1020
  # Calculate new PDF coordinates and ensure they are Python floats
992
1021
  trimmed_x0 = float(work_region.x0 + (left_content_col / scale_factor))
993
1022
  trimmed_top = float(work_region.top + (top_content_row / scale_factor))
994
- trimmed_x1 = float(work_region.x0 + ((right_content_col + 1) / scale_factor)) # +1 because we want inclusive right edge
995
- trimmed_bottom = float(work_region.top + ((bottom_content_row + 1) / scale_factor)) # +1 because we want inclusive bottom edge
996
-
1023
+ trimmed_x1 = float(
1024
+ work_region.x0 + ((right_content_col + 1) / scale_factor)
1025
+ ) # +1 because we want inclusive right edge
1026
+ trimmed_bottom = float(
1027
+ work_region.top + ((bottom_content_row + 1) / scale_factor)
1028
+ ) # +1 because we want inclusive bottom edge
1029
+
997
1030
  # Ensure the trimmed region doesn't exceed the work region boundaries
998
1031
  final_x0 = max(work_region.x0, trimmed_x0)
999
1032
  final_top = max(work_region.top, trimmed_top)
1000
1033
  final_x1 = min(work_region.x1, trimmed_x1)
1001
1034
  final_bottom = min(work_region.bottom, trimmed_bottom)
1002
-
1035
+
1003
1036
  # Ensure valid coordinates (width > 0, height > 0)
1004
1037
  if final_x1 <= final_x0 or final_bottom <= final_top:
1005
- logger.warning(f"Region {self.bbox}: Trimming resulted in invalid dimensions. Returning original region.")
1038
+ logger.warning(
1039
+ f"Region {self.bbox}: Trimming resulted in invalid dimensions. Returning original region."
1040
+ )
1006
1041
  return self
1007
-
1042
+
1008
1043
  # Create the trimmed region
1009
1044
  trimmed_region = Region(self.page, (final_x0, final_top, final_x1, final_bottom))
1010
-
1045
+
1011
1046
  # Expand back by the pre_shrink amount to restore original positioning
1012
1047
  if pre_shrink > 0:
1013
- trimmed_region = trimmed_region.expand(left=pre_shrink, right=pre_shrink, top=pre_shrink, bottom=pre_shrink)
1014
-
1048
+ trimmed_region = trimmed_region.expand(
1049
+ left=pre_shrink, right=pre_shrink, top=pre_shrink, bottom=pre_shrink
1050
+ )
1051
+
1015
1052
  # Copy relevant metadata
1016
1053
  trimmed_region.region_type = self.region_type
1017
1054
  trimmed_region.normalized_type = self.normalized_type
@@ -1021,8 +1058,10 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
1021
1058
  trimmed_region.label = self.label
1022
1059
  trimmed_region.source = "trimmed" # Indicate this is a derived region
1023
1060
  trimmed_region.parent_region = self
1024
-
1025
- logger.debug(f"Region {self.bbox}: Trimmed to {trimmed_region.bbox} (padding={padding}, threshold={threshold}, pre_shrink={pre_shrink})")
1061
+
1062
+ logger.debug(
1063
+ f"Region {self.bbox}: Trimmed to {trimmed_region.bbox} (padding={padding}, threshold={threshold}, pre_shrink={pre_shrink})"
1064
+ )
1026
1065
  return trimmed_region
1027
1066
 
1028
1067
  def clip(
@@ -1035,42 +1074,42 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
1035
1074
  ) -> "Region":
1036
1075
  """
1037
1076
  Clip this region to specific bounds, either from another object with bbox or explicit coordinates.
1038
-
1077
+
1039
1078
  The clipped region will be constrained to not exceed the specified boundaries.
1040
1079
  You can provide either an object with bounding box properties, specific coordinates, or both.
1041
1080
  When both are provided, explicit coordinates take precedence.
1042
-
1081
+
1043
1082
  Args:
1044
1083
  obj: Optional object with bbox properties (Region, Element, TextElement, etc.)
1045
1084
  left: Optional left boundary (x0) to clip to
1046
- top: Optional top boundary to clip to
1085
+ top: Optional top boundary to clip to
1047
1086
  right: Optional right boundary (x1) to clip to
1048
1087
  bottom: Optional bottom boundary to clip to
1049
-
1088
+
1050
1089
  Returns:
1051
1090
  New Region with bounds clipped to the specified constraints
1052
-
1091
+
1053
1092
  Examples:
1054
1093
  # Clip to another region's bounds
1055
1094
  clipped = region.clip(container_region)
1056
-
1095
+
1057
1096
  # Clip to any element's bounds
1058
1097
  clipped = region.clip(text_element)
1059
-
1098
+
1060
1099
  # Clip to specific coordinates
1061
1100
  clipped = region.clip(left=100, right=400)
1062
-
1101
+
1063
1102
  # Mix object bounds with specific overrides
1064
1103
  clipped = region.clip(obj=container, bottom=page.height/2)
1065
1104
  """
1066
1105
  from natural_pdf.elements.base import extract_bbox
1067
-
1106
+
1068
1107
  # Start with current region bounds
1069
1108
  clip_x0 = self.x0
1070
1109
  clip_top = self.top
1071
1110
  clip_x1 = self.x1
1072
1111
  clip_bottom = self.bottom
1073
-
1112
+
1074
1113
  # Apply object constraints if provided
1075
1114
  if obj is not None:
1076
1115
  obj_bbox = extract_bbox(obj)
@@ -1086,7 +1125,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
1086
1125
  f"Region {self.bbox}: Cannot extract bbox from clipping object {type(obj)}. "
1087
1126
  "Object must have bbox property or x0/top/x1/bottom attributes."
1088
1127
  )
1089
-
1128
+
1090
1129
  # Apply explicit coordinate constraints (these take precedence)
1091
1130
  if left is not None:
1092
1131
  clip_x0 = max(clip_x0, left)
@@ -1096,7 +1135,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
1096
1135
  clip_x1 = min(clip_x1, right)
1097
1136
  if bottom is not None:
1098
1137
  clip_bottom = min(clip_bottom, bottom)
1099
-
1138
+
1100
1139
  # Ensure valid coordinates
1101
1140
  if clip_x1 <= clip_x0 or clip_bottom <= clip_top:
1102
1141
  logger.warning(
@@ -1105,10 +1144,10 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
1105
1144
  )
1106
1145
  # Return a minimal region at the clip area's top-left
1107
1146
  return Region(self.page, (clip_x0, clip_top, clip_x0, clip_top))
1108
-
1147
+
1109
1148
  # Create the clipped region
1110
1149
  clipped_region = Region(self.page, (clip_x0, clip_top, clip_x1, clip_bottom))
1111
-
1150
+
1112
1151
  # Copy relevant metadata
1113
1152
  clipped_region.region_type = self.region_type
1114
1153
  clipped_region.normalized_type = self.normalized_type
@@ -1118,7 +1157,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
1118
1157
  clipped_region.label = self.label
1119
1158
  clipped_region.source = "clipped" # Indicate this is a derived region
1120
1159
  clipped_region.parent_region = self
1121
-
1160
+
1122
1161
  logger.debug(
1123
1162
  f"Region {self.bbox}: Clipped to {clipped_region.bbox} "
1124
1163
  f"(constraints: obj={type(obj).__name__ if obj else None}, "
@@ -1279,24 +1318,36 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
1279
1318
  else:
1280
1319
  # Try lattice first, then fall back to stream if no meaningful results
1281
1320
  logger.debug(f"Region {self.bbox}: Auto-detecting table extraction method...")
1282
-
1321
+
1283
1322
  try:
1284
1323
  logger.debug(f"Region {self.bbox}: Trying 'lattice' method first...")
1285
- lattice_result = self.extract_table('lattice', table_settings=table_settings.copy())
1286
-
1324
+ lattice_result = self.extract_table(
1325
+ "lattice", table_settings=table_settings.copy()
1326
+ )
1327
+
1287
1328
  # Check if lattice found meaningful content
1288
- if (lattice_result and len(lattice_result) > 0 and
1289
- any(any(cell and cell.strip() for cell in row if cell) for row in lattice_result)):
1290
- logger.debug(f"Region {self.bbox}: 'lattice' method found table with {len(lattice_result)} rows")
1329
+ if (
1330
+ lattice_result
1331
+ and len(lattice_result) > 0
1332
+ and any(
1333
+ any(cell and cell.strip() for cell in row if cell)
1334
+ for row in lattice_result
1335
+ )
1336
+ ):
1337
+ logger.debug(
1338
+ f"Region {self.bbox}: 'lattice' method found table with {len(lattice_result)} rows"
1339
+ )
1291
1340
  return lattice_result
1292
1341
  else:
1293
- logger.debug(f"Region {self.bbox}: 'lattice' method found no meaningful content")
1342
+ logger.debug(
1343
+ f"Region {self.bbox}: 'lattice' method found no meaningful content"
1344
+ )
1294
1345
  except Exception as e:
1295
1346
  logger.debug(f"Region {self.bbox}: 'lattice' method failed: {e}")
1296
-
1347
+
1297
1348
  # Fall back to stream
1298
1349
  logger.debug(f"Region {self.bbox}: Falling back to 'stream' method...")
1299
- return self.extract_table('stream', table_settings=table_settings.copy())
1350
+ return self.extract_table("stream", table_settings=table_settings.copy())
1300
1351
  else:
1301
1352
  effective_method = method
1302
1353
 
@@ -1308,7 +1359,9 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
1308
1359
  table_settings.setdefault("vertical_strategy", "text")
1309
1360
  table_settings.setdefault("horizontal_strategy", "text")
1310
1361
  elif effective_method == "lattice":
1311
- logger.debug("Using 'lattice' method alias for 'pdfplumber' with line-based strategies.")
1362
+ logger.debug(
1363
+ "Using 'lattice' method alias for 'pdfplumber' with line-based strategies."
1364
+ )
1312
1365
  effective_method = "pdfplumber"
1313
1366
  # Set default line strategies if not already provided by the user
1314
1367
  table_settings.setdefault("vertical_strategy", "lines")
@@ -1331,7 +1384,6 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
1331
1384
  f"Unknown table extraction method: '{method}'. Choose from 'tatr', 'pdfplumber', 'text', 'stream', 'lattice'."
1332
1385
  )
1333
1386
 
1334
-
1335
1387
  def extract_tables(
1336
1388
  self,
1337
1389
  method: Optional[str] = None,
@@ -1357,33 +1409,45 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
1357
1409
  # Auto-detect method if not specified (try lattice first, then stream)
1358
1410
  if method is None:
1359
1411
  logger.debug(f"Region {self.bbox}: Auto-detecting tables extraction method...")
1360
-
1412
+
1361
1413
  # Try lattice first
1362
1414
  try:
1363
1415
  lattice_settings = table_settings.copy()
1364
1416
  lattice_settings.setdefault("vertical_strategy", "lines")
1365
1417
  lattice_settings.setdefault("horizontal_strategy", "lines")
1366
-
1418
+
1367
1419
  logger.debug(f"Region {self.bbox}: Trying 'lattice' method first for tables...")
1368
1420
  lattice_result = self._extract_tables_plumber(lattice_settings)
1369
-
1421
+
1370
1422
  # Check if lattice found meaningful tables
1371
- if (lattice_result and len(lattice_result) > 0 and
1372
- any(any(any(cell and cell.strip() for cell in row if cell) for row in table if table) for table in lattice_result)):
1373
- logger.debug(f"Region {self.bbox}: 'lattice' method found {len(lattice_result)} tables")
1423
+ if (
1424
+ lattice_result
1425
+ and len(lattice_result) > 0
1426
+ and any(
1427
+ any(
1428
+ any(cell and cell.strip() for cell in row if cell)
1429
+ for row in table
1430
+ if table
1431
+ )
1432
+ for table in lattice_result
1433
+ )
1434
+ ):
1435
+ logger.debug(
1436
+ f"Region {self.bbox}: 'lattice' method found {len(lattice_result)} tables"
1437
+ )
1374
1438
  return lattice_result
1375
1439
  else:
1376
1440
  logger.debug(f"Region {self.bbox}: 'lattice' method found no meaningful tables")
1377
-
1441
+
1378
1442
  except Exception as e:
1379
1443
  logger.debug(f"Region {self.bbox}: 'lattice' method failed: {e}")
1380
-
1444
+
1381
1445
  # Fall back to stream
1382
1446
  logger.debug(f"Region {self.bbox}: Falling back to 'stream' method for tables...")
1383
1447
  stream_settings = table_settings.copy()
1384
1448
  stream_settings.setdefault("vertical_strategy", "text")
1385
1449
  stream_settings.setdefault("horizontal_strategy", "text")
1386
-
1450
+
1387
1451
  return self._extract_tables_plumber(stream_settings)
1388
1452
 
1389
1453
  effective_method = method
@@ -1395,7 +1459,9 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
1395
1459
  table_settings.setdefault("vertical_strategy", "text")
1396
1460
  table_settings.setdefault("horizontal_strategy", "text")
1397
1461
  elif effective_method == "lattice":
1398
- logger.debug("Using 'lattice' method alias for 'pdfplumber' with line-based strategies.")
1462
+ logger.debug(
1463
+ "Using 'lattice' method alias for 'pdfplumber' with line-based strategies."
1464
+ )
1399
1465
  effective_method = "pdfplumber"
1400
1466
  table_settings.setdefault("vertical_strategy", "lines")
1401
1467
  table_settings.setdefault("horizontal_strategy", "lines")
@@ -1844,7 +1910,9 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
1844
1910
 
1845
1911
  # Validate contains parameter
1846
1912
  if contains not in ["all", "any", "center"]:
1847
- raise ValueError(f"Invalid contains value: {contains}. Must be 'all', 'any', or 'center'")
1913
+ raise ValueError(
1914
+ f"Invalid contains value: {contains}. Must be 'all', 'any', or 'center'"
1915
+ )
1848
1916
 
1849
1917
  # Construct selector if 'text' is provided
1850
1918
  effective_selector = ""
@@ -1894,24 +1962,21 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
1894
1962
  # Filter these elements based on the specified containment method
1895
1963
  region_bbox = self.bbox
1896
1964
  matching_elements = []
1897
-
1965
+
1898
1966
  if contains == "all": # Fully inside (strict)
1899
1967
  matching_elements = [
1900
- el for el in potential_elements
1968
+ el
1969
+ for el in potential_elements
1901
1970
  if el.x0 >= region_bbox[0]
1902
1971
  and el.top >= region_bbox[1]
1903
1972
  and el.x1 <= region_bbox[2]
1904
1973
  and el.bottom <= region_bbox[3]
1905
1974
  ]
1906
1975
  elif contains == "any": # Any overlap
1907
- matching_elements = [
1908
- el for el in potential_elements
1909
- if self.intersects(el)
1910
- ]
1976
+ matching_elements = [el for el in potential_elements if self.intersects(el)]
1911
1977
  elif contains == "center": # Center point inside
1912
1978
  matching_elements = [
1913
- el for el in potential_elements
1914
- if self.is_element_center_inside(el)
1979
+ el for el in potential_elements if self.is_element_center_inside(el)
1915
1980
  ]
1916
1981
 
1917
1982
  return ElementCollection(matching_elements)
@@ -2001,17 +2066,13 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
2001
2066
  manager_args = {k: v for k, v in manager_args.items() if v is not None}
2002
2067
 
2003
2068
  # Run OCR on this region's image using the manager
2004
- try:
2005
- results = ocr_mgr.apply_ocr(**manager_args)
2006
- if not isinstance(results, list):
2007
- logger.error(
2008
- f"OCRManager returned unexpected type for single region image: {type(results)}"
2009
- )
2010
- return self
2011
- logger.debug(f"Region OCR processing returned {len(results)} results.")
2012
- except Exception as e:
2013
- logger.error(f"Error during OCRManager processing for region: {e}", exc_info=True)
2069
+ results = ocr_mgr.apply_ocr(**manager_args)
2070
+ if not isinstance(results, list):
2071
+ logger.error(
2072
+ f"OCRManager returned unexpected type for single region image: {type(results)}"
2073
+ )
2014
2074
  return self
2075
+ logger.debug(f"Region OCR processing returned {len(results)} results.")
2015
2076
 
2016
2077
  # Convert results to TextElements
2017
2078
  scale_x = self.width / region_image.width if region_image.width > 0 else 1.0
@@ -2802,11 +2863,11 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
2802
2863
  self,
2803
2864
  text_content: Optional[Union[str, Callable[["Region"], Optional[str]]]] = None,
2804
2865
  source_label: str = "derived_from_region",
2805
- object_type: str = "word", # Or "char", controls how it's categorized
2866
+ object_type: str = "word", # Or "char", controls how it's categorized
2806
2867
  default_font_size: float = 10.0,
2807
2868
  default_font_name: str = "RegionContent",
2808
- confidence: Optional[float] = None, # Allow overriding confidence
2809
- add_to_page: bool = False # NEW: Option to add to page
2869
+ confidence: Optional[float] = None, # Allow overriding confidence
2870
+ add_to_page: bool = False, # NEW: Option to add to page
2810
2871
  ) -> "TextElement":
2811
2872
  """
2812
2873
  Creates a new TextElement object based on this region's geometry.
@@ -2833,7 +2894,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
2833
2894
 
2834
2895
  Returns:
2835
2896
  A new TextElement instance.
2836
-
2897
+
2837
2898
  Raises:
2838
2899
  ValueError: If the region does not have a valid 'page' attribute.
2839
2900
  """
@@ -2844,14 +2905,17 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
2844
2905
  try:
2845
2906
  actual_text = text_content(self)
2846
2907
  except Exception as e:
2847
- logger.error(f"Error executing text_content callback for region {self.bbox}: {e}", exc_info=True)
2848
- actual_text = None # Ensure actual_text is None on error
2908
+ logger.error(
2909
+ f"Error executing text_content callback for region {self.bbox}: {e}",
2910
+ exc_info=True,
2911
+ )
2912
+ actual_text = None # Ensure actual_text is None on error
2849
2913
 
2850
2914
  final_confidence = confidence
2851
2915
  if final_confidence is None:
2852
2916
  final_confidence = 1.0 if actual_text is not None and actual_text.strip() else 0.0
2853
2917
 
2854
- if not hasattr(self, 'page') or self.page is None:
2918
+ if not hasattr(self, "page") or self.page is None:
2855
2919
  raise ValueError("Region must have a valid 'page' attribute to create a TextElement.")
2856
2920
 
2857
2921
  elem_data = {
@@ -2864,8 +2928,8 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
2864
2928
  "height": self.height,
2865
2929
  "object_type": object_type,
2866
2930
  "page_number": self.page.page_number,
2867
- "stroking_color": getattr(self, 'stroking_color', (0,0,0)),
2868
- "non_stroking_color": getattr(self, 'non_stroking_color', (0,0,0)),
2931
+ "stroking_color": getattr(self, "stroking_color", (0, 0, 0)),
2932
+ "non_stroking_color": getattr(self, "non_stroking_color", (0, 0, 0)),
2869
2933
  "fontname": default_font_name,
2870
2934
  "size": default_font_size,
2871
2935
  "upright": True,
@@ -2873,18 +2937,28 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
2873
2937
  "adv": self.width,
2874
2938
  "source": source_label,
2875
2939
  "confidence": final_confidence,
2876
- "_char_dicts": []
2940
+ "_char_dicts": [],
2877
2941
  }
2878
2942
  text_element = TextElement(elem_data, self.page)
2879
2943
 
2880
2944
  if add_to_page:
2881
- if hasattr(self.page, '_element_mgr') and self.page._element_mgr is not None:
2882
- add_as_type = "words" if object_type == "word" else "chars" if object_type == "char" else object_type
2945
+ if hasattr(self.page, "_element_mgr") and self.page._element_mgr is not None:
2946
+ add_as_type = (
2947
+ "words"
2948
+ if object_type == "word"
2949
+ else "chars" if object_type == "char" else object_type
2950
+ )
2883
2951
  # REMOVED try-except block around add_element
2884
2952
  self.page._element_mgr.add_element(text_element, element_type=add_as_type)
2885
- logger.debug(f"TextElement created from region {self.bbox} and added to page {self.page.page_number} as {add_as_type}.")
2953
+ logger.debug(
2954
+ f"TextElement created from region {self.bbox} and added to page {self.page.page_number} as {add_as_type}."
2955
+ )
2886
2956
  else:
2887
- page_num_str = str(self.page.page_number) if hasattr(self.page, 'page_number') else 'N/A'
2888
- logger.warning(f"Cannot add TextElement to page: Page {page_num_str} for region {self.bbox} is missing '_element_mgr'.")
2889
-
2957
+ page_num_str = (
2958
+ str(self.page.page_number) if hasattr(self.page, "page_number") else "N/A"
2959
+ )
2960
+ logger.warning(
2961
+ f"Cannot add TextElement to page: Page {page_num_str} for region {self.bbox} is missing '_element_mgr'."
2962
+ )
2963
+
2890
2964
  return text_element