natural-pdf 0.1.14__py3-none-any.whl → 0.1.16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. natural_pdf/__init__.py +31 -0
  2. natural_pdf/analyzers/layout/gemini.py +137 -162
  3. natural_pdf/analyzers/layout/layout_manager.py +9 -5
  4. natural_pdf/analyzers/layout/layout_options.py +77 -7
  5. natural_pdf/analyzers/layout/paddle.py +318 -165
  6. natural_pdf/analyzers/layout/table_structure_utils.py +78 -0
  7. natural_pdf/analyzers/shape_detection_mixin.py +770 -405
  8. natural_pdf/classification/mixin.py +2 -8
  9. natural_pdf/collections/pdf_collection.py +25 -30
  10. natural_pdf/core/highlighting_service.py +47 -32
  11. natural_pdf/core/page.py +226 -70
  12. natural_pdf/core/pdf.py +19 -22
  13. natural_pdf/elements/base.py +9 -9
  14. natural_pdf/elements/collections.py +105 -50
  15. natural_pdf/elements/region.py +320 -113
  16. natural_pdf/exporters/paddleocr.py +38 -13
  17. natural_pdf/flows/__init__.py +3 -3
  18. natural_pdf/flows/collections.py +303 -132
  19. natural_pdf/flows/element.py +277 -132
  20. natural_pdf/flows/flow.py +33 -16
  21. natural_pdf/flows/region.py +142 -79
  22. natural_pdf/ocr/engine_doctr.py +37 -4
  23. natural_pdf/ocr/engine_easyocr.py +23 -3
  24. natural_pdf/ocr/engine_paddle.py +281 -30
  25. natural_pdf/ocr/engine_surya.py +8 -3
  26. natural_pdf/ocr/ocr_manager.py +75 -76
  27. natural_pdf/ocr/ocr_options.py +52 -87
  28. natural_pdf/search/__init__.py +25 -12
  29. natural_pdf/search/lancedb_search_service.py +91 -54
  30. natural_pdf/search/numpy_search_service.py +86 -65
  31. natural_pdf/search/searchable_mixin.py +2 -2
  32. natural_pdf/selectors/parser.py +125 -81
  33. natural_pdf/widgets/__init__.py +1 -1
  34. natural_pdf/widgets/viewer.py +205 -449
  35. {natural_pdf-0.1.14.dist-info → natural_pdf-0.1.16.dist-info}/METADATA +27 -45
  36. {natural_pdf-0.1.14.dist-info → natural_pdf-0.1.16.dist-info}/RECORD +39 -38
  37. {natural_pdf-0.1.14.dist-info → natural_pdf-0.1.16.dist-info}/WHEEL +0 -0
  38. {natural_pdf-0.1.14.dist-info → natural_pdf-0.1.16.dist-info}/licenses/LICENSE +0 -0
  39. {natural_pdf-0.1.14.dist-info → natural_pdf-0.1.16.dist-info}/top_level.txt +0 -0
@@ -5,15 +5,19 @@ from pdfplumber.utils.geometry import get_bbox_overlap, merge_bboxes, objects_to
5
5
 
6
6
  # New Imports
7
7
  from pdfplumber.utils.text import TEXTMAP_KWARGS, WORD_EXTRACTOR_KWARGS, chars_to_textmap
8
+ from tqdm.auto import tqdm
8
9
 
9
10
  from natural_pdf.analyzers.layout.pdfplumber_table_finder import find_text_based_tables
11
+
12
+ # --- Shape Detection Mixin --- #
13
+ from natural_pdf.analyzers.shape_detection_mixin import ShapeDetectionMixin
10
14
  from natural_pdf.classification.manager import ClassificationManager # Keep for type hint
11
15
 
12
16
  # --- Classification Imports --- #
13
17
  from natural_pdf.classification.mixin import ClassificationMixin
14
18
  from natural_pdf.elements.base import DirectionalMixin
19
+ from natural_pdf.elements.text import TextElement # ADDED IMPORT
15
20
  from natural_pdf.extraction.mixin import ExtractionMixin # Import extraction mixin
16
- from natural_pdf.elements.text import TextElement # ADDED IMPORT
17
21
  from natural_pdf.ocr.utils import _apply_ocr_correction_to_elements # Import utility
18
22
  from natural_pdf.selectors.parser import parse_selector, selector_to_filter_func
19
23
  from natural_pdf.utils.locks import pdf_render_lock # Import the lock
@@ -21,21 +25,19 @@ from natural_pdf.utils.locks import pdf_render_lock # Import the lock
21
25
  # Import new utils
22
26
  from natural_pdf.utils.text_extraction import filter_chars_spatially, generate_text_layout
23
27
 
24
- from tqdm.auto import tqdm
25
28
  # --- End Classification Imports --- #
26
29
 
27
- # --- Shape Detection Mixin --- #
28
- from natural_pdf.analyzers.shape_detection_mixin import ShapeDetectionMixin
29
- # --- End Shape Detection Mixin --- #
30
+
31
+ # --- End Shape Detection Mixin --- #
30
32
 
31
33
  if TYPE_CHECKING:
32
34
  # --- NEW: Add Image type hint for classification --- #
33
35
  from PIL.Image import Image
34
36
 
35
37
  from natural_pdf.core.page import Page
38
+ from natural_pdf.elements.base import Element # Added for type hint
36
39
  from natural_pdf.elements.collections import ElementCollection
37
40
  from natural_pdf.elements.text import TextElement
38
- from natural_pdf.elements.base import Element # Added for type hint
39
41
 
40
42
  # Import OCRManager conditionally to avoid circular imports
41
43
  try:
@@ -726,25 +728,32 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
726
728
  # Handle the case where user wants the cropped region to have a specific width
727
729
  page_kwargs = kwargs.copy()
728
730
  effective_resolution = resolution # Start with the provided resolution
729
-
730
- if crop_only and 'width' in kwargs:
731
- target_width = kwargs['width']
731
+
732
+ if crop_only and "width" in kwargs:
733
+ target_width = kwargs["width"]
732
734
  # Calculate what resolution is needed to make the region crop have target_width
733
735
  region_width_points = self.width # Region width in PDF points
734
-
736
+
735
737
  if region_width_points > 0:
736
738
  # Calculate scale needed: target_width / region_width_points
737
739
  required_scale = target_width / region_width_points
738
740
  # Convert scale to resolution: scale * 72 DPI
739
741
  effective_resolution = required_scale * 72.0
740
- page_kwargs.pop('width') # Remove width parameter to avoid conflicts
741
- logger.debug(f"Region {self.bbox}: Calculated required resolution {effective_resolution:.1f} DPI for region crop width {target_width}")
742
+ page_kwargs.pop("width") # Remove width parameter to avoid conflicts
743
+ logger.debug(
744
+ f"Region {self.bbox}: Calculated required resolution {effective_resolution:.1f} DPI for region crop width {target_width}"
745
+ )
742
746
  else:
743
- logger.warning(f"Region {self.bbox}: Invalid region width {region_width_points}, using original resolution")
747
+ logger.warning(
748
+ f"Region {self.bbox}: Invalid region width {region_width_points}, using original resolution"
749
+ )
744
750
 
745
751
  # First get the full page image with highlights if requested
746
752
  page_image = self._page.to_image(
747
- scale=scale, resolution=effective_resolution, include_highlights=include_highlights, **page_kwargs
753
+ scale=scale,
754
+ resolution=effective_resolution,
755
+ include_highlights=include_highlights,
756
+ **page_kwargs,
748
757
  )
749
758
 
750
759
  # Calculate the actual scale factor used by the page image
@@ -899,13 +908,19 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
899
908
  image.save(filename)
900
909
  return self
901
910
 
902
- def trim(self, padding: int = 1, threshold: float = 0.95, resolution: float = 150, pre_shrink: float = 0.5) -> "Region":
911
+ def trim(
912
+ self,
913
+ padding: int = 1,
914
+ threshold: float = 0.95,
915
+ resolution: float = 150,
916
+ pre_shrink: float = 0.5,
917
+ ) -> "Region":
903
918
  """
904
919
  Trim visual whitespace from the edges of this region.
905
-
920
+
906
921
  Similar to Python's string .strip() method, but for visual whitespace in the region image.
907
922
  Uses pixel analysis to detect rows/columns that are predominantly whitespace.
908
-
923
+
909
924
  Args:
910
925
  padding: Number of pixels to keep as padding after trimming (default: 1)
911
926
  threshold: Threshold for considering a row/column as whitespace (0.0-1.0, default: 0.95)
@@ -914,104 +929,126 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
914
929
  resolution: Resolution for image rendering in DPI (default: 150)
915
930
  pre_shrink: Amount to shrink region before trimming, then expand back after (default: 0.5)
916
931
  This helps avoid detecting box borders/slivers as content.
917
-
932
+
918
933
  Returns:
919
934
  New Region with visual whitespace trimmed from all edges
920
-
935
+
921
936
  Example:
922
937
  # Basic trimming with 1 pixel padding and 0.5px pre-shrink
923
938
  trimmed = region.trim()
924
-
939
+
925
940
  # More aggressive trimming with no padding and no pre-shrink
926
941
  tight = region.trim(padding=0, threshold=0.9, pre_shrink=0)
927
-
942
+
928
943
  # Conservative trimming with more padding
929
944
  loose = region.trim(padding=3, threshold=0.98)
930
945
  """
931
946
  # Pre-shrink the region to avoid box slivers
932
- work_region = self.expand(left=-pre_shrink, right=-pre_shrink, top=-pre_shrink, bottom=-pre_shrink) if pre_shrink > 0 else self
933
-
947
+ work_region = (
948
+ self.expand(left=-pre_shrink, right=-pre_shrink, top=-pre_shrink, bottom=-pre_shrink)
949
+ if pre_shrink > 0
950
+ else self
951
+ )
952
+
934
953
  # Get the region image
935
- image = work_region.to_image(resolution=resolution, crop_only=True, include_highlights=False)
936
-
954
+ image = work_region.to_image(
955
+ resolution=resolution, crop_only=True, include_highlights=False
956
+ )
957
+
937
958
  if image is None:
938
- logger.warning(f"Region {self.bbox}: Could not generate image for trimming. Returning original region.")
959
+ logger.warning(
960
+ f"Region {self.bbox}: Could not generate image for trimming. Returning original region."
961
+ )
939
962
  return self
940
-
963
+
941
964
  # Convert to grayscale for easier analysis
942
965
  import numpy as np
943
-
966
+
944
967
  # Convert PIL image to numpy array
945
- img_array = np.array(image.convert('L')) # Convert to grayscale
968
+ img_array = np.array(image.convert("L")) # Convert to grayscale
946
969
  height, width = img_array.shape
947
-
970
+
948
971
  if height == 0 or width == 0:
949
- logger.warning(f"Region {self.bbox}: Image has zero dimensions. Returning original region.")
972
+ logger.warning(
973
+ f"Region {self.bbox}: Image has zero dimensions. Returning original region."
974
+ )
950
975
  return self
951
-
976
+
952
977
  # Normalize pixel values to 0-1 range (255 = white = 1.0, 0 = black = 0.0)
953
978
  normalized = img_array.astype(np.float32) / 255.0
954
-
979
+
955
980
  # Find content boundaries by analyzing row and column averages
956
-
981
+
957
982
  # Analyze rows (horizontal strips) to find top and bottom boundaries
958
983
  row_averages = np.mean(normalized, axis=1) # Average each row
959
984
  content_rows = row_averages < threshold # True where there's content (not whitespace)
960
-
985
+
961
986
  # Find first and last rows with content
962
987
  content_row_indices = np.where(content_rows)[0]
963
988
  if len(content_row_indices) == 0:
964
989
  # No content found, return a minimal region at the center
965
- logger.warning(f"Region {self.bbox}: No content detected during trimming. Returning center point.")
990
+ logger.warning(
991
+ f"Region {self.bbox}: No content detected during trimming. Returning center point."
992
+ )
966
993
  center_x = (self.x0 + self.x1) / 2
967
994
  center_y = (self.top + self.bottom) / 2
968
995
  return Region(self.page, (center_x, center_y, center_x, center_y))
969
-
996
+
970
997
  top_content_row = max(0, content_row_indices[0] - padding)
971
998
  bottom_content_row = min(height - 1, content_row_indices[-1] + padding)
972
-
973
- # Analyze columns (vertical strips) to find left and right boundaries
999
+
1000
+ # Analyze columns (vertical strips) to find left and right boundaries
974
1001
  col_averages = np.mean(normalized, axis=0) # Average each column
975
1002
  content_cols = col_averages < threshold # True where there's content
976
-
1003
+
977
1004
  content_col_indices = np.where(content_cols)[0]
978
1005
  if len(content_col_indices) == 0:
979
1006
  # No content found in columns either
980
- logger.warning(f"Region {self.bbox}: No column content detected during trimming. Returning center point.")
1007
+ logger.warning(
1008
+ f"Region {self.bbox}: No column content detected during trimming. Returning center point."
1009
+ )
981
1010
  center_x = (self.x0 + self.x1) / 2
982
1011
  center_y = (self.top + self.bottom) / 2
983
1012
  return Region(self.page, (center_x, center_y, center_x, center_y))
984
-
1013
+
985
1014
  left_content_col = max(0, content_col_indices[0] - padding)
986
1015
  right_content_col = min(width - 1, content_col_indices[-1] + padding)
987
-
1016
+
988
1017
  # Convert trimmed pixel coordinates back to PDF coordinates
989
1018
  scale_factor = resolution / 72.0 # Scale factor used in to_image()
990
-
1019
+
991
1020
  # Calculate new PDF coordinates and ensure they are Python floats
992
1021
  trimmed_x0 = float(work_region.x0 + (left_content_col / scale_factor))
993
1022
  trimmed_top = float(work_region.top + (top_content_row / scale_factor))
994
- trimmed_x1 = float(work_region.x0 + ((right_content_col + 1) / scale_factor)) # +1 because we want inclusive right edge
995
- trimmed_bottom = float(work_region.top + ((bottom_content_row + 1) / scale_factor)) # +1 because we want inclusive bottom edge
996
-
1023
+ trimmed_x1 = float(
1024
+ work_region.x0 + ((right_content_col + 1) / scale_factor)
1025
+ ) # +1 because we want inclusive right edge
1026
+ trimmed_bottom = float(
1027
+ work_region.top + ((bottom_content_row + 1) / scale_factor)
1028
+ ) # +1 because we want inclusive bottom edge
1029
+
997
1030
  # Ensure the trimmed region doesn't exceed the work region boundaries
998
1031
  final_x0 = max(work_region.x0, trimmed_x0)
999
1032
  final_top = max(work_region.top, trimmed_top)
1000
1033
  final_x1 = min(work_region.x1, trimmed_x1)
1001
1034
  final_bottom = min(work_region.bottom, trimmed_bottom)
1002
-
1035
+
1003
1036
  # Ensure valid coordinates (width > 0, height > 0)
1004
1037
  if final_x1 <= final_x0 or final_bottom <= final_top:
1005
- logger.warning(f"Region {self.bbox}: Trimming resulted in invalid dimensions. Returning original region.")
1038
+ logger.warning(
1039
+ f"Region {self.bbox}: Trimming resulted in invalid dimensions. Returning original region."
1040
+ )
1006
1041
  return self
1007
-
1042
+
1008
1043
  # Create the trimmed region
1009
1044
  trimmed_region = Region(self.page, (final_x0, final_top, final_x1, final_bottom))
1010
-
1045
+
1011
1046
  # Expand back by the pre_shrink amount to restore original positioning
1012
1047
  if pre_shrink > 0:
1013
- trimmed_region = trimmed_region.expand(left=pre_shrink, right=pre_shrink, top=pre_shrink, bottom=pre_shrink)
1014
-
1048
+ trimmed_region = trimmed_region.expand(
1049
+ left=pre_shrink, right=pre_shrink, top=pre_shrink, bottom=pre_shrink
1050
+ )
1051
+
1015
1052
  # Copy relevant metadata
1016
1053
  trimmed_region.region_type = self.region_type
1017
1054
  trimmed_region.normalized_type = self.normalized_type
@@ -1021,8 +1058,10 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
1021
1058
  trimmed_region.label = self.label
1022
1059
  trimmed_region.source = "trimmed" # Indicate this is a derived region
1023
1060
  trimmed_region.parent_region = self
1024
-
1025
- logger.debug(f"Region {self.bbox}: Trimmed to {trimmed_region.bbox} (padding={padding}, threshold={threshold}, pre_shrink={pre_shrink})")
1061
+
1062
+ logger.debug(
1063
+ f"Region {self.bbox}: Trimmed to {trimmed_region.bbox} (padding={padding}, threshold={threshold}, pre_shrink={pre_shrink})"
1064
+ )
1026
1065
  return trimmed_region
1027
1066
 
1028
1067
  def clip(
@@ -1035,42 +1074,42 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
1035
1074
  ) -> "Region":
1036
1075
  """
1037
1076
  Clip this region to specific bounds, either from another object with bbox or explicit coordinates.
1038
-
1077
+
1039
1078
  The clipped region will be constrained to not exceed the specified boundaries.
1040
1079
  You can provide either an object with bounding box properties, specific coordinates, or both.
1041
1080
  When both are provided, explicit coordinates take precedence.
1042
-
1081
+
1043
1082
  Args:
1044
1083
  obj: Optional object with bbox properties (Region, Element, TextElement, etc.)
1045
1084
  left: Optional left boundary (x0) to clip to
1046
- top: Optional top boundary to clip to
1085
+ top: Optional top boundary to clip to
1047
1086
  right: Optional right boundary (x1) to clip to
1048
1087
  bottom: Optional bottom boundary to clip to
1049
-
1088
+
1050
1089
  Returns:
1051
1090
  New Region with bounds clipped to the specified constraints
1052
-
1091
+
1053
1092
  Examples:
1054
1093
  # Clip to another region's bounds
1055
1094
  clipped = region.clip(container_region)
1056
-
1095
+
1057
1096
  # Clip to any element's bounds
1058
1097
  clipped = region.clip(text_element)
1059
-
1098
+
1060
1099
  # Clip to specific coordinates
1061
1100
  clipped = region.clip(left=100, right=400)
1062
-
1101
+
1063
1102
  # Mix object bounds with specific overrides
1064
1103
  clipped = region.clip(obj=container, bottom=page.height/2)
1065
1104
  """
1066
1105
  from natural_pdf.elements.base import extract_bbox
1067
-
1106
+
1068
1107
  # Start with current region bounds
1069
1108
  clip_x0 = self.x0
1070
1109
  clip_top = self.top
1071
1110
  clip_x1 = self.x1
1072
1111
  clip_bottom = self.bottom
1073
-
1112
+
1074
1113
  # Apply object constraints if provided
1075
1114
  if obj is not None:
1076
1115
  obj_bbox = extract_bbox(obj)
@@ -1086,7 +1125,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
1086
1125
  f"Region {self.bbox}: Cannot extract bbox from clipping object {type(obj)}. "
1087
1126
  "Object must have bbox property or x0/top/x1/bottom attributes."
1088
1127
  )
1089
-
1128
+
1090
1129
  # Apply explicit coordinate constraints (these take precedence)
1091
1130
  if left is not None:
1092
1131
  clip_x0 = max(clip_x0, left)
@@ -1096,7 +1135,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
1096
1135
  clip_x1 = min(clip_x1, right)
1097
1136
  if bottom is not None:
1098
1137
  clip_bottom = min(clip_bottom, bottom)
1099
-
1138
+
1100
1139
  # Ensure valid coordinates
1101
1140
  if clip_x1 <= clip_x0 or clip_bottom <= clip_top:
1102
1141
  logger.warning(
@@ -1105,10 +1144,10 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
1105
1144
  )
1106
1145
  # Return a minimal region at the clip area's top-left
1107
1146
  return Region(self.page, (clip_x0, clip_top, clip_x0, clip_top))
1108
-
1147
+
1109
1148
  # Create the clipped region
1110
1149
  clipped_region = Region(self.page, (clip_x0, clip_top, clip_x1, clip_bottom))
1111
-
1150
+
1112
1151
  # Copy relevant metadata
1113
1152
  clipped_region.region_type = self.region_type
1114
1153
  clipped_region.normalized_type = self.normalized_type
@@ -1118,7 +1157,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
1118
1157
  clipped_region.label = self.label
1119
1158
  clipped_region.source = "clipped" # Indicate this is a derived region
1120
1159
  clipped_region.parent_region = self
1121
-
1160
+
1122
1161
  logger.debug(
1123
1162
  f"Region {self.bbox}: Clipped to {clipped_region.bbox} "
1124
1163
  f"(constraints: obj={type(obj).__name__ if obj else None}, "
@@ -1247,8 +1286,12 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
1247
1286
  Extract a table from this region.
1248
1287
 
1249
1288
  Args:
1250
- method: Method to use: 'tatr', 'plumber', 'text', or None (auto-detect).
1251
- table_settings: Settings for pdfplumber table extraction (used only with 'plumber' method).
1289
+ method: Method to use: 'tatr', 'pdfplumber', 'text', 'stream', 'lattice', or None (auto-detect).
1290
+ 'stream' is an alias for 'pdfplumber' with text-based strategies (equivalent to
1291
+ setting `vertical_strategy` and `horizontal_strategy` to 'text').
1292
+ 'lattice' is an alias for 'pdfplumber' with line-based strategies (equivalent to
1293
+ setting `vertical_strategy` and `horizontal_strategy` to 'lines').
1294
+ table_settings: Settings for pdfplumber table extraction (used with 'pdfplumber', 'stream', or 'lattice' methods).
1252
1295
  use_ocr: Whether to use OCR for text extraction (currently only applicable with 'tatr' method).
1253
1296
  ocr_config: OCR configuration parameters.
1254
1297
  text_options: Dictionary of options for the 'text' method, corresponding to arguments
@@ -1268,13 +1311,61 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
1268
1311
  text_options = {} # Initialize empty dict
1269
1312
 
1270
1313
  # Auto-detect method if not specified
1271
- effective_method = method
1272
- if effective_method is None:
1314
+ if method is None:
1273
1315
  # If this is a TATR-detected region, use TATR method
1274
1316
  if hasattr(self, "model") and self.model == "tatr" and self.region_type == "table":
1275
1317
  effective_method = "tatr"
1276
1318
  else:
1277
- effective_method = "plumber"
1319
+ # Try lattice first, then fall back to stream if no meaningful results
1320
+ logger.debug(f"Region {self.bbox}: Auto-detecting table extraction method...")
1321
+
1322
+ try:
1323
+ logger.debug(f"Region {self.bbox}: Trying 'lattice' method first...")
1324
+ lattice_result = self.extract_table(
1325
+ "lattice", table_settings=table_settings.copy()
1326
+ )
1327
+
1328
+ # Check if lattice found meaningful content
1329
+ if (
1330
+ lattice_result
1331
+ and len(lattice_result) > 0
1332
+ and any(
1333
+ any(cell and cell.strip() for cell in row if cell)
1334
+ for row in lattice_result
1335
+ )
1336
+ ):
1337
+ logger.debug(
1338
+ f"Region {self.bbox}: 'lattice' method found table with {len(lattice_result)} rows"
1339
+ )
1340
+ return lattice_result
1341
+ else:
1342
+ logger.debug(
1343
+ f"Region {self.bbox}: 'lattice' method found no meaningful content"
1344
+ )
1345
+ except Exception as e:
1346
+ logger.debug(f"Region {self.bbox}: 'lattice' method failed: {e}")
1347
+
1348
+ # Fall back to stream
1349
+ logger.debug(f"Region {self.bbox}: Falling back to 'stream' method...")
1350
+ return self.extract_table("stream", table_settings=table_settings.copy())
1351
+ else:
1352
+ effective_method = method
1353
+
1354
+ # Handle method aliases for pdfplumber
1355
+ if effective_method == "stream":
1356
+ logger.debug("Using 'stream' method alias for 'pdfplumber' with text-based strategies.")
1357
+ effective_method = "pdfplumber"
1358
+ # Set default text strategies if not already provided by the user
1359
+ table_settings.setdefault("vertical_strategy", "text")
1360
+ table_settings.setdefault("horizontal_strategy", "text")
1361
+ elif effective_method == "lattice":
1362
+ logger.debug(
1363
+ "Using 'lattice' method alias for 'pdfplumber' with line-based strategies."
1364
+ )
1365
+ effective_method = "pdfplumber"
1366
+ # Set default line strategies if not already provided by the user
1367
+ table_settings.setdefault("vertical_strategy", "lines")
1368
+ table_settings.setdefault("horizontal_strategy", "lines")
1278
1369
 
1279
1370
  logger.debug(f"Region {self.bbox}: Extracting table using method '{effective_method}'")
1280
1371
 
@@ -1284,16 +1375,124 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
1284
1375
  elif effective_method == "text":
1285
1376
  current_text_options = text_options.copy()
1286
1377
  current_text_options["cell_extraction_func"] = cell_extraction_func
1287
- # --- Pass show_progress to the helper --- #
1288
1378
  current_text_options["show_progress"] = show_progress
1289
1379
  return self._extract_table_text(**current_text_options)
1290
- elif effective_method == "plumber":
1380
+ elif effective_method == "pdfplumber":
1291
1381
  return self._extract_table_plumber(table_settings)
1292
1382
  else:
1293
1383
  raise ValueError(
1294
- f"Unknown table extraction method: '{effective_method}'. Choose from 'tatr', 'plumber', 'text'."
1384
+ f"Unknown table extraction method: '{method}'. Choose from 'tatr', 'pdfplumber', 'text', 'stream', 'lattice'."
1295
1385
  )
1296
1386
 
1387
+ def extract_tables(
1388
+ self,
1389
+ method: Optional[str] = None,
1390
+ table_settings: Optional[dict] = None,
1391
+ ) -> List[List[List[str]]]:
1392
+ """
1393
+ Extract all tables from this region using pdfplumber-based methods.
1394
+
1395
+ Note: Only 'pdfplumber', 'stream', and 'lattice' methods are supported for extract_tables.
1396
+ 'tatr' and 'text' methods are designed for single table extraction only.
1397
+
1398
+ Args:
1399
+ method: Method to use: 'pdfplumber', 'stream', 'lattice', or None (auto-detect).
1400
+ 'stream' uses text-based strategies, 'lattice' uses line-based strategies.
1401
+ table_settings: Settings for pdfplumber table extraction.
1402
+
1403
+ Returns:
1404
+ List of tables, where each table is a list of rows, and each row is a list of cell values.
1405
+ """
1406
+ if table_settings is None:
1407
+ table_settings = {}
1408
+
1409
+ # Auto-detect method if not specified (try lattice first, then stream)
1410
+ if method is None:
1411
+ logger.debug(f"Region {self.bbox}: Auto-detecting tables extraction method...")
1412
+
1413
+ # Try lattice first
1414
+ try:
1415
+ lattice_settings = table_settings.copy()
1416
+ lattice_settings.setdefault("vertical_strategy", "lines")
1417
+ lattice_settings.setdefault("horizontal_strategy", "lines")
1418
+
1419
+ logger.debug(f"Region {self.bbox}: Trying 'lattice' method first for tables...")
1420
+ lattice_result = self._extract_tables_plumber(lattice_settings)
1421
+
1422
+ # Check if lattice found meaningful tables
1423
+ if (
1424
+ lattice_result
1425
+ and len(lattice_result) > 0
1426
+ and any(
1427
+ any(
1428
+ any(cell and cell.strip() for cell in row if cell)
1429
+ for row in table
1430
+ if table
1431
+ )
1432
+ for table in lattice_result
1433
+ )
1434
+ ):
1435
+ logger.debug(
1436
+ f"Region {self.bbox}: 'lattice' method found {len(lattice_result)} tables"
1437
+ )
1438
+ return lattice_result
1439
+ else:
1440
+ logger.debug(f"Region {self.bbox}: 'lattice' method found no meaningful tables")
1441
+
1442
+ except Exception as e:
1443
+ logger.debug(f"Region {self.bbox}: 'lattice' method failed: {e}")
1444
+
1445
+ # Fall back to stream
1446
+ logger.debug(f"Region {self.bbox}: Falling back to 'stream' method for tables...")
1447
+ stream_settings = table_settings.copy()
1448
+ stream_settings.setdefault("vertical_strategy", "text")
1449
+ stream_settings.setdefault("horizontal_strategy", "text")
1450
+
1451
+ return self._extract_tables_plumber(stream_settings)
1452
+
1453
+ effective_method = method
1454
+
1455
+ # Handle method aliases
1456
+ if effective_method == "stream":
1457
+ logger.debug("Using 'stream' method alias for 'pdfplumber' with text-based strategies.")
1458
+ effective_method = "pdfplumber"
1459
+ table_settings.setdefault("vertical_strategy", "text")
1460
+ table_settings.setdefault("horizontal_strategy", "text")
1461
+ elif effective_method == "lattice":
1462
+ logger.debug(
1463
+ "Using 'lattice' method alias for 'pdfplumber' with line-based strategies."
1464
+ )
1465
+ effective_method = "pdfplumber"
1466
+ table_settings.setdefault("vertical_strategy", "lines")
1467
+ table_settings.setdefault("horizontal_strategy", "lines")
1468
+
1469
+ # Use the selected method
1470
+ if effective_method == "pdfplumber":
1471
+ return self._extract_tables_plumber(table_settings)
1472
+ else:
1473
+ raise ValueError(
1474
+ f"Unknown tables extraction method: '{method}'. Choose from 'pdfplumber', 'stream', 'lattice'."
1475
+ )
1476
+
1477
+ def _extract_tables_plumber(self, table_settings: dict) -> List[List[List[str]]]:
1478
+ """
1479
+ Extract all tables using pdfplumber's table extraction.
1480
+
1481
+ Args:
1482
+ table_settings: Settings for pdfplumber table extraction
1483
+
1484
+ Returns:
1485
+ List of tables, where each table is a list of rows, and each row is a list of cell values
1486
+ """
1487
+ # Create a crop of the page for this region
1488
+ cropped = self.page._page.crop(self.bbox)
1489
+
1490
+ # Extract all tables from the cropped area
1491
+ tables = cropped.extract_tables(table_settings)
1492
+
1493
+ # Return the tables or an empty list if none found
1494
+ return tables if tables else []
1495
+
1297
1496
  def _extract_table_plumber(self, table_settings: dict) -> List[List[str]]:
1298
1497
  """
1299
1498
  Extract table using pdfplumber's table extraction.
@@ -1711,7 +1910,9 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
1711
1910
 
1712
1911
  # Validate contains parameter
1713
1912
  if contains not in ["all", "any", "center"]:
1714
- raise ValueError(f"Invalid contains value: {contains}. Must be 'all', 'any', or 'center'")
1913
+ raise ValueError(
1914
+ f"Invalid contains value: {contains}. Must be 'all', 'any', or 'center'"
1915
+ )
1715
1916
 
1716
1917
  # Construct selector if 'text' is provided
1717
1918
  effective_selector = ""
@@ -1761,24 +1962,21 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
1761
1962
  # Filter these elements based on the specified containment method
1762
1963
  region_bbox = self.bbox
1763
1964
  matching_elements = []
1764
-
1965
+
1765
1966
  if contains == "all": # Fully inside (strict)
1766
1967
  matching_elements = [
1767
- el for el in potential_elements
1968
+ el
1969
+ for el in potential_elements
1768
1970
  if el.x0 >= region_bbox[0]
1769
1971
  and el.top >= region_bbox[1]
1770
1972
  and el.x1 <= region_bbox[2]
1771
1973
  and el.bottom <= region_bbox[3]
1772
1974
  ]
1773
1975
  elif contains == "any": # Any overlap
1774
- matching_elements = [
1775
- el for el in potential_elements
1776
- if self.intersects(el)
1777
- ]
1976
+ matching_elements = [el for el in potential_elements if self.intersects(el)]
1778
1977
  elif contains == "center": # Center point inside
1779
1978
  matching_elements = [
1780
- el for el in potential_elements
1781
- if self.is_element_center_inside(el)
1979
+ el for el in potential_elements if self.is_element_center_inside(el)
1782
1980
  ]
1783
1981
 
1784
1982
  return ElementCollection(matching_elements)
@@ -1868,17 +2066,13 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
1868
2066
  manager_args = {k: v for k, v in manager_args.items() if v is not None}
1869
2067
 
1870
2068
  # Run OCR on this region's image using the manager
1871
- try:
1872
- results = ocr_mgr.apply_ocr(**manager_args)
1873
- if not isinstance(results, list):
1874
- logger.error(
1875
- f"OCRManager returned unexpected type for single region image: {type(results)}"
1876
- )
1877
- return self
1878
- logger.debug(f"Region OCR processing returned {len(results)} results.")
1879
- except Exception as e:
1880
- logger.error(f"Error during OCRManager processing for region: {e}", exc_info=True)
2069
+ results = ocr_mgr.apply_ocr(**manager_args)
2070
+ if not isinstance(results, list):
2071
+ logger.error(
2072
+ f"OCRManager returned unexpected type for single region image: {type(results)}"
2073
+ )
1881
2074
  return self
2075
+ logger.debug(f"Region OCR processing returned {len(results)} results.")
1882
2076
 
1883
2077
  # Convert results to TextElements
1884
2078
  scale_x = self.width / region_image.width if region_image.width > 0 else 1.0
@@ -2669,11 +2863,11 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
2669
2863
  self,
2670
2864
  text_content: Optional[Union[str, Callable[["Region"], Optional[str]]]] = None,
2671
2865
  source_label: str = "derived_from_region",
2672
- object_type: str = "word", # Or "char", controls how it's categorized
2866
+ object_type: str = "word", # Or "char", controls how it's categorized
2673
2867
  default_font_size: float = 10.0,
2674
2868
  default_font_name: str = "RegionContent",
2675
- confidence: Optional[float] = None, # Allow overriding confidence
2676
- add_to_page: bool = False # NEW: Option to add to page
2869
+ confidence: Optional[float] = None, # Allow overriding confidence
2870
+ add_to_page: bool = False, # NEW: Option to add to page
2677
2871
  ) -> "TextElement":
2678
2872
  """
2679
2873
  Creates a new TextElement object based on this region's geometry.
@@ -2700,7 +2894,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
2700
2894
 
2701
2895
  Returns:
2702
2896
  A new TextElement instance.
2703
-
2897
+
2704
2898
  Raises:
2705
2899
  ValueError: If the region does not have a valid 'page' attribute.
2706
2900
  """
@@ -2711,14 +2905,17 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
2711
2905
  try:
2712
2906
  actual_text = text_content(self)
2713
2907
  except Exception as e:
2714
- logger.error(f"Error executing text_content callback for region {self.bbox}: {e}", exc_info=True)
2715
- actual_text = None # Ensure actual_text is None on error
2908
+ logger.error(
2909
+ f"Error executing text_content callback for region {self.bbox}: {e}",
2910
+ exc_info=True,
2911
+ )
2912
+ actual_text = None # Ensure actual_text is None on error
2716
2913
 
2717
2914
  final_confidence = confidence
2718
2915
  if final_confidence is None:
2719
2916
  final_confidence = 1.0 if actual_text is not None and actual_text.strip() else 0.0
2720
2917
 
2721
- if not hasattr(self, 'page') or self.page is None:
2918
+ if not hasattr(self, "page") or self.page is None:
2722
2919
  raise ValueError("Region must have a valid 'page' attribute to create a TextElement.")
2723
2920
 
2724
2921
  elem_data = {
@@ -2731,8 +2928,8 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
2731
2928
  "height": self.height,
2732
2929
  "object_type": object_type,
2733
2930
  "page_number": self.page.page_number,
2734
- "stroking_color": getattr(self, 'stroking_color', (0,0,0)),
2735
- "non_stroking_color": getattr(self, 'non_stroking_color', (0,0,0)),
2931
+ "stroking_color": getattr(self, "stroking_color", (0, 0, 0)),
2932
+ "non_stroking_color": getattr(self, "non_stroking_color", (0, 0, 0)),
2736
2933
  "fontname": default_font_name,
2737
2934
  "size": default_font_size,
2738
2935
  "upright": True,
@@ -2740,18 +2937,28 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
2740
2937
  "adv": self.width,
2741
2938
  "source": source_label,
2742
2939
  "confidence": final_confidence,
2743
- "_char_dicts": []
2940
+ "_char_dicts": [],
2744
2941
  }
2745
2942
  text_element = TextElement(elem_data, self.page)
2746
2943
 
2747
2944
  if add_to_page:
2748
- if hasattr(self.page, '_element_mgr') and self.page._element_mgr is not None:
2749
- add_as_type = "words" if object_type == "word" else "chars" if object_type == "char" else object_type
2945
+ if hasattr(self.page, "_element_mgr") and self.page._element_mgr is not None:
2946
+ add_as_type = (
2947
+ "words"
2948
+ if object_type == "word"
2949
+ else "chars" if object_type == "char" else object_type
2950
+ )
2750
2951
  # REMOVED try-except block around add_element
2751
2952
  self.page._element_mgr.add_element(text_element, element_type=add_as_type)
2752
- logger.debug(f"TextElement created from region {self.bbox} and added to page {self.page.page_number} as {add_as_type}.")
2953
+ logger.debug(
2954
+ f"TextElement created from region {self.bbox} and added to page {self.page.page_number} as {add_as_type}."
2955
+ )
2753
2956
  else:
2754
- page_num_str = str(self.page.page_number) if hasattr(self.page, 'page_number') else 'N/A'
2755
- logger.warning(f"Cannot add TextElement to page: Page {page_num_str} for region {self.bbox} is missing '_element_mgr'.")
2756
-
2957
+ page_num_str = (
2958
+ str(self.page.page_number) if hasattr(self.page, "page_number") else "N/A"
2959
+ )
2960
+ logger.warning(
2961
+ f"Cannot add TextElement to page: Page {page_num_str} for region {self.bbox} is missing '_element_mgr'."
2962
+ )
2963
+
2757
2964
  return text_element