natural-pdf 0.2.16__py3-none-any.whl → 0.2.17__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
natural_pdf/__init__.py CHANGED
@@ -62,14 +62,59 @@ class Options:
62
62
  # Text extraction defaults (empty for now)
63
63
  self.text = ConfigSection()
64
64
 
65
+ # Layout and navigation defaults
66
+ self.layout = ConfigSection(
67
+ directional_offset=0.01, # Offset in points when using directional methods
68
+ auto_multipage=False, # Whether directional methods span pages by default
69
+ )
70
+
65
71
 
66
72
  # Create global options instance
67
73
  options = Options()
68
74
 
69
75
 
76
+ def set_option(name: str, value):
77
+ """
78
+ Set a global Natural PDF option.
79
+
80
+ Args:
81
+ name: Option name in dot notation (e.g., 'layout.auto_multipage')
82
+ value: New value for the option
83
+
84
+ Example:
85
+ import natural_pdf as npdf
86
+ npdf.set_option('layout.auto_multipage', True)
87
+ npdf.set_option('ocr.engine', 'surya')
88
+ """
89
+ parts = name.split(".")
90
+ obj = options
91
+
92
+ # Navigate to the right section
93
+ for part in parts[:-1]:
94
+ if hasattr(obj, part):
95
+ obj = getattr(obj, part)
96
+ else:
97
+ raise KeyError(f"Unknown option section: {part}")
98
+
99
+ # Set the final value
100
+ final_key = parts[-1]
101
+ if hasattr(obj, final_key):
102
+ setattr(obj, final_key, value)
103
+ else:
104
+ raise KeyError(f"Unknown option: {name}")
105
+
106
+
70
107
  # Version
71
108
  __version__ = "0.1.1"
72
109
 
110
+ # Apply pdfminer patches for known bugs
111
+ try:
112
+ from natural_pdf.utils.pdfminer_patches import apply_patches
113
+
114
+ apply_patches()
115
+ except Exception as e:
116
+ logger.warning(f"Failed to apply pdfminer patches: {e}")
117
+
73
118
  from natural_pdf.analyzers.guides import Guides
74
119
  from natural_pdf.core.page import Page
75
120
  from natural_pdf.core.page_collection import PageCollection
@@ -941,6 +941,337 @@ class GuidesList(UserList):
941
941
  self.data.clear()
942
942
  return self._parent
943
943
 
944
+ def from_headers(
945
+ self,
946
+ headers: Union["ElementCollection", List["Element"]],
947
+ obj: Optional[Union["Page", "Region"]] = None,
948
+ method: Literal["min_crossings", "seam_carving"] = "min_crossings",
949
+ min_width: Optional[float] = None,
950
+ max_width: Optional[float] = None,
951
+ margin: float = 0.5,
952
+ row_stabilization: bool = True,
953
+ num_samples: int = 400,
954
+ *,
955
+ append: bool = False,
956
+ ) -> "Guides":
957
+ """Create vertical guides for columns based on headers and whitespace valleys.
958
+
959
+ This method detects column boundaries by finding optimal vertical separators
960
+ between headers that minimize text crossings, regardless of text alignment.
961
+
962
+ Args:
963
+ headers: Column header elements (ElementCollection or list of Elements)
964
+ obj: Page/Region to analyze (uses parent's context if None)
965
+ method: Detection method:
966
+ - 'min_crossings': Fast vector-based minimum intersection count
967
+ - 'seam_carving': Dynamic programming for curved boundaries
968
+ min_width: Minimum column width constraint (pixels)
969
+ max_width: Maximum column width constraint (pixels)
970
+ margin: Buffer space from header edges when searching for separators (default: 0.5)
971
+ row_stabilization: Whether to use row-wise median for stability
972
+ num_samples: Number of x-positions to test per gap (for min_crossings)
973
+ append: Whether to append to existing guides
974
+
975
+ Returns:
976
+ Parent Guides object for chaining
977
+
978
+ Examples:
979
+ # Create column guides from headers
980
+ headers = page.find_all('text[size=16]')
981
+ guides.vertical.from_headers(headers)
982
+
983
+ # With width constraints
984
+ guides.vertical.from_headers(headers, min_width=50, max_width=200)
985
+
986
+ # Seam carving for complex layouts
987
+ guides.vertical.from_headers(headers, method='seam_carving')
988
+ """
989
+
990
+ if self._axis != "vertical":
991
+ raise ValueError("from_headers() only works for vertical guides (columns)")
992
+
993
+ target_obj = obj or self._parent.context
994
+ if target_obj is None:
995
+ raise ValueError("No object provided and no context available")
996
+
997
+ # Convert headers to list if ElementCollection
998
+ if hasattr(headers, "elements"):
999
+ header_elements = list(headers.elements)
1000
+ else:
1001
+ header_elements = list(headers)
1002
+
1003
+ # Sort headers by x-position
1004
+ header_elements.sort(key=lambda h: h.x0 if hasattr(h, "x0") else 0)
1005
+
1006
+ # Need at least 2 headers
1007
+ if len(header_elements) < 2:
1008
+ logger.warning("Need at least 2 headers for column detection")
1009
+ return self._parent
1010
+
1011
+ # Get page bounds
1012
+ if hasattr(target_obj, "bbox"):
1013
+ page_bounds = target_obj.bbox
1014
+ elif hasattr(target_obj, "width") and hasattr(target_obj, "height"):
1015
+ # Create bbox from width/height
1016
+ page_bounds = (0, 0, target_obj.width, target_obj.height)
1017
+ else:
1018
+ page_bounds = None
1019
+
1020
+ if not page_bounds:
1021
+ logger.warning("Could not determine page bounds")
1022
+ return self._parent
1023
+
1024
+ # Get text below headers for occupancy analysis
1025
+ header_bottom = max(h.bottom for h in header_elements)
1026
+ all_text = target_obj.find_all("text")
1027
+ body_elements = [elem for elem in all_text if elem.top > header_bottom]
1028
+
1029
+ # Extract bounding boxes
1030
+ bboxes = [(elem.x0, elem.top, elem.x1, elem.bottom) for elem in body_elements]
1031
+
1032
+ # Find separators between each header pair
1033
+ separators = []
1034
+ logger.debug(f"Processing {len(header_elements)} headers for column detection")
1035
+ for i in range(len(header_elements) - 1):
1036
+ h_left = header_elements[i]
1037
+ h_right = header_elements[i + 1]
1038
+
1039
+ # Define search band
1040
+ left_edge = h_left.x1 if hasattr(h_left, "x1") else h_left.right
1041
+ right_edge = h_right.x0 if hasattr(h_right, "x0") else h_right.left
1042
+ gap = right_edge - left_edge
1043
+
1044
+ # If gap is too small, place separator in the middle
1045
+ if gap <= 2 * margin:
1046
+ # Place separator in the middle of the gap
1047
+ separator = (left_edge + right_edge) / 2
1048
+ separators.append(separator)
1049
+ continue
1050
+
1051
+ # Normal case - search within the band
1052
+ x0 = left_edge + margin
1053
+ x1 = right_edge - margin
1054
+
1055
+ # Apply width constraints if provided
1056
+ if min_width and (x1 - x0) < min_width:
1057
+ # Center the separator
1058
+ center = (x0 + x1) / 2
1059
+ separators.append(center)
1060
+ continue
1061
+
1062
+ if method == "min_crossings":
1063
+ separator = self._find_min_crossing_separator(x0, x1, bboxes, num_samples)
1064
+ else: # seam_carving
1065
+ separator = self._find_seam_carving_separator(
1066
+ x0, x1, target_obj, header_bottom, page_bounds[3], bboxes
1067
+ )
1068
+
1069
+ # Apply width constraints only if they don't conflict with header positions
1070
+ if separators:
1071
+ if min_width and separator - separators[-1] < min_width:
1072
+ # Only enforce if it doesn't push into next header
1073
+ proposed = separators[-1] + min_width
1074
+ if proposed < right_edge:
1075
+ separator = proposed
1076
+ if max_width and separator - separators[-1] > max_width:
1077
+ separator = separators[-1] + max_width
1078
+
1079
+ separators.append(separator)
1080
+
1081
+ # Ensure we have page boundaries
1082
+ if separators:
1083
+ if not any(abs(sep - page_bounds[0]) < 0.1 for sep in separators):
1084
+ separators.insert(0, page_bounds[0])
1085
+ if not any(abs(sep - page_bounds[2]) < 0.1 for sep in separators):
1086
+ separators.append(page_bounds[2])
1087
+
1088
+ # Apply row stabilization if requested
1089
+ if row_stabilization and separators:
1090
+ separators = self._stabilize_with_rows(separators, target_obj, bboxes, header_bottom)
1091
+
1092
+ # Update guides
1093
+ if append:
1094
+ self.extend(separators)
1095
+ else:
1096
+ self.data = separators
1097
+
1098
+ return self._parent
1099
+
1100
+ def _find_min_crossing_separator(
1101
+ self,
1102
+ x0: float,
1103
+ x1: float,
1104
+ bboxes: List[Tuple[float, float, float, float]],
1105
+ num_samples: int,
1106
+ ) -> float:
1107
+ """Find x-coordinate with minimum text crossings in band."""
1108
+ candidates = np.linspace(x0, x1, num_samples)
1109
+
1110
+ best_x = x0
1111
+ min_crossings = float("inf")
1112
+ best_gap = 0
1113
+
1114
+ for x in candidates:
1115
+ # Count how many bboxes this x-line crosses
1116
+ crossings = sum(1 for bbox in bboxes if bbox[0] < x < bbox[2])
1117
+
1118
+ # Calculate minimum gap to any edge (for tie-breaking)
1119
+ if crossings > 0:
1120
+ gaps = []
1121
+ for bbox in bboxes:
1122
+ if bbox[0] < x < bbox[2]:
1123
+ gaps.extend([abs(x - bbox[0]), abs(x - bbox[2])])
1124
+ min_gap = min(gaps) if gaps else float("inf")
1125
+ else:
1126
+ min_gap = float("inf")
1127
+
1128
+ # Update best if fewer crossings or same crossings but larger gap
1129
+ if crossings < min_crossings or (crossings == min_crossings and min_gap > best_gap):
1130
+ min_crossings = crossings
1131
+ best_x = x
1132
+ best_gap = min_gap
1133
+
1134
+ return best_x
1135
+
1136
+ def _find_seam_carving_separator(
1137
+ self,
1138
+ x0: float,
1139
+ x1: float,
1140
+ obj,
1141
+ header_y: float,
1142
+ page_bottom: float,
1143
+ bboxes: List[Tuple[float, float, float, float]],
1144
+ ) -> float:
1145
+ """Find optimal separator using seam carving (dynamic programming)."""
1146
+ # Create cost matrix
1147
+ band_width = int(x1 - x0)
1148
+ band_height = int(page_bottom - header_y)
1149
+
1150
+ if band_width <= 0 or band_height <= 0:
1151
+ return (x0 + x1) / 2
1152
+
1153
+ # Resolution for cost matrix (1 pixel = 1 point for now)
1154
+ cost_matrix = np.zeros((band_height, band_width))
1155
+
1156
+ # Fill cost matrix - high cost where text exists
1157
+ for bbox in bboxes:
1158
+ # Check if bbox intersects with our band
1159
+ # bbox format is (x0, top, x1, bottom)
1160
+ if bbox[2] > x0 and bbox[0] < x1 and bbox[3] > header_y:
1161
+ # Convert to band coordinates
1162
+ left = max(0, int(bbox[0] - x0))
1163
+ right = min(band_width, int(bbox[2] - x0))
1164
+ top = max(0, int(bbox[1] - header_y))
1165
+ bottom = min(band_height, int(bbox[3] - header_y))
1166
+
1167
+ # Set high cost for text regions
1168
+ cost_matrix[top:bottom, left:right] = 100
1169
+
1170
+ # Add small gradient cost to prefer straight lines
1171
+ for i in range(band_width):
1172
+ cost_matrix[:, i] += abs(i - band_width // 2) * 0.1
1173
+
1174
+ # Dynamic programming to find minimum cost path
1175
+ dp = np.full_like(cost_matrix, np.inf)
1176
+ dp[0, :] = cost_matrix[0, :]
1177
+
1178
+ # Fill DP table
1179
+ for y in range(1, band_height):
1180
+ for x in range(band_width):
1181
+ # Can come from directly above or diagonally
1182
+ dp[y, x] = cost_matrix[y, x] + dp[y - 1, x]
1183
+ if x > 0:
1184
+ dp[y, x] = min(dp[y, x], cost_matrix[y, x] + dp[y - 1, x - 1])
1185
+ if x < band_width - 1:
1186
+ dp[y, x] = min(dp[y, x], cost_matrix[y, x] + dp[y - 1, x + 1])
1187
+
1188
+ # Find minimum cost at bottom
1189
+ min_x = np.argmin(dp[-1, :])
1190
+
1191
+ # Trace back to get path
1192
+ path_x_coords = [min_x]
1193
+ for y in range(band_height - 2, -1, -1):
1194
+ x = path_x_coords[-1]
1195
+
1196
+ # Find which direction we came from
1197
+ candidates = [(x, dp[y, x])]
1198
+ if x > 0:
1199
+ candidates.append((x - 1, dp[y, x - 1]))
1200
+ if x < band_width - 1:
1201
+ candidates.append((x + 1, dp[y, x + 1]))
1202
+
1203
+ next_x = min(candidates, key=lambda c: c[1])[0]
1204
+ path_x_coords.append(next_x)
1205
+
1206
+ # Return median x-coordinate of the path
1207
+ median_x = np.median(path_x_coords)
1208
+ return x0 + median_x
1209
+
1210
+ def _stabilize_with_rows(
1211
+ self,
1212
+ separators: List[float],
1213
+ obj,
1214
+ bboxes: List[Tuple[float, float, float, float]],
1215
+ header_y: float,
1216
+ ) -> List[float]:
1217
+ """Stabilize separators using row-wise analysis."""
1218
+ if not bboxes:
1219
+ return separators
1220
+
1221
+ # Detect rows by finding horizontal gaps
1222
+ # bbox format is (x0, top, x1, bottom)
1223
+ y_coords = sorted(set([bbox[1] for bbox in bboxes] + [bbox[3] for bbox in bboxes]))
1224
+
1225
+ # Find gaps larger than typical line height
1226
+ gaps = []
1227
+ for i in range(len(y_coords) - 1):
1228
+ gap_size = y_coords[i + 1] - y_coords[i]
1229
+ if gap_size > 5: # Minimum gap to consider a row boundary
1230
+ gaps.append((y_coords[i], y_coords[i + 1]))
1231
+
1232
+ if not gaps:
1233
+ return separators
1234
+
1235
+ # For each separator, collect positions across rows
1236
+ stabilized = []
1237
+ for i, sep in enumerate(separators):
1238
+ row_positions = []
1239
+
1240
+ for gap_start, gap_end in gaps:
1241
+ # Get elements in this row
1242
+ row_elements = [
1243
+ bbox for bbox in bboxes if bbox[1] >= gap_start and bbox[3] <= gap_end
1244
+ ]
1245
+
1246
+ if row_elements:
1247
+ # Find best position in this row
1248
+ if i == 0:
1249
+ # First separator - look left of content
1250
+ x0 = 0
1251
+ x1 = sep + 20
1252
+ elif i == len(separators) - 1:
1253
+ # Last separator - look right of content
1254
+ x0 = sep - 20
1255
+ x1 = float("inf")
1256
+ else:
1257
+ # Middle separator - look around current position
1258
+ x0 = sep - 20
1259
+ x1 = sep + 20
1260
+
1261
+ # Find minimum crossing position in this range
1262
+ best_x = self._find_min_crossing_separator(
1263
+ max(x0, sep - 20), min(x1, sep + 20), row_elements, 50
1264
+ )
1265
+ row_positions.append(best_x)
1266
+
1267
+ # Use median of row positions if we have enough samples
1268
+ if len(row_positions) >= 3:
1269
+ stabilized.append(np.median(row_positions))
1270
+ else:
1271
+ stabilized.append(sep)
1272
+
1273
+ return stabilized
1274
+
944
1275
  def from_stripes(
945
1276
  self,
946
1277
  stripes=None,
@@ -4143,6 +4474,34 @@ class Guides:
4143
4474
  else:
4144
4475
  raise ValueError(f"Target object {target_obj} is not a Page or Region")
4145
4476
 
4477
+ # Check if we have guides in only one dimension
4478
+ has_verticals = len(self.vertical) > 0
4479
+ has_horizontals = len(self.horizontal) > 0
4480
+
4481
+ # If we have guides in only one dimension, use direct extraction with explicit lines
4482
+ if (has_verticals and not has_horizontals) or (has_horizontals and not has_verticals):
4483
+ logger.debug(
4484
+ f"Partial guides detected - using direct extraction (v={has_verticals}, h={has_horizontals})"
4485
+ )
4486
+
4487
+ # Extract directly from the target using explicit lines
4488
+ if hasattr(target_obj, "extract_table"):
4489
+ return target_obj.extract_table(
4490
+ method=method, # Let auto-detection work when None
4491
+ table_settings=table_settings,
4492
+ use_ocr=use_ocr,
4493
+ ocr_config=ocr_config,
4494
+ text_options=text_options,
4495
+ cell_extraction_func=cell_extraction_func,
4496
+ show_progress=show_progress,
4497
+ content_filter=content_filter,
4498
+ verticals=list(self.vertical) if has_verticals else None,
4499
+ horizontals=list(self.horizontal) if has_horizontals else None,
4500
+ )
4501
+ else:
4502
+ raise ValueError(f"Target object {type(target_obj)} does not support extract_table")
4503
+
4504
+ # Both dimensions have guides - use normal grid-based extraction
4146
4505
  try:
4147
4506
  # Step 1: Build grid structure (creates temporary regions)
4148
4507
  grid_result = self.build_grid(
@@ -1286,6 +1286,10 @@ class ElementManager:
1286
1286
 
1287
1287
  fill_col = rc.get("non_stroking_color")
1288
1288
  # We keep colour as metadata but no longer filter on it
1289
+ # Note: pdfminer.six has a bug where it may report incorrect colors
1290
+ # when no explicit color space is set. E.g., '1 1 0 sc' (RGB yellow)
1291
+ # is parsed as 0.0 (grayscale black) because pdfminer defaults to
1292
+ # DeviceGray and only reads 1 component from the stack.
1289
1293
  if fill_col is None:
1290
1294
  continue
1291
1295
 
natural_pdf/core/page.py CHANGED
@@ -30,6 +30,7 @@ from tqdm.auto import tqdm # Added tqdm import
30
30
  from natural_pdf.elements.element_collection import ElementCollection
31
31
  from natural_pdf.elements.region import Region
32
32
  from natural_pdf.selectors.parser import parse_selector
33
+ from natural_pdf.tables.result import TableResult
33
34
  from natural_pdf.utils.locks import pdf_render_lock # Import from utils instead
34
35
  from natural_pdf.utils.visualization import render_plain_page
35
36
 
@@ -866,26 +867,33 @@ class Page(
866
867
  if debug:
867
868
  print(f" - Added direct region '{label}': {exclusion_item}")
868
869
 
869
- # Process direct Element objects - convert to Region
870
+ # Process direct Element objects - only convert to Region if method is "region"
870
871
  elif hasattr(exclusion_item, "bbox") and hasattr(exclusion_item, "expand"):
871
- try:
872
- # Convert Element to Region using expand()
873
- expanded_region = exclusion_item.expand()
874
- if isinstance(expanded_region, Region):
875
- expanded_region.label = label
876
- regions.append(expanded_region)
877
- if debug:
878
- print(
879
- f" - Converted direct Element to Region '{label}': {expanded_region}"
880
- )
881
- else:
872
+ if method == "region":
873
+ try:
874
+ # Convert Element to Region using expand()
875
+ expanded_region = exclusion_item.expand()
876
+ if isinstance(expanded_region, Region):
877
+ expanded_region.label = label
878
+ regions.append(expanded_region)
879
+ if debug:
880
+ print(
881
+ f" - Converted direct Element to Region '{label}': {expanded_region}"
882
+ )
883
+ else:
884
+ if debug:
885
+ print(
886
+ f" - Element.expand() did not return a Region: {type(expanded_region)}"
887
+ )
888
+ except Exception as e:
882
889
  if debug:
883
- print(
884
- f" - Element.expand() did not return a Region: {type(expanded_region)}"
885
- )
886
- except Exception as e:
890
+ print(f" - Failed to convert Element to Region: {e}")
891
+ else:
892
+ # method == "element" - will be handled in _filter_elements_by_exclusions
887
893
  if debug:
888
- print(f" - Failed to convert Element to Region: {e}")
894
+ print(
895
+ f" - Skipping element '{label}' (will be handled as element-based exclusion)"
896
+ )
889
897
 
890
898
  # Process string selectors (from PDF-level exclusions)
891
899
  elif isinstance(exclusion_item, str):
@@ -1245,15 +1253,46 @@ class Page(
1245
1253
  Returns:
1246
1254
  ElementCollection of matching elements (unfiltered by exclusions)
1247
1255
  """
1248
- from natural_pdf.selectors.parser import selector_to_filter_func
1256
+ from natural_pdf.selectors.parser import _calculate_aggregates, selector_to_filter_func
1249
1257
 
1250
1258
  # Handle compound OR selectors
1251
1259
  if selector_obj.get("type") == "or":
1252
1260
  # For OR selectors, search all elements and let the filter function decide
1253
1261
  elements_to_search = self._element_mgr.get_all_elements()
1254
1262
 
1263
+ # Check if any sub-selector contains aggregate functions
1264
+ has_aggregates = False
1265
+ for sub_selector in selector_obj.get("selectors", []):
1266
+ for attr in sub_selector.get("attributes", []):
1267
+ value = attr.get("value")
1268
+ if isinstance(value, dict) and value.get("type") == "aggregate":
1269
+ has_aggregates = True
1270
+ break
1271
+ if has_aggregates:
1272
+ break
1273
+
1274
+ # Calculate aggregates if needed - for OR selectors we calculate on ALL elements
1275
+ aggregates = {}
1276
+ if has_aggregates:
1277
+ # Need to calculate aggregates for each sub-selector type
1278
+ for sub_selector in selector_obj.get("selectors", []):
1279
+ sub_type = sub_selector.get("type", "any").lower()
1280
+ if sub_type == "text":
1281
+ sub_elements = self._element_mgr.words
1282
+ elif sub_type == "rect":
1283
+ sub_elements = self._element_mgr.rects
1284
+ elif sub_type == "line":
1285
+ sub_elements = self._element_mgr.lines
1286
+ elif sub_type == "region":
1287
+ sub_elements = self._element_mgr.regions
1288
+ else:
1289
+ sub_elements = elements_to_search
1290
+
1291
+ sub_aggregates = _calculate_aggregates(sub_elements, sub_selector)
1292
+ aggregates.update(sub_aggregates)
1293
+
1255
1294
  # Create filter function from compound selector
1256
- filter_func = selector_to_filter_func(selector_obj, **kwargs)
1295
+ filter_func = selector_to_filter_func(selector_obj, aggregates=aggregates, **kwargs)
1257
1296
 
1258
1297
  # Apply the filter to all elements
1259
1298
  matching_elements = [element for element in elements_to_search if filter_func(element)]
@@ -1309,8 +1348,23 @@ class Page(
1309
1348
  else:
1310
1349
  elements_to_search = self._element_mgr.get_all_elements()
1311
1350
 
1351
+ # Check if selector contains aggregate functions
1352
+ has_aggregates = False
1353
+ for attr in selector_obj.get("attributes", []):
1354
+ value = attr.get("value")
1355
+ if isinstance(value, dict) and value.get("type") == "aggregate":
1356
+ has_aggregates = True
1357
+ break
1358
+
1359
+ # Calculate aggregates if needed
1360
+ aggregates = {}
1361
+ if has_aggregates:
1362
+ # For aggregates, we need to calculate based on ALL elements of the same type
1363
+ # not just the filtered subset
1364
+ aggregates = _calculate_aggregates(elements_to_search, selector_obj)
1365
+
1312
1366
  # Create filter function from selector, passing any additional parameters
1313
- filter_func = selector_to_filter_func(selector_obj, **kwargs)
1367
+ filter_func = selector_to_filter_func(selector_obj, aggregates=aggregates, **kwargs)
1314
1368
 
1315
1369
  # Apply the filter to matching elements
1316
1370
  matching_elements = [element for element in elements_to_search if filter_func(element)]
@@ -1857,7 +1911,9 @@ class Page(
1857
1911
  cell_extraction_func: Optional[Callable[["Region"], Optional[str]]] = None,
1858
1912
  show_progress: bool = False,
1859
1913
  content_filter=None,
1860
- ) -> List[List[Optional[str]]]:
1914
+ verticals: Optional[List[float]] = None,
1915
+ horizontals: Optional[List[float]] = None,
1916
+ ) -> TableResult:
1861
1917
  """
1862
1918
  Extract the largest table from this page using enhanced region-based extraction.
1863
1919
 
@@ -1874,9 +1930,11 @@ class Page(
1874
1930
  - A regex pattern string (characters matching the pattern are EXCLUDED)
1875
1931
  - A callable that takes text and returns True to KEEP the character
1876
1932
  - A list of regex patterns (characters matching ANY pattern are EXCLUDED)
1933
+ verticals: Optional list of x-coordinates for explicit vertical table lines.
1934
+ horizontals: Optional list of y-coordinates for explicit horizontal table lines.
1877
1935
 
1878
1936
  Returns:
1879
- Table data as a list of rows, where each row is a list of cell values (str or None).
1937
+ TableResult: A sequence-like object containing table rows that also provides .to_df() for pandas conversion.
1880
1938
  """
1881
1939
  # Create a full-page region and delegate to its enhanced extract_table method
1882
1940
  page_region = self.create_region(0, 0, self.width, self.height)
@@ -1889,6 +1947,8 @@ class Page(
1889
1947
  cell_extraction_func=cell_extraction_func,
1890
1948
  show_progress=show_progress,
1891
1949
  content_filter=content_filter,
1950
+ verticals=verticals,
1951
+ horizontals=horizontals,
1892
1952
  )
1893
1953
 
1894
1954
  def extract_tables(
@@ -2768,6 +2828,7 @@ class Page(
2768
2828
  region.start_element = current_start_element
2769
2829
  region.end_element = end_boundary_el # Mark the element that ended it
2770
2830
  region.is_end_next_start = True # Mark how it ended
2831
+ region._boundary_exclusions = include_boundaries
2771
2832
  regions.append(region)
2772
2833
  else: # horizontal
2773
2834
  sec_left = (
@@ -2787,6 +2848,7 @@ class Page(
2787
2848
  region.start_element = current_start_element
2788
2849
  region.end_element = end_boundary_el # Mark the element that ended it
2789
2850
  region.is_end_next_start = True # Mark how it ended
2851
+ region._boundary_exclusions = include_boundaries
2790
2852
  regions.append(region)
2791
2853
  active_section_started = False # Reset for the new start
2792
2854
 
@@ -2815,6 +2877,7 @@ class Page(
2815
2877
  region.start_element = current_start_element
2816
2878
  region.end_element = end_boundary_el
2817
2879
  region.is_end_next_start = False
2880
+ region._boundary_exclusions = include_boundaries
2818
2881
  regions.append(region)
2819
2882
  else: # horizontal
2820
2883
  sec_left = (
@@ -2834,6 +2897,7 @@ class Page(
2834
2897
  region.start_element = current_start_element
2835
2898
  region.end_element = end_boundary_el
2836
2899
  region.is_end_next_start = False
2900
+ region._boundary_exclusions = include_boundaries
2837
2901
  regions.append(region)
2838
2902
 
2839
2903
  # Reset: section ended explicitly
@@ -2854,6 +2918,7 @@ class Page(
2854
2918
  region.start_element = current_start_element
2855
2919
  region.end_element = None # Ended by page end
2856
2920
  region.is_end_next_start = False
2921
+ region._boundary_exclusions = include_boundaries
2857
2922
  regions.append(region)
2858
2923
  else: # horizontal
2859
2924
  sec_left = (
@@ -2867,6 +2932,7 @@ class Page(
2867
2932
  region.start_element = current_start_element
2868
2933
  region.end_element = None # Ended by page end
2869
2934
  region.is_end_next_start = False
2935
+ region._boundary_exclusions = include_boundaries
2870
2936
  regions.append(region)
2871
2937
 
2872
2938
  return ElementCollection(regions)