natural-pdf 0.2.16__py3-none-any.whl → 0.2.18__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
natural_pdf/__init__.py CHANGED
@@ -62,14 +62,59 @@ class Options:
62
62
  # Text extraction defaults (empty for now)
63
63
  self.text = ConfigSection()
64
64
 
65
+ # Layout and navigation defaults
66
+ self.layout = ConfigSection(
67
+ directional_offset=0.01, # Offset in points when using directional methods
68
+ auto_multipage=False, # Whether directional methods span pages by default
69
+ )
70
+
65
71
 
66
72
  # Create global options instance
67
73
  options = Options()
68
74
 
69
75
 
76
+ def set_option(name: str, value):
77
+ """
78
+ Set a global Natural PDF option.
79
+
80
+ Args:
81
+ name: Option name in dot notation (e.g., 'layout.auto_multipage')
82
+ value: New value for the option
83
+
84
+ Example:
85
+ import natural_pdf as npdf
86
+ npdf.set_option('layout.auto_multipage', True)
87
+ npdf.set_option('ocr.engine', 'surya')
88
+ """
89
+ parts = name.split(".")
90
+ obj = options
91
+
92
+ # Navigate to the right section
93
+ for part in parts[:-1]:
94
+ if hasattr(obj, part):
95
+ obj = getattr(obj, part)
96
+ else:
97
+ raise KeyError(f"Unknown option section: {part}")
98
+
99
+ # Set the final value
100
+ final_key = parts[-1]
101
+ if hasattr(obj, final_key):
102
+ setattr(obj, final_key, value)
103
+ else:
104
+ raise KeyError(f"Unknown option: {name}")
105
+
106
+
70
107
  # Version
71
108
  __version__ = "0.1.1"
72
109
 
110
+ # Apply pdfminer patches for known bugs
111
+ try:
112
+ from natural_pdf.utils.pdfminer_patches import apply_patches
113
+
114
+ apply_patches()
115
+ except Exception as e:
116
+ logger.warning(f"Failed to apply pdfminer patches: {e}")
117
+
73
118
  from natural_pdf.analyzers.guides import Guides
74
119
  from natural_pdf.core.page import Page
75
120
  from natural_pdf.core.page_collection import PageCollection
@@ -941,6 +941,337 @@ class GuidesList(UserList):
941
941
  self.data.clear()
942
942
  return self._parent
943
943
 
944
+ def from_headers(
945
+ self,
946
+ headers: Union["ElementCollection", List["Element"]],
947
+ obj: Optional[Union["Page", "Region"]] = None,
948
+ method: Literal["min_crossings", "seam_carving"] = "min_crossings",
949
+ min_width: Optional[float] = None,
950
+ max_width: Optional[float] = None,
951
+ margin: float = 0.5,
952
+ row_stabilization: bool = True,
953
+ num_samples: int = 400,
954
+ *,
955
+ append: bool = False,
956
+ ) -> "Guides":
957
+ """Create vertical guides for columns based on headers and whitespace valleys.
958
+
959
+ This method detects column boundaries by finding optimal vertical separators
960
+ between headers that minimize text crossings, regardless of text alignment.
961
+
962
+ Args:
963
+ headers: Column header elements (ElementCollection or list of Elements)
964
+ obj: Page/Region to analyze (uses parent's context if None)
965
+ method: Detection method:
966
+ - 'min_crossings': Fast vector-based minimum intersection count
967
+ - 'seam_carving': Dynamic programming for curved boundaries
968
+ min_width: Minimum column width constraint (pixels)
969
+ max_width: Maximum column width constraint (pixels)
970
+ margin: Buffer space from header edges when searching for separators (default: 0.5)
971
+ row_stabilization: Whether to use row-wise median for stability
972
+ num_samples: Number of x-positions to test per gap (for min_crossings)
973
+ append: Whether to append to existing guides
974
+
975
+ Returns:
976
+ Parent Guides object for chaining
977
+
978
+ Examples:
979
+ # Create column guides from headers
980
+ headers = page.find_all('text[size=16]')
981
+ guides.vertical.from_headers(headers)
982
+
983
+ # With width constraints
984
+ guides.vertical.from_headers(headers, min_width=50, max_width=200)
985
+
986
+ # Seam carving for complex layouts
987
+ guides.vertical.from_headers(headers, method='seam_carving')
988
+ """
989
+
990
+ if self._axis != "vertical":
991
+ raise ValueError("from_headers() only works for vertical guides (columns)")
992
+
993
+ target_obj = obj or self._parent.context
994
+ if target_obj is None:
995
+ raise ValueError("No object provided and no context available")
996
+
997
+ # Convert headers to list if ElementCollection
998
+ if hasattr(headers, "elements"):
999
+ header_elements = list(headers.elements)
1000
+ else:
1001
+ header_elements = list(headers)
1002
+
1003
+ # Sort headers by x-position
1004
+ header_elements.sort(key=lambda h: h.x0 if hasattr(h, "x0") else 0)
1005
+
1006
+ # Need at least 2 headers
1007
+ if len(header_elements) < 2:
1008
+ logger.warning("Need at least 2 headers for column detection")
1009
+ return self._parent
1010
+
1011
+ # Get page bounds
1012
+ if hasattr(target_obj, "bbox"):
1013
+ page_bounds = target_obj.bbox
1014
+ elif hasattr(target_obj, "width") and hasattr(target_obj, "height"):
1015
+ # Create bbox from width/height
1016
+ page_bounds = (0, 0, target_obj.width, target_obj.height)
1017
+ else:
1018
+ page_bounds = None
1019
+
1020
+ if not page_bounds:
1021
+ logger.warning("Could not determine page bounds")
1022
+ return self._parent
1023
+
1024
+ # Get text below headers for occupancy analysis
1025
+ header_bottom = max(h.bottom for h in header_elements)
1026
+ all_text = target_obj.find_all("text")
1027
+ body_elements = [elem for elem in all_text if elem.top > header_bottom]
1028
+
1029
+ # Extract bounding boxes
1030
+ bboxes = [(elem.x0, elem.top, elem.x1, elem.bottom) for elem in body_elements]
1031
+
1032
+ # Find separators between each header pair
1033
+ separators = []
1034
+ logger.debug(f"Processing {len(header_elements)} headers for column detection")
1035
+ for i in range(len(header_elements) - 1):
1036
+ h_left = header_elements[i]
1037
+ h_right = header_elements[i + 1]
1038
+
1039
+ # Define search band
1040
+ left_edge = h_left.x1 if hasattr(h_left, "x1") else h_left.right
1041
+ right_edge = h_right.x0 if hasattr(h_right, "x0") else h_right.left
1042
+ gap = right_edge - left_edge
1043
+
1044
+ # If gap is too small, place separator in the middle
1045
+ if gap <= 2 * margin:
1046
+ # Place separator in the middle of the gap
1047
+ separator = (left_edge + right_edge) / 2
1048
+ separators.append(separator)
1049
+ continue
1050
+
1051
+ # Normal case - search within the band
1052
+ x0 = left_edge + margin
1053
+ x1 = right_edge - margin
1054
+
1055
+ # Apply width constraints if provided
1056
+ if min_width and (x1 - x0) < min_width:
1057
+ # Center the separator
1058
+ center = (x0 + x1) / 2
1059
+ separators.append(center)
1060
+ continue
1061
+
1062
+ if method == "min_crossings":
1063
+ separator = self._find_min_crossing_separator(x0, x1, bboxes, num_samples)
1064
+ else: # seam_carving
1065
+ separator = self._find_seam_carving_separator(
1066
+ x0, x1, target_obj, header_bottom, page_bounds[3], bboxes
1067
+ )
1068
+
1069
+ # Apply width constraints only if they don't conflict with header positions
1070
+ if separators:
1071
+ if min_width and separator - separators[-1] < min_width:
1072
+ # Only enforce if it doesn't push into next header
1073
+ proposed = separators[-1] + min_width
1074
+ if proposed < right_edge:
1075
+ separator = proposed
1076
+ if max_width and separator - separators[-1] > max_width:
1077
+ separator = separators[-1] + max_width
1078
+
1079
+ separators.append(separator)
1080
+
1081
+ # Ensure we have page boundaries
1082
+ if separators:
1083
+ if not any(abs(sep - page_bounds[0]) < 0.1 for sep in separators):
1084
+ separators.insert(0, page_bounds[0])
1085
+ if not any(abs(sep - page_bounds[2]) < 0.1 for sep in separators):
1086
+ separators.append(page_bounds[2])
1087
+
1088
+ # Apply row stabilization if requested
1089
+ if row_stabilization and separators:
1090
+ separators = self._stabilize_with_rows(separators, target_obj, bboxes, header_bottom)
1091
+
1092
+ # Update guides
1093
+ if append:
1094
+ self.extend(separators)
1095
+ else:
1096
+ self.data = separators
1097
+
1098
+ return self._parent
1099
+
1100
+ def _find_min_crossing_separator(
1101
+ self,
1102
+ x0: float,
1103
+ x1: float,
1104
+ bboxes: List[Tuple[float, float, float, float]],
1105
+ num_samples: int,
1106
+ ) -> float:
1107
+ """Find x-coordinate with minimum text crossings in band."""
1108
+ candidates = np.linspace(x0, x1, num_samples)
1109
+
1110
+ best_x = x0
1111
+ min_crossings = float("inf")
1112
+ best_gap = 0
1113
+
1114
+ for x in candidates:
1115
+ # Count how many bboxes this x-line crosses
1116
+ crossings = sum(1 for bbox in bboxes if bbox[0] < x < bbox[2])
1117
+
1118
+ # Calculate minimum gap to any edge (for tie-breaking)
1119
+ if crossings > 0:
1120
+ gaps = []
1121
+ for bbox in bboxes:
1122
+ if bbox[0] < x < bbox[2]:
1123
+ gaps.extend([abs(x - bbox[0]), abs(x - bbox[2])])
1124
+ min_gap = min(gaps) if gaps else float("inf")
1125
+ else:
1126
+ min_gap = float("inf")
1127
+
1128
+ # Update best if fewer crossings or same crossings but larger gap
1129
+ if crossings < min_crossings or (crossings == min_crossings and min_gap > best_gap):
1130
+ min_crossings = crossings
1131
+ best_x = x
1132
+ best_gap = min_gap
1133
+
1134
+ return best_x
1135
+
1136
+ def _find_seam_carving_separator(
1137
+ self,
1138
+ x0: float,
1139
+ x1: float,
1140
+ obj,
1141
+ header_y: float,
1142
+ page_bottom: float,
1143
+ bboxes: List[Tuple[float, float, float, float]],
1144
+ ) -> float:
1145
+ """Find optimal separator using seam carving (dynamic programming)."""
1146
+ # Create cost matrix
1147
+ band_width = int(x1 - x0)
1148
+ band_height = int(page_bottom - header_y)
1149
+
1150
+ if band_width <= 0 or band_height <= 0:
1151
+ return (x0 + x1) / 2
1152
+
1153
+ # Resolution for cost matrix (1 pixel = 1 point for now)
1154
+ cost_matrix = np.zeros((band_height, band_width))
1155
+
1156
+ # Fill cost matrix - high cost where text exists
1157
+ for bbox in bboxes:
1158
+ # Check if bbox intersects with our band
1159
+ # bbox format is (x0, top, x1, bottom)
1160
+ if bbox[2] > x0 and bbox[0] < x1 and bbox[3] > header_y:
1161
+ # Convert to band coordinates
1162
+ left = max(0, int(bbox[0] - x0))
1163
+ right = min(band_width, int(bbox[2] - x0))
1164
+ top = max(0, int(bbox[1] - header_y))
1165
+ bottom = min(band_height, int(bbox[3] - header_y))
1166
+
1167
+ # Set high cost for text regions
1168
+ cost_matrix[top:bottom, left:right] = 100
1169
+
1170
+ # Add small gradient cost to prefer straight lines
1171
+ for i in range(band_width):
1172
+ cost_matrix[:, i] += abs(i - band_width // 2) * 0.1
1173
+
1174
+ # Dynamic programming to find minimum cost path
1175
+ dp = np.full_like(cost_matrix, np.inf)
1176
+ dp[0, :] = cost_matrix[0, :]
1177
+
1178
+ # Fill DP table
1179
+ for y in range(1, band_height):
1180
+ for x in range(band_width):
1181
+ # Can come from directly above or diagonally
1182
+ dp[y, x] = cost_matrix[y, x] + dp[y - 1, x]
1183
+ if x > 0:
1184
+ dp[y, x] = min(dp[y, x], cost_matrix[y, x] + dp[y - 1, x - 1])
1185
+ if x < band_width - 1:
1186
+ dp[y, x] = min(dp[y, x], cost_matrix[y, x] + dp[y - 1, x + 1])
1187
+
1188
+ # Find minimum cost at bottom
1189
+ min_x = np.argmin(dp[-1, :])
1190
+
1191
+ # Trace back to get path
1192
+ path_x_coords = [min_x]
1193
+ for y in range(band_height - 2, -1, -1):
1194
+ x = path_x_coords[-1]
1195
+
1196
+ # Find which direction we came from
1197
+ candidates = [(x, dp[y, x])]
1198
+ if x > 0:
1199
+ candidates.append((x - 1, dp[y, x - 1]))
1200
+ if x < band_width - 1:
1201
+ candidates.append((x + 1, dp[y, x + 1]))
1202
+
1203
+ next_x = min(candidates, key=lambda c: c[1])[0]
1204
+ path_x_coords.append(next_x)
1205
+
1206
+ # Return median x-coordinate of the path
1207
+ median_x = np.median(path_x_coords)
1208
+ return x0 + median_x
1209
+
1210
+ def _stabilize_with_rows(
1211
+ self,
1212
+ separators: List[float],
1213
+ obj,
1214
+ bboxes: List[Tuple[float, float, float, float]],
1215
+ header_y: float,
1216
+ ) -> List[float]:
1217
+ """Stabilize separators using row-wise analysis."""
1218
+ if not bboxes:
1219
+ return separators
1220
+
1221
+ # Detect rows by finding horizontal gaps
1222
+ # bbox format is (x0, top, x1, bottom)
1223
+ y_coords = sorted(set([bbox[1] for bbox in bboxes] + [bbox[3] for bbox in bboxes]))
1224
+
1225
+ # Find gaps larger than typical line height
1226
+ gaps = []
1227
+ for i in range(len(y_coords) - 1):
1228
+ gap_size = y_coords[i + 1] - y_coords[i]
1229
+ if gap_size > 5: # Minimum gap to consider a row boundary
1230
+ gaps.append((y_coords[i], y_coords[i + 1]))
1231
+
1232
+ if not gaps:
1233
+ return separators
1234
+
1235
+ # For each separator, collect positions across rows
1236
+ stabilized = []
1237
+ for i, sep in enumerate(separators):
1238
+ row_positions = []
1239
+
1240
+ for gap_start, gap_end in gaps:
1241
+ # Get elements in this row
1242
+ row_elements = [
1243
+ bbox for bbox in bboxes if bbox[1] >= gap_start and bbox[3] <= gap_end
1244
+ ]
1245
+
1246
+ if row_elements:
1247
+ # Find best position in this row
1248
+ if i == 0:
1249
+ # First separator - look left of content
1250
+ x0 = 0
1251
+ x1 = sep + 20
1252
+ elif i == len(separators) - 1:
1253
+ # Last separator - look right of content
1254
+ x0 = sep - 20
1255
+ x1 = float("inf")
1256
+ else:
1257
+ # Middle separator - look around current position
1258
+ x0 = sep - 20
1259
+ x1 = sep + 20
1260
+
1261
+ # Find minimum crossing position in this range
1262
+ best_x = self._find_min_crossing_separator(
1263
+ max(x0, sep - 20), min(x1, sep + 20), row_elements, 50
1264
+ )
1265
+ row_positions.append(best_x)
1266
+
1267
+ # Use median of row positions if we have enough samples
1268
+ if len(row_positions) >= 3:
1269
+ stabilized.append(np.median(row_positions))
1270
+ else:
1271
+ stabilized.append(sep)
1272
+
1273
+ return stabilized
1274
+
944
1275
  def from_stripes(
945
1276
  self,
946
1277
  stripes=None,
@@ -4143,6 +4474,34 @@ class Guides:
4143
4474
  else:
4144
4475
  raise ValueError(f"Target object {target_obj} is not a Page or Region")
4145
4476
 
4477
+ # Check if we have guides in only one dimension
4478
+ has_verticals = len(self.vertical) > 0
4479
+ has_horizontals = len(self.horizontal) > 0
4480
+
4481
+ # If we have guides in only one dimension, use direct extraction with explicit lines
4482
+ if (has_verticals and not has_horizontals) or (has_horizontals and not has_verticals):
4483
+ logger.debug(
4484
+ f"Partial guides detected - using direct extraction (v={has_verticals}, h={has_horizontals})"
4485
+ )
4486
+
4487
+ # Extract directly from the target using explicit lines
4488
+ if hasattr(target_obj, "extract_table"):
4489
+ return target_obj.extract_table(
4490
+ method=method, # Let auto-detection work when None
4491
+ table_settings=table_settings,
4492
+ use_ocr=use_ocr,
4493
+ ocr_config=ocr_config,
4494
+ text_options=text_options,
4495
+ cell_extraction_func=cell_extraction_func,
4496
+ show_progress=show_progress,
4497
+ content_filter=content_filter,
4498
+ verticals=list(self.vertical) if has_verticals else None,
4499
+ horizontals=list(self.horizontal) if has_horizontals else None,
4500
+ )
4501
+ else:
4502
+ raise ValueError(f"Target object {type(target_obj)} does not support extract_table")
4503
+
4504
+ # Both dimensions have guides - use normal grid-based extraction
4146
4505
  try:
4147
4506
  # Step 1: Build grid structure (creates temporary regions)
4148
4507
  grid_result = self.build_grid(
@@ -1286,6 +1286,10 @@ class ElementManager:
1286
1286
 
1287
1287
  fill_col = rc.get("non_stroking_color")
1288
1288
  # We keep colour as metadata but no longer filter on it
1289
+ # Note: pdfminer.six has a bug where it may report incorrect colors
1290
+ # when no explicit color space is set. E.g., '1 1 0 sc' (RGB yellow)
1291
+ # is parsed as 0.0 (grayscale black) because pdfminer defaults to
1292
+ # DeviceGray and only reads 1 component from the stack.
1289
1293
  if fill_col is None:
1290
1294
  continue
1291
1295