natural-pdf 0.2.15__py3-none-any.whl → 0.2.17__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/__init__.py +45 -0
- natural_pdf/analyzers/guides.py +359 -0
- natural_pdf/core/element_manager.py +4 -0
- natural_pdf/core/page.py +88 -22
- natural_pdf/core/page_collection.py +75 -0
- natural_pdf/core/pdf.py +33 -0
- natural_pdf/describe/base.py +48 -7
- natural_pdf/elements/base.py +408 -43
- natural_pdf/elements/element_collection.py +83 -10
- natural_pdf/elements/region.py +217 -178
- natural_pdf/elements/text.py +5 -3
- natural_pdf/flows/element.py +48 -46
- natural_pdf/flows/flow.py +175 -480
- natural_pdf/flows/region.py +76 -0
- natural_pdf/selectors/parser.py +180 -9
- natural_pdf/utils/pdfminer_patches.py +136 -0
- natural_pdf/utils/sections.py +346 -0
- natural_pdf/utils/spatial.py +169 -0
- {natural_pdf-0.2.15.dist-info → natural_pdf-0.2.17.dist-info}/METADATA +1 -1
- {natural_pdf-0.2.15.dist-info → natural_pdf-0.2.17.dist-info}/RECORD +24 -21
- {natural_pdf-0.2.15.dist-info → natural_pdf-0.2.17.dist-info}/WHEEL +0 -0
- {natural_pdf-0.2.15.dist-info → natural_pdf-0.2.17.dist-info}/entry_points.txt +0 -0
- {natural_pdf-0.2.15.dist-info → natural_pdf-0.2.17.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.2.15.dist-info → natural_pdf-0.2.17.dist-info}/top_level.txt +0 -0
natural_pdf/__init__.py
CHANGED
@@ -62,14 +62,59 @@ class Options:
|
|
62
62
|
# Text extraction defaults (empty for now)
|
63
63
|
self.text = ConfigSection()
|
64
64
|
|
65
|
+
# Layout and navigation defaults
|
66
|
+
self.layout = ConfigSection(
|
67
|
+
directional_offset=0.01, # Offset in points when using directional methods
|
68
|
+
auto_multipage=False, # Whether directional methods span pages by default
|
69
|
+
)
|
70
|
+
|
65
71
|
|
66
72
|
# Create global options instance
|
67
73
|
options = Options()
|
68
74
|
|
69
75
|
|
76
|
+
def set_option(name: str, value):
|
77
|
+
"""
|
78
|
+
Set a global Natural PDF option.
|
79
|
+
|
80
|
+
Args:
|
81
|
+
name: Option name in dot notation (e.g., 'layout.auto_multipage')
|
82
|
+
value: New value for the option
|
83
|
+
|
84
|
+
Example:
|
85
|
+
import natural_pdf as npdf
|
86
|
+
npdf.set_option('layout.auto_multipage', True)
|
87
|
+
npdf.set_option('ocr.engine', 'surya')
|
88
|
+
"""
|
89
|
+
parts = name.split(".")
|
90
|
+
obj = options
|
91
|
+
|
92
|
+
# Navigate to the right section
|
93
|
+
for part in parts[:-1]:
|
94
|
+
if hasattr(obj, part):
|
95
|
+
obj = getattr(obj, part)
|
96
|
+
else:
|
97
|
+
raise KeyError(f"Unknown option section: {part}")
|
98
|
+
|
99
|
+
# Set the final value
|
100
|
+
final_key = parts[-1]
|
101
|
+
if hasattr(obj, final_key):
|
102
|
+
setattr(obj, final_key, value)
|
103
|
+
else:
|
104
|
+
raise KeyError(f"Unknown option: {name}")
|
105
|
+
|
106
|
+
|
70
107
|
# Version
|
71
108
|
__version__ = "0.1.1"
|
72
109
|
|
110
|
+
# Apply pdfminer patches for known bugs
|
111
|
+
try:
|
112
|
+
from natural_pdf.utils.pdfminer_patches import apply_patches
|
113
|
+
|
114
|
+
apply_patches()
|
115
|
+
except Exception as e:
|
116
|
+
logger.warning(f"Failed to apply pdfminer patches: {e}")
|
117
|
+
|
73
118
|
from natural_pdf.analyzers.guides import Guides
|
74
119
|
from natural_pdf.core.page import Page
|
75
120
|
from natural_pdf.core.page_collection import PageCollection
|
natural_pdf/analyzers/guides.py
CHANGED
@@ -941,6 +941,337 @@ class GuidesList(UserList):
|
|
941
941
|
self.data.clear()
|
942
942
|
return self._parent
|
943
943
|
|
944
|
+
def from_headers(
|
945
|
+
self,
|
946
|
+
headers: Union["ElementCollection", List["Element"]],
|
947
|
+
obj: Optional[Union["Page", "Region"]] = None,
|
948
|
+
method: Literal["min_crossings", "seam_carving"] = "min_crossings",
|
949
|
+
min_width: Optional[float] = None,
|
950
|
+
max_width: Optional[float] = None,
|
951
|
+
margin: float = 0.5,
|
952
|
+
row_stabilization: bool = True,
|
953
|
+
num_samples: int = 400,
|
954
|
+
*,
|
955
|
+
append: bool = False,
|
956
|
+
) -> "Guides":
|
957
|
+
"""Create vertical guides for columns based on headers and whitespace valleys.
|
958
|
+
|
959
|
+
This method detects column boundaries by finding optimal vertical separators
|
960
|
+
between headers that minimize text crossings, regardless of text alignment.
|
961
|
+
|
962
|
+
Args:
|
963
|
+
headers: Column header elements (ElementCollection or list of Elements)
|
964
|
+
obj: Page/Region to analyze (uses parent's context if None)
|
965
|
+
method: Detection method:
|
966
|
+
- 'min_crossings': Fast vector-based minimum intersection count
|
967
|
+
- 'seam_carving': Dynamic programming for curved boundaries
|
968
|
+
min_width: Minimum column width constraint (pixels)
|
969
|
+
max_width: Maximum column width constraint (pixels)
|
970
|
+
margin: Buffer space from header edges when searching for separators (default: 0.5)
|
971
|
+
row_stabilization: Whether to use row-wise median for stability
|
972
|
+
num_samples: Number of x-positions to test per gap (for min_crossings)
|
973
|
+
append: Whether to append to existing guides
|
974
|
+
|
975
|
+
Returns:
|
976
|
+
Parent Guides object for chaining
|
977
|
+
|
978
|
+
Examples:
|
979
|
+
# Create column guides from headers
|
980
|
+
headers = page.find_all('text[size=16]')
|
981
|
+
guides.vertical.from_headers(headers)
|
982
|
+
|
983
|
+
# With width constraints
|
984
|
+
guides.vertical.from_headers(headers, min_width=50, max_width=200)
|
985
|
+
|
986
|
+
# Seam carving for complex layouts
|
987
|
+
guides.vertical.from_headers(headers, method='seam_carving')
|
988
|
+
"""
|
989
|
+
|
990
|
+
if self._axis != "vertical":
|
991
|
+
raise ValueError("from_headers() only works for vertical guides (columns)")
|
992
|
+
|
993
|
+
target_obj = obj or self._parent.context
|
994
|
+
if target_obj is None:
|
995
|
+
raise ValueError("No object provided and no context available")
|
996
|
+
|
997
|
+
# Convert headers to list if ElementCollection
|
998
|
+
if hasattr(headers, "elements"):
|
999
|
+
header_elements = list(headers.elements)
|
1000
|
+
else:
|
1001
|
+
header_elements = list(headers)
|
1002
|
+
|
1003
|
+
# Sort headers by x-position
|
1004
|
+
header_elements.sort(key=lambda h: h.x0 if hasattr(h, "x0") else 0)
|
1005
|
+
|
1006
|
+
# Need at least 2 headers
|
1007
|
+
if len(header_elements) < 2:
|
1008
|
+
logger.warning("Need at least 2 headers for column detection")
|
1009
|
+
return self._parent
|
1010
|
+
|
1011
|
+
# Get page bounds
|
1012
|
+
if hasattr(target_obj, "bbox"):
|
1013
|
+
page_bounds = target_obj.bbox
|
1014
|
+
elif hasattr(target_obj, "width") and hasattr(target_obj, "height"):
|
1015
|
+
# Create bbox from width/height
|
1016
|
+
page_bounds = (0, 0, target_obj.width, target_obj.height)
|
1017
|
+
else:
|
1018
|
+
page_bounds = None
|
1019
|
+
|
1020
|
+
if not page_bounds:
|
1021
|
+
logger.warning("Could not determine page bounds")
|
1022
|
+
return self._parent
|
1023
|
+
|
1024
|
+
# Get text below headers for occupancy analysis
|
1025
|
+
header_bottom = max(h.bottom for h in header_elements)
|
1026
|
+
all_text = target_obj.find_all("text")
|
1027
|
+
body_elements = [elem for elem in all_text if elem.top > header_bottom]
|
1028
|
+
|
1029
|
+
# Extract bounding boxes
|
1030
|
+
bboxes = [(elem.x0, elem.top, elem.x1, elem.bottom) for elem in body_elements]
|
1031
|
+
|
1032
|
+
# Find separators between each header pair
|
1033
|
+
separators = []
|
1034
|
+
logger.debug(f"Processing {len(header_elements)} headers for column detection")
|
1035
|
+
for i in range(len(header_elements) - 1):
|
1036
|
+
h_left = header_elements[i]
|
1037
|
+
h_right = header_elements[i + 1]
|
1038
|
+
|
1039
|
+
# Define search band
|
1040
|
+
left_edge = h_left.x1 if hasattr(h_left, "x1") else h_left.right
|
1041
|
+
right_edge = h_right.x0 if hasattr(h_right, "x0") else h_right.left
|
1042
|
+
gap = right_edge - left_edge
|
1043
|
+
|
1044
|
+
# If gap is too small, place separator in the middle
|
1045
|
+
if gap <= 2 * margin:
|
1046
|
+
# Place separator in the middle of the gap
|
1047
|
+
separator = (left_edge + right_edge) / 2
|
1048
|
+
separators.append(separator)
|
1049
|
+
continue
|
1050
|
+
|
1051
|
+
# Normal case - search within the band
|
1052
|
+
x0 = left_edge + margin
|
1053
|
+
x1 = right_edge - margin
|
1054
|
+
|
1055
|
+
# Apply width constraints if provided
|
1056
|
+
if min_width and (x1 - x0) < min_width:
|
1057
|
+
# Center the separator
|
1058
|
+
center = (x0 + x1) / 2
|
1059
|
+
separators.append(center)
|
1060
|
+
continue
|
1061
|
+
|
1062
|
+
if method == "min_crossings":
|
1063
|
+
separator = self._find_min_crossing_separator(x0, x1, bboxes, num_samples)
|
1064
|
+
else: # seam_carving
|
1065
|
+
separator = self._find_seam_carving_separator(
|
1066
|
+
x0, x1, target_obj, header_bottom, page_bounds[3], bboxes
|
1067
|
+
)
|
1068
|
+
|
1069
|
+
# Apply width constraints only if they don't conflict with header positions
|
1070
|
+
if separators:
|
1071
|
+
if min_width and separator - separators[-1] < min_width:
|
1072
|
+
# Only enforce if it doesn't push into next header
|
1073
|
+
proposed = separators[-1] + min_width
|
1074
|
+
if proposed < right_edge:
|
1075
|
+
separator = proposed
|
1076
|
+
if max_width and separator - separators[-1] > max_width:
|
1077
|
+
separator = separators[-1] + max_width
|
1078
|
+
|
1079
|
+
separators.append(separator)
|
1080
|
+
|
1081
|
+
# Ensure we have page boundaries
|
1082
|
+
if separators:
|
1083
|
+
if not any(abs(sep - page_bounds[0]) < 0.1 for sep in separators):
|
1084
|
+
separators.insert(0, page_bounds[0])
|
1085
|
+
if not any(abs(sep - page_bounds[2]) < 0.1 for sep in separators):
|
1086
|
+
separators.append(page_bounds[2])
|
1087
|
+
|
1088
|
+
# Apply row stabilization if requested
|
1089
|
+
if row_stabilization and separators:
|
1090
|
+
separators = self._stabilize_with_rows(separators, target_obj, bboxes, header_bottom)
|
1091
|
+
|
1092
|
+
# Update guides
|
1093
|
+
if append:
|
1094
|
+
self.extend(separators)
|
1095
|
+
else:
|
1096
|
+
self.data = separators
|
1097
|
+
|
1098
|
+
return self._parent
|
1099
|
+
|
1100
|
+
def _find_min_crossing_separator(
|
1101
|
+
self,
|
1102
|
+
x0: float,
|
1103
|
+
x1: float,
|
1104
|
+
bboxes: List[Tuple[float, float, float, float]],
|
1105
|
+
num_samples: int,
|
1106
|
+
) -> float:
|
1107
|
+
"""Find x-coordinate with minimum text crossings in band."""
|
1108
|
+
candidates = np.linspace(x0, x1, num_samples)
|
1109
|
+
|
1110
|
+
best_x = x0
|
1111
|
+
min_crossings = float("inf")
|
1112
|
+
best_gap = 0
|
1113
|
+
|
1114
|
+
for x in candidates:
|
1115
|
+
# Count how many bboxes this x-line crosses
|
1116
|
+
crossings = sum(1 for bbox in bboxes if bbox[0] < x < bbox[2])
|
1117
|
+
|
1118
|
+
# Calculate minimum gap to any edge (for tie-breaking)
|
1119
|
+
if crossings > 0:
|
1120
|
+
gaps = []
|
1121
|
+
for bbox in bboxes:
|
1122
|
+
if bbox[0] < x < bbox[2]:
|
1123
|
+
gaps.extend([abs(x - bbox[0]), abs(x - bbox[2])])
|
1124
|
+
min_gap = min(gaps) if gaps else float("inf")
|
1125
|
+
else:
|
1126
|
+
min_gap = float("inf")
|
1127
|
+
|
1128
|
+
# Update best if fewer crossings or same crossings but larger gap
|
1129
|
+
if crossings < min_crossings or (crossings == min_crossings and min_gap > best_gap):
|
1130
|
+
min_crossings = crossings
|
1131
|
+
best_x = x
|
1132
|
+
best_gap = min_gap
|
1133
|
+
|
1134
|
+
return best_x
|
1135
|
+
|
1136
|
+
def _find_seam_carving_separator(
|
1137
|
+
self,
|
1138
|
+
x0: float,
|
1139
|
+
x1: float,
|
1140
|
+
obj,
|
1141
|
+
header_y: float,
|
1142
|
+
page_bottom: float,
|
1143
|
+
bboxes: List[Tuple[float, float, float, float]],
|
1144
|
+
) -> float:
|
1145
|
+
"""Find optimal separator using seam carving (dynamic programming)."""
|
1146
|
+
# Create cost matrix
|
1147
|
+
band_width = int(x1 - x0)
|
1148
|
+
band_height = int(page_bottom - header_y)
|
1149
|
+
|
1150
|
+
if band_width <= 0 or band_height <= 0:
|
1151
|
+
return (x0 + x1) / 2
|
1152
|
+
|
1153
|
+
# Resolution for cost matrix (1 pixel = 1 point for now)
|
1154
|
+
cost_matrix = np.zeros((band_height, band_width))
|
1155
|
+
|
1156
|
+
# Fill cost matrix - high cost where text exists
|
1157
|
+
for bbox in bboxes:
|
1158
|
+
# Check if bbox intersects with our band
|
1159
|
+
# bbox format is (x0, top, x1, bottom)
|
1160
|
+
if bbox[2] > x0 and bbox[0] < x1 and bbox[3] > header_y:
|
1161
|
+
# Convert to band coordinates
|
1162
|
+
left = max(0, int(bbox[0] - x0))
|
1163
|
+
right = min(band_width, int(bbox[2] - x0))
|
1164
|
+
top = max(0, int(bbox[1] - header_y))
|
1165
|
+
bottom = min(band_height, int(bbox[3] - header_y))
|
1166
|
+
|
1167
|
+
# Set high cost for text regions
|
1168
|
+
cost_matrix[top:bottom, left:right] = 100
|
1169
|
+
|
1170
|
+
# Add small gradient cost to prefer straight lines
|
1171
|
+
for i in range(band_width):
|
1172
|
+
cost_matrix[:, i] += abs(i - band_width // 2) * 0.1
|
1173
|
+
|
1174
|
+
# Dynamic programming to find minimum cost path
|
1175
|
+
dp = np.full_like(cost_matrix, np.inf)
|
1176
|
+
dp[0, :] = cost_matrix[0, :]
|
1177
|
+
|
1178
|
+
# Fill DP table
|
1179
|
+
for y in range(1, band_height):
|
1180
|
+
for x in range(band_width):
|
1181
|
+
# Can come from directly above or diagonally
|
1182
|
+
dp[y, x] = cost_matrix[y, x] + dp[y - 1, x]
|
1183
|
+
if x > 0:
|
1184
|
+
dp[y, x] = min(dp[y, x], cost_matrix[y, x] + dp[y - 1, x - 1])
|
1185
|
+
if x < band_width - 1:
|
1186
|
+
dp[y, x] = min(dp[y, x], cost_matrix[y, x] + dp[y - 1, x + 1])
|
1187
|
+
|
1188
|
+
# Find minimum cost at bottom
|
1189
|
+
min_x = np.argmin(dp[-1, :])
|
1190
|
+
|
1191
|
+
# Trace back to get path
|
1192
|
+
path_x_coords = [min_x]
|
1193
|
+
for y in range(band_height - 2, -1, -1):
|
1194
|
+
x = path_x_coords[-1]
|
1195
|
+
|
1196
|
+
# Find which direction we came from
|
1197
|
+
candidates = [(x, dp[y, x])]
|
1198
|
+
if x > 0:
|
1199
|
+
candidates.append((x - 1, dp[y, x - 1]))
|
1200
|
+
if x < band_width - 1:
|
1201
|
+
candidates.append((x + 1, dp[y, x + 1]))
|
1202
|
+
|
1203
|
+
next_x = min(candidates, key=lambda c: c[1])[0]
|
1204
|
+
path_x_coords.append(next_x)
|
1205
|
+
|
1206
|
+
# Return median x-coordinate of the path
|
1207
|
+
median_x = np.median(path_x_coords)
|
1208
|
+
return x0 + median_x
|
1209
|
+
|
1210
|
+
def _stabilize_with_rows(
|
1211
|
+
self,
|
1212
|
+
separators: List[float],
|
1213
|
+
obj,
|
1214
|
+
bboxes: List[Tuple[float, float, float, float]],
|
1215
|
+
header_y: float,
|
1216
|
+
) -> List[float]:
|
1217
|
+
"""Stabilize separators using row-wise analysis."""
|
1218
|
+
if not bboxes:
|
1219
|
+
return separators
|
1220
|
+
|
1221
|
+
# Detect rows by finding horizontal gaps
|
1222
|
+
# bbox format is (x0, top, x1, bottom)
|
1223
|
+
y_coords = sorted(set([bbox[1] for bbox in bboxes] + [bbox[3] for bbox in bboxes]))
|
1224
|
+
|
1225
|
+
# Find gaps larger than typical line height
|
1226
|
+
gaps = []
|
1227
|
+
for i in range(len(y_coords) - 1):
|
1228
|
+
gap_size = y_coords[i + 1] - y_coords[i]
|
1229
|
+
if gap_size > 5: # Minimum gap to consider a row boundary
|
1230
|
+
gaps.append((y_coords[i], y_coords[i + 1]))
|
1231
|
+
|
1232
|
+
if not gaps:
|
1233
|
+
return separators
|
1234
|
+
|
1235
|
+
# For each separator, collect positions across rows
|
1236
|
+
stabilized = []
|
1237
|
+
for i, sep in enumerate(separators):
|
1238
|
+
row_positions = []
|
1239
|
+
|
1240
|
+
for gap_start, gap_end in gaps:
|
1241
|
+
# Get elements in this row
|
1242
|
+
row_elements = [
|
1243
|
+
bbox for bbox in bboxes if bbox[1] >= gap_start and bbox[3] <= gap_end
|
1244
|
+
]
|
1245
|
+
|
1246
|
+
if row_elements:
|
1247
|
+
# Find best position in this row
|
1248
|
+
if i == 0:
|
1249
|
+
# First separator - look left of content
|
1250
|
+
x0 = 0
|
1251
|
+
x1 = sep + 20
|
1252
|
+
elif i == len(separators) - 1:
|
1253
|
+
# Last separator - look right of content
|
1254
|
+
x0 = sep - 20
|
1255
|
+
x1 = float("inf")
|
1256
|
+
else:
|
1257
|
+
# Middle separator - look around current position
|
1258
|
+
x0 = sep - 20
|
1259
|
+
x1 = sep + 20
|
1260
|
+
|
1261
|
+
# Find minimum crossing position in this range
|
1262
|
+
best_x = self._find_min_crossing_separator(
|
1263
|
+
max(x0, sep - 20), min(x1, sep + 20), row_elements, 50
|
1264
|
+
)
|
1265
|
+
row_positions.append(best_x)
|
1266
|
+
|
1267
|
+
# Use median of row positions if we have enough samples
|
1268
|
+
if len(row_positions) >= 3:
|
1269
|
+
stabilized.append(np.median(row_positions))
|
1270
|
+
else:
|
1271
|
+
stabilized.append(sep)
|
1272
|
+
|
1273
|
+
return stabilized
|
1274
|
+
|
944
1275
|
def from_stripes(
|
945
1276
|
self,
|
946
1277
|
stripes=None,
|
@@ -4143,6 +4474,34 @@ class Guides:
|
|
4143
4474
|
else:
|
4144
4475
|
raise ValueError(f"Target object {target_obj} is not a Page or Region")
|
4145
4476
|
|
4477
|
+
# Check if we have guides in only one dimension
|
4478
|
+
has_verticals = len(self.vertical) > 0
|
4479
|
+
has_horizontals = len(self.horizontal) > 0
|
4480
|
+
|
4481
|
+
# If we have guides in only one dimension, use direct extraction with explicit lines
|
4482
|
+
if (has_verticals and not has_horizontals) or (has_horizontals and not has_verticals):
|
4483
|
+
logger.debug(
|
4484
|
+
f"Partial guides detected - using direct extraction (v={has_verticals}, h={has_horizontals})"
|
4485
|
+
)
|
4486
|
+
|
4487
|
+
# Extract directly from the target using explicit lines
|
4488
|
+
if hasattr(target_obj, "extract_table"):
|
4489
|
+
return target_obj.extract_table(
|
4490
|
+
method=method, # Let auto-detection work when None
|
4491
|
+
table_settings=table_settings,
|
4492
|
+
use_ocr=use_ocr,
|
4493
|
+
ocr_config=ocr_config,
|
4494
|
+
text_options=text_options,
|
4495
|
+
cell_extraction_func=cell_extraction_func,
|
4496
|
+
show_progress=show_progress,
|
4497
|
+
content_filter=content_filter,
|
4498
|
+
verticals=list(self.vertical) if has_verticals else None,
|
4499
|
+
horizontals=list(self.horizontal) if has_horizontals else None,
|
4500
|
+
)
|
4501
|
+
else:
|
4502
|
+
raise ValueError(f"Target object {type(target_obj)} does not support extract_table")
|
4503
|
+
|
4504
|
+
# Both dimensions have guides - use normal grid-based extraction
|
4146
4505
|
try:
|
4147
4506
|
# Step 1: Build grid structure (creates temporary regions)
|
4148
4507
|
grid_result = self.build_grid(
|
@@ -1286,6 +1286,10 @@ class ElementManager:
|
|
1286
1286
|
|
1287
1287
|
fill_col = rc.get("non_stroking_color")
|
1288
1288
|
# We keep colour as metadata but no longer filter on it
|
1289
|
+
# Note: pdfminer.six has a bug where it may report incorrect colors
|
1290
|
+
# when no explicit color space is set. E.g., '1 1 0 sc' (RGB yellow)
|
1291
|
+
# is parsed as 0.0 (grayscale black) because pdfminer defaults to
|
1292
|
+
# DeviceGray and only reads 1 component from the stack.
|
1289
1293
|
if fill_col is None:
|
1290
1294
|
continue
|
1291
1295
|
|
natural_pdf/core/page.py
CHANGED
@@ -30,6 +30,7 @@ from tqdm.auto import tqdm # Added tqdm import
|
|
30
30
|
from natural_pdf.elements.element_collection import ElementCollection
|
31
31
|
from natural_pdf.elements.region import Region
|
32
32
|
from natural_pdf.selectors.parser import parse_selector
|
33
|
+
from natural_pdf.tables.result import TableResult
|
33
34
|
from natural_pdf.utils.locks import pdf_render_lock # Import from utils instead
|
34
35
|
from natural_pdf.utils.visualization import render_plain_page
|
35
36
|
|
@@ -866,26 +867,33 @@ class Page(
|
|
866
867
|
if debug:
|
867
868
|
print(f" - Added direct region '{label}': {exclusion_item}")
|
868
869
|
|
869
|
-
# Process direct Element objects - convert to Region
|
870
|
+
# Process direct Element objects - only convert to Region if method is "region"
|
870
871
|
elif hasattr(exclusion_item, "bbox") and hasattr(exclusion_item, "expand"):
|
871
|
-
|
872
|
-
|
873
|
-
|
874
|
-
|
875
|
-
expanded_region
|
876
|
-
|
877
|
-
|
878
|
-
|
879
|
-
|
880
|
-
|
881
|
-
|
872
|
+
if method == "region":
|
873
|
+
try:
|
874
|
+
# Convert Element to Region using expand()
|
875
|
+
expanded_region = exclusion_item.expand()
|
876
|
+
if isinstance(expanded_region, Region):
|
877
|
+
expanded_region.label = label
|
878
|
+
regions.append(expanded_region)
|
879
|
+
if debug:
|
880
|
+
print(
|
881
|
+
f" - Converted direct Element to Region '{label}': {expanded_region}"
|
882
|
+
)
|
883
|
+
else:
|
884
|
+
if debug:
|
885
|
+
print(
|
886
|
+
f" - Element.expand() did not return a Region: {type(expanded_region)}"
|
887
|
+
)
|
888
|
+
except Exception as e:
|
882
889
|
if debug:
|
883
|
-
print(
|
884
|
-
|
885
|
-
|
886
|
-
except Exception as e:
|
890
|
+
print(f" - Failed to convert Element to Region: {e}")
|
891
|
+
else:
|
892
|
+
# method == "element" - will be handled in _filter_elements_by_exclusions
|
887
893
|
if debug:
|
888
|
-
print(
|
894
|
+
print(
|
895
|
+
f" - Skipping element '{label}' (will be handled as element-based exclusion)"
|
896
|
+
)
|
889
897
|
|
890
898
|
# Process string selectors (from PDF-level exclusions)
|
891
899
|
elif isinstance(exclusion_item, str):
|
@@ -1245,15 +1253,46 @@ class Page(
|
|
1245
1253
|
Returns:
|
1246
1254
|
ElementCollection of matching elements (unfiltered by exclusions)
|
1247
1255
|
"""
|
1248
|
-
from natural_pdf.selectors.parser import selector_to_filter_func
|
1256
|
+
from natural_pdf.selectors.parser import _calculate_aggregates, selector_to_filter_func
|
1249
1257
|
|
1250
1258
|
# Handle compound OR selectors
|
1251
1259
|
if selector_obj.get("type") == "or":
|
1252
1260
|
# For OR selectors, search all elements and let the filter function decide
|
1253
1261
|
elements_to_search = self._element_mgr.get_all_elements()
|
1254
1262
|
|
1263
|
+
# Check if any sub-selector contains aggregate functions
|
1264
|
+
has_aggregates = False
|
1265
|
+
for sub_selector in selector_obj.get("selectors", []):
|
1266
|
+
for attr in sub_selector.get("attributes", []):
|
1267
|
+
value = attr.get("value")
|
1268
|
+
if isinstance(value, dict) and value.get("type") == "aggregate":
|
1269
|
+
has_aggregates = True
|
1270
|
+
break
|
1271
|
+
if has_aggregates:
|
1272
|
+
break
|
1273
|
+
|
1274
|
+
# Calculate aggregates if needed - for OR selectors we calculate on ALL elements
|
1275
|
+
aggregates = {}
|
1276
|
+
if has_aggregates:
|
1277
|
+
# Need to calculate aggregates for each sub-selector type
|
1278
|
+
for sub_selector in selector_obj.get("selectors", []):
|
1279
|
+
sub_type = sub_selector.get("type", "any").lower()
|
1280
|
+
if sub_type == "text":
|
1281
|
+
sub_elements = self._element_mgr.words
|
1282
|
+
elif sub_type == "rect":
|
1283
|
+
sub_elements = self._element_mgr.rects
|
1284
|
+
elif sub_type == "line":
|
1285
|
+
sub_elements = self._element_mgr.lines
|
1286
|
+
elif sub_type == "region":
|
1287
|
+
sub_elements = self._element_mgr.regions
|
1288
|
+
else:
|
1289
|
+
sub_elements = elements_to_search
|
1290
|
+
|
1291
|
+
sub_aggregates = _calculate_aggregates(sub_elements, sub_selector)
|
1292
|
+
aggregates.update(sub_aggregates)
|
1293
|
+
|
1255
1294
|
# Create filter function from compound selector
|
1256
|
-
filter_func = selector_to_filter_func(selector_obj, **kwargs)
|
1295
|
+
filter_func = selector_to_filter_func(selector_obj, aggregates=aggregates, **kwargs)
|
1257
1296
|
|
1258
1297
|
# Apply the filter to all elements
|
1259
1298
|
matching_elements = [element for element in elements_to_search if filter_func(element)]
|
@@ -1309,8 +1348,23 @@ class Page(
|
|
1309
1348
|
else:
|
1310
1349
|
elements_to_search = self._element_mgr.get_all_elements()
|
1311
1350
|
|
1351
|
+
# Check if selector contains aggregate functions
|
1352
|
+
has_aggregates = False
|
1353
|
+
for attr in selector_obj.get("attributes", []):
|
1354
|
+
value = attr.get("value")
|
1355
|
+
if isinstance(value, dict) and value.get("type") == "aggregate":
|
1356
|
+
has_aggregates = True
|
1357
|
+
break
|
1358
|
+
|
1359
|
+
# Calculate aggregates if needed
|
1360
|
+
aggregates = {}
|
1361
|
+
if has_aggregates:
|
1362
|
+
# For aggregates, we need to calculate based on ALL elements of the same type
|
1363
|
+
# not just the filtered subset
|
1364
|
+
aggregates = _calculate_aggregates(elements_to_search, selector_obj)
|
1365
|
+
|
1312
1366
|
# Create filter function from selector, passing any additional parameters
|
1313
|
-
filter_func = selector_to_filter_func(selector_obj, **kwargs)
|
1367
|
+
filter_func = selector_to_filter_func(selector_obj, aggregates=aggregates, **kwargs)
|
1314
1368
|
|
1315
1369
|
# Apply the filter to matching elements
|
1316
1370
|
matching_elements = [element for element in elements_to_search if filter_func(element)]
|
@@ -1857,7 +1911,9 @@ class Page(
|
|
1857
1911
|
cell_extraction_func: Optional[Callable[["Region"], Optional[str]]] = None,
|
1858
1912
|
show_progress: bool = False,
|
1859
1913
|
content_filter=None,
|
1860
|
-
|
1914
|
+
verticals: Optional[List[float]] = None,
|
1915
|
+
horizontals: Optional[List[float]] = None,
|
1916
|
+
) -> TableResult:
|
1861
1917
|
"""
|
1862
1918
|
Extract the largest table from this page using enhanced region-based extraction.
|
1863
1919
|
|
@@ -1874,9 +1930,11 @@ class Page(
|
|
1874
1930
|
- A regex pattern string (characters matching the pattern are EXCLUDED)
|
1875
1931
|
- A callable that takes text and returns True to KEEP the character
|
1876
1932
|
- A list of regex patterns (characters matching ANY pattern are EXCLUDED)
|
1933
|
+
verticals: Optional list of x-coordinates for explicit vertical table lines.
|
1934
|
+
horizontals: Optional list of y-coordinates for explicit horizontal table lines.
|
1877
1935
|
|
1878
1936
|
Returns:
|
1879
|
-
|
1937
|
+
TableResult: A sequence-like object containing table rows that also provides .to_df() for pandas conversion.
|
1880
1938
|
"""
|
1881
1939
|
# Create a full-page region and delegate to its enhanced extract_table method
|
1882
1940
|
page_region = self.create_region(0, 0, self.width, self.height)
|
@@ -1889,6 +1947,8 @@ class Page(
|
|
1889
1947
|
cell_extraction_func=cell_extraction_func,
|
1890
1948
|
show_progress=show_progress,
|
1891
1949
|
content_filter=content_filter,
|
1950
|
+
verticals=verticals,
|
1951
|
+
horizontals=horizontals,
|
1892
1952
|
)
|
1893
1953
|
|
1894
1954
|
def extract_tables(
|
@@ -2768,6 +2828,7 @@ class Page(
|
|
2768
2828
|
region.start_element = current_start_element
|
2769
2829
|
region.end_element = end_boundary_el # Mark the element that ended it
|
2770
2830
|
region.is_end_next_start = True # Mark how it ended
|
2831
|
+
region._boundary_exclusions = include_boundaries
|
2771
2832
|
regions.append(region)
|
2772
2833
|
else: # horizontal
|
2773
2834
|
sec_left = (
|
@@ -2787,6 +2848,7 @@ class Page(
|
|
2787
2848
|
region.start_element = current_start_element
|
2788
2849
|
region.end_element = end_boundary_el # Mark the element that ended it
|
2789
2850
|
region.is_end_next_start = True # Mark how it ended
|
2851
|
+
region._boundary_exclusions = include_boundaries
|
2790
2852
|
regions.append(region)
|
2791
2853
|
active_section_started = False # Reset for the new start
|
2792
2854
|
|
@@ -2815,6 +2877,7 @@ class Page(
|
|
2815
2877
|
region.start_element = current_start_element
|
2816
2878
|
region.end_element = end_boundary_el
|
2817
2879
|
region.is_end_next_start = False
|
2880
|
+
region._boundary_exclusions = include_boundaries
|
2818
2881
|
regions.append(region)
|
2819
2882
|
else: # horizontal
|
2820
2883
|
sec_left = (
|
@@ -2834,6 +2897,7 @@ class Page(
|
|
2834
2897
|
region.start_element = current_start_element
|
2835
2898
|
region.end_element = end_boundary_el
|
2836
2899
|
region.is_end_next_start = False
|
2900
|
+
region._boundary_exclusions = include_boundaries
|
2837
2901
|
regions.append(region)
|
2838
2902
|
|
2839
2903
|
# Reset: section ended explicitly
|
@@ -2854,6 +2918,7 @@ class Page(
|
|
2854
2918
|
region.start_element = current_start_element
|
2855
2919
|
region.end_element = None # Ended by page end
|
2856
2920
|
region.is_end_next_start = False
|
2921
|
+
region._boundary_exclusions = include_boundaries
|
2857
2922
|
regions.append(region)
|
2858
2923
|
else: # horizontal
|
2859
2924
|
sec_left = (
|
@@ -2867,6 +2932,7 @@ class Page(
|
|
2867
2932
|
region.start_element = current_start_element
|
2868
2933
|
region.end_element = None # Ended by page end
|
2869
2934
|
region.is_end_next_start = False
|
2935
|
+
region._boundary_exclusions = include_boundaries
|
2870
2936
|
regions.append(region)
|
2871
2937
|
|
2872
2938
|
return ElementCollection(regions)
|