natural-pdf 0.2.16__py3-none-any.whl → 0.2.18__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/__init__.py +45 -0
- natural_pdf/analyzers/guides.py +359 -0
- natural_pdf/core/element_manager.py +4 -0
- natural_pdf/core/page.py +130 -31
- natural_pdf/core/page_collection.py +75 -0
- natural_pdf/core/pdf.py +33 -0
- natural_pdf/describe/base.py +48 -7
- natural_pdf/elements/base.py +408 -43
- natural_pdf/elements/element_collection.py +83 -10
- natural_pdf/elements/region.py +217 -178
- natural_pdf/elements/text.py +5 -3
- natural_pdf/flows/element.py +1 -0
- natural_pdf/flows/flow.py +175 -480
- natural_pdf/flows/region.py +76 -0
- natural_pdf/selectors/parser.py +180 -9
- natural_pdf/utils/pdfminer_patches.py +136 -0
- natural_pdf/utils/sections.py +346 -0
- natural_pdf/utils/spatial.py +172 -0
- {natural_pdf-0.2.16.dist-info → natural_pdf-0.2.18.dist-info}/METADATA +1 -1
- {natural_pdf-0.2.16.dist-info → natural_pdf-0.2.18.dist-info}/RECORD +24 -21
- {natural_pdf-0.2.16.dist-info → natural_pdf-0.2.18.dist-info}/WHEEL +0 -0
- {natural_pdf-0.2.16.dist-info → natural_pdf-0.2.18.dist-info}/entry_points.txt +0 -0
- {natural_pdf-0.2.16.dist-info → natural_pdf-0.2.18.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.2.16.dist-info → natural_pdf-0.2.18.dist-info}/top_level.txt +0 -0
natural_pdf/__init__.py
CHANGED
@@ -62,14 +62,59 @@ class Options:
|
|
62
62
|
# Text extraction defaults (empty for now)
|
63
63
|
self.text = ConfigSection()
|
64
64
|
|
65
|
+
# Layout and navigation defaults
|
66
|
+
self.layout = ConfigSection(
|
67
|
+
directional_offset=0.01, # Offset in points when using directional methods
|
68
|
+
auto_multipage=False, # Whether directional methods span pages by default
|
69
|
+
)
|
70
|
+
|
65
71
|
|
66
72
|
# Create global options instance
|
67
73
|
options = Options()
|
68
74
|
|
69
75
|
|
76
|
+
def set_option(name: str, value):
|
77
|
+
"""
|
78
|
+
Set a global Natural PDF option.
|
79
|
+
|
80
|
+
Args:
|
81
|
+
name: Option name in dot notation (e.g., 'layout.auto_multipage')
|
82
|
+
value: New value for the option
|
83
|
+
|
84
|
+
Example:
|
85
|
+
import natural_pdf as npdf
|
86
|
+
npdf.set_option('layout.auto_multipage', True)
|
87
|
+
npdf.set_option('ocr.engine', 'surya')
|
88
|
+
"""
|
89
|
+
parts = name.split(".")
|
90
|
+
obj = options
|
91
|
+
|
92
|
+
# Navigate to the right section
|
93
|
+
for part in parts[:-1]:
|
94
|
+
if hasattr(obj, part):
|
95
|
+
obj = getattr(obj, part)
|
96
|
+
else:
|
97
|
+
raise KeyError(f"Unknown option section: {part}")
|
98
|
+
|
99
|
+
# Set the final value
|
100
|
+
final_key = parts[-1]
|
101
|
+
if hasattr(obj, final_key):
|
102
|
+
setattr(obj, final_key, value)
|
103
|
+
else:
|
104
|
+
raise KeyError(f"Unknown option: {name}")
|
105
|
+
|
106
|
+
|
70
107
|
# Version
|
71
108
|
__version__ = "0.1.1"
|
72
109
|
|
110
|
+
# Apply pdfminer patches for known bugs
|
111
|
+
try:
|
112
|
+
from natural_pdf.utils.pdfminer_patches import apply_patches
|
113
|
+
|
114
|
+
apply_patches()
|
115
|
+
except Exception as e:
|
116
|
+
logger.warning(f"Failed to apply pdfminer patches: {e}")
|
117
|
+
|
73
118
|
from natural_pdf.analyzers.guides import Guides
|
74
119
|
from natural_pdf.core.page import Page
|
75
120
|
from natural_pdf.core.page_collection import PageCollection
|
natural_pdf/analyzers/guides.py
CHANGED
@@ -941,6 +941,337 @@ class GuidesList(UserList):
|
|
941
941
|
self.data.clear()
|
942
942
|
return self._parent
|
943
943
|
|
944
|
+
def from_headers(
|
945
|
+
self,
|
946
|
+
headers: Union["ElementCollection", List["Element"]],
|
947
|
+
obj: Optional[Union["Page", "Region"]] = None,
|
948
|
+
method: Literal["min_crossings", "seam_carving"] = "min_crossings",
|
949
|
+
min_width: Optional[float] = None,
|
950
|
+
max_width: Optional[float] = None,
|
951
|
+
margin: float = 0.5,
|
952
|
+
row_stabilization: bool = True,
|
953
|
+
num_samples: int = 400,
|
954
|
+
*,
|
955
|
+
append: bool = False,
|
956
|
+
) -> "Guides":
|
957
|
+
"""Create vertical guides for columns based on headers and whitespace valleys.
|
958
|
+
|
959
|
+
This method detects column boundaries by finding optimal vertical separators
|
960
|
+
between headers that minimize text crossings, regardless of text alignment.
|
961
|
+
|
962
|
+
Args:
|
963
|
+
headers: Column header elements (ElementCollection or list of Elements)
|
964
|
+
obj: Page/Region to analyze (uses parent's context if None)
|
965
|
+
method: Detection method:
|
966
|
+
- 'min_crossings': Fast vector-based minimum intersection count
|
967
|
+
- 'seam_carving': Dynamic programming for curved boundaries
|
968
|
+
min_width: Minimum column width constraint (pixels)
|
969
|
+
max_width: Maximum column width constraint (pixels)
|
970
|
+
margin: Buffer space from header edges when searching for separators (default: 0.5)
|
971
|
+
row_stabilization: Whether to use row-wise median for stability
|
972
|
+
num_samples: Number of x-positions to test per gap (for min_crossings)
|
973
|
+
append: Whether to append to existing guides
|
974
|
+
|
975
|
+
Returns:
|
976
|
+
Parent Guides object for chaining
|
977
|
+
|
978
|
+
Examples:
|
979
|
+
# Create column guides from headers
|
980
|
+
headers = page.find_all('text[size=16]')
|
981
|
+
guides.vertical.from_headers(headers)
|
982
|
+
|
983
|
+
# With width constraints
|
984
|
+
guides.vertical.from_headers(headers, min_width=50, max_width=200)
|
985
|
+
|
986
|
+
# Seam carving for complex layouts
|
987
|
+
guides.vertical.from_headers(headers, method='seam_carving')
|
988
|
+
"""
|
989
|
+
|
990
|
+
if self._axis != "vertical":
|
991
|
+
raise ValueError("from_headers() only works for vertical guides (columns)")
|
992
|
+
|
993
|
+
target_obj = obj or self._parent.context
|
994
|
+
if target_obj is None:
|
995
|
+
raise ValueError("No object provided and no context available")
|
996
|
+
|
997
|
+
# Convert headers to list if ElementCollection
|
998
|
+
if hasattr(headers, "elements"):
|
999
|
+
header_elements = list(headers.elements)
|
1000
|
+
else:
|
1001
|
+
header_elements = list(headers)
|
1002
|
+
|
1003
|
+
# Sort headers by x-position
|
1004
|
+
header_elements.sort(key=lambda h: h.x0 if hasattr(h, "x0") else 0)
|
1005
|
+
|
1006
|
+
# Need at least 2 headers
|
1007
|
+
if len(header_elements) < 2:
|
1008
|
+
logger.warning("Need at least 2 headers for column detection")
|
1009
|
+
return self._parent
|
1010
|
+
|
1011
|
+
# Get page bounds
|
1012
|
+
if hasattr(target_obj, "bbox"):
|
1013
|
+
page_bounds = target_obj.bbox
|
1014
|
+
elif hasattr(target_obj, "width") and hasattr(target_obj, "height"):
|
1015
|
+
# Create bbox from width/height
|
1016
|
+
page_bounds = (0, 0, target_obj.width, target_obj.height)
|
1017
|
+
else:
|
1018
|
+
page_bounds = None
|
1019
|
+
|
1020
|
+
if not page_bounds:
|
1021
|
+
logger.warning("Could not determine page bounds")
|
1022
|
+
return self._parent
|
1023
|
+
|
1024
|
+
# Get text below headers for occupancy analysis
|
1025
|
+
header_bottom = max(h.bottom for h in header_elements)
|
1026
|
+
all_text = target_obj.find_all("text")
|
1027
|
+
body_elements = [elem for elem in all_text if elem.top > header_bottom]
|
1028
|
+
|
1029
|
+
# Extract bounding boxes
|
1030
|
+
bboxes = [(elem.x0, elem.top, elem.x1, elem.bottom) for elem in body_elements]
|
1031
|
+
|
1032
|
+
# Find separators between each header pair
|
1033
|
+
separators = []
|
1034
|
+
logger.debug(f"Processing {len(header_elements)} headers for column detection")
|
1035
|
+
for i in range(len(header_elements) - 1):
|
1036
|
+
h_left = header_elements[i]
|
1037
|
+
h_right = header_elements[i + 1]
|
1038
|
+
|
1039
|
+
# Define search band
|
1040
|
+
left_edge = h_left.x1 if hasattr(h_left, "x1") else h_left.right
|
1041
|
+
right_edge = h_right.x0 if hasattr(h_right, "x0") else h_right.left
|
1042
|
+
gap = right_edge - left_edge
|
1043
|
+
|
1044
|
+
# If gap is too small, place separator in the middle
|
1045
|
+
if gap <= 2 * margin:
|
1046
|
+
# Place separator in the middle of the gap
|
1047
|
+
separator = (left_edge + right_edge) / 2
|
1048
|
+
separators.append(separator)
|
1049
|
+
continue
|
1050
|
+
|
1051
|
+
# Normal case - search within the band
|
1052
|
+
x0 = left_edge + margin
|
1053
|
+
x1 = right_edge - margin
|
1054
|
+
|
1055
|
+
# Apply width constraints if provided
|
1056
|
+
if min_width and (x1 - x0) < min_width:
|
1057
|
+
# Center the separator
|
1058
|
+
center = (x0 + x1) / 2
|
1059
|
+
separators.append(center)
|
1060
|
+
continue
|
1061
|
+
|
1062
|
+
if method == "min_crossings":
|
1063
|
+
separator = self._find_min_crossing_separator(x0, x1, bboxes, num_samples)
|
1064
|
+
else: # seam_carving
|
1065
|
+
separator = self._find_seam_carving_separator(
|
1066
|
+
x0, x1, target_obj, header_bottom, page_bounds[3], bboxes
|
1067
|
+
)
|
1068
|
+
|
1069
|
+
# Apply width constraints only if they don't conflict with header positions
|
1070
|
+
if separators:
|
1071
|
+
if min_width and separator - separators[-1] < min_width:
|
1072
|
+
# Only enforce if it doesn't push into next header
|
1073
|
+
proposed = separators[-1] + min_width
|
1074
|
+
if proposed < right_edge:
|
1075
|
+
separator = proposed
|
1076
|
+
if max_width and separator - separators[-1] > max_width:
|
1077
|
+
separator = separators[-1] + max_width
|
1078
|
+
|
1079
|
+
separators.append(separator)
|
1080
|
+
|
1081
|
+
# Ensure we have page boundaries
|
1082
|
+
if separators:
|
1083
|
+
if not any(abs(sep - page_bounds[0]) < 0.1 for sep in separators):
|
1084
|
+
separators.insert(0, page_bounds[0])
|
1085
|
+
if not any(abs(sep - page_bounds[2]) < 0.1 for sep in separators):
|
1086
|
+
separators.append(page_bounds[2])
|
1087
|
+
|
1088
|
+
# Apply row stabilization if requested
|
1089
|
+
if row_stabilization and separators:
|
1090
|
+
separators = self._stabilize_with_rows(separators, target_obj, bboxes, header_bottom)
|
1091
|
+
|
1092
|
+
# Update guides
|
1093
|
+
if append:
|
1094
|
+
self.extend(separators)
|
1095
|
+
else:
|
1096
|
+
self.data = separators
|
1097
|
+
|
1098
|
+
return self._parent
|
1099
|
+
|
1100
|
+
def _find_min_crossing_separator(
|
1101
|
+
self,
|
1102
|
+
x0: float,
|
1103
|
+
x1: float,
|
1104
|
+
bboxes: List[Tuple[float, float, float, float]],
|
1105
|
+
num_samples: int,
|
1106
|
+
) -> float:
|
1107
|
+
"""Find x-coordinate with minimum text crossings in band."""
|
1108
|
+
candidates = np.linspace(x0, x1, num_samples)
|
1109
|
+
|
1110
|
+
best_x = x0
|
1111
|
+
min_crossings = float("inf")
|
1112
|
+
best_gap = 0
|
1113
|
+
|
1114
|
+
for x in candidates:
|
1115
|
+
# Count how many bboxes this x-line crosses
|
1116
|
+
crossings = sum(1 for bbox in bboxes if bbox[0] < x < bbox[2])
|
1117
|
+
|
1118
|
+
# Calculate minimum gap to any edge (for tie-breaking)
|
1119
|
+
if crossings > 0:
|
1120
|
+
gaps = []
|
1121
|
+
for bbox in bboxes:
|
1122
|
+
if bbox[0] < x < bbox[2]:
|
1123
|
+
gaps.extend([abs(x - bbox[0]), abs(x - bbox[2])])
|
1124
|
+
min_gap = min(gaps) if gaps else float("inf")
|
1125
|
+
else:
|
1126
|
+
min_gap = float("inf")
|
1127
|
+
|
1128
|
+
# Update best if fewer crossings or same crossings but larger gap
|
1129
|
+
if crossings < min_crossings or (crossings == min_crossings and min_gap > best_gap):
|
1130
|
+
min_crossings = crossings
|
1131
|
+
best_x = x
|
1132
|
+
best_gap = min_gap
|
1133
|
+
|
1134
|
+
return best_x
|
1135
|
+
|
1136
|
+
def _find_seam_carving_separator(
|
1137
|
+
self,
|
1138
|
+
x0: float,
|
1139
|
+
x1: float,
|
1140
|
+
obj,
|
1141
|
+
header_y: float,
|
1142
|
+
page_bottom: float,
|
1143
|
+
bboxes: List[Tuple[float, float, float, float]],
|
1144
|
+
) -> float:
|
1145
|
+
"""Find optimal separator using seam carving (dynamic programming)."""
|
1146
|
+
# Create cost matrix
|
1147
|
+
band_width = int(x1 - x0)
|
1148
|
+
band_height = int(page_bottom - header_y)
|
1149
|
+
|
1150
|
+
if band_width <= 0 or band_height <= 0:
|
1151
|
+
return (x0 + x1) / 2
|
1152
|
+
|
1153
|
+
# Resolution for cost matrix (1 pixel = 1 point for now)
|
1154
|
+
cost_matrix = np.zeros((band_height, band_width))
|
1155
|
+
|
1156
|
+
# Fill cost matrix - high cost where text exists
|
1157
|
+
for bbox in bboxes:
|
1158
|
+
# Check if bbox intersects with our band
|
1159
|
+
# bbox format is (x0, top, x1, bottom)
|
1160
|
+
if bbox[2] > x0 and bbox[0] < x1 and bbox[3] > header_y:
|
1161
|
+
# Convert to band coordinates
|
1162
|
+
left = max(0, int(bbox[0] - x0))
|
1163
|
+
right = min(band_width, int(bbox[2] - x0))
|
1164
|
+
top = max(0, int(bbox[1] - header_y))
|
1165
|
+
bottom = min(band_height, int(bbox[3] - header_y))
|
1166
|
+
|
1167
|
+
# Set high cost for text regions
|
1168
|
+
cost_matrix[top:bottom, left:right] = 100
|
1169
|
+
|
1170
|
+
# Add small gradient cost to prefer straight lines
|
1171
|
+
for i in range(band_width):
|
1172
|
+
cost_matrix[:, i] += abs(i - band_width // 2) * 0.1
|
1173
|
+
|
1174
|
+
# Dynamic programming to find minimum cost path
|
1175
|
+
dp = np.full_like(cost_matrix, np.inf)
|
1176
|
+
dp[0, :] = cost_matrix[0, :]
|
1177
|
+
|
1178
|
+
# Fill DP table
|
1179
|
+
for y in range(1, band_height):
|
1180
|
+
for x in range(band_width):
|
1181
|
+
# Can come from directly above or diagonally
|
1182
|
+
dp[y, x] = cost_matrix[y, x] + dp[y - 1, x]
|
1183
|
+
if x > 0:
|
1184
|
+
dp[y, x] = min(dp[y, x], cost_matrix[y, x] + dp[y - 1, x - 1])
|
1185
|
+
if x < band_width - 1:
|
1186
|
+
dp[y, x] = min(dp[y, x], cost_matrix[y, x] + dp[y - 1, x + 1])
|
1187
|
+
|
1188
|
+
# Find minimum cost at bottom
|
1189
|
+
min_x = np.argmin(dp[-1, :])
|
1190
|
+
|
1191
|
+
# Trace back to get path
|
1192
|
+
path_x_coords = [min_x]
|
1193
|
+
for y in range(band_height - 2, -1, -1):
|
1194
|
+
x = path_x_coords[-1]
|
1195
|
+
|
1196
|
+
# Find which direction we came from
|
1197
|
+
candidates = [(x, dp[y, x])]
|
1198
|
+
if x > 0:
|
1199
|
+
candidates.append((x - 1, dp[y, x - 1]))
|
1200
|
+
if x < band_width - 1:
|
1201
|
+
candidates.append((x + 1, dp[y, x + 1]))
|
1202
|
+
|
1203
|
+
next_x = min(candidates, key=lambda c: c[1])[0]
|
1204
|
+
path_x_coords.append(next_x)
|
1205
|
+
|
1206
|
+
# Return median x-coordinate of the path
|
1207
|
+
median_x = np.median(path_x_coords)
|
1208
|
+
return x0 + median_x
|
1209
|
+
|
1210
|
+
def _stabilize_with_rows(
|
1211
|
+
self,
|
1212
|
+
separators: List[float],
|
1213
|
+
obj,
|
1214
|
+
bboxes: List[Tuple[float, float, float, float]],
|
1215
|
+
header_y: float,
|
1216
|
+
) -> List[float]:
|
1217
|
+
"""Stabilize separators using row-wise analysis."""
|
1218
|
+
if not bboxes:
|
1219
|
+
return separators
|
1220
|
+
|
1221
|
+
# Detect rows by finding horizontal gaps
|
1222
|
+
# bbox format is (x0, top, x1, bottom)
|
1223
|
+
y_coords = sorted(set([bbox[1] for bbox in bboxes] + [bbox[3] for bbox in bboxes]))
|
1224
|
+
|
1225
|
+
# Find gaps larger than typical line height
|
1226
|
+
gaps = []
|
1227
|
+
for i in range(len(y_coords) - 1):
|
1228
|
+
gap_size = y_coords[i + 1] - y_coords[i]
|
1229
|
+
if gap_size > 5: # Minimum gap to consider a row boundary
|
1230
|
+
gaps.append((y_coords[i], y_coords[i + 1]))
|
1231
|
+
|
1232
|
+
if not gaps:
|
1233
|
+
return separators
|
1234
|
+
|
1235
|
+
# For each separator, collect positions across rows
|
1236
|
+
stabilized = []
|
1237
|
+
for i, sep in enumerate(separators):
|
1238
|
+
row_positions = []
|
1239
|
+
|
1240
|
+
for gap_start, gap_end in gaps:
|
1241
|
+
# Get elements in this row
|
1242
|
+
row_elements = [
|
1243
|
+
bbox for bbox in bboxes if bbox[1] >= gap_start and bbox[3] <= gap_end
|
1244
|
+
]
|
1245
|
+
|
1246
|
+
if row_elements:
|
1247
|
+
# Find best position in this row
|
1248
|
+
if i == 0:
|
1249
|
+
# First separator - look left of content
|
1250
|
+
x0 = 0
|
1251
|
+
x1 = sep + 20
|
1252
|
+
elif i == len(separators) - 1:
|
1253
|
+
# Last separator - look right of content
|
1254
|
+
x0 = sep - 20
|
1255
|
+
x1 = float("inf")
|
1256
|
+
else:
|
1257
|
+
# Middle separator - look around current position
|
1258
|
+
x0 = sep - 20
|
1259
|
+
x1 = sep + 20
|
1260
|
+
|
1261
|
+
# Find minimum crossing position in this range
|
1262
|
+
best_x = self._find_min_crossing_separator(
|
1263
|
+
max(x0, sep - 20), min(x1, sep + 20), row_elements, 50
|
1264
|
+
)
|
1265
|
+
row_positions.append(best_x)
|
1266
|
+
|
1267
|
+
# Use median of row positions if we have enough samples
|
1268
|
+
if len(row_positions) >= 3:
|
1269
|
+
stabilized.append(np.median(row_positions))
|
1270
|
+
else:
|
1271
|
+
stabilized.append(sep)
|
1272
|
+
|
1273
|
+
return stabilized
|
1274
|
+
|
944
1275
|
def from_stripes(
|
945
1276
|
self,
|
946
1277
|
stripes=None,
|
@@ -4143,6 +4474,34 @@ class Guides:
|
|
4143
4474
|
else:
|
4144
4475
|
raise ValueError(f"Target object {target_obj} is not a Page or Region")
|
4145
4476
|
|
4477
|
+
# Check if we have guides in only one dimension
|
4478
|
+
has_verticals = len(self.vertical) > 0
|
4479
|
+
has_horizontals = len(self.horizontal) > 0
|
4480
|
+
|
4481
|
+
# If we have guides in only one dimension, use direct extraction with explicit lines
|
4482
|
+
if (has_verticals and not has_horizontals) or (has_horizontals and not has_verticals):
|
4483
|
+
logger.debug(
|
4484
|
+
f"Partial guides detected - using direct extraction (v={has_verticals}, h={has_horizontals})"
|
4485
|
+
)
|
4486
|
+
|
4487
|
+
# Extract directly from the target using explicit lines
|
4488
|
+
if hasattr(target_obj, "extract_table"):
|
4489
|
+
return target_obj.extract_table(
|
4490
|
+
method=method, # Let auto-detection work when None
|
4491
|
+
table_settings=table_settings,
|
4492
|
+
use_ocr=use_ocr,
|
4493
|
+
ocr_config=ocr_config,
|
4494
|
+
text_options=text_options,
|
4495
|
+
cell_extraction_func=cell_extraction_func,
|
4496
|
+
show_progress=show_progress,
|
4497
|
+
content_filter=content_filter,
|
4498
|
+
verticals=list(self.vertical) if has_verticals else None,
|
4499
|
+
horizontals=list(self.horizontal) if has_horizontals else None,
|
4500
|
+
)
|
4501
|
+
else:
|
4502
|
+
raise ValueError(f"Target object {type(target_obj)} does not support extract_table")
|
4503
|
+
|
4504
|
+
# Both dimensions have guides - use normal grid-based extraction
|
4146
4505
|
try:
|
4147
4506
|
# Step 1: Build grid structure (creates temporary regions)
|
4148
4507
|
grid_result = self.build_grid(
|
@@ -1286,6 +1286,10 @@ class ElementManager:
|
|
1286
1286
|
|
1287
1287
|
fill_col = rc.get("non_stroking_color")
|
1288
1288
|
# We keep colour as metadata but no longer filter on it
|
1289
|
+
# Note: pdfminer.six has a bug where it may report incorrect colors
|
1290
|
+
# when no explicit color space is set. E.g., '1 1 0 sc' (RGB yellow)
|
1291
|
+
# is parsed as 0.0 (grayscale black) because pdfminer defaults to
|
1292
|
+
# DeviceGray and only reads 1 component from the stack.
|
1289
1293
|
if fill_col is None:
|
1290
1294
|
continue
|
1291
1295
|
|