natural-pdf 0.1.13__py3-none-any.whl → 0.1.15__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/analyzers/shape_detection_mixin.py +554 -273
- natural_pdf/core/page.py +127 -13
- natural_pdf/elements/base.py +20 -20
- natural_pdf/elements/region.py +167 -33
- natural_pdf/flows/element.py +2 -2
- {natural_pdf-0.1.13.dist-info → natural_pdf-0.1.15.dist-info}/METADATA +1 -1
- {natural_pdf-0.1.13.dist-info → natural_pdf-0.1.15.dist-info}/RECORD +10 -10
- {natural_pdf-0.1.13.dist-info → natural_pdf-0.1.15.dist-info}/WHEEL +0 -0
- {natural_pdf-0.1.13.dist-info → natural_pdf-0.1.15.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.1.13.dist-info → natural_pdf-0.1.15.dist-info}/top_level.txt +0 -0
natural_pdf/core/page.py
CHANGED
@@ -1138,31 +1138,145 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin):
|
|
1138
1138
|
logger.debug(f"Page {self.number}: extract_text finished, result length: {len(result)}.")
|
1139
1139
|
return result
|
1140
1140
|
|
1141
|
-
def extract_table(
|
1141
|
+
def extract_table(
|
1142
|
+
self,
|
1143
|
+
method: Optional[str] = None,
|
1144
|
+
table_settings: Optional[dict] = None,
|
1145
|
+
use_ocr: bool = False,
|
1146
|
+
ocr_config: Optional[dict] = None,
|
1147
|
+
text_options: Optional[Dict] = None,
|
1148
|
+
cell_extraction_func: Optional[Callable[["Region"], Optional[str]]] = None,
|
1149
|
+
show_progress: bool = False,
|
1150
|
+
) -> List[List[Optional[str]]]:
|
1142
1151
|
"""
|
1143
|
-
Extract the largest table from this page.
|
1152
|
+
Extract the largest table from this page using enhanced region-based extraction.
|
1144
1153
|
|
1145
1154
|
Args:
|
1146
|
-
|
1155
|
+
method: Method to use: 'tatr', 'pdfplumber', 'text', 'stream', 'lattice', or None (auto-detect).
|
1156
|
+
table_settings: Settings for pdfplumber table extraction.
|
1157
|
+
use_ocr: Whether to use OCR for text extraction (currently only applicable with 'tatr' method).
|
1158
|
+
ocr_config: OCR configuration parameters.
|
1159
|
+
text_options: Dictionary of options for the 'text' method.
|
1160
|
+
cell_extraction_func: Optional callable function that takes a cell Region object
|
1161
|
+
and returns its string content. For 'text' method only.
|
1162
|
+
show_progress: If True, display a progress bar during cell text extraction for the 'text' method.
|
1147
1163
|
|
1148
1164
|
Returns:
|
1149
|
-
|
1165
|
+
Table data as a list of rows, where each row is a list of cell values (str or None).
|
1150
1166
|
"""
|
1151
|
-
#
|
1152
|
-
|
1167
|
+
# Create a full-page region and delegate to its enhanced extract_table method
|
1168
|
+
page_region = self.create_region(0, 0, self.width, self.height)
|
1169
|
+
return page_region.extract_table(
|
1170
|
+
method=method,
|
1171
|
+
table_settings=table_settings,
|
1172
|
+
use_ocr=use_ocr,
|
1173
|
+
ocr_config=ocr_config,
|
1174
|
+
text_options=text_options,
|
1175
|
+
cell_extraction_func=cell_extraction_func,
|
1176
|
+
show_progress=show_progress,
|
1177
|
+
)
|
1153
1178
|
|
1154
|
-
def extract_tables(
|
1179
|
+
def extract_tables(
|
1180
|
+
self,
|
1181
|
+
method: Optional[str] = None,
|
1182
|
+
table_settings: Optional[dict] = None,
|
1183
|
+
check_tatr: bool = True,
|
1184
|
+
) -> List[List[List[str]]]:
|
1155
1185
|
"""
|
1156
|
-
Extract tables from this page.
|
1186
|
+
Extract all tables from this page with enhanced method support.
|
1157
1187
|
|
1158
1188
|
Args:
|
1159
|
-
|
1189
|
+
method: Method to use: 'pdfplumber', 'stream', 'lattice', or None (auto-detect).
|
1190
|
+
'stream' uses text-based strategies, 'lattice' uses line-based strategies.
|
1191
|
+
Note: 'tatr' and 'text' methods are not supported for extract_tables.
|
1192
|
+
table_settings: Settings for pdfplumber table extraction.
|
1193
|
+
check_tatr: If True (default), first check for TATR-detected table regions
|
1194
|
+
and extract from those before falling back to pdfplumber methods.
|
1160
1195
|
|
1161
1196
|
Returns:
|
1162
|
-
List of
|
1197
|
+
List of tables, where each table is a list of rows, and each row is a list of cell values.
|
1163
1198
|
"""
|
1164
|
-
|
1165
|
-
|
1199
|
+
if table_settings is None:
|
1200
|
+
table_settings = {}
|
1201
|
+
|
1202
|
+
# Check for TATR-detected table regions first if enabled
|
1203
|
+
if check_tatr:
|
1204
|
+
try:
|
1205
|
+
tatr_tables = self.find_all("region[type=table][model=tatr]")
|
1206
|
+
if tatr_tables:
|
1207
|
+
logger.debug(f"Page {self.number}: Found {len(tatr_tables)} TATR table regions, extracting from those...")
|
1208
|
+
extracted_tables = []
|
1209
|
+
for table_region in tatr_tables:
|
1210
|
+
try:
|
1211
|
+
table_data = table_region.extract_table(method="tatr")
|
1212
|
+
if table_data: # Only add non-empty tables
|
1213
|
+
extracted_tables.append(table_data)
|
1214
|
+
except Exception as e:
|
1215
|
+
logger.warning(f"Failed to extract table from TATR region {table_region.bbox}: {e}")
|
1216
|
+
|
1217
|
+
if extracted_tables:
|
1218
|
+
logger.debug(f"Page {self.number}: Successfully extracted {len(extracted_tables)} tables from TATR regions")
|
1219
|
+
return extracted_tables
|
1220
|
+
else:
|
1221
|
+
logger.debug(f"Page {self.number}: TATR regions found but no tables extracted, falling back to pdfplumber")
|
1222
|
+
else:
|
1223
|
+
logger.debug(f"Page {self.number}: No TATR table regions found, using pdfplumber methods")
|
1224
|
+
except Exception as e:
|
1225
|
+
logger.debug(f"Page {self.number}: Error checking TATR regions: {e}, falling back to pdfplumber")
|
1226
|
+
|
1227
|
+
# Auto-detect method if not specified (try lattice first, then stream)
|
1228
|
+
if method is None:
|
1229
|
+
logger.debug(f"Page {self.number}: Auto-detecting tables extraction method...")
|
1230
|
+
|
1231
|
+
# Try lattice first
|
1232
|
+
try:
|
1233
|
+
lattice_settings = table_settings.copy()
|
1234
|
+
lattice_settings.setdefault("vertical_strategy", "lines")
|
1235
|
+
lattice_settings.setdefault("horizontal_strategy", "lines")
|
1236
|
+
|
1237
|
+
logger.debug(f"Page {self.number}: Trying 'lattice' method first for tables...")
|
1238
|
+
lattice_result = self._page.extract_tables(lattice_settings)
|
1239
|
+
|
1240
|
+
# Check if lattice found meaningful tables
|
1241
|
+
if (lattice_result and len(lattice_result) > 0 and
|
1242
|
+
any(any(any(cell and cell.strip() for cell in row if cell) for row in table if table) for table in lattice_result)):
|
1243
|
+
logger.debug(f"Page {self.number}: 'lattice' method found {len(lattice_result)} tables")
|
1244
|
+
return lattice_result
|
1245
|
+
else:
|
1246
|
+
logger.debug(f"Page {self.number}: 'lattice' method found no meaningful tables")
|
1247
|
+
|
1248
|
+
except Exception as e:
|
1249
|
+
logger.debug(f"Page {self.number}: 'lattice' method failed: {e}")
|
1250
|
+
|
1251
|
+
# Fall back to stream
|
1252
|
+
logger.debug(f"Page {self.number}: Falling back to 'stream' method for tables...")
|
1253
|
+
stream_settings = table_settings.copy()
|
1254
|
+
stream_settings.setdefault("vertical_strategy", "text")
|
1255
|
+
stream_settings.setdefault("horizontal_strategy", "text")
|
1256
|
+
|
1257
|
+
return self._page.extract_tables(stream_settings)
|
1258
|
+
|
1259
|
+
effective_method = method
|
1260
|
+
|
1261
|
+
# Handle method aliases
|
1262
|
+
if effective_method == "stream":
|
1263
|
+
logger.debug("Using 'stream' method alias for 'pdfplumber' with text-based strategies.")
|
1264
|
+
effective_method = "pdfplumber"
|
1265
|
+
table_settings.setdefault("vertical_strategy", "text")
|
1266
|
+
table_settings.setdefault("horizontal_strategy", "text")
|
1267
|
+
elif effective_method == "lattice":
|
1268
|
+
logger.debug("Using 'lattice' method alias for 'pdfplumber' with line-based strategies.")
|
1269
|
+
effective_method = "pdfplumber"
|
1270
|
+
table_settings.setdefault("vertical_strategy", "lines")
|
1271
|
+
table_settings.setdefault("horizontal_strategy", "lines")
|
1272
|
+
|
1273
|
+
# Use the selected method
|
1274
|
+
if effective_method == "pdfplumber":
|
1275
|
+
return self._page.extract_tables(table_settings)
|
1276
|
+
else:
|
1277
|
+
raise ValueError(
|
1278
|
+
f"Unknown tables extraction method: '{method}'. Choose from 'pdfplumber', 'stream', 'lattice'."
|
1279
|
+
)
|
1166
1280
|
|
1167
1281
|
def _load_elements(self):
|
1168
1282
|
"""Load all elements from the page via ElementManager."""
|
@@ -2198,7 +2312,7 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin):
|
|
2198
2312
|
def viewer(
|
2199
2313
|
self,
|
2200
2314
|
# elements_to_render: Optional[List['Element']] = None, # No longer needed, from_page handles it
|
2201
|
-
#
|
2315
|
+
# include_source_types: List[str] = ['word', 'line', 'rect', 'region'] # No longer needed
|
2202
2316
|
) -> Optional["SimpleInteractiveViewerWidget"]: # Return type hint updated
|
2203
2317
|
"""
|
2204
2318
|
Creates and returns an interactive ipywidget for exploring elements on this page.
|
natural_pdf/elements/base.py
CHANGED
@@ -59,7 +59,7 @@ class DirectionalMixin:
|
|
59
59
|
direction: str,
|
60
60
|
size: Optional[float] = None,
|
61
61
|
cross_size: str = "full",
|
62
|
-
|
62
|
+
include_source: bool = False,
|
63
63
|
until: Optional[str] = None,
|
64
64
|
include_endpoint: bool = True,
|
65
65
|
**kwargs,
|
@@ -71,7 +71,7 @@ class DirectionalMixin:
|
|
71
71
|
direction: 'left', 'right', 'above', or 'below'
|
72
72
|
size: Size in the primary direction (width for horizontal, height for vertical)
|
73
73
|
cross_size: Size in the cross direction ('full' or 'element')
|
74
|
-
|
74
|
+
include_source: Whether to include this element/region's area in the result
|
75
75
|
until: Optional selector string to specify a boundary element
|
76
76
|
include_endpoint: Whether to include the boundary element found by 'until'
|
77
77
|
**kwargs: Additional parameters for the 'until' selector search
|
@@ -85,7 +85,7 @@ class DirectionalMixin:
|
|
85
85
|
is_positive = direction in ("right", "below") # right/below are positive directions
|
86
86
|
pixel_offset = 1 # Offset for excluding elements/endpoints
|
87
87
|
|
88
|
-
# 1. Determine initial boundaries based on direction and
|
88
|
+
# 1. Determine initial boundaries based on direction and include_source
|
89
89
|
if is_horizontal:
|
90
90
|
# Initial cross-boundaries (vertical)
|
91
91
|
y0 = 0 if cross_size == "full" else self.top
|
@@ -93,11 +93,11 @@ class DirectionalMixin:
|
|
93
93
|
|
94
94
|
# Initial primary boundaries (horizontal)
|
95
95
|
if is_positive: # right
|
96
|
-
x0_initial = self.x0 if
|
96
|
+
x0_initial = self.x0 if include_source else self.x1 + pixel_offset
|
97
97
|
x1_initial = self.x1 # This edge moves
|
98
98
|
else: # left
|
99
99
|
x0_initial = self.x0 # This edge moves
|
100
|
-
x1_initial = self.x1 if
|
100
|
+
x1_initial = self.x1 if include_source else self.x0 - pixel_offset
|
101
101
|
else: # Vertical
|
102
102
|
# Initial cross-boundaries (horizontal)
|
103
103
|
x0 = 0 if cross_size == "full" else self.x0
|
@@ -105,11 +105,11 @@ class DirectionalMixin:
|
|
105
105
|
|
106
106
|
# Initial primary boundaries (vertical)
|
107
107
|
if is_positive: # below
|
108
|
-
y0_initial = self.top if
|
108
|
+
y0_initial = self.top if include_source else self.bottom + pixel_offset
|
109
109
|
y1_initial = self.bottom # This edge moves
|
110
110
|
else: # above
|
111
111
|
y0_initial = self.top # This edge moves
|
112
|
-
y1_initial = self.bottom if
|
112
|
+
y1_initial = self.bottom if include_source else self.top - pixel_offset
|
113
113
|
|
114
114
|
# 2. Calculate the final primary boundary, considering 'size' or page limits
|
115
115
|
if is_horizontal:
|
@@ -195,7 +195,7 @@ class DirectionalMixin:
|
|
195
195
|
|
196
196
|
result = Region(self.page, final_bbox)
|
197
197
|
result.source_element = self
|
198
|
-
result.includes_source =
|
198
|
+
result.includes_source = include_source
|
199
199
|
# Optionally store the boundary element if found
|
200
200
|
if target:
|
201
201
|
result.boundary_element = target
|
@@ -206,7 +206,7 @@ class DirectionalMixin:
|
|
206
206
|
self,
|
207
207
|
height: Optional[float] = None,
|
208
208
|
width: str = "full",
|
209
|
-
|
209
|
+
include_source: bool = False,
|
210
210
|
until: Optional[str] = None,
|
211
211
|
include_endpoint: bool = True,
|
212
212
|
**kwargs,
|
@@ -217,7 +217,7 @@ class DirectionalMixin:
|
|
217
217
|
Args:
|
218
218
|
height: Height of the region above, in points
|
219
219
|
width: Width mode - "full" for full page width or "element" for element width
|
220
|
-
|
220
|
+
include_source: Whether to include this element/region in the result (default: False)
|
221
221
|
until: Optional selector string to specify an upper boundary element
|
222
222
|
include_endpoint: Whether to include the boundary element in the region (default: True)
|
223
223
|
**kwargs: Additional parameters
|
@@ -229,7 +229,7 @@ class DirectionalMixin:
|
|
229
229
|
direction="above",
|
230
230
|
size=height,
|
231
231
|
cross_size=width,
|
232
|
-
|
232
|
+
include_source=include_source,
|
233
233
|
until=until,
|
234
234
|
include_endpoint=include_endpoint,
|
235
235
|
**kwargs,
|
@@ -239,7 +239,7 @@ class DirectionalMixin:
|
|
239
239
|
self,
|
240
240
|
height: Optional[float] = None,
|
241
241
|
width: str = "full",
|
242
|
-
|
242
|
+
include_source: bool = False,
|
243
243
|
until: Optional[str] = None,
|
244
244
|
include_endpoint: bool = True,
|
245
245
|
**kwargs,
|
@@ -250,7 +250,7 @@ class DirectionalMixin:
|
|
250
250
|
Args:
|
251
251
|
height: Height of the region below, in points
|
252
252
|
width: Width mode - "full" for full page width or "element" for element width
|
253
|
-
|
253
|
+
include_source: Whether to include this element/region in the result (default: False)
|
254
254
|
until: Optional selector string to specify a lower boundary element
|
255
255
|
include_endpoint: Whether to include the boundary element in the region (default: True)
|
256
256
|
**kwargs: Additional parameters
|
@@ -262,7 +262,7 @@ class DirectionalMixin:
|
|
262
262
|
direction="below",
|
263
263
|
size=height,
|
264
264
|
cross_size=width,
|
265
|
-
|
265
|
+
include_source=include_source,
|
266
266
|
until=until,
|
267
267
|
include_endpoint=include_endpoint,
|
268
268
|
**kwargs,
|
@@ -272,7 +272,7 @@ class DirectionalMixin:
|
|
272
272
|
self,
|
273
273
|
width: Optional[float] = None,
|
274
274
|
height: str = "full",
|
275
|
-
|
275
|
+
include_source: bool = False,
|
276
276
|
until: Optional[str] = None,
|
277
277
|
include_endpoint: bool = True,
|
278
278
|
**kwargs,
|
@@ -283,7 +283,7 @@ class DirectionalMixin:
|
|
283
283
|
Args:
|
284
284
|
width: Width of the region to the left, in points
|
285
285
|
height: Height mode - "full" for full page height or "element" for element height
|
286
|
-
|
286
|
+
include_source: Whether to include this element/region in the result (default: False)
|
287
287
|
until: Optional selector string to specify a left boundary element
|
288
288
|
include_endpoint: Whether to include the boundary element in the region (default: True)
|
289
289
|
**kwargs: Additional parameters
|
@@ -295,7 +295,7 @@ class DirectionalMixin:
|
|
295
295
|
direction="left",
|
296
296
|
size=width,
|
297
297
|
cross_size=height,
|
298
|
-
|
298
|
+
include_source=include_source,
|
299
299
|
until=until,
|
300
300
|
include_endpoint=include_endpoint,
|
301
301
|
**kwargs,
|
@@ -305,7 +305,7 @@ class DirectionalMixin:
|
|
305
305
|
self,
|
306
306
|
width: Optional[float] = None,
|
307
307
|
height: str = "full",
|
308
|
-
|
308
|
+
include_source: bool = False,
|
309
309
|
until: Optional[str] = None,
|
310
310
|
include_endpoint: bool = True,
|
311
311
|
**kwargs,
|
@@ -316,7 +316,7 @@ class DirectionalMixin:
|
|
316
316
|
Args:
|
317
317
|
width: Width of the region to the right, in points
|
318
318
|
height: Height mode - "full" for full page height or "element" for element height
|
319
|
-
|
319
|
+
include_source: Whether to include this element/region in the result (default: False)
|
320
320
|
until: Optional selector string to specify a right boundary element
|
321
321
|
include_endpoint: Whether to include the boundary element in the region (default: True)
|
322
322
|
**kwargs: Additional parameters
|
@@ -328,7 +328,7 @@ class DirectionalMixin:
|
|
328
328
|
direction="right",
|
329
329
|
size=width,
|
330
330
|
cross_size=height,
|
331
|
-
|
331
|
+
include_source=include_source,
|
332
332
|
until=until,
|
333
333
|
include_endpoint=include_endpoint,
|
334
334
|
**kwargs,
|