natural-pdf 0.1.14__py3-none-any.whl → 0.1.15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
natural_pdf/core/page.py CHANGED
@@ -1138,31 +1138,145 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin):
1138
1138
  logger.debug(f"Page {self.number}: extract_text finished, result length: {len(result)}.")
1139
1139
  return result
1140
1140
 
1141
- def extract_table(self, table_settings={}) -> List[Any]:
1141
+ def extract_table(
1142
+ self,
1143
+ method: Optional[str] = None,
1144
+ table_settings: Optional[dict] = None,
1145
+ use_ocr: bool = False,
1146
+ ocr_config: Optional[dict] = None,
1147
+ text_options: Optional[Dict] = None,
1148
+ cell_extraction_func: Optional[Callable[["Region"], Optional[str]]] = None,
1149
+ show_progress: bool = False,
1150
+ ) -> List[List[Optional[str]]]:
1142
1151
  """
1143
- Extract the largest table from this page.
1152
+ Extract the largest table from this page using enhanced region-based extraction.
1144
1153
 
1145
1154
  Args:
1146
- table_settings: Additional extraction parameters
1155
+ method: Method to use: 'tatr', 'pdfplumber', 'text', 'stream', 'lattice', or None (auto-detect).
1156
+ table_settings: Settings for pdfplumber table extraction.
1157
+ use_ocr: Whether to use OCR for text extraction (currently only applicable with 'tatr' method).
1158
+ ocr_config: OCR configuration parameters.
1159
+ text_options: Dictionary of options for the 'text' method.
1160
+ cell_extraction_func: Optional callable function that takes a cell Region object
1161
+ and returns its string content. For 'text' method only.
1162
+ show_progress: If True, display a progress bar during cell text extraction for the 'text' method.
1147
1163
 
1148
1164
  Returns:
1149
- List of extracted tables (or None if no table found)
1165
+ Table data as a list of rows, where each row is a list of cell values (str or None).
1150
1166
  """
1151
- # pdfplumber returns None if no table found
1152
- return self._page.extract_table(table_settings)
1167
+ # Create a full-page region and delegate to its enhanced extract_table method
1168
+ page_region = self.create_region(0, 0, self.width, self.height)
1169
+ return page_region.extract_table(
1170
+ method=method,
1171
+ table_settings=table_settings,
1172
+ use_ocr=use_ocr,
1173
+ ocr_config=ocr_config,
1174
+ text_options=text_options,
1175
+ cell_extraction_func=cell_extraction_func,
1176
+ show_progress=show_progress,
1177
+ )
1153
1178
 
1154
- def extract_tables(self, table_settings={}) -> List[Any]:
1179
+ def extract_tables(
1180
+ self,
1181
+ method: Optional[str] = None,
1182
+ table_settings: Optional[dict] = None,
1183
+ check_tatr: bool = True,
1184
+ ) -> List[List[List[str]]]:
1155
1185
  """
1156
- Extract tables from this page.
1186
+ Extract all tables from this page with enhanced method support.
1157
1187
 
1158
1188
  Args:
1159
- table_settings: Additional extraction parameters
1189
+ method: Method to use: 'pdfplumber', 'stream', 'lattice', or None (auto-detect).
1190
+ 'stream' uses text-based strategies, 'lattice' uses line-based strategies.
1191
+ Note: 'tatr' and 'text' methods are not supported for extract_tables.
1192
+ table_settings: Settings for pdfplumber table extraction.
1193
+ check_tatr: If True (default), first check for TATR-detected table regions
1194
+ and extract from those before falling back to pdfplumber methods.
1160
1195
 
1161
1196
  Returns:
1162
- List of extracted tables
1197
+ List of tables, where each table is a list of rows, and each row is a list of cell values.
1163
1198
  """
1164
- # pdfplumber returns list of tables
1165
- return self._page.extract_tables(table_settings)
1199
+ if table_settings is None:
1200
+ table_settings = {}
1201
+
1202
+ # Check for TATR-detected table regions first if enabled
1203
+ if check_tatr:
1204
+ try:
1205
+ tatr_tables = self.find_all("region[type=table][model=tatr]")
1206
+ if tatr_tables:
1207
+ logger.debug(f"Page {self.number}: Found {len(tatr_tables)} TATR table regions, extracting from those...")
1208
+ extracted_tables = []
1209
+ for table_region in tatr_tables:
1210
+ try:
1211
+ table_data = table_region.extract_table(method="tatr")
1212
+ if table_data: # Only add non-empty tables
1213
+ extracted_tables.append(table_data)
1214
+ except Exception as e:
1215
+ logger.warning(f"Failed to extract table from TATR region {table_region.bbox}: {e}")
1216
+
1217
+ if extracted_tables:
1218
+ logger.debug(f"Page {self.number}: Successfully extracted {len(extracted_tables)} tables from TATR regions")
1219
+ return extracted_tables
1220
+ else:
1221
+ logger.debug(f"Page {self.number}: TATR regions found but no tables extracted, falling back to pdfplumber")
1222
+ else:
1223
+ logger.debug(f"Page {self.number}: No TATR table regions found, using pdfplumber methods")
1224
+ except Exception as e:
1225
+ logger.debug(f"Page {self.number}: Error checking TATR regions: {e}, falling back to pdfplumber")
1226
+
1227
+ # Auto-detect method if not specified (try lattice first, then stream)
1228
+ if method is None:
1229
+ logger.debug(f"Page {self.number}: Auto-detecting tables extraction method...")
1230
+
1231
+ # Try lattice first
1232
+ try:
1233
+ lattice_settings = table_settings.copy()
1234
+ lattice_settings.setdefault("vertical_strategy", "lines")
1235
+ lattice_settings.setdefault("horizontal_strategy", "lines")
1236
+
1237
+ logger.debug(f"Page {self.number}: Trying 'lattice' method first for tables...")
1238
+ lattice_result = self._page.extract_tables(lattice_settings)
1239
+
1240
+ # Check if lattice found meaningful tables
1241
+ if (lattice_result and len(lattice_result) > 0 and
1242
+ any(any(any(cell and cell.strip() for cell in row if cell) for row in table if table) for table in lattice_result)):
1243
+ logger.debug(f"Page {self.number}: 'lattice' method found {len(lattice_result)} tables")
1244
+ return lattice_result
1245
+ else:
1246
+ logger.debug(f"Page {self.number}: 'lattice' method found no meaningful tables")
1247
+
1248
+ except Exception as e:
1249
+ logger.debug(f"Page {self.number}: 'lattice' method failed: {e}")
1250
+
1251
+ # Fall back to stream
1252
+ logger.debug(f"Page {self.number}: Falling back to 'stream' method for tables...")
1253
+ stream_settings = table_settings.copy()
1254
+ stream_settings.setdefault("vertical_strategy", "text")
1255
+ stream_settings.setdefault("horizontal_strategy", "text")
1256
+
1257
+ return self._page.extract_tables(stream_settings)
1258
+
1259
+ effective_method = method
1260
+
1261
+ # Handle method aliases
1262
+ if effective_method == "stream":
1263
+ logger.debug("Using 'stream' method alias for 'pdfplumber' with text-based strategies.")
1264
+ effective_method = "pdfplumber"
1265
+ table_settings.setdefault("vertical_strategy", "text")
1266
+ table_settings.setdefault("horizontal_strategy", "text")
1267
+ elif effective_method == "lattice":
1268
+ logger.debug("Using 'lattice' method alias for 'pdfplumber' with line-based strategies.")
1269
+ effective_method = "pdfplumber"
1270
+ table_settings.setdefault("vertical_strategy", "lines")
1271
+ table_settings.setdefault("horizontal_strategy", "lines")
1272
+
1273
+ # Use the selected method
1274
+ if effective_method == "pdfplumber":
1275
+ return self._page.extract_tables(table_settings)
1276
+ else:
1277
+ raise ValueError(
1278
+ f"Unknown tables extraction method: '{method}'. Choose from 'pdfplumber', 'stream', 'lattice'."
1279
+ )
1166
1280
 
1167
1281
  def _load_elements(self):
1168
1282
  """Load all elements from the page via ElementManager."""
@@ -1247,8 +1247,12 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
1247
1247
  Extract a table from this region.
1248
1248
 
1249
1249
  Args:
1250
- method: Method to use: 'tatr', 'plumber', 'text', or None (auto-detect).
1251
- table_settings: Settings for pdfplumber table extraction (used only with 'plumber' method).
1250
+ method: Method to use: 'tatr', 'pdfplumber', 'text', 'stream', 'lattice', or None (auto-detect).
1251
+ 'stream' is an alias for 'pdfplumber' with text-based strategies (equivalent to
1252
+ setting `vertical_strategy` and `horizontal_strategy` to 'text').
1253
+ 'lattice' is an alias for 'pdfplumber' with line-based strategies (equivalent to
1254
+ setting `vertical_strategy` and `horizontal_strategy` to 'lines').
1255
+ table_settings: Settings for pdfplumber table extraction (used with 'pdfplumber', 'stream', or 'lattice' methods).
1252
1256
  use_ocr: Whether to use OCR for text extraction (currently only applicable with 'tatr' method).
1253
1257
  ocr_config: OCR configuration parameters.
1254
1258
  text_options: Dictionary of options for the 'text' method, corresponding to arguments
@@ -1268,13 +1272,47 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
1268
1272
  text_options = {} # Initialize empty dict
1269
1273
 
1270
1274
  # Auto-detect method if not specified
1271
- effective_method = method
1272
- if effective_method is None:
1275
+ if method is None:
1273
1276
  # If this is a TATR-detected region, use TATR method
1274
1277
  if hasattr(self, "model") and self.model == "tatr" and self.region_type == "table":
1275
1278
  effective_method = "tatr"
1276
1279
  else:
1277
- effective_method = "plumber"
1280
+ # Try lattice first, then fall back to stream if no meaningful results
1281
+ logger.debug(f"Region {self.bbox}: Auto-detecting table extraction method...")
1282
+
1283
+ try:
1284
+ logger.debug(f"Region {self.bbox}: Trying 'lattice' method first...")
1285
+ lattice_result = self.extract_table('lattice', table_settings=table_settings.copy())
1286
+
1287
+ # Check if lattice found meaningful content
1288
+ if (lattice_result and len(lattice_result) > 0 and
1289
+ any(any(cell and cell.strip() for cell in row if cell) for row in lattice_result)):
1290
+ logger.debug(f"Region {self.bbox}: 'lattice' method found table with {len(lattice_result)} rows")
1291
+ return lattice_result
1292
+ else:
1293
+ logger.debug(f"Region {self.bbox}: 'lattice' method found no meaningful content")
1294
+ except Exception as e:
1295
+ logger.debug(f"Region {self.bbox}: 'lattice' method failed: {e}")
1296
+
1297
+ # Fall back to stream
1298
+ logger.debug(f"Region {self.bbox}: Falling back to 'stream' method...")
1299
+ return self.extract_table('stream', table_settings=table_settings.copy())
1300
+ else:
1301
+ effective_method = method
1302
+
1303
+ # Handle method aliases for pdfplumber
1304
+ if effective_method == "stream":
1305
+ logger.debug("Using 'stream' method alias for 'pdfplumber' with text-based strategies.")
1306
+ effective_method = "pdfplumber"
1307
+ # Set default text strategies if not already provided by the user
1308
+ table_settings.setdefault("vertical_strategy", "text")
1309
+ table_settings.setdefault("horizontal_strategy", "text")
1310
+ elif effective_method == "lattice":
1311
+ logger.debug("Using 'lattice' method alias for 'pdfplumber' with line-based strategies.")
1312
+ effective_method = "pdfplumber"
1313
+ # Set default line strategies if not already provided by the user
1314
+ table_settings.setdefault("vertical_strategy", "lines")
1315
+ table_settings.setdefault("horizontal_strategy", "lines")
1278
1316
 
1279
1317
  logger.debug(f"Region {self.bbox}: Extracting table using method '{effective_method}'")
1280
1318
 
@@ -1284,16 +1322,111 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
1284
1322
  elif effective_method == "text":
1285
1323
  current_text_options = text_options.copy()
1286
1324
  current_text_options["cell_extraction_func"] = cell_extraction_func
1287
- # --- Pass show_progress to the helper --- #
1288
1325
  current_text_options["show_progress"] = show_progress
1289
1326
  return self._extract_table_text(**current_text_options)
1290
- elif effective_method == "plumber":
1327
+ elif effective_method == "pdfplumber":
1291
1328
  return self._extract_table_plumber(table_settings)
1292
1329
  else:
1293
1330
  raise ValueError(
1294
- f"Unknown table extraction method: '{effective_method}'. Choose from 'tatr', 'plumber', 'text'."
1331
+ f"Unknown table extraction method: '{method}'. Choose from 'tatr', 'pdfplumber', 'text', 'stream', 'lattice'."
1295
1332
  )
1296
1333
 
1334
+
1335
+ def extract_tables(
1336
+ self,
1337
+ method: Optional[str] = None,
1338
+ table_settings: Optional[dict] = None,
1339
+ ) -> List[List[List[str]]]:
1340
+ """
1341
+ Extract all tables from this region using pdfplumber-based methods.
1342
+
1343
+ Note: Only 'pdfplumber', 'stream', and 'lattice' methods are supported for extract_tables.
1344
+ 'tatr' and 'text' methods are designed for single table extraction only.
1345
+
1346
+ Args:
1347
+ method: Method to use: 'pdfplumber', 'stream', 'lattice', or None (auto-detect).
1348
+ 'stream' uses text-based strategies, 'lattice' uses line-based strategies.
1349
+ table_settings: Settings for pdfplumber table extraction.
1350
+
1351
+ Returns:
1352
+ List of tables, where each table is a list of rows, and each row is a list of cell values.
1353
+ """
1354
+ if table_settings is None:
1355
+ table_settings = {}
1356
+
1357
+ # Auto-detect method if not specified (try lattice first, then stream)
1358
+ if method is None:
1359
+ logger.debug(f"Region {self.bbox}: Auto-detecting tables extraction method...")
1360
+
1361
+ # Try lattice first
1362
+ try:
1363
+ lattice_settings = table_settings.copy()
1364
+ lattice_settings.setdefault("vertical_strategy", "lines")
1365
+ lattice_settings.setdefault("horizontal_strategy", "lines")
1366
+
1367
+ logger.debug(f"Region {self.bbox}: Trying 'lattice' method first for tables...")
1368
+ lattice_result = self._extract_tables_plumber(lattice_settings)
1369
+
1370
+ # Check if lattice found meaningful tables
1371
+ if (lattice_result and len(lattice_result) > 0 and
1372
+ any(any(any(cell and cell.strip() for cell in row if cell) for row in table if table) for table in lattice_result)):
1373
+ logger.debug(f"Region {self.bbox}: 'lattice' method found {len(lattice_result)} tables")
1374
+ return lattice_result
1375
+ else:
1376
+ logger.debug(f"Region {self.bbox}: 'lattice' method found no meaningful tables")
1377
+
1378
+ except Exception as e:
1379
+ logger.debug(f"Region {self.bbox}: 'lattice' method failed: {e}")
1380
+
1381
+ # Fall back to stream
1382
+ logger.debug(f"Region {self.bbox}: Falling back to 'stream' method for tables...")
1383
+ stream_settings = table_settings.copy()
1384
+ stream_settings.setdefault("vertical_strategy", "text")
1385
+ stream_settings.setdefault("horizontal_strategy", "text")
1386
+
1387
+ return self._extract_tables_plumber(stream_settings)
1388
+
1389
+ effective_method = method
1390
+
1391
+ # Handle method aliases
1392
+ if effective_method == "stream":
1393
+ logger.debug("Using 'stream' method alias for 'pdfplumber' with text-based strategies.")
1394
+ effective_method = "pdfplumber"
1395
+ table_settings.setdefault("vertical_strategy", "text")
1396
+ table_settings.setdefault("horizontal_strategy", "text")
1397
+ elif effective_method == "lattice":
1398
+ logger.debug("Using 'lattice' method alias for 'pdfplumber' with line-based strategies.")
1399
+ effective_method = "pdfplumber"
1400
+ table_settings.setdefault("vertical_strategy", "lines")
1401
+ table_settings.setdefault("horizontal_strategy", "lines")
1402
+
1403
+ # Use the selected method
1404
+ if effective_method == "pdfplumber":
1405
+ return self._extract_tables_plumber(table_settings)
1406
+ else:
1407
+ raise ValueError(
1408
+ f"Unknown tables extraction method: '{method}'. Choose from 'pdfplumber', 'stream', 'lattice'."
1409
+ )
1410
+
1411
+ def _extract_tables_plumber(self, table_settings: dict) -> List[List[List[str]]]:
1412
+ """
1413
+ Extract all tables using pdfplumber's table extraction.
1414
+
1415
+ Args:
1416
+ table_settings: Settings for pdfplumber table extraction
1417
+
1418
+ Returns:
1419
+ List of tables, where each table is a list of rows, and each row is a list of cell values
1420
+ """
1421
+ # Create a crop of the page for this region
1422
+ cropped = self.page._page.crop(self.bbox)
1423
+
1424
+ # Extract all tables from the cropped area
1425
+ tables = cropped.extract_tables(table_settings)
1426
+
1427
+ # Return the tables or an empty list if none found
1428
+ return tables if tables else []
1429
+
1297
1430
  def _extract_table_plumber(self, table_settings: dict) -> List[List[str]]:
1298
1431
  """
1299
1432
  Extract table using pdfplumber's table extraction.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: natural-pdf
3
- Version: 0.1.14
3
+ Version: 0.1.15
4
4
  Summary: A more intuitive interface for working with PDFs
5
5
  Author-email: Jonathan Soma <jonathan.soma@gmail.com>
6
6
  License-Expression: MIT
@@ -24,14 +24,14 @@ natural_pdf/collections/pdf_collection.py,sha256=nsbrzcsXAD2qVLLXhDYpljAb_WnjMNa
24
24
  natural_pdf/core/__init__.py,sha256=QC8H4M3KbXwMFiQORZ0pdPlzx1Ix6oKKQSS7Ib2KEaA,38
25
25
  natural_pdf/core/element_manager.py,sha256=_UdXu51sLi6STzc8Pj4k8R721G3yJixXDLuRHn3hmr8,25731
26
26
  natural_pdf/core/highlighting_service.py,sha256=tjMJpdJj2oaMGpdqiNHPcTJqID4nd-uBZ5v7KtPmoc0,36762
27
- natural_pdf/core/page.py,sha256=hg7EoYMbvgo9dXivBl6xb6dENobhSHt0Wuu36O5J900,111119
27
+ natural_pdf/core/page.py,sha256=M-KgTxceFebw0n1BehFAeQ0sxnCpIr9dZX10k2OJzUY,117518
28
28
  natural_pdf/core/pdf.py,sha256=395aBTg4Le4vABvQWgBhPm669nGJ8JdMToTs1UtQ2Vg,69575
29
29
  natural_pdf/elements/__init__.py,sha256=S8XeiNWJ1WcgnyYKdYV1yxQlAxCCO3FfITT8MQwNbyk,41
30
30
  natural_pdf/elements/base.py,sha256=NNF-iUzkip0UgfKTuqLE1jVJsq2yD7LUTvOQWMi_Jpc,39631
31
31
  natural_pdf/elements/collections.py,sha256=qd58tD3f-eojz90ICytlqu4Ej0OQoWgsxV4umQDhUvA,120809
32
32
  natural_pdf/elements/line.py,sha256=300kSFBDUBIudfeQtH_tzW9gTYRgRKUDPiTABw6J-BE,4782
33
33
  natural_pdf/elements/rect.py,sha256=kiVa3e377ZnqIOXc89d9ZSY4EcmDxtccdtUw-HOQzpw,3796
34
- natural_pdf/elements/region.py,sha256=wBBAcuudRqL1b9ojLdrXiwUIcQbTWEWTky_RbBuCgnU,115798
34
+ natural_pdf/elements/region.py,sha256=l9J6E7bAkxZoA603cfPKG1LuU7uRUPl4PArUBkuk7VI,122719
35
35
  natural_pdf/elements/text.py,sha256=13HvVZGinj2Vm_fFCAnqi7hohtoKvnpCp3VCfkpeAbc,11146
36
36
  natural_pdf/export/mixin.py,sha256=L1q3MIEFWuvie4j4_EmW7GT3NerbZ1as0XMUoqTS7gM,5083
37
37
  natural_pdf/exporters/__init__.py,sha256=7MnvRLLQdwtg-ULu-8uK8C84GsKiJamyhRw_GgWhw7k,151
@@ -84,8 +84,8 @@ natural_pdf/utils/text_extraction.py,sha256=z6Jhy11pakYCsEpkvh8ldw6DkUFsYF1hCL9Y
84
84
  natural_pdf/utils/visualization.py,sha256=30pRWQdsRJh2pSObh-brKVsFgC1n8tHmSrta_UDnVPw,8989
85
85
  natural_pdf/widgets/__init__.py,sha256=O2fSDo604wDAP6UwUkmBq3eT91RSqHwBpAOQXq92S8s,214
86
86
  natural_pdf/widgets/viewer.py,sha256=ekgXTEfA48GrR-JjpCpgyBCXdf4IubV0pAXDJozcU7A,39196
87
- natural_pdf-0.1.14.dist-info/licenses/LICENSE,sha256=9zfwINwJlarbDmdh6iJV4QUG54QSJlSAUcnC1YiC_Ns,1074
88
- natural_pdf-0.1.14.dist-info/METADATA,sha256=NzaR_hcSyFH22knKZ-NMCct_XOo2nPUk83XHspTncyE,7674
89
- natural_pdf-0.1.14.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
90
- natural_pdf-0.1.14.dist-info/top_level.txt,sha256=Cyw1zmNDlUZfb5moU-WUWGprrwH7ln_8LDGdmMHF1xI,17
91
- natural_pdf-0.1.14.dist-info/RECORD,,
87
+ natural_pdf-0.1.15.dist-info/licenses/LICENSE,sha256=9zfwINwJlarbDmdh6iJV4QUG54QSJlSAUcnC1YiC_Ns,1074
88
+ natural_pdf-0.1.15.dist-info/METADATA,sha256=O8RUOiFgln7unuRhKey0Z6l90K71ktMY7WwpaiEyZdc,7674
89
+ natural_pdf-0.1.15.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
90
+ natural_pdf-0.1.15.dist-info/top_level.txt,sha256=Cyw1zmNDlUZfb5moU-WUWGprrwH7ln_8LDGdmMHF1xI,17
91
+ natural_pdf-0.1.15.dist-info/RECORD,,