natural-pdf 0.1.13__py3-none-any.whl → 0.1.15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
natural_pdf/core/page.py CHANGED
@@ -1138,31 +1138,145 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin):
1138
1138
  logger.debug(f"Page {self.number}: extract_text finished, result length: {len(result)}.")
1139
1139
  return result
1140
1140
 
1141
- def extract_table(self, table_settings={}) -> List[Any]:
1141
+ def extract_table(
1142
+ self,
1143
+ method: Optional[str] = None,
1144
+ table_settings: Optional[dict] = None,
1145
+ use_ocr: bool = False,
1146
+ ocr_config: Optional[dict] = None,
1147
+ text_options: Optional[Dict] = None,
1148
+ cell_extraction_func: Optional[Callable[["Region"], Optional[str]]] = None,
1149
+ show_progress: bool = False,
1150
+ ) -> List[List[Optional[str]]]:
1142
1151
  """
1143
- Extract the largest table from this page.
1152
+ Extract the largest table from this page using enhanced region-based extraction.
1144
1153
 
1145
1154
  Args:
1146
- table_settings: Additional extraction parameters
1155
+ method: Method to use: 'tatr', 'pdfplumber', 'text', 'stream', 'lattice', or None (auto-detect).
1156
+ table_settings: Settings for pdfplumber table extraction.
1157
+ use_ocr: Whether to use OCR for text extraction (currently only applicable with 'tatr' method).
1158
+ ocr_config: OCR configuration parameters.
1159
+ text_options: Dictionary of options for the 'text' method.
1160
+ cell_extraction_func: Optional callable function that takes a cell Region object
1161
+ and returns its string content. For 'text' method only.
1162
+ show_progress: If True, display a progress bar during cell text extraction for the 'text' method.
1147
1163
 
1148
1164
  Returns:
1149
- List of extracted tables (or None if no table found)
1165
+ Table data as a list of rows, where each row is a list of cell values (str or None).
1150
1166
  """
1151
- # pdfplumber returns None if no table found
1152
- return self._page.extract_table(table_settings)
1167
+ # Create a full-page region and delegate to its enhanced extract_table method
1168
+ page_region = self.create_region(0, 0, self.width, self.height)
1169
+ return page_region.extract_table(
1170
+ method=method,
1171
+ table_settings=table_settings,
1172
+ use_ocr=use_ocr,
1173
+ ocr_config=ocr_config,
1174
+ text_options=text_options,
1175
+ cell_extraction_func=cell_extraction_func,
1176
+ show_progress=show_progress,
1177
+ )
1153
1178
 
1154
- def extract_tables(self, table_settings={}) -> List[Any]:
1179
+ def extract_tables(
1180
+ self,
1181
+ method: Optional[str] = None,
1182
+ table_settings: Optional[dict] = None,
1183
+ check_tatr: bool = True,
1184
+ ) -> List[List[List[str]]]:
1155
1185
  """
1156
- Extract tables from this page.
1186
+ Extract all tables from this page with enhanced method support.
1157
1187
 
1158
1188
  Args:
1159
- table_settings: Additional extraction parameters
1189
+ method: Method to use: 'pdfplumber', 'stream', 'lattice', or None (auto-detect).
1190
+ 'stream' uses text-based strategies, 'lattice' uses line-based strategies.
1191
+ Note: 'tatr' and 'text' methods are not supported for extract_tables.
1192
+ table_settings: Settings for pdfplumber table extraction.
1193
+ check_tatr: If True (default), first check for TATR-detected table regions
1194
+ and extract from those before falling back to pdfplumber methods.
1160
1195
 
1161
1196
  Returns:
1162
- List of extracted tables
1197
+ List of tables, where each table is a list of rows, and each row is a list of cell values.
1163
1198
  """
1164
- # pdfplumber returns list of tables
1165
- return self._page.extract_tables(table_settings)
1199
+ if table_settings is None:
1200
+ table_settings = {}
1201
+
1202
+ # Check for TATR-detected table regions first if enabled
1203
+ if check_tatr:
1204
+ try:
1205
+ tatr_tables = self.find_all("region[type=table][model=tatr]")
1206
+ if tatr_tables:
1207
+ logger.debug(f"Page {self.number}: Found {len(tatr_tables)} TATR table regions, extracting from those...")
1208
+ extracted_tables = []
1209
+ for table_region in tatr_tables:
1210
+ try:
1211
+ table_data = table_region.extract_table(method="tatr")
1212
+ if table_data: # Only add non-empty tables
1213
+ extracted_tables.append(table_data)
1214
+ except Exception as e:
1215
+ logger.warning(f"Failed to extract table from TATR region {table_region.bbox}: {e}")
1216
+
1217
+ if extracted_tables:
1218
+ logger.debug(f"Page {self.number}: Successfully extracted {len(extracted_tables)} tables from TATR regions")
1219
+ return extracted_tables
1220
+ else:
1221
+ logger.debug(f"Page {self.number}: TATR regions found but no tables extracted, falling back to pdfplumber")
1222
+ else:
1223
+ logger.debug(f"Page {self.number}: No TATR table regions found, using pdfplumber methods")
1224
+ except Exception as e:
1225
+ logger.debug(f"Page {self.number}: Error checking TATR regions: {e}, falling back to pdfplumber")
1226
+
1227
+ # Auto-detect method if not specified (try lattice first, then stream)
1228
+ if method is None:
1229
+ logger.debug(f"Page {self.number}: Auto-detecting tables extraction method...")
1230
+
1231
+ # Try lattice first
1232
+ try:
1233
+ lattice_settings = table_settings.copy()
1234
+ lattice_settings.setdefault("vertical_strategy", "lines")
1235
+ lattice_settings.setdefault("horizontal_strategy", "lines")
1236
+
1237
+ logger.debug(f"Page {self.number}: Trying 'lattice' method first for tables...")
1238
+ lattice_result = self._page.extract_tables(lattice_settings)
1239
+
1240
+ # Check if lattice found meaningful tables
1241
+ if (lattice_result and len(lattice_result) > 0 and
1242
+ any(any(any(cell and cell.strip() for cell in row if cell) for row in table if table) for table in lattice_result)):
1243
+ logger.debug(f"Page {self.number}: 'lattice' method found {len(lattice_result)} tables")
1244
+ return lattice_result
1245
+ else:
1246
+ logger.debug(f"Page {self.number}: 'lattice' method found no meaningful tables")
1247
+
1248
+ except Exception as e:
1249
+ logger.debug(f"Page {self.number}: 'lattice' method failed: {e}")
1250
+
1251
+ # Fall back to stream
1252
+ logger.debug(f"Page {self.number}: Falling back to 'stream' method for tables...")
1253
+ stream_settings = table_settings.copy()
1254
+ stream_settings.setdefault("vertical_strategy", "text")
1255
+ stream_settings.setdefault("horizontal_strategy", "text")
1256
+
1257
+ return self._page.extract_tables(stream_settings)
1258
+
1259
+ effective_method = method
1260
+
1261
+ # Handle method aliases
1262
+ if effective_method == "stream":
1263
+ logger.debug("Using 'stream' method alias for 'pdfplumber' with text-based strategies.")
1264
+ effective_method = "pdfplumber"
1265
+ table_settings.setdefault("vertical_strategy", "text")
1266
+ table_settings.setdefault("horizontal_strategy", "text")
1267
+ elif effective_method == "lattice":
1268
+ logger.debug("Using 'lattice' method alias for 'pdfplumber' with line-based strategies.")
1269
+ effective_method = "pdfplumber"
1270
+ table_settings.setdefault("vertical_strategy", "lines")
1271
+ table_settings.setdefault("horizontal_strategy", "lines")
1272
+
1273
+ # Use the selected method
1274
+ if effective_method == "pdfplumber":
1275
+ return self._page.extract_tables(table_settings)
1276
+ else:
1277
+ raise ValueError(
1278
+ f"Unknown tables extraction method: '{method}'. Choose from 'pdfplumber', 'stream', 'lattice'."
1279
+ )
1166
1280
 
1167
1281
  def _load_elements(self):
1168
1282
  """Load all elements from the page via ElementManager."""
@@ -2198,7 +2312,7 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin):
2198
2312
  def viewer(
2199
2313
  self,
2200
2314
  # elements_to_render: Optional[List['Element']] = None, # No longer needed, from_page handles it
2201
- # include_element_types: List[str] = ['word', 'line', 'rect', 'region'] # No longer needed
2315
+ # include_source_types: List[str] = ['word', 'line', 'rect', 'region'] # No longer needed
2202
2316
  ) -> Optional["SimpleInteractiveViewerWidget"]: # Return type hint updated
2203
2317
  """
2204
2318
  Creates and returns an interactive ipywidget for exploring elements on this page.
@@ -59,7 +59,7 @@ class DirectionalMixin:
59
59
  direction: str,
60
60
  size: Optional[float] = None,
61
61
  cross_size: str = "full",
62
- include_element: bool = False,
62
+ include_source: bool = False,
63
63
  until: Optional[str] = None,
64
64
  include_endpoint: bool = True,
65
65
  **kwargs,
@@ -71,7 +71,7 @@ class DirectionalMixin:
71
71
  direction: 'left', 'right', 'above', or 'below'
72
72
  size: Size in the primary direction (width for horizontal, height for vertical)
73
73
  cross_size: Size in the cross direction ('full' or 'element')
74
- include_element: Whether to include this element/region's area in the result
74
+ include_source: Whether to include this element/region's area in the result
75
75
  until: Optional selector string to specify a boundary element
76
76
  include_endpoint: Whether to include the boundary element found by 'until'
77
77
  **kwargs: Additional parameters for the 'until' selector search
@@ -85,7 +85,7 @@ class DirectionalMixin:
85
85
  is_positive = direction in ("right", "below") # right/below are positive directions
86
86
  pixel_offset = 1 # Offset for excluding elements/endpoints
87
87
 
88
- # 1. Determine initial boundaries based on direction and include_element
88
+ # 1. Determine initial boundaries based on direction and include_source
89
89
  if is_horizontal:
90
90
  # Initial cross-boundaries (vertical)
91
91
  y0 = 0 if cross_size == "full" else self.top
@@ -93,11 +93,11 @@ class DirectionalMixin:
93
93
 
94
94
  # Initial primary boundaries (horizontal)
95
95
  if is_positive: # right
96
- x0_initial = self.x0 if include_element else self.x1 + pixel_offset
96
+ x0_initial = self.x0 if include_source else self.x1 + pixel_offset
97
97
  x1_initial = self.x1 # This edge moves
98
98
  else: # left
99
99
  x0_initial = self.x0 # This edge moves
100
- x1_initial = self.x1 if include_element else self.x0 - pixel_offset
100
+ x1_initial = self.x1 if include_source else self.x0 - pixel_offset
101
101
  else: # Vertical
102
102
  # Initial cross-boundaries (horizontal)
103
103
  x0 = 0 if cross_size == "full" else self.x0
@@ -105,11 +105,11 @@ class DirectionalMixin:
105
105
 
106
106
  # Initial primary boundaries (vertical)
107
107
  if is_positive: # below
108
- y0_initial = self.top if include_element else self.bottom + pixel_offset
108
+ y0_initial = self.top if include_source else self.bottom + pixel_offset
109
109
  y1_initial = self.bottom # This edge moves
110
110
  else: # above
111
111
  y0_initial = self.top # This edge moves
112
- y1_initial = self.bottom if include_element else self.top - pixel_offset
112
+ y1_initial = self.bottom if include_source else self.top - pixel_offset
113
113
 
114
114
  # 2. Calculate the final primary boundary, considering 'size' or page limits
115
115
  if is_horizontal:
@@ -195,7 +195,7 @@ class DirectionalMixin:
195
195
 
196
196
  result = Region(self.page, final_bbox)
197
197
  result.source_element = self
198
- result.includes_source = include_element
198
+ result.includes_source = include_source
199
199
  # Optionally store the boundary element if found
200
200
  if target:
201
201
  result.boundary_element = target
@@ -206,7 +206,7 @@ class DirectionalMixin:
206
206
  self,
207
207
  height: Optional[float] = None,
208
208
  width: str = "full",
209
- include_element: bool = False,
209
+ include_source: bool = False,
210
210
  until: Optional[str] = None,
211
211
  include_endpoint: bool = True,
212
212
  **kwargs,
@@ -217,7 +217,7 @@ class DirectionalMixin:
217
217
  Args:
218
218
  height: Height of the region above, in points
219
219
  width: Width mode - "full" for full page width or "element" for element width
220
- include_element: Whether to include this element/region in the result (default: False)
220
+ include_source: Whether to include this element/region in the result (default: False)
221
221
  until: Optional selector string to specify an upper boundary element
222
222
  include_endpoint: Whether to include the boundary element in the region (default: True)
223
223
  **kwargs: Additional parameters
@@ -229,7 +229,7 @@ class DirectionalMixin:
229
229
  direction="above",
230
230
  size=height,
231
231
  cross_size=width,
232
- include_element=include_element,
232
+ include_source=include_source,
233
233
  until=until,
234
234
  include_endpoint=include_endpoint,
235
235
  **kwargs,
@@ -239,7 +239,7 @@ class DirectionalMixin:
239
239
  self,
240
240
  height: Optional[float] = None,
241
241
  width: str = "full",
242
- include_element: bool = False,
242
+ include_source: bool = False,
243
243
  until: Optional[str] = None,
244
244
  include_endpoint: bool = True,
245
245
  **kwargs,
@@ -250,7 +250,7 @@ class DirectionalMixin:
250
250
  Args:
251
251
  height: Height of the region below, in points
252
252
  width: Width mode - "full" for full page width or "element" for element width
253
- include_element: Whether to include this element/region in the result (default: False)
253
+ include_source: Whether to include this element/region in the result (default: False)
254
254
  until: Optional selector string to specify a lower boundary element
255
255
  include_endpoint: Whether to include the boundary element in the region (default: True)
256
256
  **kwargs: Additional parameters
@@ -262,7 +262,7 @@ class DirectionalMixin:
262
262
  direction="below",
263
263
  size=height,
264
264
  cross_size=width,
265
- include_element=include_element,
265
+ include_source=include_source,
266
266
  until=until,
267
267
  include_endpoint=include_endpoint,
268
268
  **kwargs,
@@ -272,7 +272,7 @@ class DirectionalMixin:
272
272
  self,
273
273
  width: Optional[float] = None,
274
274
  height: str = "full",
275
- include_element: bool = False,
275
+ include_source: bool = False,
276
276
  until: Optional[str] = None,
277
277
  include_endpoint: bool = True,
278
278
  **kwargs,
@@ -283,7 +283,7 @@ class DirectionalMixin:
283
283
  Args:
284
284
  width: Width of the region to the left, in points
285
285
  height: Height mode - "full" for full page height or "element" for element height
286
- include_element: Whether to include this element/region in the result (default: False)
286
+ include_source: Whether to include this element/region in the result (default: False)
287
287
  until: Optional selector string to specify a left boundary element
288
288
  include_endpoint: Whether to include the boundary element in the region (default: True)
289
289
  **kwargs: Additional parameters
@@ -295,7 +295,7 @@ class DirectionalMixin:
295
295
  direction="left",
296
296
  size=width,
297
297
  cross_size=height,
298
- include_element=include_element,
298
+ include_source=include_source,
299
299
  until=until,
300
300
  include_endpoint=include_endpoint,
301
301
  **kwargs,
@@ -305,7 +305,7 @@ class DirectionalMixin:
305
305
  self,
306
306
  width: Optional[float] = None,
307
307
  height: str = "full",
308
- include_element: bool = False,
308
+ include_source: bool = False,
309
309
  until: Optional[str] = None,
310
310
  include_endpoint: bool = True,
311
311
  **kwargs,
@@ -316,7 +316,7 @@ class DirectionalMixin:
316
316
  Args:
317
317
  width: Width of the region to the right, in points
318
318
  height: Height mode - "full" for full page height or "element" for element height
319
- include_element: Whether to include this element/region in the result (default: False)
319
+ include_source: Whether to include this element/region in the result (default: False)
320
320
  until: Optional selector string to specify a right boundary element
321
321
  include_endpoint: Whether to include the boundary element in the region (default: True)
322
322
  **kwargs: Additional parameters
@@ -328,7 +328,7 @@ class DirectionalMixin:
328
328
  direction="right",
329
329
  size=width,
330
330
  cross_size=height,
331
- include_element=include_element,
331
+ include_source=include_source,
332
332
  until=until,
333
333
  include_endpoint=include_endpoint,
334
334
  **kwargs,