natural-pdf 0.1.14__py3-none-any.whl → 0.1.16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. natural_pdf/__init__.py +31 -0
  2. natural_pdf/analyzers/layout/gemini.py +137 -162
  3. natural_pdf/analyzers/layout/layout_manager.py +9 -5
  4. natural_pdf/analyzers/layout/layout_options.py +77 -7
  5. natural_pdf/analyzers/layout/paddle.py +318 -165
  6. natural_pdf/analyzers/layout/table_structure_utils.py +78 -0
  7. natural_pdf/analyzers/shape_detection_mixin.py +770 -405
  8. natural_pdf/classification/mixin.py +2 -8
  9. natural_pdf/collections/pdf_collection.py +25 -30
  10. natural_pdf/core/highlighting_service.py +47 -32
  11. natural_pdf/core/page.py +226 -70
  12. natural_pdf/core/pdf.py +19 -22
  13. natural_pdf/elements/base.py +9 -9
  14. natural_pdf/elements/collections.py +105 -50
  15. natural_pdf/elements/region.py +320 -113
  16. natural_pdf/exporters/paddleocr.py +38 -13
  17. natural_pdf/flows/__init__.py +3 -3
  18. natural_pdf/flows/collections.py +303 -132
  19. natural_pdf/flows/element.py +277 -132
  20. natural_pdf/flows/flow.py +33 -16
  21. natural_pdf/flows/region.py +142 -79
  22. natural_pdf/ocr/engine_doctr.py +37 -4
  23. natural_pdf/ocr/engine_easyocr.py +23 -3
  24. natural_pdf/ocr/engine_paddle.py +281 -30
  25. natural_pdf/ocr/engine_surya.py +8 -3
  26. natural_pdf/ocr/ocr_manager.py +75 -76
  27. natural_pdf/ocr/ocr_options.py +52 -87
  28. natural_pdf/search/__init__.py +25 -12
  29. natural_pdf/search/lancedb_search_service.py +91 -54
  30. natural_pdf/search/numpy_search_service.py +86 -65
  31. natural_pdf/search/searchable_mixin.py +2 -2
  32. natural_pdf/selectors/parser.py +125 -81
  33. natural_pdf/widgets/__init__.py +1 -1
  34. natural_pdf/widgets/viewer.py +205 -449
  35. {natural_pdf-0.1.14.dist-info → natural_pdf-0.1.16.dist-info}/METADATA +27 -45
  36. {natural_pdf-0.1.14.dist-info → natural_pdf-0.1.16.dist-info}/RECORD +39 -38
  37. {natural_pdf-0.1.14.dist-info → natural_pdf-0.1.16.dist-info}/WHEEL +0 -0
  38. {natural_pdf-0.1.14.dist-info → natural_pdf-0.1.16.dist-info}/licenses/LICENSE +0 -0
  39. {natural_pdf-0.1.14.dist-info → natural_pdf-0.1.16.dist-info}/top_level.txt +0 -0
natural_pdf/core/page.py CHANGED
@@ -51,6 +51,9 @@ from pdfplumber.utils.text import TEXTMAP_KWARGS, WORD_EXTRACTOR_KWARGS, chars_t
51
51
  from natural_pdf.analyzers.layout.layout_analyzer import LayoutAnalyzer
52
52
  from natural_pdf.analyzers.layout.layout_manager import LayoutManager
53
53
  from natural_pdf.analyzers.layout.layout_options import LayoutOptions
54
+
55
+ # --- Shape Detection Mixin --- #
56
+ from natural_pdf.analyzers.shape_detection_mixin import ShapeDetectionMixin
54
57
  from natural_pdf.analyzers.text_options import TextStyleOptions
55
58
  from natural_pdf.analyzers.text_structure import TextStyleAnalyzer
56
59
  from natural_pdf.classification.manager import ClassificationManager # For type hint
@@ -68,14 +71,12 @@ from natural_pdf.utils.locks import pdf_render_lock # Import the lock
68
71
 
69
72
  # # Import new utils
70
73
  from natural_pdf.utils.text_extraction import filter_chars_spatially, generate_text_layout
71
- from natural_pdf.widgets import InteractiveViewerWidget
72
- from natural_pdf.widgets.viewer import _IPYWIDGETS_AVAILABLE, SimpleInteractiveViewerWidget
74
+ from natural_pdf.widgets.viewer import _IPYWIDGETS_AVAILABLE, InteractiveViewerWidget
73
75
 
74
76
  # --- End Classification Imports --- #
75
77
 
76
78
 
77
- # --- Shape Detection Mixin --- #
78
- from natural_pdf.analyzers.shape_detection_mixin import ShapeDetectionMixin
79
+
79
80
  # --- End Shape Detection Mixin --- #
80
81
 
81
82
 
@@ -667,13 +668,13 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin):
667
668
  if selector_obj.get("type") == "or":
668
669
  # For OR selectors, search all elements and let the filter function decide
669
670
  elements_to_search = self._element_mgr.get_all_elements()
670
-
671
+
671
672
  # Create filter function from compound selector
672
673
  filter_func = selector_to_filter_func(selector_obj, **kwargs)
673
-
674
+
674
675
  # Apply the filter to all elements
675
676
  matching_elements = [element for element in elements_to_search if filter_func(element)]
676
-
677
+
677
678
  # Sort elements in reading order if requested
678
679
  if kwargs.get("reading_order", True):
679
680
  if all(hasattr(el, "top") and hasattr(el, "x0") for el in matching_elements):
@@ -682,7 +683,7 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin):
682
683
  logger.warning(
683
684
  "Cannot sort elements in reading order: Missing required attributes (top, x0)."
684
685
  )
685
-
686
+
686
687
  # Return result collection
687
688
  return ElementCollection(matching_elements)
688
689
 
@@ -1138,31 +1139,171 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin):
1138
1139
  logger.debug(f"Page {self.number}: extract_text finished, result length: {len(result)}.")
1139
1140
  return result
1140
1141
 
1141
- def extract_table(self, table_settings={}) -> List[Any]:
1142
+ def extract_table(
1143
+ self,
1144
+ method: Optional[str] = None,
1145
+ table_settings: Optional[dict] = None,
1146
+ use_ocr: bool = False,
1147
+ ocr_config: Optional[dict] = None,
1148
+ text_options: Optional[Dict] = None,
1149
+ cell_extraction_func: Optional[Callable[["Region"], Optional[str]]] = None,
1150
+ show_progress: bool = False,
1151
+ ) -> List[List[Optional[str]]]:
1142
1152
  """
1143
- Extract the largest table from this page.
1153
+ Extract the largest table from this page using enhanced region-based extraction.
1144
1154
 
1145
1155
  Args:
1146
- table_settings: Additional extraction parameters
1156
+ method: Method to use: 'tatr', 'pdfplumber', 'text', 'stream', 'lattice', or None (auto-detect).
1157
+ table_settings: Settings for pdfplumber table extraction.
1158
+ use_ocr: Whether to use OCR for text extraction (currently only applicable with 'tatr' method).
1159
+ ocr_config: OCR configuration parameters.
1160
+ text_options: Dictionary of options for the 'text' method.
1161
+ cell_extraction_func: Optional callable function that takes a cell Region object
1162
+ and returns its string content. For 'text' method only.
1163
+ show_progress: If True, display a progress bar during cell text extraction for the 'text' method.
1147
1164
 
1148
1165
  Returns:
1149
- List of extracted tables (or None if no table found)
1166
+ Table data as a list of rows, where each row is a list of cell values (str or None).
1150
1167
  """
1151
- # pdfplumber returns None if no table found
1152
- return self._page.extract_table(table_settings)
1168
+ # Create a full-page region and delegate to its enhanced extract_table method
1169
+ page_region = self.create_region(0, 0, self.width, self.height)
1170
+ return page_region.extract_table(
1171
+ method=method,
1172
+ table_settings=table_settings,
1173
+ use_ocr=use_ocr,
1174
+ ocr_config=ocr_config,
1175
+ text_options=text_options,
1176
+ cell_extraction_func=cell_extraction_func,
1177
+ show_progress=show_progress,
1178
+ )
1153
1179
 
1154
- def extract_tables(self, table_settings={}) -> List[Any]:
1180
+ def extract_tables(
1181
+ self,
1182
+ method: Optional[str] = None,
1183
+ table_settings: Optional[dict] = None,
1184
+ check_tatr: bool = True,
1185
+ ) -> List[List[List[str]]]:
1155
1186
  """
1156
- Extract tables from this page.
1187
+ Extract all tables from this page with enhanced method support.
1157
1188
 
1158
1189
  Args:
1159
- table_settings: Additional extraction parameters
1190
+ method: Method to use: 'pdfplumber', 'stream', 'lattice', or None (auto-detect).
1191
+ 'stream' uses text-based strategies, 'lattice' uses line-based strategies.
1192
+ Note: 'tatr' and 'text' methods are not supported for extract_tables.
1193
+ table_settings: Settings for pdfplumber table extraction.
1194
+ check_tatr: If True (default), first check for TATR-detected table regions
1195
+ and extract from those before falling back to pdfplumber methods.
1160
1196
 
1161
1197
  Returns:
1162
- List of extracted tables
1198
+ List of tables, where each table is a list of rows, and each row is a list of cell values.
1163
1199
  """
1164
- # pdfplumber returns list of tables
1165
- return self._page.extract_tables(table_settings)
1200
+ if table_settings is None:
1201
+ table_settings = {}
1202
+
1203
+ # Check for TATR-detected table regions first if enabled
1204
+ if check_tatr:
1205
+ try:
1206
+ tatr_tables = self.find_all("region[type=table][model=tatr]")
1207
+ if tatr_tables:
1208
+ logger.debug(
1209
+ f"Page {self.number}: Found {len(tatr_tables)} TATR table regions, extracting from those..."
1210
+ )
1211
+ extracted_tables = []
1212
+ for table_region in tatr_tables:
1213
+ try:
1214
+ table_data = table_region.extract_table(method="tatr")
1215
+ if table_data: # Only add non-empty tables
1216
+ extracted_tables.append(table_data)
1217
+ except Exception as e:
1218
+ logger.warning(
1219
+ f"Failed to extract table from TATR region {table_region.bbox}: {e}"
1220
+ )
1221
+
1222
+ if extracted_tables:
1223
+ logger.debug(
1224
+ f"Page {self.number}: Successfully extracted {len(extracted_tables)} tables from TATR regions"
1225
+ )
1226
+ return extracted_tables
1227
+ else:
1228
+ logger.debug(
1229
+ f"Page {self.number}: TATR regions found but no tables extracted, falling back to pdfplumber"
1230
+ )
1231
+ else:
1232
+ logger.debug(
1233
+ f"Page {self.number}: No TATR table regions found, using pdfplumber methods"
1234
+ )
1235
+ except Exception as e:
1236
+ logger.debug(
1237
+ f"Page {self.number}: Error checking TATR regions: {e}, falling back to pdfplumber"
1238
+ )
1239
+
1240
+ # Auto-detect method if not specified (try lattice first, then stream)
1241
+ if method is None:
1242
+ logger.debug(f"Page {self.number}: Auto-detecting tables extraction method...")
1243
+
1244
+ # Try lattice first
1245
+ try:
1246
+ lattice_settings = table_settings.copy()
1247
+ lattice_settings.setdefault("vertical_strategy", "lines")
1248
+ lattice_settings.setdefault("horizontal_strategy", "lines")
1249
+
1250
+ logger.debug(f"Page {self.number}: Trying 'lattice' method first for tables...")
1251
+ lattice_result = self._page.extract_tables(lattice_settings)
1252
+
1253
+ # Check if lattice found meaningful tables
1254
+ if (
1255
+ lattice_result
1256
+ and len(lattice_result) > 0
1257
+ and any(
1258
+ any(
1259
+ any(cell and cell.strip() for cell in row if cell)
1260
+ for row in table
1261
+ if table
1262
+ )
1263
+ for table in lattice_result
1264
+ )
1265
+ ):
1266
+ logger.debug(
1267
+ f"Page {self.number}: 'lattice' method found {len(lattice_result)} tables"
1268
+ )
1269
+ return lattice_result
1270
+ else:
1271
+ logger.debug(f"Page {self.number}: 'lattice' method found no meaningful tables")
1272
+
1273
+ except Exception as e:
1274
+ logger.debug(f"Page {self.number}: 'lattice' method failed: {e}")
1275
+
1276
+ # Fall back to stream
1277
+ logger.debug(f"Page {self.number}: Falling back to 'stream' method for tables...")
1278
+ stream_settings = table_settings.copy()
1279
+ stream_settings.setdefault("vertical_strategy", "text")
1280
+ stream_settings.setdefault("horizontal_strategy", "text")
1281
+
1282
+ return self._page.extract_tables(stream_settings)
1283
+
1284
+ effective_method = method
1285
+
1286
+ # Handle method aliases
1287
+ if effective_method == "stream":
1288
+ logger.debug("Using 'stream' method alias for 'pdfplumber' with text-based strategies.")
1289
+ effective_method = "pdfplumber"
1290
+ table_settings.setdefault("vertical_strategy", "text")
1291
+ table_settings.setdefault("horizontal_strategy", "text")
1292
+ elif effective_method == "lattice":
1293
+ logger.debug(
1294
+ "Using 'lattice' method alias for 'pdfplumber' with line-based strategies."
1295
+ )
1296
+ effective_method = "pdfplumber"
1297
+ table_settings.setdefault("vertical_strategy", "lines")
1298
+ table_settings.setdefault("horizontal_strategy", "lines")
1299
+
1300
+ # Use the selected method
1301
+ if effective_method == "pdfplumber":
1302
+ return self._page.extract_tables(table_settings)
1303
+ else:
1304
+ raise ValueError(
1305
+ f"Unknown tables extraction method: '{method}'. Choose from 'pdfplumber', 'stream', 'lattice'."
1306
+ )
1166
1307
 
1167
1308
  def _load_elements(self):
1168
1309
  """Load all elements from the page via ElementManager."""
@@ -1441,6 +1582,14 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin):
1441
1582
  Returns:
1442
1583
  PIL Image of the page, or None if rendering fails.
1443
1584
  """
1585
+ # Apply global options as defaults, but allow explicit parameters to override
1586
+ import natural_pdf
1587
+
1588
+ # Use global options if parameters are not explicitly set
1589
+ if width is None:
1590
+ width = natural_pdf.options.image.width
1591
+ if resolution is None and natural_pdf.options.image.resolution is not None:
1592
+ resolution = natural_pdf.options.image.resolution
1444
1593
  # 1. Create cache key (excluding path)
1445
1594
  cache_key_parts = [
1446
1595
  scale,
@@ -1458,19 +1607,23 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin):
1458
1607
  if isinstance(v, list):
1459
1608
  try:
1460
1609
  v = tuple(v) # Convert lists to tuples
1461
- except TypeError: # pragma: no cover
1610
+ except TypeError: # pragma: no cover
1462
1611
  # If list contains unhashable items, fall back to repr or skip
1463
1612
  # For simplicity, we'll try to proceed; hashing will fail if v remains unhashable
1464
- logger.warning(f"Cache key generation: List item in kwargs['{k}'] could not be converted to tuple due to unhashable elements.")
1613
+ logger.warning(
1614
+ f"Cache key generation: List item in kwargs['{k}'] could not be converted to tuple due to unhashable elements."
1615
+ )
1465
1616
  sorted_kwargs_list.append((k, v))
1466
-
1617
+
1467
1618
  cache_key_parts.append(tuple(sorted_kwargs_list))
1468
-
1619
+
1469
1620
  try:
1470
1621
  cache_key = tuple(cache_key_parts)
1471
- except TypeError as e: # pragma: no cover
1472
- logger.warning(f"Page {self.index}: Could not create cache key for to_image due to unhashable item: {e}. Proceeding without cache for this call.")
1473
- cache_key = None # Fallback to not using cache for this call
1622
+ except TypeError as e: # pragma: no cover
1623
+ logger.warning(
1624
+ f"Page {self.index}: Could not create cache key for to_image due to unhashable item: {e}. Proceeding without cache for this call."
1625
+ )
1626
+ cache_key = None # Fallback to not using cache for this call
1474
1627
 
1475
1628
  image_to_return: Optional[Image.Image] = None
1476
1629
 
@@ -1480,7 +1633,9 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin):
1480
1633
  logger.debug(f"Page {self.index}: Returning cached image for key: {cache_key}")
1481
1634
  else:
1482
1635
  # --- This is the original logic to generate the image ---
1483
- rendered_image_component: Optional[Image.Image] = None # Renamed from 'image' in original
1636
+ rendered_image_component: Optional[Image.Image] = (
1637
+ None # Renamed from 'image' in original
1638
+ )
1484
1639
  render_resolution = resolution if resolution is not None else scale * 72
1485
1640
  thread_id = threading.current_thread().name
1486
1641
  logger.debug(
@@ -1518,29 +1673,31 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin):
1518
1673
 
1519
1674
  if rendered_image_component is None:
1520
1675
  if cache_key is not None:
1521
- self._to_image_cache[cache_key] = None # Cache the failure
1676
+ self._to_image_cache[cache_key] = None # Cache the failure
1522
1677
  # Save the image if path is provided (will try to save None, handled by PIL/OS)
1523
1678
  if path:
1524
1679
  try:
1525
1680
  if os.path.dirname(path):
1526
1681
  os.makedirs(os.path.dirname(path), exist_ok=True)
1527
- if rendered_image_component is not None: # Should be None here
1528
- rendered_image_component.save(path) # This line won't be hit if None
1682
+ if rendered_image_component is not None: # Should be None here
1683
+ rendered_image_component.save(path) # This line won't be hit if None
1529
1684
  # else: logger.debug("Not saving None image") # Not strictly needed
1530
- except Exception as save_error: # pragma: no cover
1685
+ except Exception as save_error: # pragma: no cover
1531
1686
  logger.error(f"Failed to save image to {path}: {save_error}")
1532
1687
  return None
1533
1688
 
1534
1689
  # --- Apply exclusion masking if requested ---
1535
1690
  # This modifies 'rendered_image_component'
1536
- image_after_masking = rendered_image_component # Start with the rendered image
1691
+ image_after_masking = rendered_image_component # Start with the rendered image
1537
1692
  if exclusions == "mask" and self._exclusions:
1538
1693
  try:
1539
1694
  # Ensure image is mutable (RGB or RGBA)
1540
1695
  if image_after_masking.mode not in ("RGB", "RGBA"):
1541
1696
  image_after_masking = image_after_masking.convert("RGB")
1542
1697
 
1543
- exclusion_regions = self._get_exclusion_regions(include_callable=True, debug=False)
1698
+ exclusion_regions = self._get_exclusion_regions(
1699
+ include_callable=True, debug=False
1700
+ )
1544
1701
  if exclusion_regions:
1545
1702
  draw = ImageDraw.Draw(image_after_masking)
1546
1703
  # Calculate the scaling factor used for the image
@@ -1562,12 +1719,12 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin):
1562
1719
  )
1563
1720
  if img_coords[0] < img_coords[2] and img_coords[1] < img_coords[3]:
1564
1721
  draw.rectangle(img_coords, fill="white")
1565
- else: # pragma: no cover
1722
+ else: # pragma: no cover
1566
1723
  logger.warning(
1567
1724
  f"Skipping invalid exclusion rect for masking: {img_coords}"
1568
1725
  )
1569
1726
  del draw # Release drawing context
1570
- except Exception as mask_error: # pragma: no cover
1727
+ except Exception as mask_error: # pragma: no cover
1571
1728
  logger.error(
1572
1729
  f"Error applying exclusion mask to page {self.index}: {mask_error}",
1573
1730
  exc_info=True,
@@ -1575,7 +1732,7 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin):
1575
1732
  # Continue with potentially unmasked or partially masked image
1576
1733
 
1577
1734
  # --- Resize the final image if width is provided ---
1578
- image_final_content = image_after_masking # Start with image after masking
1735
+ image_final_content = image_after_masking # Start with image after masking
1579
1736
  if width is not None and width > 0 and image_final_content.width > 0:
1580
1737
  aspect_ratio = image_final_content.height / image_final_content.width
1581
1738
  height = int(width * aspect_ratio)
@@ -1583,7 +1740,7 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin):
1583
1740
  image_final_content = image_final_content.resize(
1584
1741
  (width, height), Image.Resampling.LANCZOS
1585
1742
  )
1586
- except Exception as resize_error: # pragma: no cover
1743
+ except Exception as resize_error: # pragma: no cover
1587
1744
  logger.warning(f"Could not resize image: {resize_error}")
1588
1745
  # image_final_content remains the un-resized version if resize fails
1589
1746
 
@@ -1598,11 +1755,11 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin):
1598
1755
  if path and image_to_return:
1599
1756
  try:
1600
1757
  # Ensure directory exists
1601
- if os.path.dirname(path): # Only call makedirs if there's a directory part
1758
+ if os.path.dirname(path): # Only call makedirs if there's a directory part
1602
1759
  os.makedirs(os.path.dirname(path), exist_ok=True)
1603
1760
  image_to_return.save(path)
1604
1761
  logger.debug(f"Saved page image to: {path}")
1605
- except Exception as save_error: # pragma: no cover
1762
+ except Exception as save_error: # pragma: no cover
1606
1763
  logger.error(f"Failed to save image to {path}: {save_error}")
1607
1764
 
1608
1765
  return image_to_return
@@ -1661,24 +1818,20 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin):
1661
1818
  self._element_mgr.remove_ocr_elements()
1662
1819
 
1663
1820
  logger.info(f"Page {self.number}: Delegating apply_ocr to PDF.apply_ocr.")
1664
- try:
1665
- # Delegate to parent PDF, targeting only this page's index
1666
- # Pass all relevant parameters through, including apply_exclusions
1667
- self._parent.apply_ocr(
1668
- pages=[self.index],
1669
- engine=engine,
1670
- options=options,
1671
- languages=languages,
1672
- min_confidence=min_confidence,
1673
- device=device,
1674
- resolution=resolution,
1675
- detect_only=detect_only,
1676
- apply_exclusions=apply_exclusions,
1677
- replace=replace, # Pass the replace parameter to PDF.apply_ocr
1678
- )
1679
- except Exception as e:
1680
- logger.error(f"Page {self.number}: Error during delegated OCR call: {e}", exc_info=True)
1681
- return self # Return self for chaining
1821
+ # Delegate to parent PDF, targeting only this page's index
1822
+ # Pass all relevant parameters through, including apply_exclusions
1823
+ self._parent.apply_ocr(
1824
+ pages=[self.index],
1825
+ engine=engine,
1826
+ options=options,
1827
+ languages=languages,
1828
+ min_confidence=min_confidence,
1829
+ device=device,
1830
+ resolution=resolution,
1831
+ detect_only=detect_only,
1832
+ apply_exclusions=apply_exclusions,
1833
+ replace=replace, # Pass the replace parameter to PDF.apply_ocr
1834
+ )
1682
1835
 
1683
1836
  # Return self for chaining
1684
1837
  return self
@@ -2199,14 +2352,14 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin):
2199
2352
  self,
2200
2353
  # elements_to_render: Optional[List['Element']] = None, # No longer needed, from_page handles it
2201
2354
  # include_source_types: List[str] = ['word', 'line', 'rect', 'region'] # No longer needed
2202
- ) -> Optional["SimpleInteractiveViewerWidget"]: # Return type hint updated
2355
+ ) -> Optional["InteractiveViewerWidget"]: # Return type hint updated
2203
2356
  """
2204
2357
  Creates and returns an interactive ipywidget for exploring elements on this page.
2205
2358
 
2206
- Uses SimpleInteractiveViewerWidget.from_page() to create the viewer.
2359
+ Uses InteractiveViewerWidget.from_page() to create the viewer.
2207
2360
 
2208
2361
  Returns:
2209
- A SimpleInteractiveViewerWidget instance ready for display in Jupyter,
2362
+ A InteractiveViewerWidget instance ready for display in Jupyter,
2210
2363
  or None if ipywidgets is not installed or widget creation fails.
2211
2364
 
2212
2365
  Raises:
@@ -2215,18 +2368,18 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin):
2215
2368
  ValueError: If image rendering or data preparation fails within from_page.
2216
2369
  """
2217
2370
  # Check for availability using the imported flag and class variable
2218
- if not _IPYWIDGETS_AVAILABLE or SimpleInteractiveViewerWidget is None:
2371
+ if not _IPYWIDGETS_AVAILABLE or InteractiveViewerWidget is None:
2219
2372
  logger.error(
2220
- "Interactive viewer requires optional dependencies ('ipywidgets'). "
2221
- "Install with `pip install natural-pdf[viewer]`"
2373
+ "Interactive viewer requires 'ipywidgets'. "
2374
+ 'Please install with: pip install "ipywidgets>=7.0.0,<10.0.0"'
2222
2375
  )
2223
2376
  # raise ImportError("ipywidgets not found.") # Option 1: Raise error
2224
2377
  return None # Option 2: Return None gracefully
2225
2378
 
2226
- # If we reach here, SimpleInteractiveViewerWidget should be the actual class
2379
+ # If we reach here, InteractiveViewerWidget should be the actual class
2227
2380
  try:
2228
2381
  # Pass self (the Page object) to the factory method
2229
- return SimpleInteractiveViewerWidget.from_page(self)
2382
+ return InteractiveViewerWidget.from_page(self)
2230
2383
  except Exception as e:
2231
2384
  # Catch potential errors during widget creation (e.g., image rendering)
2232
2385
  logger.error(
@@ -2326,9 +2479,7 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin):
2326
2479
  f"Page {self.number}: Starting OCR correction with callback '{correction_callback.__name__}' (max_workers={max_workers})"
2327
2480
  )
2328
2481
 
2329
- target_elements_collection = self.find_all(
2330
- selector=selector, apply_exclusions=False
2331
- )
2482
+ target_elements_collection = self.find_all(selector=selector, apply_exclusions=False)
2332
2483
  target_elements = target_elements_collection.elements # Get the list
2333
2484
 
2334
2485
  if not target_elements:
@@ -2337,7 +2488,12 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin):
2337
2488
 
2338
2489
  element_pbar = None
2339
2490
  try:
2340
- element_pbar = tqdm(total=len(target_elements), desc=f"Correcting OCR Page {self.number}", unit="element", leave=False)
2491
+ element_pbar = tqdm(
2492
+ total=len(target_elements),
2493
+ desc=f"Correcting OCR Page {self.number}",
2494
+ unit="element",
2495
+ leave=False,
2496
+ )
2341
2497
 
2342
2498
  processed_count = 0
2343
2499
  updated_count = 0
natural_pdf/core/pdf.py CHANGED
@@ -24,6 +24,7 @@ from typing import (
24
24
 
25
25
  import pdfplumber
26
26
  from PIL import Image
27
+ from tqdm.auto import tqdm
27
28
 
28
29
  from natural_pdf.analyzers.layout.layout_manager import LayoutManager
29
30
  from natural_pdf.classification.manager import ClassificationError, ClassificationManager
@@ -38,7 +39,6 @@ from natural_pdf.extraction.mixin import ExtractionMixin
38
39
  from natural_pdf.ocr import OCRManager, OCROptions
39
40
  from natural_pdf.selectors.parser import parse_selector
40
41
  from natural_pdf.utils.locks import pdf_render_lock
41
- from tqdm.auto import tqdm
42
42
 
43
43
  try:
44
44
  from typing import Any as TypingAny
@@ -307,7 +307,6 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
307
307
  ) -> "PDF":
308
308
  """
309
309
  Applies OCR to specified pages of the PDF using batch processing.
310
- Applies OCR to specified pages of the PDF using batch processing.
311
310
 
312
311
  Args:
313
312
  engine: Name of the OCR engine
@@ -320,25 +319,27 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
320
319
  replace: Whether to replace existing OCR elements
321
320
  options: Engine-specific options
322
321
  pages: Page indices to process or None for all pages
323
- engine: Name of the OCR engine
324
- languages: List of language codes
325
- min_confidence: Minimum confidence threshold
326
- device: Device to run OCR on
327
- resolution: DPI resolution for page images
328
- apply_exclusions: Whether to mask excluded areas
329
- detect_only: If True, only detect text boxes
330
- replace: Whether to replace existing OCR elements
331
- options: Engine-specific options
332
- pages: Page indices to process or None for all pages
333
322
 
334
323
  Returns:
335
324
  Self for method chaining
336
- Self for method chaining
337
325
  """
338
326
  if not self._ocr_manager:
339
327
  logger.error("OCRManager not available. Cannot apply OCR.")
340
328
  return self
341
329
 
330
+ # Apply global options as defaults, but allow explicit parameters to override
331
+ import natural_pdf
332
+
333
+ # Use global OCR options if parameters are not explicitly set
334
+ if engine is None:
335
+ engine = natural_pdf.options.ocr.engine
336
+ if languages is None:
337
+ languages = natural_pdf.options.ocr.languages
338
+ if min_confidence is None:
339
+ min_confidence = natural_pdf.options.ocr.min_confidence
340
+ if device is None:
341
+ pass # No default device in options.ocr anymore
342
+
342
343
  thread_id = threading.current_thread().name
343
344
  logger.debug(f"[{thread_id}] PDF.apply_ocr starting for {self.path}")
344
345
 
@@ -425,18 +426,14 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
425
426
  logger.info(f"[{thread_id}] Calling OCR Manager with args: {ocr_call_args}...")
426
427
  ocr_start_time = time.monotonic()
427
428
 
428
- try:
429
- batch_results = self._ocr_manager.apply_ocr(**manager_args)
430
-
431
- if not isinstance(batch_results, list) or len(batch_results) != len(images_pil):
432
- logger.error(f"OCR Manager returned unexpected result format or length.")
433
- return self
429
+ batch_results = self._ocr_manager.apply_ocr(**manager_args)
434
430
 
435
- logger.info("OCR Manager batch processing complete.")
436
- except Exception as e:
437
- logger.error(f"Batch OCR processing failed: {e}")
431
+ if not isinstance(batch_results, list) or len(batch_results) != len(images_pil):
432
+ logger.error(f"OCR Manager returned unexpected result format or length.")
438
433
  return self
439
434
 
435
+ logger.info("OCR Manager batch processing complete.")
436
+
440
437
  ocr_end_time = time.monotonic()
441
438
  logger.debug(
442
439
  f"[{thread_id}] OCR processing finished (Duration: {ocr_end_time - ocr_start_time:.2f}s)"
@@ -18,34 +18,34 @@ if TYPE_CHECKING:
18
18
  def extract_bbox(obj: Any) -> Optional[Tuple[float, float, float, float]]:
19
19
  """
20
20
  Extract bounding box coordinates from any object that has bbox properties.
21
-
21
+
22
22
  Args:
23
23
  obj: Object that might have bbox coordinates (Element, Region, etc.)
24
-
24
+
25
25
  Returns:
26
26
  Tuple of (x0, top, x1, bottom) or None if object doesn't have bbox properties
27
27
  """
28
28
  # Try bbox property first (most common)
29
- if hasattr(obj, 'bbox') and obj.bbox is not None:
29
+ if hasattr(obj, "bbox") and obj.bbox is not None:
30
30
  bbox = obj.bbox
31
31
  if isinstance(bbox, (tuple, list)) and len(bbox) == 4:
32
32
  return tuple(float(coord) for coord in bbox)
33
-
33
+
34
34
  # Try individual coordinate properties
35
- if all(hasattr(obj, attr) for attr in ['x0', 'top', 'x1', 'bottom']):
35
+ if all(hasattr(obj, attr) for attr in ["x0", "top", "x1", "bottom"]):
36
36
  try:
37
37
  return (float(obj.x0), float(obj.top), float(obj.x1), float(obj.bottom))
38
38
  except (ValueError, TypeError):
39
39
  pass
40
-
40
+
41
41
  # If object is a dict with bbox keys
42
42
  if isinstance(obj, dict):
43
- if all(key in obj for key in ['x0', 'top', 'x1', 'bottom']):
43
+ if all(key in obj for key in ["x0", "top", "x1", "bottom"]):
44
44
  try:
45
- return (float(obj['x0']), float(obj['top']), float(obj['x1']), float(obj['bottom']))
45
+ return (float(obj["x0"]), float(obj["top"]), float(obj["x1"]), float(obj["bottom"]))
46
46
  except (ValueError, TypeError):
47
47
  pass
48
-
48
+
49
49
  return None
50
50
 
51
51