natural-pdf 0.1.15__py3-none-any.whl → 0.1.17__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. natural_pdf/__init__.py +31 -0
  2. natural_pdf/analyzers/layout/gemini.py +137 -162
  3. natural_pdf/analyzers/layout/layout_manager.py +9 -5
  4. natural_pdf/analyzers/layout/layout_options.py +77 -7
  5. natural_pdf/analyzers/layout/paddle.py +318 -165
  6. natural_pdf/analyzers/layout/table_structure_utils.py +78 -0
  7. natural_pdf/analyzers/shape_detection_mixin.py +770 -405
  8. natural_pdf/classification/mixin.py +2 -8
  9. natural_pdf/collections/pdf_collection.py +25 -30
  10. natural_pdf/core/highlighting_service.py +47 -32
  11. natural_pdf/core/page.py +119 -76
  12. natural_pdf/core/pdf.py +19 -22
  13. natural_pdf/describe/__init__.py +21 -0
  14. natural_pdf/describe/base.py +457 -0
  15. natural_pdf/describe/elements.py +411 -0
  16. natural_pdf/describe/mixin.py +84 -0
  17. natural_pdf/describe/summary.py +186 -0
  18. natural_pdf/elements/base.py +11 -10
  19. natural_pdf/elements/collections.py +116 -51
  20. natural_pdf/elements/region.py +204 -127
  21. natural_pdf/exporters/paddleocr.py +38 -13
  22. natural_pdf/flows/__init__.py +3 -3
  23. natural_pdf/flows/collections.py +303 -132
  24. natural_pdf/flows/element.py +277 -132
  25. natural_pdf/flows/flow.py +33 -16
  26. natural_pdf/flows/region.py +142 -79
  27. natural_pdf/ocr/engine_doctr.py +37 -4
  28. natural_pdf/ocr/engine_easyocr.py +23 -3
  29. natural_pdf/ocr/engine_paddle.py +281 -30
  30. natural_pdf/ocr/engine_surya.py +8 -3
  31. natural_pdf/ocr/ocr_manager.py +75 -76
  32. natural_pdf/ocr/ocr_options.py +52 -87
  33. natural_pdf/search/__init__.py +25 -12
  34. natural_pdf/search/lancedb_search_service.py +91 -54
  35. natural_pdf/search/numpy_search_service.py +86 -65
  36. natural_pdf/search/searchable_mixin.py +2 -2
  37. natural_pdf/selectors/parser.py +125 -81
  38. natural_pdf/widgets/__init__.py +1 -1
  39. natural_pdf/widgets/viewer.py +205 -449
  40. {natural_pdf-0.1.15.dist-info → natural_pdf-0.1.17.dist-info}/METADATA +27 -45
  41. {natural_pdf-0.1.15.dist-info → natural_pdf-0.1.17.dist-info}/RECORD +44 -38
  42. {natural_pdf-0.1.15.dist-info → natural_pdf-0.1.17.dist-info}/WHEEL +0 -0
  43. {natural_pdf-0.1.15.dist-info → natural_pdf-0.1.17.dist-info}/licenses/LICENSE +0 -0
  44. {natural_pdf-0.1.15.dist-info → natural_pdf-0.1.17.dist-info}/top_level.txt +0 -0
natural_pdf/core/page.py CHANGED
@@ -51,6 +51,9 @@ from pdfplumber.utils.text import TEXTMAP_KWARGS, WORD_EXTRACTOR_KWARGS, chars_t
51
51
  from natural_pdf.analyzers.layout.layout_analyzer import LayoutAnalyzer
52
52
  from natural_pdf.analyzers.layout.layout_manager import LayoutManager
53
53
  from natural_pdf.analyzers.layout.layout_options import LayoutOptions
54
+
55
+ # --- Shape Detection Mixin --- #
56
+ from natural_pdf.analyzers.shape_detection_mixin import ShapeDetectionMixin
54
57
  from natural_pdf.analyzers.text_options import TextStyleOptions
55
58
  from natural_pdf.analyzers.text_structure import TextStyleAnalyzer
56
59
  from natural_pdf.classification.manager import ClassificationManager # For type hint
@@ -58,6 +61,7 @@ from natural_pdf.classification.manager import ClassificationManager # For type
58
61
  # # --- Classification Imports --- #
59
62
  from natural_pdf.classification.mixin import ClassificationMixin # Import classification mixin
60
63
  from natural_pdf.core.element_manager import ElementManager
64
+ from natural_pdf.describe.mixin import DescribeMixin # Import describe mixin
61
65
  from natural_pdf.elements.base import Element # Import base element
62
66
  from natural_pdf.elements.text import TextElement
63
67
  from natural_pdf.extraction.mixin import ExtractionMixin # Import extraction mixin
@@ -68,14 +72,12 @@ from natural_pdf.utils.locks import pdf_render_lock # Import the lock
68
72
 
69
73
  # # Import new utils
70
74
  from natural_pdf.utils.text_extraction import filter_chars_spatially, generate_text_layout
71
- from natural_pdf.widgets import InteractiveViewerWidget
72
- from natural_pdf.widgets.viewer import _IPYWIDGETS_AVAILABLE, SimpleInteractiveViewerWidget
75
+ from natural_pdf.widgets.viewer import _IPYWIDGETS_AVAILABLE, InteractiveViewerWidget
73
76
 
74
77
  # --- End Classification Imports --- #
75
78
 
76
79
 
77
- # --- Shape Detection Mixin --- #
78
- from natural_pdf.analyzers.shape_detection_mixin import ShapeDetectionMixin
80
+
79
81
  # --- End Shape Detection Mixin --- #
80
82
 
81
83
 
@@ -91,7 +93,7 @@ except ImportError:
91
93
  logger = logging.getLogger(__name__)
92
94
 
93
95
 
94
- class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin):
96
+ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMixin):
95
97
  """
96
98
  Enhanced Page wrapper built on top of pdfplumber.Page.
97
99
 
@@ -667,13 +669,13 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin):
667
669
  if selector_obj.get("type") == "or":
668
670
  # For OR selectors, search all elements and let the filter function decide
669
671
  elements_to_search = self._element_mgr.get_all_elements()
670
-
672
+
671
673
  # Create filter function from compound selector
672
674
  filter_func = selector_to_filter_func(selector_obj, **kwargs)
673
-
675
+
674
676
  # Apply the filter to all elements
675
677
  matching_elements = [element for element in elements_to_search if filter_func(element)]
676
-
678
+
677
679
  # Sort elements in reading order if requested
678
680
  if kwargs.get("reading_order", True):
679
681
  if all(hasattr(el, "top") and hasattr(el, "x0") for el in matching_elements):
@@ -682,7 +684,7 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin):
682
684
  logger.warning(
683
685
  "Cannot sort elements in reading order: Missing required attributes (top, x0)."
684
686
  )
685
-
687
+
686
688
  # Return result collection
687
689
  return ElementCollection(matching_elements)
688
690
 
@@ -1204,7 +1206,9 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin):
1204
1206
  try:
1205
1207
  tatr_tables = self.find_all("region[type=table][model=tatr]")
1206
1208
  if tatr_tables:
1207
- logger.debug(f"Page {self.number}: Found {len(tatr_tables)} TATR table regions, extracting from those...")
1209
+ logger.debug(
1210
+ f"Page {self.number}: Found {len(tatr_tables)} TATR table regions, extracting from those..."
1211
+ )
1208
1212
  extracted_tables = []
1209
1213
  for table_region in tatr_tables:
1210
1214
  try:
@@ -1212,48 +1216,70 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin):
1212
1216
  if table_data: # Only add non-empty tables
1213
1217
  extracted_tables.append(table_data)
1214
1218
  except Exception as e:
1215
- logger.warning(f"Failed to extract table from TATR region {table_region.bbox}: {e}")
1216
-
1219
+ logger.warning(
1220
+ f"Failed to extract table from TATR region {table_region.bbox}: {e}"
1221
+ )
1222
+
1217
1223
  if extracted_tables:
1218
- logger.debug(f"Page {self.number}: Successfully extracted {len(extracted_tables)} tables from TATR regions")
1224
+ logger.debug(
1225
+ f"Page {self.number}: Successfully extracted {len(extracted_tables)} tables from TATR regions"
1226
+ )
1219
1227
  return extracted_tables
1220
1228
  else:
1221
- logger.debug(f"Page {self.number}: TATR regions found but no tables extracted, falling back to pdfplumber")
1229
+ logger.debug(
1230
+ f"Page {self.number}: TATR regions found but no tables extracted, falling back to pdfplumber"
1231
+ )
1222
1232
  else:
1223
- logger.debug(f"Page {self.number}: No TATR table regions found, using pdfplumber methods")
1233
+ logger.debug(
1234
+ f"Page {self.number}: No TATR table regions found, using pdfplumber methods"
1235
+ )
1224
1236
  except Exception as e:
1225
- logger.debug(f"Page {self.number}: Error checking TATR regions: {e}, falling back to pdfplumber")
1237
+ logger.debug(
1238
+ f"Page {self.number}: Error checking TATR regions: {e}, falling back to pdfplumber"
1239
+ )
1226
1240
 
1227
1241
  # Auto-detect method if not specified (try lattice first, then stream)
1228
1242
  if method is None:
1229
1243
  logger.debug(f"Page {self.number}: Auto-detecting tables extraction method...")
1230
-
1244
+
1231
1245
  # Try lattice first
1232
1246
  try:
1233
1247
  lattice_settings = table_settings.copy()
1234
1248
  lattice_settings.setdefault("vertical_strategy", "lines")
1235
1249
  lattice_settings.setdefault("horizontal_strategy", "lines")
1236
-
1250
+
1237
1251
  logger.debug(f"Page {self.number}: Trying 'lattice' method first for tables...")
1238
1252
  lattice_result = self._page.extract_tables(lattice_settings)
1239
-
1253
+
1240
1254
  # Check if lattice found meaningful tables
1241
- if (lattice_result and len(lattice_result) > 0 and
1242
- any(any(any(cell and cell.strip() for cell in row if cell) for row in table if table) for table in lattice_result)):
1243
- logger.debug(f"Page {self.number}: 'lattice' method found {len(lattice_result)} tables")
1255
+ if (
1256
+ lattice_result
1257
+ and len(lattice_result) > 0
1258
+ and any(
1259
+ any(
1260
+ any(cell and cell.strip() for cell in row if cell)
1261
+ for row in table
1262
+ if table
1263
+ )
1264
+ for table in lattice_result
1265
+ )
1266
+ ):
1267
+ logger.debug(
1268
+ f"Page {self.number}: 'lattice' method found {len(lattice_result)} tables"
1269
+ )
1244
1270
  return lattice_result
1245
1271
  else:
1246
1272
  logger.debug(f"Page {self.number}: 'lattice' method found no meaningful tables")
1247
-
1273
+
1248
1274
  except Exception as e:
1249
1275
  logger.debug(f"Page {self.number}: 'lattice' method failed: {e}")
1250
-
1276
+
1251
1277
  # Fall back to stream
1252
1278
  logger.debug(f"Page {self.number}: Falling back to 'stream' method for tables...")
1253
1279
  stream_settings = table_settings.copy()
1254
1280
  stream_settings.setdefault("vertical_strategy", "text")
1255
1281
  stream_settings.setdefault("horizontal_strategy", "text")
1256
-
1282
+
1257
1283
  return self._page.extract_tables(stream_settings)
1258
1284
 
1259
1285
  effective_method = method
@@ -1265,7 +1291,9 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin):
1265
1291
  table_settings.setdefault("vertical_strategy", "text")
1266
1292
  table_settings.setdefault("horizontal_strategy", "text")
1267
1293
  elif effective_method == "lattice":
1268
- logger.debug("Using 'lattice' method alias for 'pdfplumber' with line-based strategies.")
1294
+ logger.debug(
1295
+ "Using 'lattice' method alias for 'pdfplumber' with line-based strategies."
1296
+ )
1269
1297
  effective_method = "pdfplumber"
1270
1298
  table_settings.setdefault("vertical_strategy", "lines")
1271
1299
  table_settings.setdefault("horizontal_strategy", "lines")
@@ -1555,6 +1583,14 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin):
1555
1583
  Returns:
1556
1584
  PIL Image of the page, or None if rendering fails.
1557
1585
  """
1586
+ # Apply global options as defaults, but allow explicit parameters to override
1587
+ import natural_pdf
1588
+
1589
+ # Use global options if parameters are not explicitly set
1590
+ if width is None:
1591
+ width = natural_pdf.options.image.width
1592
+ if resolution is None and natural_pdf.options.image.resolution is not None:
1593
+ resolution = natural_pdf.options.image.resolution
1558
1594
  # 1. Create cache key (excluding path)
1559
1595
  cache_key_parts = [
1560
1596
  scale,
@@ -1572,19 +1608,23 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin):
1572
1608
  if isinstance(v, list):
1573
1609
  try:
1574
1610
  v = tuple(v) # Convert lists to tuples
1575
- except TypeError: # pragma: no cover
1611
+ except TypeError: # pragma: no cover
1576
1612
  # If list contains unhashable items, fall back to repr or skip
1577
1613
  # For simplicity, we'll try to proceed; hashing will fail if v remains unhashable
1578
- logger.warning(f"Cache key generation: List item in kwargs['{k}'] could not be converted to tuple due to unhashable elements.")
1614
+ logger.warning(
1615
+ f"Cache key generation: List item in kwargs['{k}'] could not be converted to tuple due to unhashable elements."
1616
+ )
1579
1617
  sorted_kwargs_list.append((k, v))
1580
-
1618
+
1581
1619
  cache_key_parts.append(tuple(sorted_kwargs_list))
1582
-
1620
+
1583
1621
  try:
1584
1622
  cache_key = tuple(cache_key_parts)
1585
- except TypeError as e: # pragma: no cover
1586
- logger.warning(f"Page {self.index}: Could not create cache key for to_image due to unhashable item: {e}. Proceeding without cache for this call.")
1587
- cache_key = None # Fallback to not using cache for this call
1623
+ except TypeError as e: # pragma: no cover
1624
+ logger.warning(
1625
+ f"Page {self.index}: Could not create cache key for to_image due to unhashable item: {e}. Proceeding without cache for this call."
1626
+ )
1627
+ cache_key = None # Fallback to not using cache for this call
1588
1628
 
1589
1629
  image_to_return: Optional[Image.Image] = None
1590
1630
 
@@ -1594,7 +1634,9 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin):
1594
1634
  logger.debug(f"Page {self.index}: Returning cached image for key: {cache_key}")
1595
1635
  else:
1596
1636
  # --- This is the original logic to generate the image ---
1597
- rendered_image_component: Optional[Image.Image] = None # Renamed from 'image' in original
1637
+ rendered_image_component: Optional[Image.Image] = (
1638
+ None # Renamed from 'image' in original
1639
+ )
1598
1640
  render_resolution = resolution if resolution is not None else scale * 72
1599
1641
  thread_id = threading.current_thread().name
1600
1642
  logger.debug(
@@ -1632,29 +1674,31 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin):
1632
1674
 
1633
1675
  if rendered_image_component is None:
1634
1676
  if cache_key is not None:
1635
- self._to_image_cache[cache_key] = None # Cache the failure
1677
+ self._to_image_cache[cache_key] = None # Cache the failure
1636
1678
  # Save the image if path is provided (will try to save None, handled by PIL/OS)
1637
1679
  if path:
1638
1680
  try:
1639
1681
  if os.path.dirname(path):
1640
1682
  os.makedirs(os.path.dirname(path), exist_ok=True)
1641
- if rendered_image_component is not None: # Should be None here
1642
- rendered_image_component.save(path) # This line won't be hit if None
1683
+ if rendered_image_component is not None: # Should be None here
1684
+ rendered_image_component.save(path) # This line won't be hit if None
1643
1685
  # else: logger.debug("Not saving None image") # Not strictly needed
1644
- except Exception as save_error: # pragma: no cover
1686
+ except Exception as save_error: # pragma: no cover
1645
1687
  logger.error(f"Failed to save image to {path}: {save_error}")
1646
1688
  return None
1647
1689
 
1648
1690
  # --- Apply exclusion masking if requested ---
1649
1691
  # This modifies 'rendered_image_component'
1650
- image_after_masking = rendered_image_component # Start with the rendered image
1692
+ image_after_masking = rendered_image_component # Start with the rendered image
1651
1693
  if exclusions == "mask" and self._exclusions:
1652
1694
  try:
1653
1695
  # Ensure image is mutable (RGB or RGBA)
1654
1696
  if image_after_masking.mode not in ("RGB", "RGBA"):
1655
1697
  image_after_masking = image_after_masking.convert("RGB")
1656
1698
 
1657
- exclusion_regions = self._get_exclusion_regions(include_callable=True, debug=False)
1699
+ exclusion_regions = self._get_exclusion_regions(
1700
+ include_callable=True, debug=False
1701
+ )
1658
1702
  if exclusion_regions:
1659
1703
  draw = ImageDraw.Draw(image_after_masking)
1660
1704
  # Calculate the scaling factor used for the image
@@ -1676,12 +1720,12 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin):
1676
1720
  )
1677
1721
  if img_coords[0] < img_coords[2] and img_coords[1] < img_coords[3]:
1678
1722
  draw.rectangle(img_coords, fill="white")
1679
- else: # pragma: no cover
1723
+ else: # pragma: no cover
1680
1724
  logger.warning(
1681
1725
  f"Skipping invalid exclusion rect for masking: {img_coords}"
1682
1726
  )
1683
1727
  del draw # Release drawing context
1684
- except Exception as mask_error: # pragma: no cover
1728
+ except Exception as mask_error: # pragma: no cover
1685
1729
  logger.error(
1686
1730
  f"Error applying exclusion mask to page {self.index}: {mask_error}",
1687
1731
  exc_info=True,
@@ -1689,7 +1733,7 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin):
1689
1733
  # Continue with potentially unmasked or partially masked image
1690
1734
 
1691
1735
  # --- Resize the final image if width is provided ---
1692
- image_final_content = image_after_masking # Start with image after masking
1736
+ image_final_content = image_after_masking # Start with image after masking
1693
1737
  if width is not None and width > 0 and image_final_content.width > 0:
1694
1738
  aspect_ratio = image_final_content.height / image_final_content.width
1695
1739
  height = int(width * aspect_ratio)
@@ -1697,7 +1741,7 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin):
1697
1741
  image_final_content = image_final_content.resize(
1698
1742
  (width, height), Image.Resampling.LANCZOS
1699
1743
  )
1700
- except Exception as resize_error: # pragma: no cover
1744
+ except Exception as resize_error: # pragma: no cover
1701
1745
  logger.warning(f"Could not resize image: {resize_error}")
1702
1746
  # image_final_content remains the un-resized version if resize fails
1703
1747
 
@@ -1712,11 +1756,11 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin):
1712
1756
  if path and image_to_return:
1713
1757
  try:
1714
1758
  # Ensure directory exists
1715
- if os.path.dirname(path): # Only call makedirs if there's a directory part
1759
+ if os.path.dirname(path): # Only call makedirs if there's a directory part
1716
1760
  os.makedirs(os.path.dirname(path), exist_ok=True)
1717
1761
  image_to_return.save(path)
1718
1762
  logger.debug(f"Saved page image to: {path}")
1719
- except Exception as save_error: # pragma: no cover
1763
+ except Exception as save_error: # pragma: no cover
1720
1764
  logger.error(f"Failed to save image to {path}: {save_error}")
1721
1765
 
1722
1766
  return image_to_return
@@ -1775,24 +1819,20 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin):
1775
1819
  self._element_mgr.remove_ocr_elements()
1776
1820
 
1777
1821
  logger.info(f"Page {self.number}: Delegating apply_ocr to PDF.apply_ocr.")
1778
- try:
1779
- # Delegate to parent PDF, targeting only this page's index
1780
- # Pass all relevant parameters through, including apply_exclusions
1781
- self._parent.apply_ocr(
1782
- pages=[self.index],
1783
- engine=engine,
1784
- options=options,
1785
- languages=languages,
1786
- min_confidence=min_confidence,
1787
- device=device,
1788
- resolution=resolution,
1789
- detect_only=detect_only,
1790
- apply_exclusions=apply_exclusions,
1791
- replace=replace, # Pass the replace parameter to PDF.apply_ocr
1792
- )
1793
- except Exception as e:
1794
- logger.error(f"Page {self.number}: Error during delegated OCR call: {e}", exc_info=True)
1795
- return self # Return self for chaining
1822
+ # Delegate to parent PDF, targeting only this page's index
1823
+ # Pass all relevant parameters through, including apply_exclusions
1824
+ self._parent.apply_ocr(
1825
+ pages=[self.index],
1826
+ engine=engine,
1827
+ options=options,
1828
+ languages=languages,
1829
+ min_confidence=min_confidence,
1830
+ device=device,
1831
+ resolution=resolution,
1832
+ detect_only=detect_only,
1833
+ apply_exclusions=apply_exclusions,
1834
+ replace=replace, # Pass the replace parameter to PDF.apply_ocr
1835
+ )
1796
1836
 
1797
1837
  # Return self for chaining
1798
1838
  return self
@@ -2313,14 +2353,14 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin):
2313
2353
  self,
2314
2354
  # elements_to_render: Optional[List['Element']] = None, # No longer needed, from_page handles it
2315
2355
  # include_source_types: List[str] = ['word', 'line', 'rect', 'region'] # No longer needed
2316
- ) -> Optional["SimpleInteractiveViewerWidget"]: # Return type hint updated
2356
+ ) -> Optional["InteractiveViewerWidget"]: # Return type hint updated
2317
2357
  """
2318
2358
  Creates and returns an interactive ipywidget for exploring elements on this page.
2319
2359
 
2320
- Uses SimpleInteractiveViewerWidget.from_page() to create the viewer.
2360
+ Uses InteractiveViewerWidget.from_page() to create the viewer.
2321
2361
 
2322
2362
  Returns:
2323
- A SimpleInteractiveViewerWidget instance ready for display in Jupyter,
2363
+ A InteractiveViewerWidget instance ready for display in Jupyter,
2324
2364
  or None if ipywidgets is not installed or widget creation fails.
2325
2365
 
2326
2366
  Raises:
@@ -2329,18 +2369,18 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin):
2329
2369
  ValueError: If image rendering or data preparation fails within from_page.
2330
2370
  """
2331
2371
  # Check for availability using the imported flag and class variable
2332
- if not _IPYWIDGETS_AVAILABLE or SimpleInteractiveViewerWidget is None:
2372
+ if not _IPYWIDGETS_AVAILABLE or InteractiveViewerWidget is None:
2333
2373
  logger.error(
2334
- "Interactive viewer requires optional dependencies ('ipywidgets'). "
2335
- "Install with `pip install natural-pdf[viewer]`"
2374
+ "Interactive viewer requires 'ipywidgets'. "
2375
+ 'Please install with: pip install "ipywidgets>=7.0.0,<10.0.0"'
2336
2376
  )
2337
2377
  # raise ImportError("ipywidgets not found.") # Option 1: Raise error
2338
2378
  return None # Option 2: Return None gracefully
2339
2379
 
2340
- # If we reach here, SimpleInteractiveViewerWidget should be the actual class
2380
+ # If we reach here, InteractiveViewerWidget should be the actual class
2341
2381
  try:
2342
2382
  # Pass self (the Page object) to the factory method
2343
- return SimpleInteractiveViewerWidget.from_page(self)
2383
+ return InteractiveViewerWidget.from_page(self)
2344
2384
  except Exception as e:
2345
2385
  # Catch potential errors during widget creation (e.g., image rendering)
2346
2386
  logger.error(
@@ -2440,9 +2480,7 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin):
2440
2480
  f"Page {self.number}: Starting OCR correction with callback '{correction_callback.__name__}' (max_workers={max_workers})"
2441
2481
  )
2442
2482
 
2443
- target_elements_collection = self.find_all(
2444
- selector=selector, apply_exclusions=False
2445
- )
2483
+ target_elements_collection = self.find_all(selector=selector, apply_exclusions=False)
2446
2484
  target_elements = target_elements_collection.elements # Get the list
2447
2485
 
2448
2486
  if not target_elements:
@@ -2451,7 +2489,12 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin):
2451
2489
 
2452
2490
  element_pbar = None
2453
2491
  try:
2454
- element_pbar = tqdm(total=len(target_elements), desc=f"Correcting OCR Page {self.number}", unit="element", leave=False)
2492
+ element_pbar = tqdm(
2493
+ total=len(target_elements),
2494
+ desc=f"Correcting OCR Page {self.number}",
2495
+ unit="element",
2496
+ leave=False,
2497
+ )
2455
2498
 
2456
2499
  processed_count = 0
2457
2500
  updated_count = 0
natural_pdf/core/pdf.py CHANGED
@@ -24,6 +24,7 @@ from typing import (
24
24
 
25
25
  import pdfplumber
26
26
  from PIL import Image
27
+ from tqdm.auto import tqdm
27
28
 
28
29
  from natural_pdf.analyzers.layout.layout_manager import LayoutManager
29
30
  from natural_pdf.classification.manager import ClassificationError, ClassificationManager
@@ -38,7 +39,6 @@ from natural_pdf.extraction.mixin import ExtractionMixin
38
39
  from natural_pdf.ocr import OCRManager, OCROptions
39
40
  from natural_pdf.selectors.parser import parse_selector
40
41
  from natural_pdf.utils.locks import pdf_render_lock
41
- from tqdm.auto import tqdm
42
42
 
43
43
  try:
44
44
  from typing import Any as TypingAny
@@ -307,7 +307,6 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
307
307
  ) -> "PDF":
308
308
  """
309
309
  Applies OCR to specified pages of the PDF using batch processing.
310
- Applies OCR to specified pages of the PDF using batch processing.
311
310
 
312
311
  Args:
313
312
  engine: Name of the OCR engine
@@ -320,25 +319,27 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
320
319
  replace: Whether to replace existing OCR elements
321
320
  options: Engine-specific options
322
321
  pages: Page indices to process or None for all pages
323
- engine: Name of the OCR engine
324
- languages: List of language codes
325
- min_confidence: Minimum confidence threshold
326
- device: Device to run OCR on
327
- resolution: DPI resolution for page images
328
- apply_exclusions: Whether to mask excluded areas
329
- detect_only: If True, only detect text boxes
330
- replace: Whether to replace existing OCR elements
331
- options: Engine-specific options
332
- pages: Page indices to process or None for all pages
333
322
 
334
323
  Returns:
335
324
  Self for method chaining
336
- Self for method chaining
337
325
  """
338
326
  if not self._ocr_manager:
339
327
  logger.error("OCRManager not available. Cannot apply OCR.")
340
328
  return self
341
329
 
330
+ # Apply global options as defaults, but allow explicit parameters to override
331
+ import natural_pdf
332
+
333
+ # Use global OCR options if parameters are not explicitly set
334
+ if engine is None:
335
+ engine = natural_pdf.options.ocr.engine
336
+ if languages is None:
337
+ languages = natural_pdf.options.ocr.languages
338
+ if min_confidence is None:
339
+ min_confidence = natural_pdf.options.ocr.min_confidence
340
+ if device is None:
341
+ pass # No default device in options.ocr anymore
342
+
342
343
  thread_id = threading.current_thread().name
343
344
  logger.debug(f"[{thread_id}] PDF.apply_ocr starting for {self.path}")
344
345
 
@@ -425,18 +426,14 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
425
426
  logger.info(f"[{thread_id}] Calling OCR Manager with args: {ocr_call_args}...")
426
427
  ocr_start_time = time.monotonic()
427
428
 
428
- try:
429
- batch_results = self._ocr_manager.apply_ocr(**manager_args)
430
-
431
- if not isinstance(batch_results, list) or len(batch_results) != len(images_pil):
432
- logger.error(f"OCR Manager returned unexpected result format or length.")
433
- return self
429
+ batch_results = self._ocr_manager.apply_ocr(**manager_args)
434
430
 
435
- logger.info("OCR Manager batch processing complete.")
436
- except Exception as e:
437
- logger.error(f"Batch OCR processing failed: {e}")
431
+ if not isinstance(batch_results, list) or len(batch_results) != len(images_pil):
432
+ logger.error(f"OCR Manager returned unexpected result format or length.")
438
433
  return self
439
434
 
435
+ logger.info("OCR Manager batch processing complete.")
436
+
440
437
  ocr_end_time = time.monotonic()
441
438
  logger.debug(
442
439
  f"[{thread_id}] OCR processing finished (Duration: {ocr_end_time - ocr_start_time:.2f}s)"
@@ -0,0 +1,21 @@
1
+ """
2
+ Describe functionality for natural-pdf.
3
+
4
+ Provides summary and inspection methods for pages, collections, and regions.
5
+ """
6
+
7
+ from .base import describe_page, describe_collection, inspect_collection, describe_region, describe_element
8
+ from .summary import ElementSummary, InspectionSummary
9
+ from .mixin import DescribeMixin, InspectMixin
10
+
11
+ __all__ = [
12
+ 'describe_page',
13
+ 'describe_collection',
14
+ 'inspect_collection',
15
+ 'describe_region',
16
+ 'describe_element',
17
+ 'ElementSummary',
18
+ 'InspectionSummary',
19
+ 'DescribeMixin',
20
+ 'InspectMixin'
21
+ ]