natural-pdf 0.1.15__py3-none-any.whl → 0.1.16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. natural_pdf/__init__.py +31 -0
  2. natural_pdf/analyzers/layout/gemini.py +137 -162
  3. natural_pdf/analyzers/layout/layout_manager.py +9 -5
  4. natural_pdf/analyzers/layout/layout_options.py +77 -7
  5. natural_pdf/analyzers/layout/paddle.py +318 -165
  6. natural_pdf/analyzers/layout/table_structure_utils.py +78 -0
  7. natural_pdf/analyzers/shape_detection_mixin.py +770 -405
  8. natural_pdf/classification/mixin.py +2 -8
  9. natural_pdf/collections/pdf_collection.py +25 -30
  10. natural_pdf/core/highlighting_service.py +47 -32
  11. natural_pdf/core/page.py +117 -75
  12. natural_pdf/core/pdf.py +19 -22
  13. natural_pdf/elements/base.py +9 -9
  14. natural_pdf/elements/collections.py +105 -50
  15. natural_pdf/elements/region.py +200 -126
  16. natural_pdf/exporters/paddleocr.py +38 -13
  17. natural_pdf/flows/__init__.py +3 -3
  18. natural_pdf/flows/collections.py +303 -132
  19. natural_pdf/flows/element.py +277 -132
  20. natural_pdf/flows/flow.py +33 -16
  21. natural_pdf/flows/region.py +142 -79
  22. natural_pdf/ocr/engine_doctr.py +37 -4
  23. natural_pdf/ocr/engine_easyocr.py +23 -3
  24. natural_pdf/ocr/engine_paddle.py +281 -30
  25. natural_pdf/ocr/engine_surya.py +8 -3
  26. natural_pdf/ocr/ocr_manager.py +75 -76
  27. natural_pdf/ocr/ocr_options.py +52 -87
  28. natural_pdf/search/__init__.py +25 -12
  29. natural_pdf/search/lancedb_search_service.py +91 -54
  30. natural_pdf/search/numpy_search_service.py +86 -65
  31. natural_pdf/search/searchable_mixin.py +2 -2
  32. natural_pdf/selectors/parser.py +125 -81
  33. natural_pdf/widgets/__init__.py +1 -1
  34. natural_pdf/widgets/viewer.py +205 -449
  35. {natural_pdf-0.1.15.dist-info → natural_pdf-0.1.16.dist-info}/METADATA +27 -45
  36. {natural_pdf-0.1.15.dist-info → natural_pdf-0.1.16.dist-info}/RECORD +39 -38
  37. {natural_pdf-0.1.15.dist-info → natural_pdf-0.1.16.dist-info}/WHEEL +0 -0
  38. {natural_pdf-0.1.15.dist-info → natural_pdf-0.1.16.dist-info}/licenses/LICENSE +0 -0
  39. {natural_pdf-0.1.15.dist-info → natural_pdf-0.1.16.dist-info}/top_level.txt +0 -0
natural_pdf/core/page.py CHANGED
@@ -51,6 +51,9 @@ from pdfplumber.utils.text import TEXTMAP_KWARGS, WORD_EXTRACTOR_KWARGS, chars_t
51
51
  from natural_pdf.analyzers.layout.layout_analyzer import LayoutAnalyzer
52
52
  from natural_pdf.analyzers.layout.layout_manager import LayoutManager
53
53
  from natural_pdf.analyzers.layout.layout_options import LayoutOptions
54
+
55
+ # --- Shape Detection Mixin --- #
56
+ from natural_pdf.analyzers.shape_detection_mixin import ShapeDetectionMixin
54
57
  from natural_pdf.analyzers.text_options import TextStyleOptions
55
58
  from natural_pdf.analyzers.text_structure import TextStyleAnalyzer
56
59
  from natural_pdf.classification.manager import ClassificationManager # For type hint
@@ -68,14 +71,12 @@ from natural_pdf.utils.locks import pdf_render_lock # Import the lock
68
71
 
69
72
  # # Import new utils
70
73
  from natural_pdf.utils.text_extraction import filter_chars_spatially, generate_text_layout
71
- from natural_pdf.widgets import InteractiveViewerWidget
72
- from natural_pdf.widgets.viewer import _IPYWIDGETS_AVAILABLE, SimpleInteractiveViewerWidget
74
+ from natural_pdf.widgets.viewer import _IPYWIDGETS_AVAILABLE, InteractiveViewerWidget
73
75
 
74
76
  # --- End Classification Imports --- #
75
77
 
76
78
 
77
- # --- Shape Detection Mixin --- #
78
- from natural_pdf.analyzers.shape_detection_mixin import ShapeDetectionMixin
79
+
79
80
  # --- End Shape Detection Mixin --- #
80
81
 
81
82
 
@@ -667,13 +668,13 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin):
667
668
  if selector_obj.get("type") == "or":
668
669
  # For OR selectors, search all elements and let the filter function decide
669
670
  elements_to_search = self._element_mgr.get_all_elements()
670
-
671
+
671
672
  # Create filter function from compound selector
672
673
  filter_func = selector_to_filter_func(selector_obj, **kwargs)
673
-
674
+
674
675
  # Apply the filter to all elements
675
676
  matching_elements = [element for element in elements_to_search if filter_func(element)]
676
-
677
+
677
678
  # Sort elements in reading order if requested
678
679
  if kwargs.get("reading_order", True):
679
680
  if all(hasattr(el, "top") and hasattr(el, "x0") for el in matching_elements):
@@ -682,7 +683,7 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin):
682
683
  logger.warning(
683
684
  "Cannot sort elements in reading order: Missing required attributes (top, x0)."
684
685
  )
685
-
686
+
686
687
  # Return result collection
687
688
  return ElementCollection(matching_elements)
688
689
 
@@ -1204,7 +1205,9 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin):
1204
1205
  try:
1205
1206
  tatr_tables = self.find_all("region[type=table][model=tatr]")
1206
1207
  if tatr_tables:
1207
- logger.debug(f"Page {self.number}: Found {len(tatr_tables)} TATR table regions, extracting from those...")
1208
+ logger.debug(
1209
+ f"Page {self.number}: Found {len(tatr_tables)} TATR table regions, extracting from those..."
1210
+ )
1208
1211
  extracted_tables = []
1209
1212
  for table_region in tatr_tables:
1210
1213
  try:
@@ -1212,48 +1215,70 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin):
1212
1215
  if table_data: # Only add non-empty tables
1213
1216
  extracted_tables.append(table_data)
1214
1217
  except Exception as e:
1215
- logger.warning(f"Failed to extract table from TATR region {table_region.bbox}: {e}")
1216
-
1218
+ logger.warning(
1219
+ f"Failed to extract table from TATR region {table_region.bbox}: {e}"
1220
+ )
1221
+
1217
1222
  if extracted_tables:
1218
- logger.debug(f"Page {self.number}: Successfully extracted {len(extracted_tables)} tables from TATR regions")
1223
+ logger.debug(
1224
+ f"Page {self.number}: Successfully extracted {len(extracted_tables)} tables from TATR regions"
1225
+ )
1219
1226
  return extracted_tables
1220
1227
  else:
1221
- logger.debug(f"Page {self.number}: TATR regions found but no tables extracted, falling back to pdfplumber")
1228
+ logger.debug(
1229
+ f"Page {self.number}: TATR regions found but no tables extracted, falling back to pdfplumber"
1230
+ )
1222
1231
  else:
1223
- logger.debug(f"Page {self.number}: No TATR table regions found, using pdfplumber methods")
1232
+ logger.debug(
1233
+ f"Page {self.number}: No TATR table regions found, using pdfplumber methods"
1234
+ )
1224
1235
  except Exception as e:
1225
- logger.debug(f"Page {self.number}: Error checking TATR regions: {e}, falling back to pdfplumber")
1236
+ logger.debug(
1237
+ f"Page {self.number}: Error checking TATR regions: {e}, falling back to pdfplumber"
1238
+ )
1226
1239
 
1227
1240
  # Auto-detect method if not specified (try lattice first, then stream)
1228
1241
  if method is None:
1229
1242
  logger.debug(f"Page {self.number}: Auto-detecting tables extraction method...")
1230
-
1243
+
1231
1244
  # Try lattice first
1232
1245
  try:
1233
1246
  lattice_settings = table_settings.copy()
1234
1247
  lattice_settings.setdefault("vertical_strategy", "lines")
1235
1248
  lattice_settings.setdefault("horizontal_strategy", "lines")
1236
-
1249
+
1237
1250
  logger.debug(f"Page {self.number}: Trying 'lattice' method first for tables...")
1238
1251
  lattice_result = self._page.extract_tables(lattice_settings)
1239
-
1252
+
1240
1253
  # Check if lattice found meaningful tables
1241
- if (lattice_result and len(lattice_result) > 0 and
1242
- any(any(any(cell and cell.strip() for cell in row if cell) for row in table if table) for table in lattice_result)):
1243
- logger.debug(f"Page {self.number}: 'lattice' method found {len(lattice_result)} tables")
1254
+ if (
1255
+ lattice_result
1256
+ and len(lattice_result) > 0
1257
+ and any(
1258
+ any(
1259
+ any(cell and cell.strip() for cell in row if cell)
1260
+ for row in table
1261
+ if table
1262
+ )
1263
+ for table in lattice_result
1264
+ )
1265
+ ):
1266
+ logger.debug(
1267
+ f"Page {self.number}: 'lattice' method found {len(lattice_result)} tables"
1268
+ )
1244
1269
  return lattice_result
1245
1270
  else:
1246
1271
  logger.debug(f"Page {self.number}: 'lattice' method found no meaningful tables")
1247
-
1272
+
1248
1273
  except Exception as e:
1249
1274
  logger.debug(f"Page {self.number}: 'lattice' method failed: {e}")
1250
-
1275
+
1251
1276
  # Fall back to stream
1252
1277
  logger.debug(f"Page {self.number}: Falling back to 'stream' method for tables...")
1253
1278
  stream_settings = table_settings.copy()
1254
1279
  stream_settings.setdefault("vertical_strategy", "text")
1255
1280
  stream_settings.setdefault("horizontal_strategy", "text")
1256
-
1281
+
1257
1282
  return self._page.extract_tables(stream_settings)
1258
1283
 
1259
1284
  effective_method = method
@@ -1265,7 +1290,9 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin):
1265
1290
  table_settings.setdefault("vertical_strategy", "text")
1266
1291
  table_settings.setdefault("horizontal_strategy", "text")
1267
1292
  elif effective_method == "lattice":
1268
- logger.debug("Using 'lattice' method alias for 'pdfplumber' with line-based strategies.")
1293
+ logger.debug(
1294
+ "Using 'lattice' method alias for 'pdfplumber' with line-based strategies."
1295
+ )
1269
1296
  effective_method = "pdfplumber"
1270
1297
  table_settings.setdefault("vertical_strategy", "lines")
1271
1298
  table_settings.setdefault("horizontal_strategy", "lines")
@@ -1555,6 +1582,14 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin):
1555
1582
  Returns:
1556
1583
  PIL Image of the page, or None if rendering fails.
1557
1584
  """
1585
+ # Apply global options as defaults, but allow explicit parameters to override
1586
+ import natural_pdf
1587
+
1588
+ # Use global options if parameters are not explicitly set
1589
+ if width is None:
1590
+ width = natural_pdf.options.image.width
1591
+ if resolution is None and natural_pdf.options.image.resolution is not None:
1592
+ resolution = natural_pdf.options.image.resolution
1558
1593
  # 1. Create cache key (excluding path)
1559
1594
  cache_key_parts = [
1560
1595
  scale,
@@ -1572,19 +1607,23 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin):
1572
1607
  if isinstance(v, list):
1573
1608
  try:
1574
1609
  v = tuple(v) # Convert lists to tuples
1575
- except TypeError: # pragma: no cover
1610
+ except TypeError: # pragma: no cover
1576
1611
  # If list contains unhashable items, fall back to repr or skip
1577
1612
  # For simplicity, we'll try to proceed; hashing will fail if v remains unhashable
1578
- logger.warning(f"Cache key generation: List item in kwargs['{k}'] could not be converted to tuple due to unhashable elements.")
1613
+ logger.warning(
1614
+ f"Cache key generation: List item in kwargs['{k}'] could not be converted to tuple due to unhashable elements."
1615
+ )
1579
1616
  sorted_kwargs_list.append((k, v))
1580
-
1617
+
1581
1618
  cache_key_parts.append(tuple(sorted_kwargs_list))
1582
-
1619
+
1583
1620
  try:
1584
1621
  cache_key = tuple(cache_key_parts)
1585
- except TypeError as e: # pragma: no cover
1586
- logger.warning(f"Page {self.index}: Could not create cache key for to_image due to unhashable item: {e}. Proceeding without cache for this call.")
1587
- cache_key = None # Fallback to not using cache for this call
1622
+ except TypeError as e: # pragma: no cover
1623
+ logger.warning(
1624
+ f"Page {self.index}: Could not create cache key for to_image due to unhashable item: {e}. Proceeding without cache for this call."
1625
+ )
1626
+ cache_key = None # Fallback to not using cache for this call
1588
1627
 
1589
1628
  image_to_return: Optional[Image.Image] = None
1590
1629
 
@@ -1594,7 +1633,9 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin):
1594
1633
  logger.debug(f"Page {self.index}: Returning cached image for key: {cache_key}")
1595
1634
  else:
1596
1635
  # --- This is the original logic to generate the image ---
1597
- rendered_image_component: Optional[Image.Image] = None # Renamed from 'image' in original
1636
+ rendered_image_component: Optional[Image.Image] = (
1637
+ None # Renamed from 'image' in original
1638
+ )
1598
1639
  render_resolution = resolution if resolution is not None else scale * 72
1599
1640
  thread_id = threading.current_thread().name
1600
1641
  logger.debug(
@@ -1632,29 +1673,31 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin):
1632
1673
 
1633
1674
  if rendered_image_component is None:
1634
1675
  if cache_key is not None:
1635
- self._to_image_cache[cache_key] = None # Cache the failure
1676
+ self._to_image_cache[cache_key] = None # Cache the failure
1636
1677
  # Save the image if path is provided (will try to save None, handled by PIL/OS)
1637
1678
  if path:
1638
1679
  try:
1639
1680
  if os.path.dirname(path):
1640
1681
  os.makedirs(os.path.dirname(path), exist_ok=True)
1641
- if rendered_image_component is not None: # Should be None here
1642
- rendered_image_component.save(path) # This line won't be hit if None
1682
+ if rendered_image_component is not None: # Should be None here
1683
+ rendered_image_component.save(path) # This line won't be hit if None
1643
1684
  # else: logger.debug("Not saving None image") # Not strictly needed
1644
- except Exception as save_error: # pragma: no cover
1685
+ except Exception as save_error: # pragma: no cover
1645
1686
  logger.error(f"Failed to save image to {path}: {save_error}")
1646
1687
  return None
1647
1688
 
1648
1689
  # --- Apply exclusion masking if requested ---
1649
1690
  # This modifies 'rendered_image_component'
1650
- image_after_masking = rendered_image_component # Start with the rendered image
1691
+ image_after_masking = rendered_image_component # Start with the rendered image
1651
1692
  if exclusions == "mask" and self._exclusions:
1652
1693
  try:
1653
1694
  # Ensure image is mutable (RGB or RGBA)
1654
1695
  if image_after_masking.mode not in ("RGB", "RGBA"):
1655
1696
  image_after_masking = image_after_masking.convert("RGB")
1656
1697
 
1657
- exclusion_regions = self._get_exclusion_regions(include_callable=True, debug=False)
1698
+ exclusion_regions = self._get_exclusion_regions(
1699
+ include_callable=True, debug=False
1700
+ )
1658
1701
  if exclusion_regions:
1659
1702
  draw = ImageDraw.Draw(image_after_masking)
1660
1703
  # Calculate the scaling factor used for the image
@@ -1676,12 +1719,12 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin):
1676
1719
  )
1677
1720
  if img_coords[0] < img_coords[2] and img_coords[1] < img_coords[3]:
1678
1721
  draw.rectangle(img_coords, fill="white")
1679
- else: # pragma: no cover
1722
+ else: # pragma: no cover
1680
1723
  logger.warning(
1681
1724
  f"Skipping invalid exclusion rect for masking: {img_coords}"
1682
1725
  )
1683
1726
  del draw # Release drawing context
1684
- except Exception as mask_error: # pragma: no cover
1727
+ except Exception as mask_error: # pragma: no cover
1685
1728
  logger.error(
1686
1729
  f"Error applying exclusion mask to page {self.index}: {mask_error}",
1687
1730
  exc_info=True,
@@ -1689,7 +1732,7 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin):
1689
1732
  # Continue with potentially unmasked or partially masked image
1690
1733
 
1691
1734
  # --- Resize the final image if width is provided ---
1692
- image_final_content = image_after_masking # Start with image after masking
1735
+ image_final_content = image_after_masking # Start with image after masking
1693
1736
  if width is not None and width > 0 and image_final_content.width > 0:
1694
1737
  aspect_ratio = image_final_content.height / image_final_content.width
1695
1738
  height = int(width * aspect_ratio)
@@ -1697,7 +1740,7 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin):
1697
1740
  image_final_content = image_final_content.resize(
1698
1741
  (width, height), Image.Resampling.LANCZOS
1699
1742
  )
1700
- except Exception as resize_error: # pragma: no cover
1743
+ except Exception as resize_error: # pragma: no cover
1701
1744
  logger.warning(f"Could not resize image: {resize_error}")
1702
1745
  # image_final_content remains the un-resized version if resize fails
1703
1746
 
@@ -1712,11 +1755,11 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin):
1712
1755
  if path and image_to_return:
1713
1756
  try:
1714
1757
  # Ensure directory exists
1715
- if os.path.dirname(path): # Only call makedirs if there's a directory part
1758
+ if os.path.dirname(path): # Only call makedirs if there's a directory part
1716
1759
  os.makedirs(os.path.dirname(path), exist_ok=True)
1717
1760
  image_to_return.save(path)
1718
1761
  logger.debug(f"Saved page image to: {path}")
1719
- except Exception as save_error: # pragma: no cover
1762
+ except Exception as save_error: # pragma: no cover
1720
1763
  logger.error(f"Failed to save image to {path}: {save_error}")
1721
1764
 
1722
1765
  return image_to_return
@@ -1775,24 +1818,20 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin):
1775
1818
  self._element_mgr.remove_ocr_elements()
1776
1819
 
1777
1820
  logger.info(f"Page {self.number}: Delegating apply_ocr to PDF.apply_ocr.")
1778
- try:
1779
- # Delegate to parent PDF, targeting only this page's index
1780
- # Pass all relevant parameters through, including apply_exclusions
1781
- self._parent.apply_ocr(
1782
- pages=[self.index],
1783
- engine=engine,
1784
- options=options,
1785
- languages=languages,
1786
- min_confidence=min_confidence,
1787
- device=device,
1788
- resolution=resolution,
1789
- detect_only=detect_only,
1790
- apply_exclusions=apply_exclusions,
1791
- replace=replace, # Pass the replace parameter to PDF.apply_ocr
1792
- )
1793
- except Exception as e:
1794
- logger.error(f"Page {self.number}: Error during delegated OCR call: {e}", exc_info=True)
1795
- return self # Return self for chaining
1821
+ # Delegate to parent PDF, targeting only this page's index
1822
+ # Pass all relevant parameters through, including apply_exclusions
1823
+ self._parent.apply_ocr(
1824
+ pages=[self.index],
1825
+ engine=engine,
1826
+ options=options,
1827
+ languages=languages,
1828
+ min_confidence=min_confidence,
1829
+ device=device,
1830
+ resolution=resolution,
1831
+ detect_only=detect_only,
1832
+ apply_exclusions=apply_exclusions,
1833
+ replace=replace, # Pass the replace parameter to PDF.apply_ocr
1834
+ )
1796
1835
 
1797
1836
  # Return self for chaining
1798
1837
  return self
@@ -2313,14 +2352,14 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin):
2313
2352
  self,
2314
2353
  # elements_to_render: Optional[List['Element']] = None, # No longer needed, from_page handles it
2315
2354
  # include_source_types: List[str] = ['word', 'line', 'rect', 'region'] # No longer needed
2316
- ) -> Optional["SimpleInteractiveViewerWidget"]: # Return type hint updated
2355
+ ) -> Optional["InteractiveViewerWidget"]: # Return type hint updated
2317
2356
  """
2318
2357
  Creates and returns an interactive ipywidget for exploring elements on this page.
2319
2358
 
2320
- Uses SimpleInteractiveViewerWidget.from_page() to create the viewer.
2359
+ Uses InteractiveViewerWidget.from_page() to create the viewer.
2321
2360
 
2322
2361
  Returns:
2323
- A SimpleInteractiveViewerWidget instance ready for display in Jupyter,
2362
+ A InteractiveViewerWidget instance ready for display in Jupyter,
2324
2363
  or None if ipywidgets is not installed or widget creation fails.
2325
2364
 
2326
2365
  Raises:
@@ -2329,18 +2368,18 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin):
2329
2368
  ValueError: If image rendering or data preparation fails within from_page.
2330
2369
  """
2331
2370
  # Check for availability using the imported flag and class variable
2332
- if not _IPYWIDGETS_AVAILABLE or SimpleInteractiveViewerWidget is None:
2371
+ if not _IPYWIDGETS_AVAILABLE or InteractiveViewerWidget is None:
2333
2372
  logger.error(
2334
- "Interactive viewer requires optional dependencies ('ipywidgets'). "
2335
- "Install with `pip install natural-pdf[viewer]`"
2373
+ "Interactive viewer requires 'ipywidgets'. "
2374
+ 'Please install with: pip install "ipywidgets>=7.0.0,<10.0.0"'
2336
2375
  )
2337
2376
  # raise ImportError("ipywidgets not found.") # Option 1: Raise error
2338
2377
  return None # Option 2: Return None gracefully
2339
2378
 
2340
- # If we reach here, SimpleInteractiveViewerWidget should be the actual class
2379
+ # If we reach here, InteractiveViewerWidget should be the actual class
2341
2380
  try:
2342
2381
  # Pass self (the Page object) to the factory method
2343
- return SimpleInteractiveViewerWidget.from_page(self)
2382
+ return InteractiveViewerWidget.from_page(self)
2344
2383
  except Exception as e:
2345
2384
  # Catch potential errors during widget creation (e.g., image rendering)
2346
2385
  logger.error(
@@ -2440,9 +2479,7 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin):
2440
2479
  f"Page {self.number}: Starting OCR correction with callback '{correction_callback.__name__}' (max_workers={max_workers})"
2441
2480
  )
2442
2481
 
2443
- target_elements_collection = self.find_all(
2444
- selector=selector, apply_exclusions=False
2445
- )
2482
+ target_elements_collection = self.find_all(selector=selector, apply_exclusions=False)
2446
2483
  target_elements = target_elements_collection.elements # Get the list
2447
2484
 
2448
2485
  if not target_elements:
@@ -2451,7 +2488,12 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin):
2451
2488
 
2452
2489
  element_pbar = None
2453
2490
  try:
2454
- element_pbar = tqdm(total=len(target_elements), desc=f"Correcting OCR Page {self.number}", unit="element", leave=False)
2491
+ element_pbar = tqdm(
2492
+ total=len(target_elements),
2493
+ desc=f"Correcting OCR Page {self.number}",
2494
+ unit="element",
2495
+ leave=False,
2496
+ )
2455
2497
 
2456
2498
  processed_count = 0
2457
2499
  updated_count = 0
natural_pdf/core/pdf.py CHANGED
@@ -24,6 +24,7 @@ from typing import (
24
24
 
25
25
  import pdfplumber
26
26
  from PIL import Image
27
+ from tqdm.auto import tqdm
27
28
 
28
29
  from natural_pdf.analyzers.layout.layout_manager import LayoutManager
29
30
  from natural_pdf.classification.manager import ClassificationError, ClassificationManager
@@ -38,7 +39,6 @@ from natural_pdf.extraction.mixin import ExtractionMixin
38
39
  from natural_pdf.ocr import OCRManager, OCROptions
39
40
  from natural_pdf.selectors.parser import parse_selector
40
41
  from natural_pdf.utils.locks import pdf_render_lock
41
- from tqdm.auto import tqdm
42
42
 
43
43
  try:
44
44
  from typing import Any as TypingAny
@@ -307,7 +307,6 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
307
307
  ) -> "PDF":
308
308
  """
309
309
  Applies OCR to specified pages of the PDF using batch processing.
310
- Applies OCR to specified pages of the PDF using batch processing.
311
310
 
312
311
  Args:
313
312
  engine: Name of the OCR engine
@@ -320,25 +319,27 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
320
319
  replace: Whether to replace existing OCR elements
321
320
  options: Engine-specific options
322
321
  pages: Page indices to process or None for all pages
323
- engine: Name of the OCR engine
324
- languages: List of language codes
325
- min_confidence: Minimum confidence threshold
326
- device: Device to run OCR on
327
- resolution: DPI resolution for page images
328
- apply_exclusions: Whether to mask excluded areas
329
- detect_only: If True, only detect text boxes
330
- replace: Whether to replace existing OCR elements
331
- options: Engine-specific options
332
- pages: Page indices to process or None for all pages
333
322
 
334
323
  Returns:
335
324
  Self for method chaining
336
- Self for method chaining
337
325
  """
338
326
  if not self._ocr_manager:
339
327
  logger.error("OCRManager not available. Cannot apply OCR.")
340
328
  return self
341
329
 
330
+ # Apply global options as defaults, but allow explicit parameters to override
331
+ import natural_pdf
332
+
333
+ # Use global OCR options if parameters are not explicitly set
334
+ if engine is None:
335
+ engine = natural_pdf.options.ocr.engine
336
+ if languages is None:
337
+ languages = natural_pdf.options.ocr.languages
338
+ if min_confidence is None:
339
+ min_confidence = natural_pdf.options.ocr.min_confidence
340
+ if device is None:
341
+ pass # No default device in options.ocr anymore
342
+
342
343
  thread_id = threading.current_thread().name
343
344
  logger.debug(f"[{thread_id}] PDF.apply_ocr starting for {self.path}")
344
345
 
@@ -425,18 +426,14 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
425
426
  logger.info(f"[{thread_id}] Calling OCR Manager with args: {ocr_call_args}...")
426
427
  ocr_start_time = time.monotonic()
427
428
 
428
- try:
429
- batch_results = self._ocr_manager.apply_ocr(**manager_args)
430
-
431
- if not isinstance(batch_results, list) or len(batch_results) != len(images_pil):
432
- logger.error(f"OCR Manager returned unexpected result format or length.")
433
- return self
429
+ batch_results = self._ocr_manager.apply_ocr(**manager_args)
434
430
 
435
- logger.info("OCR Manager batch processing complete.")
436
- except Exception as e:
437
- logger.error(f"Batch OCR processing failed: {e}")
431
+ if not isinstance(batch_results, list) or len(batch_results) != len(images_pil):
432
+ logger.error(f"OCR Manager returned unexpected result format or length.")
438
433
  return self
439
434
 
435
+ logger.info("OCR Manager batch processing complete.")
436
+
440
437
  ocr_end_time = time.monotonic()
441
438
  logger.debug(
442
439
  f"[{thread_id}] OCR processing finished (Duration: {ocr_end_time - ocr_start_time:.2f}s)"
@@ -18,34 +18,34 @@ if TYPE_CHECKING:
18
18
  def extract_bbox(obj: Any) -> Optional[Tuple[float, float, float, float]]:
19
19
  """
20
20
  Extract bounding box coordinates from any object that has bbox properties.
21
-
21
+
22
22
  Args:
23
23
  obj: Object that might have bbox coordinates (Element, Region, etc.)
24
-
24
+
25
25
  Returns:
26
26
  Tuple of (x0, top, x1, bottom) or None if object doesn't have bbox properties
27
27
  """
28
28
  # Try bbox property first (most common)
29
- if hasattr(obj, 'bbox') and obj.bbox is not None:
29
+ if hasattr(obj, "bbox") and obj.bbox is not None:
30
30
  bbox = obj.bbox
31
31
  if isinstance(bbox, (tuple, list)) and len(bbox) == 4:
32
32
  return tuple(float(coord) for coord in bbox)
33
-
33
+
34
34
  # Try individual coordinate properties
35
- if all(hasattr(obj, attr) for attr in ['x0', 'top', 'x1', 'bottom']):
35
+ if all(hasattr(obj, attr) for attr in ["x0", "top", "x1", "bottom"]):
36
36
  try:
37
37
  return (float(obj.x0), float(obj.top), float(obj.x1), float(obj.bottom))
38
38
  except (ValueError, TypeError):
39
39
  pass
40
-
40
+
41
41
  # If object is a dict with bbox keys
42
42
  if isinstance(obj, dict):
43
- if all(key in obj for key in ['x0', 'top', 'x1', 'bottom']):
43
+ if all(key in obj for key in ["x0", "top", "x1", "bottom"]):
44
44
  try:
45
- return (float(obj['x0']), float(obj['top']), float(obj['x1']), float(obj['bottom']))
45
+ return (float(obj["x0"]), float(obj["top"]), float(obj["x1"]), float(obj["bottom"]))
46
46
  except (ValueError, TypeError):
47
47
  pass
48
-
48
+
49
49
  return None
50
50
 
51
51