natural-pdf 0.1.15__py3-none-any.whl → 0.1.16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/__init__.py +31 -0
- natural_pdf/analyzers/layout/gemini.py +137 -162
- natural_pdf/analyzers/layout/layout_manager.py +9 -5
- natural_pdf/analyzers/layout/layout_options.py +77 -7
- natural_pdf/analyzers/layout/paddle.py +318 -165
- natural_pdf/analyzers/layout/table_structure_utils.py +78 -0
- natural_pdf/analyzers/shape_detection_mixin.py +770 -405
- natural_pdf/classification/mixin.py +2 -8
- natural_pdf/collections/pdf_collection.py +25 -30
- natural_pdf/core/highlighting_service.py +47 -32
- natural_pdf/core/page.py +117 -75
- natural_pdf/core/pdf.py +19 -22
- natural_pdf/elements/base.py +9 -9
- natural_pdf/elements/collections.py +105 -50
- natural_pdf/elements/region.py +200 -126
- natural_pdf/exporters/paddleocr.py +38 -13
- natural_pdf/flows/__init__.py +3 -3
- natural_pdf/flows/collections.py +303 -132
- natural_pdf/flows/element.py +277 -132
- natural_pdf/flows/flow.py +33 -16
- natural_pdf/flows/region.py +142 -79
- natural_pdf/ocr/engine_doctr.py +37 -4
- natural_pdf/ocr/engine_easyocr.py +23 -3
- natural_pdf/ocr/engine_paddle.py +281 -30
- natural_pdf/ocr/engine_surya.py +8 -3
- natural_pdf/ocr/ocr_manager.py +75 -76
- natural_pdf/ocr/ocr_options.py +52 -87
- natural_pdf/search/__init__.py +25 -12
- natural_pdf/search/lancedb_search_service.py +91 -54
- natural_pdf/search/numpy_search_service.py +86 -65
- natural_pdf/search/searchable_mixin.py +2 -2
- natural_pdf/selectors/parser.py +125 -81
- natural_pdf/widgets/__init__.py +1 -1
- natural_pdf/widgets/viewer.py +205 -449
- {natural_pdf-0.1.15.dist-info → natural_pdf-0.1.16.dist-info}/METADATA +27 -45
- {natural_pdf-0.1.15.dist-info → natural_pdf-0.1.16.dist-info}/RECORD +39 -38
- {natural_pdf-0.1.15.dist-info → natural_pdf-0.1.16.dist-info}/WHEEL +0 -0
- {natural_pdf-0.1.15.dist-info → natural_pdf-0.1.16.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.1.15.dist-info → natural_pdf-0.1.16.dist-info}/top_level.txt +0 -0
natural_pdf/core/page.py
CHANGED
@@ -51,6 +51,9 @@ from pdfplumber.utils.text import TEXTMAP_KWARGS, WORD_EXTRACTOR_KWARGS, chars_t
|
|
51
51
|
from natural_pdf.analyzers.layout.layout_analyzer import LayoutAnalyzer
|
52
52
|
from natural_pdf.analyzers.layout.layout_manager import LayoutManager
|
53
53
|
from natural_pdf.analyzers.layout.layout_options import LayoutOptions
|
54
|
+
|
55
|
+
# --- Shape Detection Mixin --- #
|
56
|
+
from natural_pdf.analyzers.shape_detection_mixin import ShapeDetectionMixin
|
54
57
|
from natural_pdf.analyzers.text_options import TextStyleOptions
|
55
58
|
from natural_pdf.analyzers.text_structure import TextStyleAnalyzer
|
56
59
|
from natural_pdf.classification.manager import ClassificationManager # For type hint
|
@@ -68,14 +71,12 @@ from natural_pdf.utils.locks import pdf_render_lock # Import the lock
|
|
68
71
|
|
69
72
|
# # Import new utils
|
70
73
|
from natural_pdf.utils.text_extraction import filter_chars_spatially, generate_text_layout
|
71
|
-
from natural_pdf.widgets import InteractiveViewerWidget
|
72
|
-
from natural_pdf.widgets.viewer import _IPYWIDGETS_AVAILABLE, SimpleInteractiveViewerWidget
|
74
|
+
from natural_pdf.widgets.viewer import _IPYWIDGETS_AVAILABLE, InteractiveViewerWidget
|
73
75
|
|
74
76
|
# --- End Classification Imports --- #
|
75
77
|
|
76
78
|
|
77
|
-
|
78
|
-
from natural_pdf.analyzers.shape_detection_mixin import ShapeDetectionMixin
|
79
|
+
|
79
80
|
# --- End Shape Detection Mixin --- #
|
80
81
|
|
81
82
|
|
@@ -667,13 +668,13 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin):
|
|
667
668
|
if selector_obj.get("type") == "or":
|
668
669
|
# For OR selectors, search all elements and let the filter function decide
|
669
670
|
elements_to_search = self._element_mgr.get_all_elements()
|
670
|
-
|
671
|
+
|
671
672
|
# Create filter function from compound selector
|
672
673
|
filter_func = selector_to_filter_func(selector_obj, **kwargs)
|
673
|
-
|
674
|
+
|
674
675
|
# Apply the filter to all elements
|
675
676
|
matching_elements = [element for element in elements_to_search if filter_func(element)]
|
676
|
-
|
677
|
+
|
677
678
|
# Sort elements in reading order if requested
|
678
679
|
if kwargs.get("reading_order", True):
|
679
680
|
if all(hasattr(el, "top") and hasattr(el, "x0") for el in matching_elements):
|
@@ -682,7 +683,7 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin):
|
|
682
683
|
logger.warning(
|
683
684
|
"Cannot sort elements in reading order: Missing required attributes (top, x0)."
|
684
685
|
)
|
685
|
-
|
686
|
+
|
686
687
|
# Return result collection
|
687
688
|
return ElementCollection(matching_elements)
|
688
689
|
|
@@ -1204,7 +1205,9 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin):
|
|
1204
1205
|
try:
|
1205
1206
|
tatr_tables = self.find_all("region[type=table][model=tatr]")
|
1206
1207
|
if tatr_tables:
|
1207
|
-
logger.debug(
|
1208
|
+
logger.debug(
|
1209
|
+
f"Page {self.number}: Found {len(tatr_tables)} TATR table regions, extracting from those..."
|
1210
|
+
)
|
1208
1211
|
extracted_tables = []
|
1209
1212
|
for table_region in tatr_tables:
|
1210
1213
|
try:
|
@@ -1212,48 +1215,70 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin):
|
|
1212
1215
|
if table_data: # Only add non-empty tables
|
1213
1216
|
extracted_tables.append(table_data)
|
1214
1217
|
except Exception as e:
|
1215
|
-
logger.warning(
|
1216
|
-
|
1218
|
+
logger.warning(
|
1219
|
+
f"Failed to extract table from TATR region {table_region.bbox}: {e}"
|
1220
|
+
)
|
1221
|
+
|
1217
1222
|
if extracted_tables:
|
1218
|
-
logger.debug(
|
1223
|
+
logger.debug(
|
1224
|
+
f"Page {self.number}: Successfully extracted {len(extracted_tables)} tables from TATR regions"
|
1225
|
+
)
|
1219
1226
|
return extracted_tables
|
1220
1227
|
else:
|
1221
|
-
logger.debug(
|
1228
|
+
logger.debug(
|
1229
|
+
f"Page {self.number}: TATR regions found but no tables extracted, falling back to pdfplumber"
|
1230
|
+
)
|
1222
1231
|
else:
|
1223
|
-
logger.debug(
|
1232
|
+
logger.debug(
|
1233
|
+
f"Page {self.number}: No TATR table regions found, using pdfplumber methods"
|
1234
|
+
)
|
1224
1235
|
except Exception as e:
|
1225
|
-
logger.debug(
|
1236
|
+
logger.debug(
|
1237
|
+
f"Page {self.number}: Error checking TATR regions: {e}, falling back to pdfplumber"
|
1238
|
+
)
|
1226
1239
|
|
1227
1240
|
# Auto-detect method if not specified (try lattice first, then stream)
|
1228
1241
|
if method is None:
|
1229
1242
|
logger.debug(f"Page {self.number}: Auto-detecting tables extraction method...")
|
1230
|
-
|
1243
|
+
|
1231
1244
|
# Try lattice first
|
1232
1245
|
try:
|
1233
1246
|
lattice_settings = table_settings.copy()
|
1234
1247
|
lattice_settings.setdefault("vertical_strategy", "lines")
|
1235
1248
|
lattice_settings.setdefault("horizontal_strategy", "lines")
|
1236
|
-
|
1249
|
+
|
1237
1250
|
logger.debug(f"Page {self.number}: Trying 'lattice' method first for tables...")
|
1238
1251
|
lattice_result = self._page.extract_tables(lattice_settings)
|
1239
|
-
|
1252
|
+
|
1240
1253
|
# Check if lattice found meaningful tables
|
1241
|
-
if (
|
1242
|
-
|
1243
|
-
|
1254
|
+
if (
|
1255
|
+
lattice_result
|
1256
|
+
and len(lattice_result) > 0
|
1257
|
+
and any(
|
1258
|
+
any(
|
1259
|
+
any(cell and cell.strip() for cell in row if cell)
|
1260
|
+
for row in table
|
1261
|
+
if table
|
1262
|
+
)
|
1263
|
+
for table in lattice_result
|
1264
|
+
)
|
1265
|
+
):
|
1266
|
+
logger.debug(
|
1267
|
+
f"Page {self.number}: 'lattice' method found {len(lattice_result)} tables"
|
1268
|
+
)
|
1244
1269
|
return lattice_result
|
1245
1270
|
else:
|
1246
1271
|
logger.debug(f"Page {self.number}: 'lattice' method found no meaningful tables")
|
1247
|
-
|
1272
|
+
|
1248
1273
|
except Exception as e:
|
1249
1274
|
logger.debug(f"Page {self.number}: 'lattice' method failed: {e}")
|
1250
|
-
|
1275
|
+
|
1251
1276
|
# Fall back to stream
|
1252
1277
|
logger.debug(f"Page {self.number}: Falling back to 'stream' method for tables...")
|
1253
1278
|
stream_settings = table_settings.copy()
|
1254
1279
|
stream_settings.setdefault("vertical_strategy", "text")
|
1255
1280
|
stream_settings.setdefault("horizontal_strategy", "text")
|
1256
|
-
|
1281
|
+
|
1257
1282
|
return self._page.extract_tables(stream_settings)
|
1258
1283
|
|
1259
1284
|
effective_method = method
|
@@ -1265,7 +1290,9 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin):
|
|
1265
1290
|
table_settings.setdefault("vertical_strategy", "text")
|
1266
1291
|
table_settings.setdefault("horizontal_strategy", "text")
|
1267
1292
|
elif effective_method == "lattice":
|
1268
|
-
logger.debug(
|
1293
|
+
logger.debug(
|
1294
|
+
"Using 'lattice' method alias for 'pdfplumber' with line-based strategies."
|
1295
|
+
)
|
1269
1296
|
effective_method = "pdfplumber"
|
1270
1297
|
table_settings.setdefault("vertical_strategy", "lines")
|
1271
1298
|
table_settings.setdefault("horizontal_strategy", "lines")
|
@@ -1555,6 +1582,14 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin):
|
|
1555
1582
|
Returns:
|
1556
1583
|
PIL Image of the page, or None if rendering fails.
|
1557
1584
|
"""
|
1585
|
+
# Apply global options as defaults, but allow explicit parameters to override
|
1586
|
+
import natural_pdf
|
1587
|
+
|
1588
|
+
# Use global options if parameters are not explicitly set
|
1589
|
+
if width is None:
|
1590
|
+
width = natural_pdf.options.image.width
|
1591
|
+
if resolution is None and natural_pdf.options.image.resolution is not None:
|
1592
|
+
resolution = natural_pdf.options.image.resolution
|
1558
1593
|
# 1. Create cache key (excluding path)
|
1559
1594
|
cache_key_parts = [
|
1560
1595
|
scale,
|
@@ -1572,19 +1607,23 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin):
|
|
1572
1607
|
if isinstance(v, list):
|
1573
1608
|
try:
|
1574
1609
|
v = tuple(v) # Convert lists to tuples
|
1575
|
-
except TypeError:
|
1610
|
+
except TypeError: # pragma: no cover
|
1576
1611
|
# If list contains unhashable items, fall back to repr or skip
|
1577
1612
|
# For simplicity, we'll try to proceed; hashing will fail if v remains unhashable
|
1578
|
-
logger.warning(
|
1613
|
+
logger.warning(
|
1614
|
+
f"Cache key generation: List item in kwargs['{k}'] could not be converted to tuple due to unhashable elements."
|
1615
|
+
)
|
1579
1616
|
sorted_kwargs_list.append((k, v))
|
1580
|
-
|
1617
|
+
|
1581
1618
|
cache_key_parts.append(tuple(sorted_kwargs_list))
|
1582
|
-
|
1619
|
+
|
1583
1620
|
try:
|
1584
1621
|
cache_key = tuple(cache_key_parts)
|
1585
|
-
except TypeError as e:
|
1586
|
-
logger.warning(
|
1587
|
-
|
1622
|
+
except TypeError as e: # pragma: no cover
|
1623
|
+
logger.warning(
|
1624
|
+
f"Page {self.index}: Could not create cache key for to_image due to unhashable item: {e}. Proceeding without cache for this call."
|
1625
|
+
)
|
1626
|
+
cache_key = None # Fallback to not using cache for this call
|
1588
1627
|
|
1589
1628
|
image_to_return: Optional[Image.Image] = None
|
1590
1629
|
|
@@ -1594,7 +1633,9 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin):
|
|
1594
1633
|
logger.debug(f"Page {self.index}: Returning cached image for key: {cache_key}")
|
1595
1634
|
else:
|
1596
1635
|
# --- This is the original logic to generate the image ---
|
1597
|
-
rendered_image_component: Optional[Image.Image] =
|
1636
|
+
rendered_image_component: Optional[Image.Image] = (
|
1637
|
+
None # Renamed from 'image' in original
|
1638
|
+
)
|
1598
1639
|
render_resolution = resolution if resolution is not None else scale * 72
|
1599
1640
|
thread_id = threading.current_thread().name
|
1600
1641
|
logger.debug(
|
@@ -1632,29 +1673,31 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin):
|
|
1632
1673
|
|
1633
1674
|
if rendered_image_component is None:
|
1634
1675
|
if cache_key is not None:
|
1635
|
-
self._to_image_cache[cache_key] = None
|
1676
|
+
self._to_image_cache[cache_key] = None # Cache the failure
|
1636
1677
|
# Save the image if path is provided (will try to save None, handled by PIL/OS)
|
1637
1678
|
if path:
|
1638
1679
|
try:
|
1639
1680
|
if os.path.dirname(path):
|
1640
1681
|
os.makedirs(os.path.dirname(path), exist_ok=True)
|
1641
|
-
if rendered_image_component is not None:
|
1642
|
-
|
1682
|
+
if rendered_image_component is not None: # Should be None here
|
1683
|
+
rendered_image_component.save(path) # This line won't be hit if None
|
1643
1684
|
# else: logger.debug("Not saving None image") # Not strictly needed
|
1644
|
-
except Exception as save_error:
|
1685
|
+
except Exception as save_error: # pragma: no cover
|
1645
1686
|
logger.error(f"Failed to save image to {path}: {save_error}")
|
1646
1687
|
return None
|
1647
1688
|
|
1648
1689
|
# --- Apply exclusion masking if requested ---
|
1649
1690
|
# This modifies 'rendered_image_component'
|
1650
|
-
image_after_masking = rendered_image_component
|
1691
|
+
image_after_masking = rendered_image_component # Start with the rendered image
|
1651
1692
|
if exclusions == "mask" and self._exclusions:
|
1652
1693
|
try:
|
1653
1694
|
# Ensure image is mutable (RGB or RGBA)
|
1654
1695
|
if image_after_masking.mode not in ("RGB", "RGBA"):
|
1655
1696
|
image_after_masking = image_after_masking.convert("RGB")
|
1656
1697
|
|
1657
|
-
exclusion_regions = self._get_exclusion_regions(
|
1698
|
+
exclusion_regions = self._get_exclusion_regions(
|
1699
|
+
include_callable=True, debug=False
|
1700
|
+
)
|
1658
1701
|
if exclusion_regions:
|
1659
1702
|
draw = ImageDraw.Draw(image_after_masking)
|
1660
1703
|
# Calculate the scaling factor used for the image
|
@@ -1676,12 +1719,12 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin):
|
|
1676
1719
|
)
|
1677
1720
|
if img_coords[0] < img_coords[2] and img_coords[1] < img_coords[3]:
|
1678
1721
|
draw.rectangle(img_coords, fill="white")
|
1679
|
-
else:
|
1722
|
+
else: # pragma: no cover
|
1680
1723
|
logger.warning(
|
1681
1724
|
f"Skipping invalid exclusion rect for masking: {img_coords}"
|
1682
1725
|
)
|
1683
1726
|
del draw # Release drawing context
|
1684
|
-
except Exception as mask_error:
|
1727
|
+
except Exception as mask_error: # pragma: no cover
|
1685
1728
|
logger.error(
|
1686
1729
|
f"Error applying exclusion mask to page {self.index}: {mask_error}",
|
1687
1730
|
exc_info=True,
|
@@ -1689,7 +1732,7 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin):
|
|
1689
1732
|
# Continue with potentially unmasked or partially masked image
|
1690
1733
|
|
1691
1734
|
# --- Resize the final image if width is provided ---
|
1692
|
-
image_final_content = image_after_masking
|
1735
|
+
image_final_content = image_after_masking # Start with image after masking
|
1693
1736
|
if width is not None and width > 0 and image_final_content.width > 0:
|
1694
1737
|
aspect_ratio = image_final_content.height / image_final_content.width
|
1695
1738
|
height = int(width * aspect_ratio)
|
@@ -1697,7 +1740,7 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin):
|
|
1697
1740
|
image_final_content = image_final_content.resize(
|
1698
1741
|
(width, height), Image.Resampling.LANCZOS
|
1699
1742
|
)
|
1700
|
-
except Exception as resize_error:
|
1743
|
+
except Exception as resize_error: # pragma: no cover
|
1701
1744
|
logger.warning(f"Could not resize image: {resize_error}")
|
1702
1745
|
# image_final_content remains the un-resized version if resize fails
|
1703
1746
|
|
@@ -1712,11 +1755,11 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin):
|
|
1712
1755
|
if path and image_to_return:
|
1713
1756
|
try:
|
1714
1757
|
# Ensure directory exists
|
1715
|
-
if os.path.dirname(path):
|
1758
|
+
if os.path.dirname(path): # Only call makedirs if there's a directory part
|
1716
1759
|
os.makedirs(os.path.dirname(path), exist_ok=True)
|
1717
1760
|
image_to_return.save(path)
|
1718
1761
|
logger.debug(f"Saved page image to: {path}")
|
1719
|
-
except Exception as save_error:
|
1762
|
+
except Exception as save_error: # pragma: no cover
|
1720
1763
|
logger.error(f"Failed to save image to {path}: {save_error}")
|
1721
1764
|
|
1722
1765
|
return image_to_return
|
@@ -1775,24 +1818,20 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin):
|
|
1775
1818
|
self._element_mgr.remove_ocr_elements()
|
1776
1819
|
|
1777
1820
|
logger.info(f"Page {self.number}: Delegating apply_ocr to PDF.apply_ocr.")
|
1778
|
-
|
1779
|
-
|
1780
|
-
|
1781
|
-
self.
|
1782
|
-
|
1783
|
-
|
1784
|
-
|
1785
|
-
|
1786
|
-
|
1787
|
-
|
1788
|
-
|
1789
|
-
|
1790
|
-
|
1791
|
-
|
1792
|
-
)
|
1793
|
-
except Exception as e:
|
1794
|
-
logger.error(f"Page {self.number}: Error during delegated OCR call: {e}", exc_info=True)
|
1795
|
-
return self # Return self for chaining
|
1821
|
+
# Delegate to parent PDF, targeting only this page's index
|
1822
|
+
# Pass all relevant parameters through, including apply_exclusions
|
1823
|
+
self._parent.apply_ocr(
|
1824
|
+
pages=[self.index],
|
1825
|
+
engine=engine,
|
1826
|
+
options=options,
|
1827
|
+
languages=languages,
|
1828
|
+
min_confidence=min_confidence,
|
1829
|
+
device=device,
|
1830
|
+
resolution=resolution,
|
1831
|
+
detect_only=detect_only,
|
1832
|
+
apply_exclusions=apply_exclusions,
|
1833
|
+
replace=replace, # Pass the replace parameter to PDF.apply_ocr
|
1834
|
+
)
|
1796
1835
|
|
1797
1836
|
# Return self for chaining
|
1798
1837
|
return self
|
@@ -2313,14 +2352,14 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin):
|
|
2313
2352
|
self,
|
2314
2353
|
# elements_to_render: Optional[List['Element']] = None, # No longer needed, from_page handles it
|
2315
2354
|
# include_source_types: List[str] = ['word', 'line', 'rect', 'region'] # No longer needed
|
2316
|
-
) -> Optional["
|
2355
|
+
) -> Optional["InteractiveViewerWidget"]: # Return type hint updated
|
2317
2356
|
"""
|
2318
2357
|
Creates and returns an interactive ipywidget for exploring elements on this page.
|
2319
2358
|
|
2320
|
-
Uses
|
2359
|
+
Uses InteractiveViewerWidget.from_page() to create the viewer.
|
2321
2360
|
|
2322
2361
|
Returns:
|
2323
|
-
A
|
2362
|
+
A InteractiveViewerWidget instance ready for display in Jupyter,
|
2324
2363
|
or None if ipywidgets is not installed or widget creation fails.
|
2325
2364
|
|
2326
2365
|
Raises:
|
@@ -2329,18 +2368,18 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin):
|
|
2329
2368
|
ValueError: If image rendering or data preparation fails within from_page.
|
2330
2369
|
"""
|
2331
2370
|
# Check for availability using the imported flag and class variable
|
2332
|
-
if not _IPYWIDGETS_AVAILABLE or
|
2371
|
+
if not _IPYWIDGETS_AVAILABLE or InteractiveViewerWidget is None:
|
2333
2372
|
logger.error(
|
2334
|
-
"Interactive viewer requires
|
2335
|
-
|
2373
|
+
"Interactive viewer requires 'ipywidgets'. "
|
2374
|
+
'Please install with: pip install "ipywidgets>=7.0.0,<10.0.0"'
|
2336
2375
|
)
|
2337
2376
|
# raise ImportError("ipywidgets not found.") # Option 1: Raise error
|
2338
2377
|
return None # Option 2: Return None gracefully
|
2339
2378
|
|
2340
|
-
# If we reach here,
|
2379
|
+
# If we reach here, InteractiveViewerWidget should be the actual class
|
2341
2380
|
try:
|
2342
2381
|
# Pass self (the Page object) to the factory method
|
2343
|
-
return
|
2382
|
+
return InteractiveViewerWidget.from_page(self)
|
2344
2383
|
except Exception as e:
|
2345
2384
|
# Catch potential errors during widget creation (e.g., image rendering)
|
2346
2385
|
logger.error(
|
@@ -2440,9 +2479,7 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin):
|
|
2440
2479
|
f"Page {self.number}: Starting OCR correction with callback '{correction_callback.__name__}' (max_workers={max_workers})"
|
2441
2480
|
)
|
2442
2481
|
|
2443
|
-
target_elements_collection = self.find_all(
|
2444
|
-
selector=selector, apply_exclusions=False
|
2445
|
-
)
|
2482
|
+
target_elements_collection = self.find_all(selector=selector, apply_exclusions=False)
|
2446
2483
|
target_elements = target_elements_collection.elements # Get the list
|
2447
2484
|
|
2448
2485
|
if not target_elements:
|
@@ -2451,7 +2488,12 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin):
|
|
2451
2488
|
|
2452
2489
|
element_pbar = None
|
2453
2490
|
try:
|
2454
|
-
element_pbar = tqdm(
|
2491
|
+
element_pbar = tqdm(
|
2492
|
+
total=len(target_elements),
|
2493
|
+
desc=f"Correcting OCR Page {self.number}",
|
2494
|
+
unit="element",
|
2495
|
+
leave=False,
|
2496
|
+
)
|
2455
2497
|
|
2456
2498
|
processed_count = 0
|
2457
2499
|
updated_count = 0
|
natural_pdf/core/pdf.py
CHANGED
@@ -24,6 +24,7 @@ from typing import (
|
|
24
24
|
|
25
25
|
import pdfplumber
|
26
26
|
from PIL import Image
|
27
|
+
from tqdm.auto import tqdm
|
27
28
|
|
28
29
|
from natural_pdf.analyzers.layout.layout_manager import LayoutManager
|
29
30
|
from natural_pdf.classification.manager import ClassificationError, ClassificationManager
|
@@ -38,7 +39,6 @@ from natural_pdf.extraction.mixin import ExtractionMixin
|
|
38
39
|
from natural_pdf.ocr import OCRManager, OCROptions
|
39
40
|
from natural_pdf.selectors.parser import parse_selector
|
40
41
|
from natural_pdf.utils.locks import pdf_render_lock
|
41
|
-
from tqdm.auto import tqdm
|
42
42
|
|
43
43
|
try:
|
44
44
|
from typing import Any as TypingAny
|
@@ -307,7 +307,6 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
|
|
307
307
|
) -> "PDF":
|
308
308
|
"""
|
309
309
|
Applies OCR to specified pages of the PDF using batch processing.
|
310
|
-
Applies OCR to specified pages of the PDF using batch processing.
|
311
310
|
|
312
311
|
Args:
|
313
312
|
engine: Name of the OCR engine
|
@@ -320,25 +319,27 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
|
|
320
319
|
replace: Whether to replace existing OCR elements
|
321
320
|
options: Engine-specific options
|
322
321
|
pages: Page indices to process or None for all pages
|
323
|
-
engine: Name of the OCR engine
|
324
|
-
languages: List of language codes
|
325
|
-
min_confidence: Minimum confidence threshold
|
326
|
-
device: Device to run OCR on
|
327
|
-
resolution: DPI resolution for page images
|
328
|
-
apply_exclusions: Whether to mask excluded areas
|
329
|
-
detect_only: If True, only detect text boxes
|
330
|
-
replace: Whether to replace existing OCR elements
|
331
|
-
options: Engine-specific options
|
332
|
-
pages: Page indices to process or None for all pages
|
333
322
|
|
334
323
|
Returns:
|
335
324
|
Self for method chaining
|
336
|
-
Self for method chaining
|
337
325
|
"""
|
338
326
|
if not self._ocr_manager:
|
339
327
|
logger.error("OCRManager not available. Cannot apply OCR.")
|
340
328
|
return self
|
341
329
|
|
330
|
+
# Apply global options as defaults, but allow explicit parameters to override
|
331
|
+
import natural_pdf
|
332
|
+
|
333
|
+
# Use global OCR options if parameters are not explicitly set
|
334
|
+
if engine is None:
|
335
|
+
engine = natural_pdf.options.ocr.engine
|
336
|
+
if languages is None:
|
337
|
+
languages = natural_pdf.options.ocr.languages
|
338
|
+
if min_confidence is None:
|
339
|
+
min_confidence = natural_pdf.options.ocr.min_confidence
|
340
|
+
if device is None:
|
341
|
+
pass # No default device in options.ocr anymore
|
342
|
+
|
342
343
|
thread_id = threading.current_thread().name
|
343
344
|
logger.debug(f"[{thread_id}] PDF.apply_ocr starting for {self.path}")
|
344
345
|
|
@@ -425,18 +426,14 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
|
|
425
426
|
logger.info(f"[{thread_id}] Calling OCR Manager with args: {ocr_call_args}...")
|
426
427
|
ocr_start_time = time.monotonic()
|
427
428
|
|
428
|
-
|
429
|
-
batch_results = self._ocr_manager.apply_ocr(**manager_args)
|
430
|
-
|
431
|
-
if not isinstance(batch_results, list) or len(batch_results) != len(images_pil):
|
432
|
-
logger.error(f"OCR Manager returned unexpected result format or length.")
|
433
|
-
return self
|
429
|
+
batch_results = self._ocr_manager.apply_ocr(**manager_args)
|
434
430
|
|
435
|
-
|
436
|
-
|
437
|
-
logger.error(f"Batch OCR processing failed: {e}")
|
431
|
+
if not isinstance(batch_results, list) or len(batch_results) != len(images_pil):
|
432
|
+
logger.error(f"OCR Manager returned unexpected result format or length.")
|
438
433
|
return self
|
439
434
|
|
435
|
+
logger.info("OCR Manager batch processing complete.")
|
436
|
+
|
440
437
|
ocr_end_time = time.monotonic()
|
441
438
|
logger.debug(
|
442
439
|
f"[{thread_id}] OCR processing finished (Duration: {ocr_end_time - ocr_start_time:.2f}s)"
|
natural_pdf/elements/base.py
CHANGED
@@ -18,34 +18,34 @@ if TYPE_CHECKING:
|
|
18
18
|
def extract_bbox(obj: Any) -> Optional[Tuple[float, float, float, float]]:
|
19
19
|
"""
|
20
20
|
Extract bounding box coordinates from any object that has bbox properties.
|
21
|
-
|
21
|
+
|
22
22
|
Args:
|
23
23
|
obj: Object that might have bbox coordinates (Element, Region, etc.)
|
24
|
-
|
24
|
+
|
25
25
|
Returns:
|
26
26
|
Tuple of (x0, top, x1, bottom) or None if object doesn't have bbox properties
|
27
27
|
"""
|
28
28
|
# Try bbox property first (most common)
|
29
|
-
if hasattr(obj,
|
29
|
+
if hasattr(obj, "bbox") and obj.bbox is not None:
|
30
30
|
bbox = obj.bbox
|
31
31
|
if isinstance(bbox, (tuple, list)) and len(bbox) == 4:
|
32
32
|
return tuple(float(coord) for coord in bbox)
|
33
|
-
|
33
|
+
|
34
34
|
# Try individual coordinate properties
|
35
|
-
if all(hasattr(obj, attr) for attr in [
|
35
|
+
if all(hasattr(obj, attr) for attr in ["x0", "top", "x1", "bottom"]):
|
36
36
|
try:
|
37
37
|
return (float(obj.x0), float(obj.top), float(obj.x1), float(obj.bottom))
|
38
38
|
except (ValueError, TypeError):
|
39
39
|
pass
|
40
|
-
|
40
|
+
|
41
41
|
# If object is a dict with bbox keys
|
42
42
|
if isinstance(obj, dict):
|
43
|
-
if all(key in obj for key in [
|
43
|
+
if all(key in obj for key in ["x0", "top", "x1", "bottom"]):
|
44
44
|
try:
|
45
|
-
return (float(obj[
|
45
|
+
return (float(obj["x0"]), float(obj["top"]), float(obj["x1"]), float(obj["bottom"]))
|
46
46
|
except (ValueError, TypeError):
|
47
47
|
pass
|
48
|
-
|
48
|
+
|
49
49
|
return None
|
50
50
|
|
51
51
|
|