natural-pdf 0.1.15__py3-none-any.whl → 0.1.17__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/__init__.py +31 -0
- natural_pdf/analyzers/layout/gemini.py +137 -162
- natural_pdf/analyzers/layout/layout_manager.py +9 -5
- natural_pdf/analyzers/layout/layout_options.py +77 -7
- natural_pdf/analyzers/layout/paddle.py +318 -165
- natural_pdf/analyzers/layout/table_structure_utils.py +78 -0
- natural_pdf/analyzers/shape_detection_mixin.py +770 -405
- natural_pdf/classification/mixin.py +2 -8
- natural_pdf/collections/pdf_collection.py +25 -30
- natural_pdf/core/highlighting_service.py +47 -32
- natural_pdf/core/page.py +119 -76
- natural_pdf/core/pdf.py +19 -22
- natural_pdf/describe/__init__.py +21 -0
- natural_pdf/describe/base.py +457 -0
- natural_pdf/describe/elements.py +411 -0
- natural_pdf/describe/mixin.py +84 -0
- natural_pdf/describe/summary.py +186 -0
- natural_pdf/elements/base.py +11 -10
- natural_pdf/elements/collections.py +116 -51
- natural_pdf/elements/region.py +204 -127
- natural_pdf/exporters/paddleocr.py +38 -13
- natural_pdf/flows/__init__.py +3 -3
- natural_pdf/flows/collections.py +303 -132
- natural_pdf/flows/element.py +277 -132
- natural_pdf/flows/flow.py +33 -16
- natural_pdf/flows/region.py +142 -79
- natural_pdf/ocr/engine_doctr.py +37 -4
- natural_pdf/ocr/engine_easyocr.py +23 -3
- natural_pdf/ocr/engine_paddle.py +281 -30
- natural_pdf/ocr/engine_surya.py +8 -3
- natural_pdf/ocr/ocr_manager.py +75 -76
- natural_pdf/ocr/ocr_options.py +52 -87
- natural_pdf/search/__init__.py +25 -12
- natural_pdf/search/lancedb_search_service.py +91 -54
- natural_pdf/search/numpy_search_service.py +86 -65
- natural_pdf/search/searchable_mixin.py +2 -2
- natural_pdf/selectors/parser.py +125 -81
- natural_pdf/widgets/__init__.py +1 -1
- natural_pdf/widgets/viewer.py +205 -449
- {natural_pdf-0.1.15.dist-info → natural_pdf-0.1.17.dist-info}/METADATA +27 -45
- {natural_pdf-0.1.15.dist-info → natural_pdf-0.1.17.dist-info}/RECORD +44 -38
- {natural_pdf-0.1.15.dist-info → natural_pdf-0.1.17.dist-info}/WHEEL +0 -0
- {natural_pdf-0.1.15.dist-info → natural_pdf-0.1.17.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.1.15.dist-info → natural_pdf-0.1.17.dist-info}/top_level.txt +0 -0
natural_pdf/core/page.py
CHANGED
@@ -51,6 +51,9 @@ from pdfplumber.utils.text import TEXTMAP_KWARGS, WORD_EXTRACTOR_KWARGS, chars_t
|
|
51
51
|
from natural_pdf.analyzers.layout.layout_analyzer import LayoutAnalyzer
|
52
52
|
from natural_pdf.analyzers.layout.layout_manager import LayoutManager
|
53
53
|
from natural_pdf.analyzers.layout.layout_options import LayoutOptions
|
54
|
+
|
55
|
+
# --- Shape Detection Mixin --- #
|
56
|
+
from natural_pdf.analyzers.shape_detection_mixin import ShapeDetectionMixin
|
54
57
|
from natural_pdf.analyzers.text_options import TextStyleOptions
|
55
58
|
from natural_pdf.analyzers.text_structure import TextStyleAnalyzer
|
56
59
|
from natural_pdf.classification.manager import ClassificationManager # For type hint
|
@@ -58,6 +61,7 @@ from natural_pdf.classification.manager import ClassificationManager # For type
|
|
58
61
|
# # --- Classification Imports --- #
|
59
62
|
from natural_pdf.classification.mixin import ClassificationMixin # Import classification mixin
|
60
63
|
from natural_pdf.core.element_manager import ElementManager
|
64
|
+
from natural_pdf.describe.mixin import DescribeMixin # Import describe mixin
|
61
65
|
from natural_pdf.elements.base import Element # Import base element
|
62
66
|
from natural_pdf.elements.text import TextElement
|
63
67
|
from natural_pdf.extraction.mixin import ExtractionMixin # Import extraction mixin
|
@@ -68,14 +72,12 @@ from natural_pdf.utils.locks import pdf_render_lock # Import the lock
|
|
68
72
|
|
69
73
|
# # Import new utils
|
70
74
|
from natural_pdf.utils.text_extraction import filter_chars_spatially, generate_text_layout
|
71
|
-
from natural_pdf.widgets import InteractiveViewerWidget
|
72
|
-
from natural_pdf.widgets.viewer import _IPYWIDGETS_AVAILABLE, SimpleInteractiveViewerWidget
|
75
|
+
from natural_pdf.widgets.viewer import _IPYWIDGETS_AVAILABLE, InteractiveViewerWidget
|
73
76
|
|
74
77
|
# --- End Classification Imports --- #
|
75
78
|
|
76
79
|
|
77
|
-
|
78
|
-
from natural_pdf.analyzers.shape_detection_mixin import ShapeDetectionMixin
|
80
|
+
|
79
81
|
# --- End Shape Detection Mixin --- #
|
80
82
|
|
81
83
|
|
@@ -91,7 +93,7 @@ except ImportError:
|
|
91
93
|
logger = logging.getLogger(__name__)
|
92
94
|
|
93
95
|
|
94
|
-
class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin):
|
96
|
+
class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMixin):
|
95
97
|
"""
|
96
98
|
Enhanced Page wrapper built on top of pdfplumber.Page.
|
97
99
|
|
@@ -667,13 +669,13 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin):
|
|
667
669
|
if selector_obj.get("type") == "or":
|
668
670
|
# For OR selectors, search all elements and let the filter function decide
|
669
671
|
elements_to_search = self._element_mgr.get_all_elements()
|
670
|
-
|
672
|
+
|
671
673
|
# Create filter function from compound selector
|
672
674
|
filter_func = selector_to_filter_func(selector_obj, **kwargs)
|
673
|
-
|
675
|
+
|
674
676
|
# Apply the filter to all elements
|
675
677
|
matching_elements = [element for element in elements_to_search if filter_func(element)]
|
676
|
-
|
678
|
+
|
677
679
|
# Sort elements in reading order if requested
|
678
680
|
if kwargs.get("reading_order", True):
|
679
681
|
if all(hasattr(el, "top") and hasattr(el, "x0") for el in matching_elements):
|
@@ -682,7 +684,7 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin):
|
|
682
684
|
logger.warning(
|
683
685
|
"Cannot sort elements in reading order: Missing required attributes (top, x0)."
|
684
686
|
)
|
685
|
-
|
687
|
+
|
686
688
|
# Return result collection
|
687
689
|
return ElementCollection(matching_elements)
|
688
690
|
|
@@ -1204,7 +1206,9 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin):
|
|
1204
1206
|
try:
|
1205
1207
|
tatr_tables = self.find_all("region[type=table][model=tatr]")
|
1206
1208
|
if tatr_tables:
|
1207
|
-
logger.debug(
|
1209
|
+
logger.debug(
|
1210
|
+
f"Page {self.number}: Found {len(tatr_tables)} TATR table regions, extracting from those..."
|
1211
|
+
)
|
1208
1212
|
extracted_tables = []
|
1209
1213
|
for table_region in tatr_tables:
|
1210
1214
|
try:
|
@@ -1212,48 +1216,70 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin):
|
|
1212
1216
|
if table_data: # Only add non-empty tables
|
1213
1217
|
extracted_tables.append(table_data)
|
1214
1218
|
except Exception as e:
|
1215
|
-
logger.warning(
|
1216
|
-
|
1219
|
+
logger.warning(
|
1220
|
+
f"Failed to extract table from TATR region {table_region.bbox}: {e}"
|
1221
|
+
)
|
1222
|
+
|
1217
1223
|
if extracted_tables:
|
1218
|
-
logger.debug(
|
1224
|
+
logger.debug(
|
1225
|
+
f"Page {self.number}: Successfully extracted {len(extracted_tables)} tables from TATR regions"
|
1226
|
+
)
|
1219
1227
|
return extracted_tables
|
1220
1228
|
else:
|
1221
|
-
logger.debug(
|
1229
|
+
logger.debug(
|
1230
|
+
f"Page {self.number}: TATR regions found but no tables extracted, falling back to pdfplumber"
|
1231
|
+
)
|
1222
1232
|
else:
|
1223
|
-
logger.debug(
|
1233
|
+
logger.debug(
|
1234
|
+
f"Page {self.number}: No TATR table regions found, using pdfplumber methods"
|
1235
|
+
)
|
1224
1236
|
except Exception as e:
|
1225
|
-
logger.debug(
|
1237
|
+
logger.debug(
|
1238
|
+
f"Page {self.number}: Error checking TATR regions: {e}, falling back to pdfplumber"
|
1239
|
+
)
|
1226
1240
|
|
1227
1241
|
# Auto-detect method if not specified (try lattice first, then stream)
|
1228
1242
|
if method is None:
|
1229
1243
|
logger.debug(f"Page {self.number}: Auto-detecting tables extraction method...")
|
1230
|
-
|
1244
|
+
|
1231
1245
|
# Try lattice first
|
1232
1246
|
try:
|
1233
1247
|
lattice_settings = table_settings.copy()
|
1234
1248
|
lattice_settings.setdefault("vertical_strategy", "lines")
|
1235
1249
|
lattice_settings.setdefault("horizontal_strategy", "lines")
|
1236
|
-
|
1250
|
+
|
1237
1251
|
logger.debug(f"Page {self.number}: Trying 'lattice' method first for tables...")
|
1238
1252
|
lattice_result = self._page.extract_tables(lattice_settings)
|
1239
|
-
|
1253
|
+
|
1240
1254
|
# Check if lattice found meaningful tables
|
1241
|
-
if (
|
1242
|
-
|
1243
|
-
|
1255
|
+
if (
|
1256
|
+
lattice_result
|
1257
|
+
and len(lattice_result) > 0
|
1258
|
+
and any(
|
1259
|
+
any(
|
1260
|
+
any(cell and cell.strip() for cell in row if cell)
|
1261
|
+
for row in table
|
1262
|
+
if table
|
1263
|
+
)
|
1264
|
+
for table in lattice_result
|
1265
|
+
)
|
1266
|
+
):
|
1267
|
+
logger.debug(
|
1268
|
+
f"Page {self.number}: 'lattice' method found {len(lattice_result)} tables"
|
1269
|
+
)
|
1244
1270
|
return lattice_result
|
1245
1271
|
else:
|
1246
1272
|
logger.debug(f"Page {self.number}: 'lattice' method found no meaningful tables")
|
1247
|
-
|
1273
|
+
|
1248
1274
|
except Exception as e:
|
1249
1275
|
logger.debug(f"Page {self.number}: 'lattice' method failed: {e}")
|
1250
|
-
|
1276
|
+
|
1251
1277
|
# Fall back to stream
|
1252
1278
|
logger.debug(f"Page {self.number}: Falling back to 'stream' method for tables...")
|
1253
1279
|
stream_settings = table_settings.copy()
|
1254
1280
|
stream_settings.setdefault("vertical_strategy", "text")
|
1255
1281
|
stream_settings.setdefault("horizontal_strategy", "text")
|
1256
|
-
|
1282
|
+
|
1257
1283
|
return self._page.extract_tables(stream_settings)
|
1258
1284
|
|
1259
1285
|
effective_method = method
|
@@ -1265,7 +1291,9 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin):
|
|
1265
1291
|
table_settings.setdefault("vertical_strategy", "text")
|
1266
1292
|
table_settings.setdefault("horizontal_strategy", "text")
|
1267
1293
|
elif effective_method == "lattice":
|
1268
|
-
logger.debug(
|
1294
|
+
logger.debug(
|
1295
|
+
"Using 'lattice' method alias for 'pdfplumber' with line-based strategies."
|
1296
|
+
)
|
1269
1297
|
effective_method = "pdfplumber"
|
1270
1298
|
table_settings.setdefault("vertical_strategy", "lines")
|
1271
1299
|
table_settings.setdefault("horizontal_strategy", "lines")
|
@@ -1555,6 +1583,14 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin):
|
|
1555
1583
|
Returns:
|
1556
1584
|
PIL Image of the page, or None if rendering fails.
|
1557
1585
|
"""
|
1586
|
+
# Apply global options as defaults, but allow explicit parameters to override
|
1587
|
+
import natural_pdf
|
1588
|
+
|
1589
|
+
# Use global options if parameters are not explicitly set
|
1590
|
+
if width is None:
|
1591
|
+
width = natural_pdf.options.image.width
|
1592
|
+
if resolution is None and natural_pdf.options.image.resolution is not None:
|
1593
|
+
resolution = natural_pdf.options.image.resolution
|
1558
1594
|
# 1. Create cache key (excluding path)
|
1559
1595
|
cache_key_parts = [
|
1560
1596
|
scale,
|
@@ -1572,19 +1608,23 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin):
|
|
1572
1608
|
if isinstance(v, list):
|
1573
1609
|
try:
|
1574
1610
|
v = tuple(v) # Convert lists to tuples
|
1575
|
-
except TypeError:
|
1611
|
+
except TypeError: # pragma: no cover
|
1576
1612
|
# If list contains unhashable items, fall back to repr or skip
|
1577
1613
|
# For simplicity, we'll try to proceed; hashing will fail if v remains unhashable
|
1578
|
-
logger.warning(
|
1614
|
+
logger.warning(
|
1615
|
+
f"Cache key generation: List item in kwargs['{k}'] could not be converted to tuple due to unhashable elements."
|
1616
|
+
)
|
1579
1617
|
sorted_kwargs_list.append((k, v))
|
1580
|
-
|
1618
|
+
|
1581
1619
|
cache_key_parts.append(tuple(sorted_kwargs_list))
|
1582
|
-
|
1620
|
+
|
1583
1621
|
try:
|
1584
1622
|
cache_key = tuple(cache_key_parts)
|
1585
|
-
except TypeError as e:
|
1586
|
-
logger.warning(
|
1587
|
-
|
1623
|
+
except TypeError as e: # pragma: no cover
|
1624
|
+
logger.warning(
|
1625
|
+
f"Page {self.index}: Could not create cache key for to_image due to unhashable item: {e}. Proceeding without cache for this call."
|
1626
|
+
)
|
1627
|
+
cache_key = None # Fallback to not using cache for this call
|
1588
1628
|
|
1589
1629
|
image_to_return: Optional[Image.Image] = None
|
1590
1630
|
|
@@ -1594,7 +1634,9 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin):
|
|
1594
1634
|
logger.debug(f"Page {self.index}: Returning cached image for key: {cache_key}")
|
1595
1635
|
else:
|
1596
1636
|
# --- This is the original logic to generate the image ---
|
1597
|
-
rendered_image_component: Optional[Image.Image] =
|
1637
|
+
rendered_image_component: Optional[Image.Image] = (
|
1638
|
+
None # Renamed from 'image' in original
|
1639
|
+
)
|
1598
1640
|
render_resolution = resolution if resolution is not None else scale * 72
|
1599
1641
|
thread_id = threading.current_thread().name
|
1600
1642
|
logger.debug(
|
@@ -1632,29 +1674,31 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin):
|
|
1632
1674
|
|
1633
1675
|
if rendered_image_component is None:
|
1634
1676
|
if cache_key is not None:
|
1635
|
-
self._to_image_cache[cache_key] = None
|
1677
|
+
self._to_image_cache[cache_key] = None # Cache the failure
|
1636
1678
|
# Save the image if path is provided (will try to save None, handled by PIL/OS)
|
1637
1679
|
if path:
|
1638
1680
|
try:
|
1639
1681
|
if os.path.dirname(path):
|
1640
1682
|
os.makedirs(os.path.dirname(path), exist_ok=True)
|
1641
|
-
if rendered_image_component is not None:
|
1642
|
-
|
1683
|
+
if rendered_image_component is not None: # Should be None here
|
1684
|
+
rendered_image_component.save(path) # This line won't be hit if None
|
1643
1685
|
# else: logger.debug("Not saving None image") # Not strictly needed
|
1644
|
-
except Exception as save_error:
|
1686
|
+
except Exception as save_error: # pragma: no cover
|
1645
1687
|
logger.error(f"Failed to save image to {path}: {save_error}")
|
1646
1688
|
return None
|
1647
1689
|
|
1648
1690
|
# --- Apply exclusion masking if requested ---
|
1649
1691
|
# This modifies 'rendered_image_component'
|
1650
|
-
image_after_masking = rendered_image_component
|
1692
|
+
image_after_masking = rendered_image_component # Start with the rendered image
|
1651
1693
|
if exclusions == "mask" and self._exclusions:
|
1652
1694
|
try:
|
1653
1695
|
# Ensure image is mutable (RGB or RGBA)
|
1654
1696
|
if image_after_masking.mode not in ("RGB", "RGBA"):
|
1655
1697
|
image_after_masking = image_after_masking.convert("RGB")
|
1656
1698
|
|
1657
|
-
exclusion_regions = self._get_exclusion_regions(
|
1699
|
+
exclusion_regions = self._get_exclusion_regions(
|
1700
|
+
include_callable=True, debug=False
|
1701
|
+
)
|
1658
1702
|
if exclusion_regions:
|
1659
1703
|
draw = ImageDraw.Draw(image_after_masking)
|
1660
1704
|
# Calculate the scaling factor used for the image
|
@@ -1676,12 +1720,12 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin):
|
|
1676
1720
|
)
|
1677
1721
|
if img_coords[0] < img_coords[2] and img_coords[1] < img_coords[3]:
|
1678
1722
|
draw.rectangle(img_coords, fill="white")
|
1679
|
-
else:
|
1723
|
+
else: # pragma: no cover
|
1680
1724
|
logger.warning(
|
1681
1725
|
f"Skipping invalid exclusion rect for masking: {img_coords}"
|
1682
1726
|
)
|
1683
1727
|
del draw # Release drawing context
|
1684
|
-
except Exception as mask_error:
|
1728
|
+
except Exception as mask_error: # pragma: no cover
|
1685
1729
|
logger.error(
|
1686
1730
|
f"Error applying exclusion mask to page {self.index}: {mask_error}",
|
1687
1731
|
exc_info=True,
|
@@ -1689,7 +1733,7 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin):
|
|
1689
1733
|
# Continue with potentially unmasked or partially masked image
|
1690
1734
|
|
1691
1735
|
# --- Resize the final image if width is provided ---
|
1692
|
-
image_final_content = image_after_masking
|
1736
|
+
image_final_content = image_after_masking # Start with image after masking
|
1693
1737
|
if width is not None and width > 0 and image_final_content.width > 0:
|
1694
1738
|
aspect_ratio = image_final_content.height / image_final_content.width
|
1695
1739
|
height = int(width * aspect_ratio)
|
@@ -1697,7 +1741,7 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin):
|
|
1697
1741
|
image_final_content = image_final_content.resize(
|
1698
1742
|
(width, height), Image.Resampling.LANCZOS
|
1699
1743
|
)
|
1700
|
-
except Exception as resize_error:
|
1744
|
+
except Exception as resize_error: # pragma: no cover
|
1701
1745
|
logger.warning(f"Could not resize image: {resize_error}")
|
1702
1746
|
# image_final_content remains the un-resized version if resize fails
|
1703
1747
|
|
@@ -1712,11 +1756,11 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin):
|
|
1712
1756
|
if path and image_to_return:
|
1713
1757
|
try:
|
1714
1758
|
# Ensure directory exists
|
1715
|
-
if os.path.dirname(path):
|
1759
|
+
if os.path.dirname(path): # Only call makedirs if there's a directory part
|
1716
1760
|
os.makedirs(os.path.dirname(path), exist_ok=True)
|
1717
1761
|
image_to_return.save(path)
|
1718
1762
|
logger.debug(f"Saved page image to: {path}")
|
1719
|
-
except Exception as save_error:
|
1763
|
+
except Exception as save_error: # pragma: no cover
|
1720
1764
|
logger.error(f"Failed to save image to {path}: {save_error}")
|
1721
1765
|
|
1722
1766
|
return image_to_return
|
@@ -1775,24 +1819,20 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin):
|
|
1775
1819
|
self._element_mgr.remove_ocr_elements()
|
1776
1820
|
|
1777
1821
|
logger.info(f"Page {self.number}: Delegating apply_ocr to PDF.apply_ocr.")
|
1778
|
-
|
1779
|
-
|
1780
|
-
|
1781
|
-
self.
|
1782
|
-
|
1783
|
-
|
1784
|
-
|
1785
|
-
|
1786
|
-
|
1787
|
-
|
1788
|
-
|
1789
|
-
|
1790
|
-
|
1791
|
-
|
1792
|
-
)
|
1793
|
-
except Exception as e:
|
1794
|
-
logger.error(f"Page {self.number}: Error during delegated OCR call: {e}", exc_info=True)
|
1795
|
-
return self # Return self for chaining
|
1822
|
+
# Delegate to parent PDF, targeting only this page's index
|
1823
|
+
# Pass all relevant parameters through, including apply_exclusions
|
1824
|
+
self._parent.apply_ocr(
|
1825
|
+
pages=[self.index],
|
1826
|
+
engine=engine,
|
1827
|
+
options=options,
|
1828
|
+
languages=languages,
|
1829
|
+
min_confidence=min_confidence,
|
1830
|
+
device=device,
|
1831
|
+
resolution=resolution,
|
1832
|
+
detect_only=detect_only,
|
1833
|
+
apply_exclusions=apply_exclusions,
|
1834
|
+
replace=replace, # Pass the replace parameter to PDF.apply_ocr
|
1835
|
+
)
|
1796
1836
|
|
1797
1837
|
# Return self for chaining
|
1798
1838
|
return self
|
@@ -2313,14 +2353,14 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin):
|
|
2313
2353
|
self,
|
2314
2354
|
# elements_to_render: Optional[List['Element']] = None, # No longer needed, from_page handles it
|
2315
2355
|
# include_source_types: List[str] = ['word', 'line', 'rect', 'region'] # No longer needed
|
2316
|
-
) -> Optional["
|
2356
|
+
) -> Optional["InteractiveViewerWidget"]: # Return type hint updated
|
2317
2357
|
"""
|
2318
2358
|
Creates and returns an interactive ipywidget for exploring elements on this page.
|
2319
2359
|
|
2320
|
-
Uses
|
2360
|
+
Uses InteractiveViewerWidget.from_page() to create the viewer.
|
2321
2361
|
|
2322
2362
|
Returns:
|
2323
|
-
A
|
2363
|
+
A InteractiveViewerWidget instance ready for display in Jupyter,
|
2324
2364
|
or None if ipywidgets is not installed or widget creation fails.
|
2325
2365
|
|
2326
2366
|
Raises:
|
@@ -2329,18 +2369,18 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin):
|
|
2329
2369
|
ValueError: If image rendering or data preparation fails within from_page.
|
2330
2370
|
"""
|
2331
2371
|
# Check for availability using the imported flag and class variable
|
2332
|
-
if not _IPYWIDGETS_AVAILABLE or
|
2372
|
+
if not _IPYWIDGETS_AVAILABLE or InteractiveViewerWidget is None:
|
2333
2373
|
logger.error(
|
2334
|
-
"Interactive viewer requires
|
2335
|
-
|
2374
|
+
"Interactive viewer requires 'ipywidgets'. "
|
2375
|
+
'Please install with: pip install "ipywidgets>=7.0.0,<10.0.0"'
|
2336
2376
|
)
|
2337
2377
|
# raise ImportError("ipywidgets not found.") # Option 1: Raise error
|
2338
2378
|
return None # Option 2: Return None gracefully
|
2339
2379
|
|
2340
|
-
# If we reach here,
|
2380
|
+
# If we reach here, InteractiveViewerWidget should be the actual class
|
2341
2381
|
try:
|
2342
2382
|
# Pass self (the Page object) to the factory method
|
2343
|
-
return
|
2383
|
+
return InteractiveViewerWidget.from_page(self)
|
2344
2384
|
except Exception as e:
|
2345
2385
|
# Catch potential errors during widget creation (e.g., image rendering)
|
2346
2386
|
logger.error(
|
@@ -2440,9 +2480,7 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin):
|
|
2440
2480
|
f"Page {self.number}: Starting OCR correction with callback '{correction_callback.__name__}' (max_workers={max_workers})"
|
2441
2481
|
)
|
2442
2482
|
|
2443
|
-
target_elements_collection = self.find_all(
|
2444
|
-
selector=selector, apply_exclusions=False
|
2445
|
-
)
|
2483
|
+
target_elements_collection = self.find_all(selector=selector, apply_exclusions=False)
|
2446
2484
|
target_elements = target_elements_collection.elements # Get the list
|
2447
2485
|
|
2448
2486
|
if not target_elements:
|
@@ -2451,7 +2489,12 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin):
|
|
2451
2489
|
|
2452
2490
|
element_pbar = None
|
2453
2491
|
try:
|
2454
|
-
element_pbar = tqdm(
|
2492
|
+
element_pbar = tqdm(
|
2493
|
+
total=len(target_elements),
|
2494
|
+
desc=f"Correcting OCR Page {self.number}",
|
2495
|
+
unit="element",
|
2496
|
+
leave=False,
|
2497
|
+
)
|
2455
2498
|
|
2456
2499
|
processed_count = 0
|
2457
2500
|
updated_count = 0
|
natural_pdf/core/pdf.py
CHANGED
@@ -24,6 +24,7 @@ from typing import (
|
|
24
24
|
|
25
25
|
import pdfplumber
|
26
26
|
from PIL import Image
|
27
|
+
from tqdm.auto import tqdm
|
27
28
|
|
28
29
|
from natural_pdf.analyzers.layout.layout_manager import LayoutManager
|
29
30
|
from natural_pdf.classification.manager import ClassificationError, ClassificationManager
|
@@ -38,7 +39,6 @@ from natural_pdf.extraction.mixin import ExtractionMixin
|
|
38
39
|
from natural_pdf.ocr import OCRManager, OCROptions
|
39
40
|
from natural_pdf.selectors.parser import parse_selector
|
40
41
|
from natural_pdf.utils.locks import pdf_render_lock
|
41
|
-
from tqdm.auto import tqdm
|
42
42
|
|
43
43
|
try:
|
44
44
|
from typing import Any as TypingAny
|
@@ -307,7 +307,6 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
|
|
307
307
|
) -> "PDF":
|
308
308
|
"""
|
309
309
|
Applies OCR to specified pages of the PDF using batch processing.
|
310
|
-
Applies OCR to specified pages of the PDF using batch processing.
|
311
310
|
|
312
311
|
Args:
|
313
312
|
engine: Name of the OCR engine
|
@@ -320,25 +319,27 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
|
|
320
319
|
replace: Whether to replace existing OCR elements
|
321
320
|
options: Engine-specific options
|
322
321
|
pages: Page indices to process or None for all pages
|
323
|
-
engine: Name of the OCR engine
|
324
|
-
languages: List of language codes
|
325
|
-
min_confidence: Minimum confidence threshold
|
326
|
-
device: Device to run OCR on
|
327
|
-
resolution: DPI resolution for page images
|
328
|
-
apply_exclusions: Whether to mask excluded areas
|
329
|
-
detect_only: If True, only detect text boxes
|
330
|
-
replace: Whether to replace existing OCR elements
|
331
|
-
options: Engine-specific options
|
332
|
-
pages: Page indices to process or None for all pages
|
333
322
|
|
334
323
|
Returns:
|
335
324
|
Self for method chaining
|
336
|
-
Self for method chaining
|
337
325
|
"""
|
338
326
|
if not self._ocr_manager:
|
339
327
|
logger.error("OCRManager not available. Cannot apply OCR.")
|
340
328
|
return self
|
341
329
|
|
330
|
+
# Apply global options as defaults, but allow explicit parameters to override
|
331
|
+
import natural_pdf
|
332
|
+
|
333
|
+
# Use global OCR options if parameters are not explicitly set
|
334
|
+
if engine is None:
|
335
|
+
engine = natural_pdf.options.ocr.engine
|
336
|
+
if languages is None:
|
337
|
+
languages = natural_pdf.options.ocr.languages
|
338
|
+
if min_confidence is None:
|
339
|
+
min_confidence = natural_pdf.options.ocr.min_confidence
|
340
|
+
if device is None:
|
341
|
+
pass # No default device in options.ocr anymore
|
342
|
+
|
342
343
|
thread_id = threading.current_thread().name
|
343
344
|
logger.debug(f"[{thread_id}] PDF.apply_ocr starting for {self.path}")
|
344
345
|
|
@@ -425,18 +426,14 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
|
|
425
426
|
logger.info(f"[{thread_id}] Calling OCR Manager with args: {ocr_call_args}...")
|
426
427
|
ocr_start_time = time.monotonic()
|
427
428
|
|
428
|
-
|
429
|
-
batch_results = self._ocr_manager.apply_ocr(**manager_args)
|
430
|
-
|
431
|
-
if not isinstance(batch_results, list) or len(batch_results) != len(images_pil):
|
432
|
-
logger.error(f"OCR Manager returned unexpected result format or length.")
|
433
|
-
return self
|
429
|
+
batch_results = self._ocr_manager.apply_ocr(**manager_args)
|
434
430
|
|
435
|
-
|
436
|
-
|
437
|
-
logger.error(f"Batch OCR processing failed: {e}")
|
431
|
+
if not isinstance(batch_results, list) or len(batch_results) != len(images_pil):
|
432
|
+
logger.error(f"OCR Manager returned unexpected result format or length.")
|
438
433
|
return self
|
439
434
|
|
435
|
+
logger.info("OCR Manager batch processing complete.")
|
436
|
+
|
440
437
|
ocr_end_time = time.monotonic()
|
441
438
|
logger.debug(
|
442
439
|
f"[{thread_id}] OCR processing finished (Duration: {ocr_end_time - ocr_start_time:.2f}s)"
|
@@ -0,0 +1,21 @@
|
|
1
|
+
"""
|
2
|
+
Describe functionality for natural-pdf.
|
3
|
+
|
4
|
+
Provides summary and inspection methods for pages, collections, and regions.
|
5
|
+
"""
|
6
|
+
|
7
|
+
from .base import describe_page, describe_collection, inspect_collection, describe_region, describe_element
|
8
|
+
from .summary import ElementSummary, InspectionSummary
|
9
|
+
from .mixin import DescribeMixin, InspectMixin
|
10
|
+
|
11
|
+
__all__ = [
|
12
|
+
'describe_page',
|
13
|
+
'describe_collection',
|
14
|
+
'inspect_collection',
|
15
|
+
'describe_region',
|
16
|
+
'describe_element',
|
17
|
+
'ElementSummary',
|
18
|
+
'InspectionSummary',
|
19
|
+
'DescribeMixin',
|
20
|
+
'InspectMixin'
|
21
|
+
]
|