natural-pdf 0.1.14__py3-none-any.whl → 0.1.16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/__init__.py +31 -0
- natural_pdf/analyzers/layout/gemini.py +137 -162
- natural_pdf/analyzers/layout/layout_manager.py +9 -5
- natural_pdf/analyzers/layout/layout_options.py +77 -7
- natural_pdf/analyzers/layout/paddle.py +318 -165
- natural_pdf/analyzers/layout/table_structure_utils.py +78 -0
- natural_pdf/analyzers/shape_detection_mixin.py +770 -405
- natural_pdf/classification/mixin.py +2 -8
- natural_pdf/collections/pdf_collection.py +25 -30
- natural_pdf/core/highlighting_service.py +47 -32
- natural_pdf/core/page.py +226 -70
- natural_pdf/core/pdf.py +19 -22
- natural_pdf/elements/base.py +9 -9
- natural_pdf/elements/collections.py +105 -50
- natural_pdf/elements/region.py +320 -113
- natural_pdf/exporters/paddleocr.py +38 -13
- natural_pdf/flows/__init__.py +3 -3
- natural_pdf/flows/collections.py +303 -132
- natural_pdf/flows/element.py +277 -132
- natural_pdf/flows/flow.py +33 -16
- natural_pdf/flows/region.py +142 -79
- natural_pdf/ocr/engine_doctr.py +37 -4
- natural_pdf/ocr/engine_easyocr.py +23 -3
- natural_pdf/ocr/engine_paddle.py +281 -30
- natural_pdf/ocr/engine_surya.py +8 -3
- natural_pdf/ocr/ocr_manager.py +75 -76
- natural_pdf/ocr/ocr_options.py +52 -87
- natural_pdf/search/__init__.py +25 -12
- natural_pdf/search/lancedb_search_service.py +91 -54
- natural_pdf/search/numpy_search_service.py +86 -65
- natural_pdf/search/searchable_mixin.py +2 -2
- natural_pdf/selectors/parser.py +125 -81
- natural_pdf/widgets/__init__.py +1 -1
- natural_pdf/widgets/viewer.py +205 -449
- {natural_pdf-0.1.14.dist-info → natural_pdf-0.1.16.dist-info}/METADATA +27 -45
- {natural_pdf-0.1.14.dist-info → natural_pdf-0.1.16.dist-info}/RECORD +39 -38
- {natural_pdf-0.1.14.dist-info → natural_pdf-0.1.16.dist-info}/WHEEL +0 -0
- {natural_pdf-0.1.14.dist-info → natural_pdf-0.1.16.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.1.14.dist-info → natural_pdf-0.1.16.dist-info}/top_level.txt +0 -0
natural_pdf/core/page.py
CHANGED
@@ -51,6 +51,9 @@ from pdfplumber.utils.text import TEXTMAP_KWARGS, WORD_EXTRACTOR_KWARGS, chars_t
|
|
51
51
|
from natural_pdf.analyzers.layout.layout_analyzer import LayoutAnalyzer
|
52
52
|
from natural_pdf.analyzers.layout.layout_manager import LayoutManager
|
53
53
|
from natural_pdf.analyzers.layout.layout_options import LayoutOptions
|
54
|
+
|
55
|
+
# --- Shape Detection Mixin --- #
|
56
|
+
from natural_pdf.analyzers.shape_detection_mixin import ShapeDetectionMixin
|
54
57
|
from natural_pdf.analyzers.text_options import TextStyleOptions
|
55
58
|
from natural_pdf.analyzers.text_structure import TextStyleAnalyzer
|
56
59
|
from natural_pdf.classification.manager import ClassificationManager # For type hint
|
@@ -68,14 +71,12 @@ from natural_pdf.utils.locks import pdf_render_lock # Import the lock
|
|
68
71
|
|
69
72
|
# # Import new utils
|
70
73
|
from natural_pdf.utils.text_extraction import filter_chars_spatially, generate_text_layout
|
71
|
-
from natural_pdf.widgets import InteractiveViewerWidget
|
72
|
-
from natural_pdf.widgets.viewer import _IPYWIDGETS_AVAILABLE, SimpleInteractiveViewerWidget
|
74
|
+
from natural_pdf.widgets.viewer import _IPYWIDGETS_AVAILABLE, InteractiveViewerWidget
|
73
75
|
|
74
76
|
# --- End Classification Imports --- #
|
75
77
|
|
76
78
|
|
77
|
-
|
78
|
-
from natural_pdf.analyzers.shape_detection_mixin import ShapeDetectionMixin
|
79
|
+
|
79
80
|
# --- End Shape Detection Mixin --- #
|
80
81
|
|
81
82
|
|
@@ -667,13 +668,13 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin):
|
|
667
668
|
if selector_obj.get("type") == "or":
|
668
669
|
# For OR selectors, search all elements and let the filter function decide
|
669
670
|
elements_to_search = self._element_mgr.get_all_elements()
|
670
|
-
|
671
|
+
|
671
672
|
# Create filter function from compound selector
|
672
673
|
filter_func = selector_to_filter_func(selector_obj, **kwargs)
|
673
|
-
|
674
|
+
|
674
675
|
# Apply the filter to all elements
|
675
676
|
matching_elements = [element for element in elements_to_search if filter_func(element)]
|
676
|
-
|
677
|
+
|
677
678
|
# Sort elements in reading order if requested
|
678
679
|
if kwargs.get("reading_order", True):
|
679
680
|
if all(hasattr(el, "top") and hasattr(el, "x0") for el in matching_elements):
|
@@ -682,7 +683,7 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin):
|
|
682
683
|
logger.warning(
|
683
684
|
"Cannot sort elements in reading order: Missing required attributes (top, x0)."
|
684
685
|
)
|
685
|
-
|
686
|
+
|
686
687
|
# Return result collection
|
687
688
|
return ElementCollection(matching_elements)
|
688
689
|
|
@@ -1138,31 +1139,171 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin):
|
|
1138
1139
|
logger.debug(f"Page {self.number}: extract_text finished, result length: {len(result)}.")
|
1139
1140
|
return result
|
1140
1141
|
|
1141
|
-
def extract_table(
|
1142
|
+
def extract_table(
|
1143
|
+
self,
|
1144
|
+
method: Optional[str] = None,
|
1145
|
+
table_settings: Optional[dict] = None,
|
1146
|
+
use_ocr: bool = False,
|
1147
|
+
ocr_config: Optional[dict] = None,
|
1148
|
+
text_options: Optional[Dict] = None,
|
1149
|
+
cell_extraction_func: Optional[Callable[["Region"], Optional[str]]] = None,
|
1150
|
+
show_progress: bool = False,
|
1151
|
+
) -> List[List[Optional[str]]]:
|
1142
1152
|
"""
|
1143
|
-
Extract the largest table from this page.
|
1153
|
+
Extract the largest table from this page using enhanced region-based extraction.
|
1144
1154
|
|
1145
1155
|
Args:
|
1146
|
-
|
1156
|
+
method: Method to use: 'tatr', 'pdfplumber', 'text', 'stream', 'lattice', or None (auto-detect).
|
1157
|
+
table_settings: Settings for pdfplumber table extraction.
|
1158
|
+
use_ocr: Whether to use OCR for text extraction (currently only applicable with 'tatr' method).
|
1159
|
+
ocr_config: OCR configuration parameters.
|
1160
|
+
text_options: Dictionary of options for the 'text' method.
|
1161
|
+
cell_extraction_func: Optional callable function that takes a cell Region object
|
1162
|
+
and returns its string content. For 'text' method only.
|
1163
|
+
show_progress: If True, display a progress bar during cell text extraction for the 'text' method.
|
1147
1164
|
|
1148
1165
|
Returns:
|
1149
|
-
|
1166
|
+
Table data as a list of rows, where each row is a list of cell values (str or None).
|
1150
1167
|
"""
|
1151
|
-
#
|
1152
|
-
|
1168
|
+
# Create a full-page region and delegate to its enhanced extract_table method
|
1169
|
+
page_region = self.create_region(0, 0, self.width, self.height)
|
1170
|
+
return page_region.extract_table(
|
1171
|
+
method=method,
|
1172
|
+
table_settings=table_settings,
|
1173
|
+
use_ocr=use_ocr,
|
1174
|
+
ocr_config=ocr_config,
|
1175
|
+
text_options=text_options,
|
1176
|
+
cell_extraction_func=cell_extraction_func,
|
1177
|
+
show_progress=show_progress,
|
1178
|
+
)
|
1153
1179
|
|
1154
|
-
def extract_tables(
|
1180
|
+
def extract_tables(
|
1181
|
+
self,
|
1182
|
+
method: Optional[str] = None,
|
1183
|
+
table_settings: Optional[dict] = None,
|
1184
|
+
check_tatr: bool = True,
|
1185
|
+
) -> List[List[List[str]]]:
|
1155
1186
|
"""
|
1156
|
-
Extract tables from this page.
|
1187
|
+
Extract all tables from this page with enhanced method support.
|
1157
1188
|
|
1158
1189
|
Args:
|
1159
|
-
|
1190
|
+
method: Method to use: 'pdfplumber', 'stream', 'lattice', or None (auto-detect).
|
1191
|
+
'stream' uses text-based strategies, 'lattice' uses line-based strategies.
|
1192
|
+
Note: 'tatr' and 'text' methods are not supported for extract_tables.
|
1193
|
+
table_settings: Settings for pdfplumber table extraction.
|
1194
|
+
check_tatr: If True (default), first check for TATR-detected table regions
|
1195
|
+
and extract from those before falling back to pdfplumber methods.
|
1160
1196
|
|
1161
1197
|
Returns:
|
1162
|
-
List of
|
1198
|
+
List of tables, where each table is a list of rows, and each row is a list of cell values.
|
1163
1199
|
"""
|
1164
|
-
|
1165
|
-
|
1200
|
+
if table_settings is None:
|
1201
|
+
table_settings = {}
|
1202
|
+
|
1203
|
+
# Check for TATR-detected table regions first if enabled
|
1204
|
+
if check_tatr:
|
1205
|
+
try:
|
1206
|
+
tatr_tables = self.find_all("region[type=table][model=tatr]")
|
1207
|
+
if tatr_tables:
|
1208
|
+
logger.debug(
|
1209
|
+
f"Page {self.number}: Found {len(tatr_tables)} TATR table regions, extracting from those..."
|
1210
|
+
)
|
1211
|
+
extracted_tables = []
|
1212
|
+
for table_region in tatr_tables:
|
1213
|
+
try:
|
1214
|
+
table_data = table_region.extract_table(method="tatr")
|
1215
|
+
if table_data: # Only add non-empty tables
|
1216
|
+
extracted_tables.append(table_data)
|
1217
|
+
except Exception as e:
|
1218
|
+
logger.warning(
|
1219
|
+
f"Failed to extract table from TATR region {table_region.bbox}: {e}"
|
1220
|
+
)
|
1221
|
+
|
1222
|
+
if extracted_tables:
|
1223
|
+
logger.debug(
|
1224
|
+
f"Page {self.number}: Successfully extracted {len(extracted_tables)} tables from TATR regions"
|
1225
|
+
)
|
1226
|
+
return extracted_tables
|
1227
|
+
else:
|
1228
|
+
logger.debug(
|
1229
|
+
f"Page {self.number}: TATR regions found but no tables extracted, falling back to pdfplumber"
|
1230
|
+
)
|
1231
|
+
else:
|
1232
|
+
logger.debug(
|
1233
|
+
f"Page {self.number}: No TATR table regions found, using pdfplumber methods"
|
1234
|
+
)
|
1235
|
+
except Exception as e:
|
1236
|
+
logger.debug(
|
1237
|
+
f"Page {self.number}: Error checking TATR regions: {e}, falling back to pdfplumber"
|
1238
|
+
)
|
1239
|
+
|
1240
|
+
# Auto-detect method if not specified (try lattice first, then stream)
|
1241
|
+
if method is None:
|
1242
|
+
logger.debug(f"Page {self.number}: Auto-detecting tables extraction method...")
|
1243
|
+
|
1244
|
+
# Try lattice first
|
1245
|
+
try:
|
1246
|
+
lattice_settings = table_settings.copy()
|
1247
|
+
lattice_settings.setdefault("vertical_strategy", "lines")
|
1248
|
+
lattice_settings.setdefault("horizontal_strategy", "lines")
|
1249
|
+
|
1250
|
+
logger.debug(f"Page {self.number}: Trying 'lattice' method first for tables...")
|
1251
|
+
lattice_result = self._page.extract_tables(lattice_settings)
|
1252
|
+
|
1253
|
+
# Check if lattice found meaningful tables
|
1254
|
+
if (
|
1255
|
+
lattice_result
|
1256
|
+
and len(lattice_result) > 0
|
1257
|
+
and any(
|
1258
|
+
any(
|
1259
|
+
any(cell and cell.strip() for cell in row if cell)
|
1260
|
+
for row in table
|
1261
|
+
if table
|
1262
|
+
)
|
1263
|
+
for table in lattice_result
|
1264
|
+
)
|
1265
|
+
):
|
1266
|
+
logger.debug(
|
1267
|
+
f"Page {self.number}: 'lattice' method found {len(lattice_result)} tables"
|
1268
|
+
)
|
1269
|
+
return lattice_result
|
1270
|
+
else:
|
1271
|
+
logger.debug(f"Page {self.number}: 'lattice' method found no meaningful tables")
|
1272
|
+
|
1273
|
+
except Exception as e:
|
1274
|
+
logger.debug(f"Page {self.number}: 'lattice' method failed: {e}")
|
1275
|
+
|
1276
|
+
# Fall back to stream
|
1277
|
+
logger.debug(f"Page {self.number}: Falling back to 'stream' method for tables...")
|
1278
|
+
stream_settings = table_settings.copy()
|
1279
|
+
stream_settings.setdefault("vertical_strategy", "text")
|
1280
|
+
stream_settings.setdefault("horizontal_strategy", "text")
|
1281
|
+
|
1282
|
+
return self._page.extract_tables(stream_settings)
|
1283
|
+
|
1284
|
+
effective_method = method
|
1285
|
+
|
1286
|
+
# Handle method aliases
|
1287
|
+
if effective_method == "stream":
|
1288
|
+
logger.debug("Using 'stream' method alias for 'pdfplumber' with text-based strategies.")
|
1289
|
+
effective_method = "pdfplumber"
|
1290
|
+
table_settings.setdefault("vertical_strategy", "text")
|
1291
|
+
table_settings.setdefault("horizontal_strategy", "text")
|
1292
|
+
elif effective_method == "lattice":
|
1293
|
+
logger.debug(
|
1294
|
+
"Using 'lattice' method alias for 'pdfplumber' with line-based strategies."
|
1295
|
+
)
|
1296
|
+
effective_method = "pdfplumber"
|
1297
|
+
table_settings.setdefault("vertical_strategy", "lines")
|
1298
|
+
table_settings.setdefault("horizontal_strategy", "lines")
|
1299
|
+
|
1300
|
+
# Use the selected method
|
1301
|
+
if effective_method == "pdfplumber":
|
1302
|
+
return self._page.extract_tables(table_settings)
|
1303
|
+
else:
|
1304
|
+
raise ValueError(
|
1305
|
+
f"Unknown tables extraction method: '{method}'. Choose from 'pdfplumber', 'stream', 'lattice'."
|
1306
|
+
)
|
1166
1307
|
|
1167
1308
|
def _load_elements(self):
|
1168
1309
|
"""Load all elements from the page via ElementManager."""
|
@@ -1441,6 +1582,14 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin):
|
|
1441
1582
|
Returns:
|
1442
1583
|
PIL Image of the page, or None if rendering fails.
|
1443
1584
|
"""
|
1585
|
+
# Apply global options as defaults, but allow explicit parameters to override
|
1586
|
+
import natural_pdf
|
1587
|
+
|
1588
|
+
# Use global options if parameters are not explicitly set
|
1589
|
+
if width is None:
|
1590
|
+
width = natural_pdf.options.image.width
|
1591
|
+
if resolution is None and natural_pdf.options.image.resolution is not None:
|
1592
|
+
resolution = natural_pdf.options.image.resolution
|
1444
1593
|
# 1. Create cache key (excluding path)
|
1445
1594
|
cache_key_parts = [
|
1446
1595
|
scale,
|
@@ -1458,19 +1607,23 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin):
|
|
1458
1607
|
if isinstance(v, list):
|
1459
1608
|
try:
|
1460
1609
|
v = tuple(v) # Convert lists to tuples
|
1461
|
-
except TypeError:
|
1610
|
+
except TypeError: # pragma: no cover
|
1462
1611
|
# If list contains unhashable items, fall back to repr or skip
|
1463
1612
|
# For simplicity, we'll try to proceed; hashing will fail if v remains unhashable
|
1464
|
-
logger.warning(
|
1613
|
+
logger.warning(
|
1614
|
+
f"Cache key generation: List item in kwargs['{k}'] could not be converted to tuple due to unhashable elements."
|
1615
|
+
)
|
1465
1616
|
sorted_kwargs_list.append((k, v))
|
1466
|
-
|
1617
|
+
|
1467
1618
|
cache_key_parts.append(tuple(sorted_kwargs_list))
|
1468
|
-
|
1619
|
+
|
1469
1620
|
try:
|
1470
1621
|
cache_key = tuple(cache_key_parts)
|
1471
|
-
except TypeError as e:
|
1472
|
-
logger.warning(
|
1473
|
-
|
1622
|
+
except TypeError as e: # pragma: no cover
|
1623
|
+
logger.warning(
|
1624
|
+
f"Page {self.index}: Could not create cache key for to_image due to unhashable item: {e}. Proceeding without cache for this call."
|
1625
|
+
)
|
1626
|
+
cache_key = None # Fallback to not using cache for this call
|
1474
1627
|
|
1475
1628
|
image_to_return: Optional[Image.Image] = None
|
1476
1629
|
|
@@ -1480,7 +1633,9 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin):
|
|
1480
1633
|
logger.debug(f"Page {self.index}: Returning cached image for key: {cache_key}")
|
1481
1634
|
else:
|
1482
1635
|
# --- This is the original logic to generate the image ---
|
1483
|
-
rendered_image_component: Optional[Image.Image] =
|
1636
|
+
rendered_image_component: Optional[Image.Image] = (
|
1637
|
+
None # Renamed from 'image' in original
|
1638
|
+
)
|
1484
1639
|
render_resolution = resolution if resolution is not None else scale * 72
|
1485
1640
|
thread_id = threading.current_thread().name
|
1486
1641
|
logger.debug(
|
@@ -1518,29 +1673,31 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin):
|
|
1518
1673
|
|
1519
1674
|
if rendered_image_component is None:
|
1520
1675
|
if cache_key is not None:
|
1521
|
-
self._to_image_cache[cache_key] = None
|
1676
|
+
self._to_image_cache[cache_key] = None # Cache the failure
|
1522
1677
|
# Save the image if path is provided (will try to save None, handled by PIL/OS)
|
1523
1678
|
if path:
|
1524
1679
|
try:
|
1525
1680
|
if os.path.dirname(path):
|
1526
1681
|
os.makedirs(os.path.dirname(path), exist_ok=True)
|
1527
|
-
if rendered_image_component is not None:
|
1528
|
-
|
1682
|
+
if rendered_image_component is not None: # Should be None here
|
1683
|
+
rendered_image_component.save(path) # This line won't be hit if None
|
1529
1684
|
# else: logger.debug("Not saving None image") # Not strictly needed
|
1530
|
-
except Exception as save_error:
|
1685
|
+
except Exception as save_error: # pragma: no cover
|
1531
1686
|
logger.error(f"Failed to save image to {path}: {save_error}")
|
1532
1687
|
return None
|
1533
1688
|
|
1534
1689
|
# --- Apply exclusion masking if requested ---
|
1535
1690
|
# This modifies 'rendered_image_component'
|
1536
|
-
image_after_masking = rendered_image_component
|
1691
|
+
image_after_masking = rendered_image_component # Start with the rendered image
|
1537
1692
|
if exclusions == "mask" and self._exclusions:
|
1538
1693
|
try:
|
1539
1694
|
# Ensure image is mutable (RGB or RGBA)
|
1540
1695
|
if image_after_masking.mode not in ("RGB", "RGBA"):
|
1541
1696
|
image_after_masking = image_after_masking.convert("RGB")
|
1542
1697
|
|
1543
|
-
exclusion_regions = self._get_exclusion_regions(
|
1698
|
+
exclusion_regions = self._get_exclusion_regions(
|
1699
|
+
include_callable=True, debug=False
|
1700
|
+
)
|
1544
1701
|
if exclusion_regions:
|
1545
1702
|
draw = ImageDraw.Draw(image_after_masking)
|
1546
1703
|
# Calculate the scaling factor used for the image
|
@@ -1562,12 +1719,12 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin):
|
|
1562
1719
|
)
|
1563
1720
|
if img_coords[0] < img_coords[2] and img_coords[1] < img_coords[3]:
|
1564
1721
|
draw.rectangle(img_coords, fill="white")
|
1565
|
-
else:
|
1722
|
+
else: # pragma: no cover
|
1566
1723
|
logger.warning(
|
1567
1724
|
f"Skipping invalid exclusion rect for masking: {img_coords}"
|
1568
1725
|
)
|
1569
1726
|
del draw # Release drawing context
|
1570
|
-
except Exception as mask_error:
|
1727
|
+
except Exception as mask_error: # pragma: no cover
|
1571
1728
|
logger.error(
|
1572
1729
|
f"Error applying exclusion mask to page {self.index}: {mask_error}",
|
1573
1730
|
exc_info=True,
|
@@ -1575,7 +1732,7 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin):
|
|
1575
1732
|
# Continue with potentially unmasked or partially masked image
|
1576
1733
|
|
1577
1734
|
# --- Resize the final image if width is provided ---
|
1578
|
-
image_final_content = image_after_masking
|
1735
|
+
image_final_content = image_after_masking # Start with image after masking
|
1579
1736
|
if width is not None and width > 0 and image_final_content.width > 0:
|
1580
1737
|
aspect_ratio = image_final_content.height / image_final_content.width
|
1581
1738
|
height = int(width * aspect_ratio)
|
@@ -1583,7 +1740,7 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin):
|
|
1583
1740
|
image_final_content = image_final_content.resize(
|
1584
1741
|
(width, height), Image.Resampling.LANCZOS
|
1585
1742
|
)
|
1586
|
-
except Exception as resize_error:
|
1743
|
+
except Exception as resize_error: # pragma: no cover
|
1587
1744
|
logger.warning(f"Could not resize image: {resize_error}")
|
1588
1745
|
# image_final_content remains the un-resized version if resize fails
|
1589
1746
|
|
@@ -1598,11 +1755,11 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin):
|
|
1598
1755
|
if path and image_to_return:
|
1599
1756
|
try:
|
1600
1757
|
# Ensure directory exists
|
1601
|
-
if os.path.dirname(path):
|
1758
|
+
if os.path.dirname(path): # Only call makedirs if there's a directory part
|
1602
1759
|
os.makedirs(os.path.dirname(path), exist_ok=True)
|
1603
1760
|
image_to_return.save(path)
|
1604
1761
|
logger.debug(f"Saved page image to: {path}")
|
1605
|
-
except Exception as save_error:
|
1762
|
+
except Exception as save_error: # pragma: no cover
|
1606
1763
|
logger.error(f"Failed to save image to {path}: {save_error}")
|
1607
1764
|
|
1608
1765
|
return image_to_return
|
@@ -1661,24 +1818,20 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin):
|
|
1661
1818
|
self._element_mgr.remove_ocr_elements()
|
1662
1819
|
|
1663
1820
|
logger.info(f"Page {self.number}: Delegating apply_ocr to PDF.apply_ocr.")
|
1664
|
-
|
1665
|
-
|
1666
|
-
|
1667
|
-
self.
|
1668
|
-
|
1669
|
-
|
1670
|
-
|
1671
|
-
|
1672
|
-
|
1673
|
-
|
1674
|
-
|
1675
|
-
|
1676
|
-
|
1677
|
-
|
1678
|
-
)
|
1679
|
-
except Exception as e:
|
1680
|
-
logger.error(f"Page {self.number}: Error during delegated OCR call: {e}", exc_info=True)
|
1681
|
-
return self # Return self for chaining
|
1821
|
+
# Delegate to parent PDF, targeting only this page's index
|
1822
|
+
# Pass all relevant parameters through, including apply_exclusions
|
1823
|
+
self._parent.apply_ocr(
|
1824
|
+
pages=[self.index],
|
1825
|
+
engine=engine,
|
1826
|
+
options=options,
|
1827
|
+
languages=languages,
|
1828
|
+
min_confidence=min_confidence,
|
1829
|
+
device=device,
|
1830
|
+
resolution=resolution,
|
1831
|
+
detect_only=detect_only,
|
1832
|
+
apply_exclusions=apply_exclusions,
|
1833
|
+
replace=replace, # Pass the replace parameter to PDF.apply_ocr
|
1834
|
+
)
|
1682
1835
|
|
1683
1836
|
# Return self for chaining
|
1684
1837
|
return self
|
@@ -2199,14 +2352,14 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin):
|
|
2199
2352
|
self,
|
2200
2353
|
# elements_to_render: Optional[List['Element']] = None, # No longer needed, from_page handles it
|
2201
2354
|
# include_source_types: List[str] = ['word', 'line', 'rect', 'region'] # No longer needed
|
2202
|
-
) -> Optional["
|
2355
|
+
) -> Optional["InteractiveViewerWidget"]: # Return type hint updated
|
2203
2356
|
"""
|
2204
2357
|
Creates and returns an interactive ipywidget for exploring elements on this page.
|
2205
2358
|
|
2206
|
-
Uses
|
2359
|
+
Uses InteractiveViewerWidget.from_page() to create the viewer.
|
2207
2360
|
|
2208
2361
|
Returns:
|
2209
|
-
A
|
2362
|
+
A InteractiveViewerWidget instance ready for display in Jupyter,
|
2210
2363
|
or None if ipywidgets is not installed or widget creation fails.
|
2211
2364
|
|
2212
2365
|
Raises:
|
@@ -2215,18 +2368,18 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin):
|
|
2215
2368
|
ValueError: If image rendering or data preparation fails within from_page.
|
2216
2369
|
"""
|
2217
2370
|
# Check for availability using the imported flag and class variable
|
2218
|
-
if not _IPYWIDGETS_AVAILABLE or
|
2371
|
+
if not _IPYWIDGETS_AVAILABLE or InteractiveViewerWidget is None:
|
2219
2372
|
logger.error(
|
2220
|
-
"Interactive viewer requires
|
2221
|
-
|
2373
|
+
"Interactive viewer requires 'ipywidgets'. "
|
2374
|
+
'Please install with: pip install "ipywidgets>=7.0.0,<10.0.0"'
|
2222
2375
|
)
|
2223
2376
|
# raise ImportError("ipywidgets not found.") # Option 1: Raise error
|
2224
2377
|
return None # Option 2: Return None gracefully
|
2225
2378
|
|
2226
|
-
# If we reach here,
|
2379
|
+
# If we reach here, InteractiveViewerWidget should be the actual class
|
2227
2380
|
try:
|
2228
2381
|
# Pass self (the Page object) to the factory method
|
2229
|
-
return
|
2382
|
+
return InteractiveViewerWidget.from_page(self)
|
2230
2383
|
except Exception as e:
|
2231
2384
|
# Catch potential errors during widget creation (e.g., image rendering)
|
2232
2385
|
logger.error(
|
@@ -2326,9 +2479,7 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin):
|
|
2326
2479
|
f"Page {self.number}: Starting OCR correction with callback '{correction_callback.__name__}' (max_workers={max_workers})"
|
2327
2480
|
)
|
2328
2481
|
|
2329
|
-
target_elements_collection = self.find_all(
|
2330
|
-
selector=selector, apply_exclusions=False
|
2331
|
-
)
|
2482
|
+
target_elements_collection = self.find_all(selector=selector, apply_exclusions=False)
|
2332
2483
|
target_elements = target_elements_collection.elements # Get the list
|
2333
2484
|
|
2334
2485
|
if not target_elements:
|
@@ -2337,7 +2488,12 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin):
|
|
2337
2488
|
|
2338
2489
|
element_pbar = None
|
2339
2490
|
try:
|
2340
|
-
element_pbar = tqdm(
|
2491
|
+
element_pbar = tqdm(
|
2492
|
+
total=len(target_elements),
|
2493
|
+
desc=f"Correcting OCR Page {self.number}",
|
2494
|
+
unit="element",
|
2495
|
+
leave=False,
|
2496
|
+
)
|
2341
2497
|
|
2342
2498
|
processed_count = 0
|
2343
2499
|
updated_count = 0
|
natural_pdf/core/pdf.py
CHANGED
@@ -24,6 +24,7 @@ from typing import (
|
|
24
24
|
|
25
25
|
import pdfplumber
|
26
26
|
from PIL import Image
|
27
|
+
from tqdm.auto import tqdm
|
27
28
|
|
28
29
|
from natural_pdf.analyzers.layout.layout_manager import LayoutManager
|
29
30
|
from natural_pdf.classification.manager import ClassificationError, ClassificationManager
|
@@ -38,7 +39,6 @@ from natural_pdf.extraction.mixin import ExtractionMixin
|
|
38
39
|
from natural_pdf.ocr import OCRManager, OCROptions
|
39
40
|
from natural_pdf.selectors.parser import parse_selector
|
40
41
|
from natural_pdf.utils.locks import pdf_render_lock
|
41
|
-
from tqdm.auto import tqdm
|
42
42
|
|
43
43
|
try:
|
44
44
|
from typing import Any as TypingAny
|
@@ -307,7 +307,6 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
|
|
307
307
|
) -> "PDF":
|
308
308
|
"""
|
309
309
|
Applies OCR to specified pages of the PDF using batch processing.
|
310
|
-
Applies OCR to specified pages of the PDF using batch processing.
|
311
310
|
|
312
311
|
Args:
|
313
312
|
engine: Name of the OCR engine
|
@@ -320,25 +319,27 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
|
|
320
319
|
replace: Whether to replace existing OCR elements
|
321
320
|
options: Engine-specific options
|
322
321
|
pages: Page indices to process or None for all pages
|
323
|
-
engine: Name of the OCR engine
|
324
|
-
languages: List of language codes
|
325
|
-
min_confidence: Minimum confidence threshold
|
326
|
-
device: Device to run OCR on
|
327
|
-
resolution: DPI resolution for page images
|
328
|
-
apply_exclusions: Whether to mask excluded areas
|
329
|
-
detect_only: If True, only detect text boxes
|
330
|
-
replace: Whether to replace existing OCR elements
|
331
|
-
options: Engine-specific options
|
332
|
-
pages: Page indices to process or None for all pages
|
333
322
|
|
334
323
|
Returns:
|
335
324
|
Self for method chaining
|
336
|
-
Self for method chaining
|
337
325
|
"""
|
338
326
|
if not self._ocr_manager:
|
339
327
|
logger.error("OCRManager not available. Cannot apply OCR.")
|
340
328
|
return self
|
341
329
|
|
330
|
+
# Apply global options as defaults, but allow explicit parameters to override
|
331
|
+
import natural_pdf
|
332
|
+
|
333
|
+
# Use global OCR options if parameters are not explicitly set
|
334
|
+
if engine is None:
|
335
|
+
engine = natural_pdf.options.ocr.engine
|
336
|
+
if languages is None:
|
337
|
+
languages = natural_pdf.options.ocr.languages
|
338
|
+
if min_confidence is None:
|
339
|
+
min_confidence = natural_pdf.options.ocr.min_confidence
|
340
|
+
if device is None:
|
341
|
+
pass # No default device in options.ocr anymore
|
342
|
+
|
342
343
|
thread_id = threading.current_thread().name
|
343
344
|
logger.debug(f"[{thread_id}] PDF.apply_ocr starting for {self.path}")
|
344
345
|
|
@@ -425,18 +426,14 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
|
|
425
426
|
logger.info(f"[{thread_id}] Calling OCR Manager with args: {ocr_call_args}...")
|
426
427
|
ocr_start_time = time.monotonic()
|
427
428
|
|
428
|
-
|
429
|
-
batch_results = self._ocr_manager.apply_ocr(**manager_args)
|
430
|
-
|
431
|
-
if not isinstance(batch_results, list) or len(batch_results) != len(images_pil):
|
432
|
-
logger.error(f"OCR Manager returned unexpected result format or length.")
|
433
|
-
return self
|
429
|
+
batch_results = self._ocr_manager.apply_ocr(**manager_args)
|
434
430
|
|
435
|
-
|
436
|
-
|
437
|
-
logger.error(f"Batch OCR processing failed: {e}")
|
431
|
+
if not isinstance(batch_results, list) or len(batch_results) != len(images_pil):
|
432
|
+
logger.error(f"OCR Manager returned unexpected result format or length.")
|
438
433
|
return self
|
439
434
|
|
435
|
+
logger.info("OCR Manager batch processing complete.")
|
436
|
+
|
440
437
|
ocr_end_time = time.monotonic()
|
441
438
|
logger.debug(
|
442
439
|
f"[{thread_id}] OCR processing finished (Duration: {ocr_end_time - ocr_start_time:.2f}s)"
|
natural_pdf/elements/base.py
CHANGED
@@ -18,34 +18,34 @@ if TYPE_CHECKING:
|
|
18
18
|
def extract_bbox(obj: Any) -> Optional[Tuple[float, float, float, float]]:
|
19
19
|
"""
|
20
20
|
Extract bounding box coordinates from any object that has bbox properties.
|
21
|
-
|
21
|
+
|
22
22
|
Args:
|
23
23
|
obj: Object that might have bbox coordinates (Element, Region, etc.)
|
24
|
-
|
24
|
+
|
25
25
|
Returns:
|
26
26
|
Tuple of (x0, top, x1, bottom) or None if object doesn't have bbox properties
|
27
27
|
"""
|
28
28
|
# Try bbox property first (most common)
|
29
|
-
if hasattr(obj,
|
29
|
+
if hasattr(obj, "bbox") and obj.bbox is not None:
|
30
30
|
bbox = obj.bbox
|
31
31
|
if isinstance(bbox, (tuple, list)) and len(bbox) == 4:
|
32
32
|
return tuple(float(coord) for coord in bbox)
|
33
|
-
|
33
|
+
|
34
34
|
# Try individual coordinate properties
|
35
|
-
if all(hasattr(obj, attr) for attr in [
|
35
|
+
if all(hasattr(obj, attr) for attr in ["x0", "top", "x1", "bottom"]):
|
36
36
|
try:
|
37
37
|
return (float(obj.x0), float(obj.top), float(obj.x1), float(obj.bottom))
|
38
38
|
except (ValueError, TypeError):
|
39
39
|
pass
|
40
|
-
|
40
|
+
|
41
41
|
# If object is a dict with bbox keys
|
42
42
|
if isinstance(obj, dict):
|
43
|
-
if all(key in obj for key in [
|
43
|
+
if all(key in obj for key in ["x0", "top", "x1", "bottom"]):
|
44
44
|
try:
|
45
|
-
return (float(obj[
|
45
|
+
return (float(obj["x0"]), float(obj["top"]), float(obj["x1"]), float(obj["bottom"]))
|
46
46
|
except (ValueError, TypeError):
|
47
47
|
pass
|
48
|
-
|
48
|
+
|
49
49
|
return None
|
50
50
|
|
51
51
|
|