natural-pdf 0.1.5__py3-none-any.whl → 0.1.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docs/ocr/index.md +34 -47
- docs/tutorials/01-loading-and-extraction.ipynb +60 -46
- docs/tutorials/02-finding-elements.ipynb +42 -42
- docs/tutorials/03-extracting-blocks.ipynb +17 -17
- docs/tutorials/04-table-extraction.ipynb +12 -12
- docs/tutorials/05-excluding-content.ipynb +30 -30
- docs/tutorials/06-document-qa.ipynb +28 -28
- docs/tutorials/07-layout-analysis.ipynb +63 -35
- docs/tutorials/07-working-with-regions.ipynb +55 -51
- docs/tutorials/07-working-with-regions.md +2 -2
- docs/tutorials/08-spatial-navigation.ipynb +60 -60
- docs/tutorials/09-section-extraction.ipynb +113 -113
- docs/tutorials/10-form-field-extraction.ipynb +78 -50
- docs/tutorials/11-enhanced-table-processing.ipynb +6 -6
- docs/tutorials/12-ocr-integration.ipynb +149 -131
- docs/tutorials/12-ocr-integration.md +0 -13
- docs/tutorials/13-semantic-search.ipynb +313 -873
- natural_pdf/__init__.py +21 -23
- natural_pdf/analyzers/layout/gemini.py +264 -0
- natural_pdf/analyzers/layout/layout_manager.py +28 -1
- natural_pdf/analyzers/layout/layout_options.py +11 -0
- natural_pdf/analyzers/layout/yolo.py +6 -2
- natural_pdf/collections/pdf_collection.py +21 -0
- natural_pdf/core/element_manager.py +16 -13
- natural_pdf/core/page.py +165 -36
- natural_pdf/core/pdf.py +146 -41
- natural_pdf/elements/base.py +11 -17
- natural_pdf/elements/collections.py +100 -38
- natural_pdf/elements/region.py +77 -38
- natural_pdf/elements/text.py +5 -0
- natural_pdf/ocr/__init__.py +49 -36
- natural_pdf/ocr/engine.py +146 -51
- natural_pdf/ocr/engine_easyocr.py +141 -161
- natural_pdf/ocr/engine_paddle.py +107 -193
- natural_pdf/ocr/engine_surya.py +75 -148
- natural_pdf/ocr/ocr_factory.py +114 -0
- natural_pdf/ocr/ocr_manager.py +65 -93
- natural_pdf/ocr/ocr_options.py +7 -17
- natural_pdf/ocr/utils.py +98 -0
- natural_pdf/templates/spa/css/style.css +334 -0
- natural_pdf/templates/spa/index.html +31 -0
- natural_pdf/templates/spa/js/app.js +472 -0
- natural_pdf/templates/spa/words.txt +235976 -0
- natural_pdf/utils/debug.py +32 -0
- natural_pdf/utils/identifiers.py +29 -0
- natural_pdf/utils/packaging.py +418 -0
- {natural_pdf-0.1.5.dist-info → natural_pdf-0.1.6.dist-info}/METADATA +41 -19
- {natural_pdf-0.1.5.dist-info → natural_pdf-0.1.6.dist-info}/RECORD +51 -44
- {natural_pdf-0.1.5.dist-info → natural_pdf-0.1.6.dist-info}/WHEEL +1 -1
- {natural_pdf-0.1.5.dist-info → natural_pdf-0.1.6.dist-info}/top_level.txt +0 -1
- natural_pdf/templates/ocr_debug.html +0 -517
- tests/test_loading.py +0 -50
- tests/test_optional_deps.py +0 -298
- {natural_pdf-0.1.5.dist-info → natural_pdf-0.1.6.dist-info}/licenses/LICENSE +0 -0
natural_pdf/core/page.py
CHANGED
@@ -10,7 +10,7 @@ from pathlib import Path
|
|
10
10
|
from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union
|
11
11
|
|
12
12
|
import pdfplumber
|
13
|
-
from PIL import Image
|
13
|
+
from PIL import Image, ImageDraw
|
14
14
|
|
15
15
|
from natural_pdf.elements.collections import ElementCollection
|
16
16
|
from natural_pdf.elements.region import Region
|
@@ -43,6 +43,9 @@ from natural_pdf.utils.text_extraction import filter_chars_spatially, generate_t
|
|
43
43
|
from natural_pdf.widgets import InteractiveViewerWidget
|
44
44
|
from natural_pdf.widgets.viewer import _IPYWIDGETS_AVAILABLE, SimpleInteractiveViewerWidget
|
45
45
|
|
46
|
+
from natural_pdf.qa import DocumentQA, get_qa_engine
|
47
|
+
from natural_pdf.ocr.utils import _apply_ocr_correction_to_elements
|
48
|
+
|
46
49
|
logger = logging.getLogger(__name__)
|
47
50
|
|
48
51
|
|
@@ -1230,6 +1233,7 @@ class Page:
|
|
1230
1233
|
render_ocr: bool = False,
|
1231
1234
|
resolution: Optional[float] = None,
|
1232
1235
|
include_highlights: bool = True,
|
1236
|
+
exclusions: Optional[str] = None, # New parameter
|
1233
1237
|
**kwargs,
|
1234
1238
|
) -> Optional[Image.Image]:
|
1235
1239
|
"""
|
@@ -1244,27 +1248,29 @@ class Page:
|
|
1244
1248
|
render_ocr: Whether to render OCR text on highlights.
|
1245
1249
|
resolution: Resolution in DPI for base page image (default: scale * 72).
|
1246
1250
|
include_highlights: Whether to render highlights.
|
1251
|
+
exclusions: If 'mask', excluded regions will be whited out on the image.
|
1252
|
+
(default: None).
|
1247
1253
|
**kwargs: Additional parameters for pdfplumber.to_image.
|
1248
1254
|
|
1249
1255
|
Returns:
|
1250
1256
|
PIL Image of the page, or None if rendering fails.
|
1251
1257
|
"""
|
1252
1258
|
image = None
|
1259
|
+
render_resolution = resolution if resolution is not None else scale * 72
|
1253
1260
|
try:
|
1254
1261
|
if include_highlights:
|
1255
1262
|
# Delegate rendering to the central service
|
1256
1263
|
image = self._highlighter.render_page(
|
1257
1264
|
page_index=self.index,
|
1258
|
-
scale=scale,
|
1265
|
+
scale=scale, # Note: scale is used by highlighter internally for drawing
|
1259
1266
|
labels=labels,
|
1260
1267
|
legend_position=legend_position,
|
1261
1268
|
render_ocr=render_ocr,
|
1262
|
-
resolution=resolution
|
1269
|
+
resolution=render_resolution, # Pass the calculated resolution
|
1263
1270
|
**kwargs,
|
1264
1271
|
)
|
1265
1272
|
else:
|
1266
1273
|
# Get the base page image directly from pdfplumber if no highlights needed
|
1267
|
-
render_resolution = resolution if resolution is not None else scale * 72
|
1268
1274
|
# Use the underlying pdfplumber page object
|
1269
1275
|
img_object = self._page.to_image(resolution=render_resolution, **kwargs)
|
1270
1276
|
# Access the PIL image directly (assuming pdfplumber structure)
|
@@ -1287,6 +1293,48 @@ class Page:
|
|
1287
1293
|
if image is None:
|
1288
1294
|
return None
|
1289
1295
|
|
1296
|
+
# --- Apply exclusion masking if requested ---
|
1297
|
+
if exclusions == "mask" and self._exclusions:
|
1298
|
+
try:
|
1299
|
+
# Ensure image is mutable (RGB or RGBA)
|
1300
|
+
if image.mode not in ("RGB", "RGBA"):
|
1301
|
+
image = image.convert("RGB")
|
1302
|
+
|
1303
|
+
exclusion_regions = self._get_exclusion_regions(include_callable=True, debug=False)
|
1304
|
+
if exclusion_regions:
|
1305
|
+
draw = ImageDraw.Draw(image)
|
1306
|
+
# Calculate the scaling factor used for the image
|
1307
|
+
# Base image was rendered at render_resolution (DPI)
|
1308
|
+
# pdfplumber default is 72 DPI
|
1309
|
+
# Scale factor = (pixels / inch) / (points / inch) = DPI / 72
|
1310
|
+
img_scale = render_resolution / 72.0
|
1311
|
+
|
1312
|
+
for region in exclusion_regions:
|
1313
|
+
# Convert PDF points (x0, top, x1, bottom) to image pixels
|
1314
|
+
img_x0 = region.x0 * img_scale
|
1315
|
+
img_top = region.top * img_scale
|
1316
|
+
img_x1 = region.x1 * img_scale
|
1317
|
+
img_bottom = region.bottom * img_scale
|
1318
|
+
|
1319
|
+
# Draw a white rectangle over the excluded area
|
1320
|
+
# Ensure coordinates are within image bounds (though region should be)
|
1321
|
+
img_coords = (
|
1322
|
+
max(0, img_x0),
|
1323
|
+
max(0, img_top),
|
1324
|
+
min(image.width, img_x1),
|
1325
|
+
min(image.height, img_bottom)
|
1326
|
+
)
|
1327
|
+
if img_coords[0] < img_coords[2] and img_coords[1] < img_coords[3]:
|
1328
|
+
draw.rectangle(img_coords, fill="white")
|
1329
|
+
else:
|
1330
|
+
logger.warning(f"Skipping invalid exclusion rect for masking: {img_coords}")
|
1331
|
+
|
1332
|
+
del draw # Release drawing context
|
1333
|
+
except Exception as mask_error:
|
1334
|
+
logger.error(f"Error applying exclusion mask to page {self.index}: {mask_error}", exc_info=True)
|
1335
|
+
# Decide if you want to return None or continue without mask
|
1336
|
+
# For now, continue without mask
|
1337
|
+
|
1290
1338
|
# Resize the final image if width is provided
|
1291
1339
|
if width is not None and width > 0 and image.width > 0:
|
1292
1340
|
aspect_ratio = image.height / image.width
|
@@ -1328,20 +1376,34 @@ class Page:
|
|
1328
1376
|
languages: Optional[List[str]] = None,
|
1329
1377
|
min_confidence: Optional[float] = None,
|
1330
1378
|
device: Optional[str] = None,
|
1379
|
+
resolution: Optional[int] = None,
|
1380
|
+
detect_only: bool = False,
|
1381
|
+
apply_exclusions: bool = True,
|
1331
1382
|
) -> "Page":
|
1332
1383
|
"""
|
1333
1384
|
Apply OCR to THIS page and add results to page elements via PDF.apply_ocr.
|
1334
1385
|
|
1386
|
+
Args:
|
1387
|
+
engine: Name of the OCR engine.
|
1388
|
+
options: Engine-specific options object or dict.
|
1389
|
+
languages: List of engine-specific language codes.
|
1390
|
+
min_confidence: Minimum confidence threshold.
|
1391
|
+
device: Device to run OCR on.
|
1392
|
+
resolution: DPI resolution for rendering page image before OCR.
|
1393
|
+
apply_exclusions: If True (default), render page image for OCR
|
1394
|
+
with excluded areas masked (whited out).
|
1395
|
+
|
1335
1396
|
Returns:
|
1336
1397
|
List of created TextElements derived from OCR results for this page.
|
1337
1398
|
"""
|
1338
1399
|
if not hasattr(self._parent, "apply_ocr"):
|
1339
1400
|
logger.error(f"Page {self.number}: Parent PDF missing 'apply_ocr'. Cannot apply OCR.")
|
1340
|
-
return []
|
1401
|
+
return [] # Return empty list for consistency
|
1341
1402
|
|
1342
1403
|
logger.info(f"Page {self.number}: Delegating apply_ocr to PDF.apply_ocr.")
|
1343
1404
|
try:
|
1344
1405
|
# Delegate to parent PDF, targeting only this page's index
|
1406
|
+
# Pass all relevant parameters through, including apply_exclusions
|
1345
1407
|
self._parent.apply_ocr(
|
1346
1408
|
pages=[self.index],
|
1347
1409
|
engine=engine,
|
@@ -1349,17 +1411,21 @@ class Page:
|
|
1349
1411
|
languages=languages,
|
1350
1412
|
min_confidence=min_confidence,
|
1351
1413
|
device=device,
|
1414
|
+
resolution=resolution,
|
1415
|
+
detect_only=detect_only,
|
1416
|
+
apply_exclusions=apply_exclusions,
|
1352
1417
|
)
|
1353
1418
|
except Exception as e:
|
1354
1419
|
logger.error(f"Page {self.number}: Error during delegated OCR call: {e}", exc_info=True)
|
1355
1420
|
return []
|
1356
1421
|
|
1357
1422
|
# Return the OCR elements specifically added to this page
|
1358
|
-
# Use element manager to retrieve them
|
1359
1423
|
ocr_elements = [el for el in self.words if getattr(el, "source", None) == "ocr"]
|
1360
1424
|
logger.debug(
|
1361
1425
|
f"Page {self.number}: apply_ocr completed. Found {len(ocr_elements)} OCR elements."
|
1362
1426
|
)
|
1427
|
+
# Note: The method is typed to return Page for chaining, but the log indicates
|
1428
|
+
# finding elements. Let's stick to returning self for chaining consistency.
|
1363
1429
|
return self
|
1364
1430
|
|
1365
1431
|
def extract_ocr_elements(
|
@@ -1369,10 +1435,22 @@ class Page:
|
|
1369
1435
|
languages: Optional[List[str]] = None,
|
1370
1436
|
min_confidence: Optional[float] = None,
|
1371
1437
|
device: Optional[str] = None,
|
1438
|
+
resolution: Optional[int] = None,
|
1372
1439
|
) -> List[TextElement]:
|
1373
1440
|
"""
|
1374
1441
|
Extract text elements using OCR *without* adding them to the page's elements.
|
1375
1442
|
Uses the shared OCRManager instance.
|
1443
|
+
|
1444
|
+
Args:
|
1445
|
+
engine: Name of the OCR engine.
|
1446
|
+
options: Engine-specific options object or dict.
|
1447
|
+
languages: List of engine-specific language codes.
|
1448
|
+
min_confidence: Minimum confidence threshold.
|
1449
|
+
device: Device to run OCR on.
|
1450
|
+
resolution: DPI resolution for rendering page image before OCR.
|
1451
|
+
|
1452
|
+
Returns:
|
1453
|
+
List of created TextElement objects derived from OCR results for this page.
|
1376
1454
|
"""
|
1377
1455
|
if not self._ocr_manager:
|
1378
1456
|
logger.error(
|
@@ -1381,10 +1459,14 @@ class Page:
|
|
1381
1459
|
return []
|
1382
1460
|
|
1383
1461
|
logger.info(f"Page {self.number}: Extracting OCR elements (extract only)...")
|
1462
|
+
|
1463
|
+
# Determine rendering resolution
|
1464
|
+
final_resolution = resolution if resolution is not None else 150 # Default to 150 DPI
|
1465
|
+
logger.debug(f" Using rendering resolution: {final_resolution} DPI")
|
1466
|
+
|
1384
1467
|
try:
|
1385
|
-
|
1386
|
-
|
1387
|
-
image = self.to_image(scale=ocr_scale, include_highlights=False)
|
1468
|
+
# Get base image without highlights using the determined resolution
|
1469
|
+
image = self.to_image(resolution=final_resolution, include_highlights=False)
|
1388
1470
|
if not image:
|
1389
1471
|
logger.error(f" Failed to render page {self.number} to image for OCR extraction.")
|
1390
1472
|
return []
|
@@ -1393,13 +1475,16 @@ class Page:
|
|
1393
1475
|
logger.error(f" Failed to render page {self.number} to image: {e}", exc_info=True)
|
1394
1476
|
return []
|
1395
1477
|
|
1396
|
-
|
1397
|
-
|
1398
|
-
|
1399
|
-
|
1400
|
-
|
1401
|
-
|
1402
|
-
|
1478
|
+
# Prepare arguments for the OCR Manager call
|
1479
|
+
manager_args = {
|
1480
|
+
"images": image,
|
1481
|
+
"engine": engine,
|
1482
|
+
"languages": languages,
|
1483
|
+
"min_confidence": min_confidence,
|
1484
|
+
"device": device,
|
1485
|
+
"options": options
|
1486
|
+
}
|
1487
|
+
manager_args = {k: v for k, v in manager_args.items() if v is not None}
|
1403
1488
|
|
1404
1489
|
logger.debug(
|
1405
1490
|
f" Calling OCR Manager (extract only) with args: { {k:v for k,v in manager_args.items() if k != 'images'} }"
|
@@ -1415,7 +1500,6 @@ class Page:
|
|
1415
1500
|
and isinstance(results_list[0], list)
|
1416
1501
|
else results_list
|
1417
1502
|
)
|
1418
|
-
|
1419
1503
|
if not isinstance(results, list):
|
1420
1504
|
logger.error(f" OCR Manager returned unexpected type: {type(results)}")
|
1421
1505
|
results = []
|
@@ -1426,28 +1510,30 @@ class Page:
|
|
1426
1510
|
|
1427
1511
|
# Convert results but DO NOT add to ElementManager
|
1428
1512
|
logger.debug(f" Converting OCR results to TextElements (extract only)...")
|
1429
|
-
# Use a temporary method to create elements without adding them globally
|
1430
1513
|
temp_elements = []
|
1431
1514
|
scale_x = self.width / image.width if image.width else 1
|
1432
1515
|
scale_y = self.height / image.height if image.height else 1
|
1433
1516
|
for result in results:
|
1434
|
-
|
1435
|
-
|
1436
|
-
|
1437
|
-
|
1438
|
-
|
1439
|
-
|
1440
|
-
|
1441
|
-
|
1442
|
-
|
1443
|
-
|
1444
|
-
|
1445
|
-
|
1446
|
-
|
1447
|
-
|
1448
|
-
|
1449
|
-
|
1450
|
-
|
1517
|
+
try: # Added try-except around result processing
|
1518
|
+
x0, top, x1, bottom = [float(c) for c in result["bbox"]]
|
1519
|
+
elem_data = {
|
1520
|
+
"text": result["text"],
|
1521
|
+
"confidence": result["confidence"],
|
1522
|
+
"x0": x0 * scale_x,
|
1523
|
+
"top": top * scale_y,
|
1524
|
+
"x1": x1 * scale_x,
|
1525
|
+
"bottom": bottom * scale_y,
|
1526
|
+
"width": (x1 - x0) * scale_x,
|
1527
|
+
"height": (bottom - top) * scale_y,
|
1528
|
+
"object_type": "text", # Using text for temporary elements
|
1529
|
+
"source": "ocr",
|
1530
|
+
"fontname": "OCR-extract", # Different name for clarity
|
1531
|
+
"size": 10.0,
|
1532
|
+
"page_number": self.number,
|
1533
|
+
}
|
1534
|
+
temp_elements.append(TextElement(elem_data, self))
|
1535
|
+
except (KeyError, ValueError, TypeError) as convert_err:
|
1536
|
+
logger.warning(f" Skipping invalid OCR result during conversion: {result}. Error: {convert_err}")
|
1451
1537
|
|
1452
1538
|
logger.info(f" Created {len(temp_elements)} TextElements from OCR (extract only).")
|
1453
1539
|
return temp_elements
|
@@ -1914,7 +2000,7 @@ class Page:
|
|
1914
2000
|
Requires optional dependencies. Install with: pip install "natural-pdf[ocr-save]"
|
1915
2001
|
|
1916
2002
|
Note: OCR must have been applied to the pages beforehand
|
1917
|
-
(e.g.,
|
2003
|
+
(e.g., pdf.apply_ocr()).
|
1918
2004
|
|
1919
2005
|
Args:
|
1920
2006
|
output_path: Path to save the searchable PDF.
|
@@ -1929,3 +2015,46 @@ class Page:
|
|
1929
2015
|
|
1930
2016
|
create_searchable_pdf(self, output_path_str, dpi=dpi, **kwargs)
|
1931
2017
|
logger.info(f"Searchable PDF saved to: {output_path_str}")
|
2018
|
+
|
2019
|
+
# --- Added correct_ocr method ---
|
2020
|
+
def correct_ocr(
|
2021
|
+
self,
|
2022
|
+
correction_callback: Callable[[Any], Optional[str]],
|
2023
|
+
) -> "Page": # Return self for chaining
|
2024
|
+
"""
|
2025
|
+
Applies corrections to OCR-generated text elements on this page
|
2026
|
+
using a user-provided callback function.
|
2027
|
+
|
2028
|
+
Finds text elements on this page whose 'source' attribute starts
|
2029
|
+
with 'ocr' and calls the `correction_callback` for each, passing the
|
2030
|
+
element itself.
|
2031
|
+
|
2032
|
+
The `correction_callback` should contain the logic to:
|
2033
|
+
1. Determine if the element needs correction.
|
2034
|
+
2. Perform the correction (e.g., call an LLM).
|
2035
|
+
3. Return the new text (`str`) or `None`.
|
2036
|
+
|
2037
|
+
If the callback returns a string, the element's `.text` is updated.
|
2038
|
+
Metadata updates (source, confidence, etc.) should happen within the callback.
|
2039
|
+
|
2040
|
+
Args:
|
2041
|
+
correction_callback: A function accepting an element and returning
|
2042
|
+
`Optional[str]` (new text or None).
|
2043
|
+
|
2044
|
+
Returns:
|
2045
|
+
Self for method chaining.
|
2046
|
+
"""
|
2047
|
+
logger.info(f"Page {self.number}: Starting OCR correction process using callback '{correction_callback.__name__}'")
|
2048
|
+
|
2049
|
+
# Find OCR elements specifically on this page
|
2050
|
+
# Note: We typically want to correct even if the element falls in an excluded area
|
2051
|
+
target_elements = self.find_all(selector="text[source^=ocr]", apply_exclusions=False)
|
2052
|
+
|
2053
|
+
# Delegate to the utility function
|
2054
|
+
_apply_ocr_correction_to_elements(
|
2055
|
+
elements=target_elements, # Pass the ElementCollection directly
|
2056
|
+
correction_callback=correction_callback,
|
2057
|
+
caller_info=f"Page({self.number})", # Pass caller info
|
2058
|
+
)
|
2059
|
+
|
2060
|
+
return self # Return self for chaining
|
natural_pdf/core/pdf.py
CHANGED
@@ -17,6 +17,8 @@ from typing import ( # Added Iterable and TYPE_CHECKING
|
|
17
17
|
Type,
|
18
18
|
Union,
|
19
19
|
)
|
20
|
+
from pathlib import Path
|
21
|
+
|
20
22
|
|
21
23
|
import pdfplumber
|
22
24
|
from PIL import Image
|
@@ -235,11 +237,16 @@ class PDF:
|
|
235
237
|
self,
|
236
238
|
pages: Optional[Union[Iterable[int], range, slice]] = None,
|
237
239
|
engine: Optional[str] = None,
|
238
|
-
|
240
|
+
# --- Common OCR Parameters (Direct Arguments) ---
|
239
241
|
languages: Optional[List[str]] = None,
|
240
|
-
min_confidence: Optional[float] = None,
|
242
|
+
min_confidence: Optional[float] = None, # Min confidence threshold
|
241
243
|
device: Optional[str] = None,
|
242
|
-
|
244
|
+
resolution: Optional[int] = None, # DPI for rendering before OCR
|
245
|
+
apply_exclusions: bool = True, # New parameter
|
246
|
+
detect_only: bool = False,
|
247
|
+
# --- Engine-Specific Options --- Use 'options=' for this
|
248
|
+
options: Optional[Any] = None, # e.g., EasyOCROptions(...), PaddleOCROptions(...), or dict
|
249
|
+
# **kwargs: Optional[Dict[str, Any]] = None # Allow potential extra args?
|
243
250
|
) -> "PDF":
|
244
251
|
"""
|
245
252
|
Applies OCR to specified pages (or all pages) of the PDF using batch processing.
|
@@ -250,20 +257,30 @@ class PDF:
|
|
250
257
|
Args:
|
251
258
|
pages: An iterable of 0-based page indices (list, range, tuple),
|
252
259
|
a slice object, or None to process all pages.
|
253
|
-
engine: Name of the engine (e.g., 'easyocr', 'paddleocr', 'surya').
|
254
|
-
Uses manager's default
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
min_confidence: Minimum confidence threshold for
|
259
|
-
|
260
|
+
engine: Name of the OCR engine (e.g., 'easyocr', 'paddleocr', 'surya').
|
261
|
+
Uses manager's default ('easyocr') if None.
|
262
|
+
languages: List of language codes (e.g., ['en', 'fr'], ['en', 'ch_sim']).
|
263
|
+
**Must be codes understood by the specific selected engine.**
|
264
|
+
No mapping is performed. Overrides manager/engine default.
|
265
|
+
min_confidence: Minimum confidence threshold for detected text (0.0 to 1.0).
|
266
|
+
Overrides manager/engine default.
|
267
|
+
device: Device to run OCR on (e.g., 'cpu', 'cuda', 'mps').
|
268
|
+
Overrides manager/engine default.
|
269
|
+
resolution: DPI resolution to render page images before OCR (e.g., 150, 300).
|
270
|
+
Affects input quality for OCR. Defaults to 150 if not set.
|
271
|
+
apply_exclusions: If True (default), render page image for OCR with
|
272
|
+
excluded areas masked (whited out). If False, OCR
|
273
|
+
the raw page image without masking exclusions.
|
274
|
+
detect_only: If True, only detect text bounding boxes, don't perform OCR.
|
275
|
+
options: An engine-specific options object (e.g., EasyOCROptions) or dict
|
276
|
+
containing parameters specific to the chosen engine.
|
260
277
|
|
261
278
|
Returns:
|
262
279
|
Self for method chaining.
|
263
280
|
|
264
281
|
Raises:
|
265
|
-
ValueError: If page indices are invalid
|
266
|
-
TypeError: If
|
282
|
+
ValueError: If page indices are invalid.
|
283
|
+
TypeError: If 'options' is not compatible with the engine.
|
267
284
|
RuntimeError: If the OCRManager or selected engine is not available.
|
268
285
|
"""
|
269
286
|
if not self._ocr_manager:
|
@@ -271,7 +288,7 @@ class PDF:
|
|
271
288
|
# Or raise RuntimeError("OCRManager not initialized.")
|
272
289
|
return self
|
273
290
|
|
274
|
-
# --- Determine Target Pages ---
|
291
|
+
# --- Determine Target Pages (unchanged) ---
|
275
292
|
target_pages: List[Page] = []
|
276
293
|
if pages is None:
|
277
294
|
target_pages = self._pages
|
@@ -295,44 +312,63 @@ class PDF:
|
|
295
312
|
|
296
313
|
page_numbers = [p.number for p in target_pages]
|
297
314
|
logger.info(f"Applying batch OCR to pages: {page_numbers}...")
|
315
|
+
# --- Determine Rendering Resolution ---
|
316
|
+
# Priority: 1. direct `resolution` arg, 2. PDF config, 3. default 150
|
317
|
+
final_resolution = resolution # Use direct arg if provided
|
318
|
+
if final_resolution is None:
|
319
|
+
final_resolution = getattr(self, "_config", {}).get("resolution", 150)
|
320
|
+
|
321
|
+
logger.debug(f"Using OCR image rendering resolution: {final_resolution} DPI")
|
298
322
|
|
299
323
|
# --- Render Images for Batch ---
|
300
324
|
images_pil: List[Image.Image] = []
|
301
325
|
page_image_map: List[Tuple[Page, Image.Image]] = [] # Store page and its image
|
302
|
-
logger.info(f"Rendering {len(target_pages)} pages to images...")
|
326
|
+
logger.info(f"Rendering {len(target_pages)} pages to images at {final_resolution} DPI (apply_exclusions={apply_exclusions})...")
|
303
327
|
failed_page_num = "unknown" # Keep track of potentially failing page
|
304
328
|
try:
|
305
|
-
ocr_scale = getattr(self, "_config", {}).get("ocr_image_scale", 2.0)
|
306
329
|
for i, page in enumerate(target_pages):
|
307
330
|
failed_page_num = page.number # Update current page number in case of error
|
308
331
|
logger.debug(f" Rendering page {page.number} (index {page.index})...")
|
309
|
-
# Use
|
310
|
-
|
332
|
+
# Use the determined final_resolution and apply exclusions if requested
|
333
|
+
to_image_kwargs = {
|
334
|
+
"resolution": final_resolution,
|
335
|
+
"include_highlights": False,
|
336
|
+
"exclusions": "mask" if apply_exclusions else None,
|
337
|
+
}
|
338
|
+
img = page.to_image(**to_image_kwargs)
|
339
|
+
if img is None:
|
340
|
+
logger.error(f" Failed to render page {page.number} to image.")
|
341
|
+
# Decide how to handle: skip page, raise error? For now, skip.
|
342
|
+
continue # Skip this page if rendering failed
|
311
343
|
images_pil.append(img)
|
312
344
|
page_image_map.append((page, img)) # Store pair
|
313
345
|
except Exception as e:
|
314
346
|
logger.error(f"Failed to render one or more pages for batch OCR: {e}", exc_info=True)
|
315
347
|
raise RuntimeError(f"Failed to render page {failed_page_num} for OCR.") from e
|
316
348
|
|
317
|
-
if not images_pil:
|
349
|
+
if not images_pil or not page_image_map:
|
318
350
|
logger.error("No images were successfully rendered for batch OCR.")
|
319
351
|
return self
|
320
352
|
|
321
353
|
# --- Prepare Arguments for Manager ---
|
322
|
-
|
323
|
-
|
324
|
-
|
325
|
-
|
326
|
-
|
327
|
-
|
328
|
-
|
329
|
-
|
330
|
-
|
354
|
+
# Pass common args directly, engine-specific via options
|
355
|
+
manager_args = {
|
356
|
+
"images": images_pil,
|
357
|
+
"engine": engine,
|
358
|
+
"languages": languages,
|
359
|
+
"min_confidence": min_confidence, # Use the renamed parameter
|
360
|
+
"device": device,
|
361
|
+
"options": options,
|
362
|
+
"detect_only": detect_only,
|
363
|
+
# Note: resolution is used for rendering, not passed to OCR manager directly
|
364
|
+
}
|
365
|
+
# Filter out None values so manager can use its defaults
|
366
|
+
manager_args = {k: v for k, v in manager_args.items() if v is not None}
|
331
367
|
|
332
368
|
# --- Call OCR Manager for Batch Processing ---
|
333
|
-
logger.info(f"Calling OCR Manager for
|
369
|
+
logger.info(f"Calling OCR Manager with args: { {k:v for k,v in manager_args.items() if k!='images'} } ...")
|
334
370
|
try:
|
335
|
-
#
|
371
|
+
# Manager's apply_ocr signature needs to accept common args directly
|
336
372
|
batch_results = self._ocr_manager.apply_ocr(**manager_args)
|
337
373
|
|
338
374
|
if not isinstance(batch_results, list) or len(batch_results) != len(images_pil):
|
@@ -341,16 +377,15 @@ class PDF:
|
|
341
377
|
f"Expected list of length {len(images_pil)}, got {type(batch_results)} "
|
342
378
|
f"with length {len(batch_results) if isinstance(batch_results, list) else 'N/A'}."
|
343
379
|
)
|
344
|
-
|
345
|
-
return self # Return self without adding elements
|
380
|
+
return self
|
346
381
|
|
347
382
|
logger.info("OCR Manager batch processing complete.")
|
348
383
|
|
349
384
|
except Exception as e:
|
350
385
|
logger.error(f"Batch OCR processing failed: {e}", exc_info=True)
|
351
|
-
return self
|
386
|
+
return self
|
352
387
|
|
353
|
-
# --- Distribute Results and Add Elements to Pages ---
|
388
|
+
# --- Distribute Results and Add Elements to Pages (unchanged) ---
|
354
389
|
logger.info("Adding OCR results to respective pages...")
|
355
390
|
total_elements_added = 0
|
356
391
|
for i, (page, img) in enumerate(page_image_map):
|
@@ -362,10 +397,7 @@ class PDF:
|
|
362
397
|
continue
|
363
398
|
|
364
399
|
logger.debug(f" Processing {len(results_for_page)} results for page {page.number}...")
|
365
|
-
# Use the page's element manager to create elements from its results
|
366
|
-
# Changed from page._create_text_elements_from_ocr to use element_mgr
|
367
400
|
try:
|
368
|
-
# Calculate scale factors based on rendered image vs page dims
|
369
401
|
img_scale_x = page.width / img.width if img.width > 0 else 1
|
370
402
|
img_scale_y = page.height / img.height if img.height > 0 else 1
|
371
403
|
elements = page._element_mgr.create_text_elements_from_ocr(
|
@@ -373,7 +405,6 @@ class PDF:
|
|
373
405
|
)
|
374
406
|
|
375
407
|
if elements:
|
376
|
-
# Note: element_mgr.create_text_elements_from_ocr already adds them
|
377
408
|
total_elements_added += len(elements)
|
378
409
|
logger.debug(f" Added {len(elements)} OCR TextElements to page {page.number}.")
|
379
410
|
else:
|
@@ -382,7 +413,6 @@ class PDF:
|
|
382
413
|
logger.error(
|
383
414
|
f" Error adding OCR elements to page {page.number}: {e}", exc_info=True
|
384
415
|
)
|
385
|
-
# Continue to next page
|
386
416
|
|
387
417
|
logger.info(
|
388
418
|
f"Finished adding OCR results. Total elements added across {len(target_pages)} pages: {total_elements_added}"
|
@@ -907,6 +937,80 @@ class PDF:
|
|
907
937
|
f"Search within index failed for PDF '{self.path}'. See logs for details."
|
908
938
|
) from e
|
909
939
|
|
940
|
+
def export_ocr_correction_task(self, output_zip_path: str, **kwargs):
|
941
|
+
"""
|
942
|
+
Exports OCR results from this PDF into a correction task package (zip file).
|
943
|
+
|
944
|
+
Args:
|
945
|
+
output_zip_path: The path to save the output zip file.
|
946
|
+
**kwargs: Additional arguments passed to create_correction_task_package
|
947
|
+
(e.g., image_render_scale, overwrite).
|
948
|
+
"""
|
949
|
+
try:
|
950
|
+
from natural_pdf.utils.packaging import create_correction_task_package
|
951
|
+
create_correction_task_package(source=self, output_zip_path=output_zip_path, **kwargs)
|
952
|
+
except ImportError:
|
953
|
+
logger.error("Failed to import 'create_correction_task_package'. Packaging utility might be missing.")
|
954
|
+
# Or raise
|
955
|
+
except Exception as e:
|
956
|
+
logger.error(f"Failed to export correction task for {self.path}: {e}", exc_info=True)
|
957
|
+
raise # Re-raise the exception from the utility function
|
958
|
+
|
959
|
+
def correct_ocr(
|
960
|
+
self,
|
961
|
+
correction_callback: Callable[[Any], Optional[str]],
|
962
|
+
pages: Optional[Union[Iterable[int], range, slice]] = None,
|
963
|
+
) -> "PDF": # Return self for chaining
|
964
|
+
"""
|
965
|
+
Applies corrections to OCR-generated text elements using a callback function,
|
966
|
+
delegating the core work to the `Page.correct_ocr` method.
|
967
|
+
|
968
|
+
Args:
|
969
|
+
correction_callback: A function that accepts a single argument (an element
|
970
|
+
object) and returns `Optional[str]`. It returns the
|
971
|
+
corrected text string if an update is needed, otherwise None.
|
972
|
+
pages: Optional page indices/slice to limit the scope of correction
|
973
|
+
(default: all pages).
|
974
|
+
|
975
|
+
Returns:
|
976
|
+
Self for method chaining.
|
977
|
+
"""
|
978
|
+
# Determine target pages
|
979
|
+
target_page_indices: List[int] = []
|
980
|
+
if pages is None:
|
981
|
+
target_page_indices = list(range(len(self._pages)))
|
982
|
+
elif isinstance(pages, slice):
|
983
|
+
target_page_indices = list(range(*pages.indices(len(self._pages))))
|
984
|
+
elif hasattr(pages, "__iter__"):
|
985
|
+
try:
|
986
|
+
target_page_indices = [int(i) for i in pages]
|
987
|
+
# Validate indices
|
988
|
+
for idx in target_page_indices:
|
989
|
+
if not (0 <= idx < len(self._pages)):
|
990
|
+
raise IndexError(f"Page index {idx} out of range (0-{len(self._pages)-1}).")
|
991
|
+
except (IndexError, TypeError, ValueError) as e:
|
992
|
+
raise ValueError(f"Invalid page index or type provided in 'pages': {pages}. Error: {e}") from e
|
993
|
+
else:
|
994
|
+
raise TypeError("'pages' must be None, a slice, or an iterable of page indices (int).")
|
995
|
+
|
996
|
+
if not target_page_indices:
|
997
|
+
logger.warning("No pages selected for OCR correction.")
|
998
|
+
return self
|
999
|
+
|
1000
|
+
logger.info(f"Starting OCR correction process via Page delegation for pages: {target_page_indices}")
|
1001
|
+
|
1002
|
+
# Iterate through target pages and call their correct_ocr method
|
1003
|
+
for page_idx in target_page_indices:
|
1004
|
+
page = self._pages[page_idx]
|
1005
|
+
try:
|
1006
|
+
page.correct_ocr(correction_callback=correction_callback)
|
1007
|
+
except Exception as e:
|
1008
|
+
logger.error(f"Error during correct_ocr on page {page_idx}: {e}", exc_info=True)
|
1009
|
+
# Optionally re-raise or just log and continue
|
1010
|
+
|
1011
|
+
logger.info(f"OCR correction process finished for requested pages.")
|
1012
|
+
return self
|
1013
|
+
|
910
1014
|
def __len__(self) -> int:
|
911
1015
|
"""Return the number of pages in the PDF."""
|
912
1016
|
# Ensure _pages is initialized
|
@@ -968,6 +1072,7 @@ class PDF:
|
|
968
1072
|
self.close()
|
969
1073
|
|
970
1074
|
|
971
|
-
# ---
|
972
|
-
|
973
|
-
|
1075
|
+
# --- Indexable Protocol Methods --- Needed for search/sync
|
1076
|
+
def get_id(self) -> str:
|
1077
|
+
return self.path
|
1078
|
+
|