natural-pdf 0.1.5__py3-none-any.whl → 0.1.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. docs/ocr/index.md +34 -47
  2. docs/tutorials/01-loading-and-extraction.ipynb +60 -46
  3. docs/tutorials/02-finding-elements.ipynb +42 -42
  4. docs/tutorials/03-extracting-blocks.ipynb +17 -17
  5. docs/tutorials/04-table-extraction.ipynb +12 -12
  6. docs/tutorials/05-excluding-content.ipynb +30 -30
  7. docs/tutorials/06-document-qa.ipynb +28 -28
  8. docs/tutorials/07-layout-analysis.ipynb +63 -35
  9. docs/tutorials/07-working-with-regions.ipynb +55 -51
  10. docs/tutorials/07-working-with-regions.md +2 -2
  11. docs/tutorials/08-spatial-navigation.ipynb +60 -60
  12. docs/tutorials/09-section-extraction.ipynb +113 -113
  13. docs/tutorials/10-form-field-extraction.ipynb +78 -50
  14. docs/tutorials/11-enhanced-table-processing.ipynb +6 -6
  15. docs/tutorials/12-ocr-integration.ipynb +149 -131
  16. docs/tutorials/12-ocr-integration.md +0 -13
  17. docs/tutorials/13-semantic-search.ipynb +313 -873
  18. natural_pdf/__init__.py +21 -23
  19. natural_pdf/analyzers/layout/gemini.py +264 -0
  20. natural_pdf/analyzers/layout/layout_manager.py +28 -1
  21. natural_pdf/analyzers/layout/layout_options.py +11 -0
  22. natural_pdf/analyzers/layout/yolo.py +6 -2
  23. natural_pdf/collections/pdf_collection.py +21 -0
  24. natural_pdf/core/element_manager.py +16 -13
  25. natural_pdf/core/page.py +165 -36
  26. natural_pdf/core/pdf.py +146 -41
  27. natural_pdf/elements/base.py +11 -17
  28. natural_pdf/elements/collections.py +100 -38
  29. natural_pdf/elements/region.py +77 -38
  30. natural_pdf/elements/text.py +5 -0
  31. natural_pdf/ocr/__init__.py +49 -36
  32. natural_pdf/ocr/engine.py +146 -51
  33. natural_pdf/ocr/engine_easyocr.py +141 -161
  34. natural_pdf/ocr/engine_paddle.py +107 -193
  35. natural_pdf/ocr/engine_surya.py +75 -148
  36. natural_pdf/ocr/ocr_factory.py +114 -0
  37. natural_pdf/ocr/ocr_manager.py +65 -93
  38. natural_pdf/ocr/ocr_options.py +7 -17
  39. natural_pdf/ocr/utils.py +98 -0
  40. natural_pdf/templates/spa/css/style.css +334 -0
  41. natural_pdf/templates/spa/index.html +31 -0
  42. natural_pdf/templates/spa/js/app.js +472 -0
  43. natural_pdf/templates/spa/words.txt +235976 -0
  44. natural_pdf/utils/debug.py +32 -0
  45. natural_pdf/utils/identifiers.py +29 -0
  46. natural_pdf/utils/packaging.py +418 -0
  47. {natural_pdf-0.1.5.dist-info → natural_pdf-0.1.6.dist-info}/METADATA +41 -19
  48. {natural_pdf-0.1.5.dist-info → natural_pdf-0.1.6.dist-info}/RECORD +51 -44
  49. {natural_pdf-0.1.5.dist-info → natural_pdf-0.1.6.dist-info}/WHEEL +1 -1
  50. {natural_pdf-0.1.5.dist-info → natural_pdf-0.1.6.dist-info}/top_level.txt +0 -1
  51. natural_pdf/templates/ocr_debug.html +0 -517
  52. tests/test_loading.py +0 -50
  53. tests/test_optional_deps.py +0 -298
  54. {natural_pdf-0.1.5.dist-info → natural_pdf-0.1.6.dist-info}/licenses/LICENSE +0 -0
natural_pdf/core/page.py CHANGED
@@ -10,7 +10,7 @@ from pathlib import Path
10
10
  from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union
11
11
 
12
12
  import pdfplumber
13
- from PIL import Image
13
+ from PIL import Image, ImageDraw
14
14
 
15
15
  from natural_pdf.elements.collections import ElementCollection
16
16
  from natural_pdf.elements.region import Region
@@ -43,6 +43,9 @@ from natural_pdf.utils.text_extraction import filter_chars_spatially, generate_t
43
43
  from natural_pdf.widgets import InteractiveViewerWidget
44
44
  from natural_pdf.widgets.viewer import _IPYWIDGETS_AVAILABLE, SimpleInteractiveViewerWidget
45
45
 
46
+ from natural_pdf.qa import DocumentQA, get_qa_engine
47
+ from natural_pdf.ocr.utils import _apply_ocr_correction_to_elements
48
+
46
49
  logger = logging.getLogger(__name__)
47
50
 
48
51
 
@@ -1230,6 +1233,7 @@ class Page:
1230
1233
  render_ocr: bool = False,
1231
1234
  resolution: Optional[float] = None,
1232
1235
  include_highlights: bool = True,
1236
+ exclusions: Optional[str] = None, # New parameter
1233
1237
  **kwargs,
1234
1238
  ) -> Optional[Image.Image]:
1235
1239
  """
@@ -1244,27 +1248,29 @@ class Page:
1244
1248
  render_ocr: Whether to render OCR text on highlights.
1245
1249
  resolution: Resolution in DPI for base page image (default: scale * 72).
1246
1250
  include_highlights: Whether to render highlights.
1251
+ exclusions: If 'mask', excluded regions will be whited out on the image.
1252
+ (default: None).
1247
1253
  **kwargs: Additional parameters for pdfplumber.to_image.
1248
1254
 
1249
1255
  Returns:
1250
1256
  PIL Image of the page, or None if rendering fails.
1251
1257
  """
1252
1258
  image = None
1259
+ render_resolution = resolution if resolution is not None else scale * 72
1253
1260
  try:
1254
1261
  if include_highlights:
1255
1262
  # Delegate rendering to the central service
1256
1263
  image = self._highlighter.render_page(
1257
1264
  page_index=self.index,
1258
- scale=scale,
1265
+ scale=scale, # Note: scale is used by highlighter internally for drawing
1259
1266
  labels=labels,
1260
1267
  legend_position=legend_position,
1261
1268
  render_ocr=render_ocr,
1262
- resolution=resolution,
1269
+ resolution=render_resolution, # Pass the calculated resolution
1263
1270
  **kwargs,
1264
1271
  )
1265
1272
  else:
1266
1273
  # Get the base page image directly from pdfplumber if no highlights needed
1267
- render_resolution = resolution if resolution is not None else scale * 72
1268
1274
  # Use the underlying pdfplumber page object
1269
1275
  img_object = self._page.to_image(resolution=render_resolution, **kwargs)
1270
1276
  # Access the PIL image directly (assuming pdfplumber structure)
@@ -1287,6 +1293,48 @@ class Page:
1287
1293
  if image is None:
1288
1294
  return None
1289
1295
 
1296
+ # --- Apply exclusion masking if requested ---
1297
+ if exclusions == "mask" and self._exclusions:
1298
+ try:
1299
+ # Ensure image is mutable (RGB or RGBA)
1300
+ if image.mode not in ("RGB", "RGBA"):
1301
+ image = image.convert("RGB")
1302
+
1303
+ exclusion_regions = self._get_exclusion_regions(include_callable=True, debug=False)
1304
+ if exclusion_regions:
1305
+ draw = ImageDraw.Draw(image)
1306
+ # Calculate the scaling factor used for the image
1307
+ # Base image was rendered at render_resolution (DPI)
1308
+ # pdfplumber default is 72 DPI
1309
+ # Scale factor = (pixels / inch) / (points / inch) = DPI / 72
1310
+ img_scale = render_resolution / 72.0
1311
+
1312
+ for region in exclusion_regions:
1313
+ # Convert PDF points (x0, top, x1, bottom) to image pixels
1314
+ img_x0 = region.x0 * img_scale
1315
+ img_top = region.top * img_scale
1316
+ img_x1 = region.x1 * img_scale
1317
+ img_bottom = region.bottom * img_scale
1318
+
1319
+ # Draw a white rectangle over the excluded area
1320
+ # Ensure coordinates are within image bounds (though region should be)
1321
+ img_coords = (
1322
+ max(0, img_x0),
1323
+ max(0, img_top),
1324
+ min(image.width, img_x1),
1325
+ min(image.height, img_bottom)
1326
+ )
1327
+ if img_coords[0] < img_coords[2] and img_coords[1] < img_coords[3]:
1328
+ draw.rectangle(img_coords, fill="white")
1329
+ else:
1330
+ logger.warning(f"Skipping invalid exclusion rect for masking: {img_coords}")
1331
+
1332
+ del draw # Release drawing context
1333
+ except Exception as mask_error:
1334
+ logger.error(f"Error applying exclusion mask to page {self.index}: {mask_error}", exc_info=True)
1335
+ # Decide if you want to return None or continue without mask
1336
+ # For now, continue without mask
1337
+
1290
1338
  # Resize the final image if width is provided
1291
1339
  if width is not None and width > 0 and image.width > 0:
1292
1340
  aspect_ratio = image.height / image.width
@@ -1328,20 +1376,34 @@ class Page:
1328
1376
  languages: Optional[List[str]] = None,
1329
1377
  min_confidence: Optional[float] = None,
1330
1378
  device: Optional[str] = None,
1379
+ resolution: Optional[int] = None,
1380
+ detect_only: bool = False,
1381
+ apply_exclusions: bool = True,
1331
1382
  ) -> "Page":
1332
1383
  """
1333
1384
  Apply OCR to THIS page and add results to page elements via PDF.apply_ocr.
1334
1385
 
1386
+ Args:
1387
+ engine: Name of the OCR engine.
1388
+ options: Engine-specific options object or dict.
1389
+ languages: List of engine-specific language codes.
1390
+ min_confidence: Minimum confidence threshold.
1391
+ device: Device to run OCR on.
1392
+ resolution: DPI resolution for rendering page image before OCR.
1393
+ apply_exclusions: If True (default), render page image for OCR
1394
+ with excluded areas masked (whited out).
1395
+
1335
1396
  Returns:
1336
1397
  List of created TextElements derived from OCR results for this page.
1337
1398
  """
1338
1399
  if not hasattr(self._parent, "apply_ocr"):
1339
1400
  logger.error(f"Page {self.number}: Parent PDF missing 'apply_ocr'. Cannot apply OCR.")
1340
- return []
1401
+ return [] # Return empty list for consistency
1341
1402
 
1342
1403
  logger.info(f"Page {self.number}: Delegating apply_ocr to PDF.apply_ocr.")
1343
1404
  try:
1344
1405
  # Delegate to parent PDF, targeting only this page's index
1406
+ # Pass all relevant parameters through, including apply_exclusions
1345
1407
  self._parent.apply_ocr(
1346
1408
  pages=[self.index],
1347
1409
  engine=engine,
@@ -1349,17 +1411,21 @@ class Page:
1349
1411
  languages=languages,
1350
1412
  min_confidence=min_confidence,
1351
1413
  device=device,
1414
+ resolution=resolution,
1415
+ detect_only=detect_only,
1416
+ apply_exclusions=apply_exclusions,
1352
1417
  )
1353
1418
  except Exception as e:
1354
1419
  logger.error(f"Page {self.number}: Error during delegated OCR call: {e}", exc_info=True)
1355
1420
  return []
1356
1421
 
1357
1422
  # Return the OCR elements specifically added to this page
1358
- # Use element manager to retrieve them
1359
1423
  ocr_elements = [el for el in self.words if getattr(el, "source", None) == "ocr"]
1360
1424
  logger.debug(
1361
1425
  f"Page {self.number}: apply_ocr completed. Found {len(ocr_elements)} OCR elements."
1362
1426
  )
1427
+ # Note: The method is typed to return Page for chaining, but the log indicates
1428
+ # finding elements. Let's stick to returning self for chaining consistency.
1363
1429
  return self
1364
1430
 
1365
1431
  def extract_ocr_elements(
@@ -1369,10 +1435,22 @@ class Page:
1369
1435
  languages: Optional[List[str]] = None,
1370
1436
  min_confidence: Optional[float] = None,
1371
1437
  device: Optional[str] = None,
1438
+ resolution: Optional[int] = None,
1372
1439
  ) -> List[TextElement]:
1373
1440
  """
1374
1441
  Extract text elements using OCR *without* adding them to the page's elements.
1375
1442
  Uses the shared OCRManager instance.
1443
+
1444
+ Args:
1445
+ engine: Name of the OCR engine.
1446
+ options: Engine-specific options object or dict.
1447
+ languages: List of engine-specific language codes.
1448
+ min_confidence: Minimum confidence threshold.
1449
+ device: Device to run OCR on.
1450
+ resolution: DPI resolution for rendering page image before OCR.
1451
+
1452
+ Returns:
1453
+ List of created TextElement objects derived from OCR results for this page.
1376
1454
  """
1377
1455
  if not self._ocr_manager:
1378
1456
  logger.error(
@@ -1381,10 +1459,14 @@ class Page:
1381
1459
  return []
1382
1460
 
1383
1461
  logger.info(f"Page {self.number}: Extracting OCR elements (extract only)...")
1462
+
1463
+ # Determine rendering resolution
1464
+ final_resolution = resolution if resolution is not None else 150 # Default to 150 DPI
1465
+ logger.debug(f" Using rendering resolution: {final_resolution} DPI")
1466
+
1384
1467
  try:
1385
- ocr_scale = getattr(self._parent, "_config", {}).get("ocr_image_scale", 2.0)
1386
- # Get base image without highlights
1387
- image = self.to_image(scale=ocr_scale, include_highlights=False)
1468
+ # Get base image without highlights using the determined resolution
1469
+ image = self.to_image(resolution=final_resolution, include_highlights=False)
1388
1470
  if not image:
1389
1471
  logger.error(f" Failed to render page {self.number} to image for OCR extraction.")
1390
1472
  return []
@@ -1393,13 +1475,16 @@ class Page:
1393
1475
  logger.error(f" Failed to render page {self.number} to image: {e}", exc_info=True)
1394
1476
  return []
1395
1477
 
1396
- manager_args = {"images": image, "options": options, "engine": engine}
1397
- if languages is not None:
1398
- manager_args["languages"] = languages
1399
- if min_confidence is not None:
1400
- manager_args["min_confidence"] = min_confidence
1401
- if device is not None:
1402
- manager_args["device"] = device
1478
+ # Prepare arguments for the OCR Manager call
1479
+ manager_args = {
1480
+ "images": image,
1481
+ "engine": engine,
1482
+ "languages": languages,
1483
+ "min_confidence": min_confidence,
1484
+ "device": device,
1485
+ "options": options
1486
+ }
1487
+ manager_args = {k: v for k, v in manager_args.items() if v is not None}
1403
1488
 
1404
1489
  logger.debug(
1405
1490
  f" Calling OCR Manager (extract only) with args: { {k:v for k,v in manager_args.items() if k != 'images'} }"
@@ -1415,7 +1500,6 @@ class Page:
1415
1500
  and isinstance(results_list[0], list)
1416
1501
  else results_list
1417
1502
  )
1418
-
1419
1503
  if not isinstance(results, list):
1420
1504
  logger.error(f" OCR Manager returned unexpected type: {type(results)}")
1421
1505
  results = []
@@ -1426,28 +1510,30 @@ class Page:
1426
1510
 
1427
1511
  # Convert results but DO NOT add to ElementManager
1428
1512
  logger.debug(f" Converting OCR results to TextElements (extract only)...")
1429
- # Use a temporary method to create elements without adding them globally
1430
1513
  temp_elements = []
1431
1514
  scale_x = self.width / image.width if image.width else 1
1432
1515
  scale_y = self.height / image.height if image.height else 1
1433
1516
  for result in results:
1434
- x0, top, x1, bottom = [float(c) for c in result["bbox"]]
1435
- elem_data = {
1436
- "text": result["text"],
1437
- "confidence": result["confidence"],
1438
- "x0": x0 * scale_x,
1439
- "top": top * scale_y,
1440
- "x1": x1 * scale_x,
1441
- "bottom": bottom * scale_y,
1442
- "width": (x1 - x0) * scale_x,
1443
- "height": (bottom - top) * scale_y,
1444
- "object_type": "text",
1445
- "source": "ocr",
1446
- "fontname": "OCR-temp",
1447
- "size": 10.0,
1448
- "page_number": self.number,
1449
- }
1450
- temp_elements.append(TextElement(elem_data, self))
1517
+ try: # Added try-except around result processing
1518
+ x0, top, x1, bottom = [float(c) for c in result["bbox"]]
1519
+ elem_data = {
1520
+ "text": result["text"],
1521
+ "confidence": result["confidence"],
1522
+ "x0": x0 * scale_x,
1523
+ "top": top * scale_y,
1524
+ "x1": x1 * scale_x,
1525
+ "bottom": bottom * scale_y,
1526
+ "width": (x1 - x0) * scale_x,
1527
+ "height": (bottom - top) * scale_y,
1528
+ "object_type": "text", # Using text for temporary elements
1529
+ "source": "ocr",
1530
+ "fontname": "OCR-extract", # Different name for clarity
1531
+ "size": 10.0,
1532
+ "page_number": self.number,
1533
+ }
1534
+ temp_elements.append(TextElement(elem_data, self))
1535
+ except (KeyError, ValueError, TypeError) as convert_err:
1536
+ logger.warning(f" Skipping invalid OCR result during conversion: {result}. Error: {convert_err}")
1451
1537
 
1452
1538
  logger.info(f" Created {len(temp_elements)} TextElements from OCR (extract only).")
1453
1539
  return temp_elements
@@ -1914,7 +2000,7 @@ class Page:
1914
2000
  Requires optional dependencies. Install with: pip install "natural-pdf[ocr-save]"
1915
2001
 
1916
2002
  Note: OCR must have been applied to the pages beforehand
1917
- (e.g., using pdf.apply_ocr()).
2003
+ (e.g., pdf.apply_ocr()).
1918
2004
 
1919
2005
  Args:
1920
2006
  output_path: Path to save the searchable PDF.
@@ -1929,3 +2015,46 @@ class Page:
1929
2015
 
1930
2016
  create_searchable_pdf(self, output_path_str, dpi=dpi, **kwargs)
1931
2017
  logger.info(f"Searchable PDF saved to: {output_path_str}")
2018
+
2019
+ # --- Added correct_ocr method ---
2020
+ def correct_ocr(
2021
+ self,
2022
+ correction_callback: Callable[[Any], Optional[str]],
2023
+ ) -> "Page": # Return self for chaining
2024
+ """
2025
+ Applies corrections to OCR-generated text elements on this page
2026
+ using a user-provided callback function.
2027
+
2028
+ Finds text elements on this page whose 'source' attribute starts
2029
+ with 'ocr' and calls the `correction_callback` for each, passing the
2030
+ element itself.
2031
+
2032
+ The `correction_callback` should contain the logic to:
2033
+ 1. Determine if the element needs correction.
2034
+ 2. Perform the correction (e.g., call an LLM).
2035
+ 3. Return the new text (`str`) or `None`.
2036
+
2037
+ If the callback returns a string, the element's `.text` is updated.
2038
+ Metadata updates (source, confidence, etc.) should happen within the callback.
2039
+
2040
+ Args:
2041
+ correction_callback: A function accepting an element and returning
2042
+ `Optional[str]` (new text or None).
2043
+
2044
+ Returns:
2045
+ Self for method chaining.
2046
+ """
2047
+ logger.info(f"Page {self.number}: Starting OCR correction process using callback '{correction_callback.__name__}'")
2048
+
2049
+ # Find OCR elements specifically on this page
2050
+ # Note: We typically want to correct even if the element falls in an excluded area
2051
+ target_elements = self.find_all(selector="text[source^=ocr]", apply_exclusions=False)
2052
+
2053
+ # Delegate to the utility function
2054
+ _apply_ocr_correction_to_elements(
2055
+ elements=target_elements, # Pass the ElementCollection directly
2056
+ correction_callback=correction_callback,
2057
+ caller_info=f"Page({self.number})", # Pass caller info
2058
+ )
2059
+
2060
+ return self # Return self for chaining
natural_pdf/core/pdf.py CHANGED
@@ -17,6 +17,8 @@ from typing import ( # Added Iterable and TYPE_CHECKING
17
17
  Type,
18
18
  Union,
19
19
  )
20
+ from pathlib import Path
21
+
20
22
 
21
23
  import pdfplumber
22
24
  from PIL import Image
@@ -235,11 +237,16 @@ class PDF:
235
237
  self,
236
238
  pages: Optional[Union[Iterable[int], range, slice]] = None,
237
239
  engine: Optional[str] = None,
238
- options: Optional["OCROptions"] = None,
240
+ # --- Common OCR Parameters (Direct Arguments) ---
239
241
  languages: Optional[List[str]] = None,
240
- min_confidence: Optional[float] = None,
242
+ min_confidence: Optional[float] = None, # Min confidence threshold
241
243
  device: Optional[str] = None,
242
- # Add other simple mode args if needed
244
+ resolution: Optional[int] = None, # DPI for rendering before OCR
245
+ apply_exclusions: bool = True, # New parameter
246
+ detect_only: bool = False,
247
+ # --- Engine-Specific Options --- Use 'options=' for this
248
+ options: Optional[Any] = None, # e.g., EasyOCROptions(...), PaddleOCROptions(...), or dict
249
+ # **kwargs: Optional[Dict[str, Any]] = None # Allow potential extra args?
243
250
  ) -> "PDF":
244
251
  """
245
252
  Applies OCR to specified pages (or all pages) of the PDF using batch processing.
@@ -250,20 +257,30 @@ class PDF:
250
257
  Args:
251
258
  pages: An iterable of 0-based page indices (list, range, tuple),
252
259
  a slice object, or None to process all pages.
253
- engine: Name of the engine (e.g., 'easyocr', 'paddleocr', 'surya').
254
- Uses manager's default if None. Ignored if 'options' is provided.
255
- options: An specific Options object (e.g., EasyOCROptions) for
256
- advanced configuration. Overrides simple arguments.
257
- languages: List of language codes for simple mode.
258
- min_confidence: Minimum confidence threshold for simple mode.
259
- device: Device string ('cpu', 'cuda', etc.) for simple mode.
260
+ engine: Name of the OCR engine (e.g., 'easyocr', 'paddleocr', 'surya').
261
+ Uses manager's default ('easyocr') if None.
262
+ languages: List of language codes (e.g., ['en', 'fr'], ['en', 'ch_sim']).
263
+ **Must be codes understood by the specific selected engine.**
264
+ No mapping is performed. Overrides manager/engine default.
265
+ min_confidence: Minimum confidence threshold for detected text (0.0 to 1.0).
266
+ Overrides manager/engine default.
267
+ device: Device to run OCR on (e.g., 'cpu', 'cuda', 'mps').
268
+ Overrides manager/engine default.
269
+ resolution: DPI resolution to render page images before OCR (e.g., 150, 300).
270
+ Affects input quality for OCR. Defaults to 150 if not set.
271
+ apply_exclusions: If True (default), render page image for OCR with
272
+ excluded areas masked (whited out). If False, OCR
273
+ the raw page image without masking exclusions.
274
+ detect_only: If True, only detect text bounding boxes, don't perform OCR.
275
+ options: An engine-specific options object (e.g., EasyOCROptions) or dict
276
+ containing parameters specific to the chosen engine.
260
277
 
261
278
  Returns:
262
279
  Self for method chaining.
263
280
 
264
281
  Raises:
265
- ValueError: If page indices are invalid or the engine name is invalid.
266
- TypeError: If unexpected keyword arguments are provided in simple mode.
282
+ ValueError: If page indices are invalid.
283
+ TypeError: If 'options' is not compatible with the engine.
267
284
  RuntimeError: If the OCRManager or selected engine is not available.
268
285
  """
269
286
  if not self._ocr_manager:
@@ -271,7 +288,7 @@ class PDF:
271
288
  # Or raise RuntimeError("OCRManager not initialized.")
272
289
  return self
273
290
 
274
- # --- Determine Target Pages ---
291
+ # --- Determine Target Pages (unchanged) ---
275
292
  target_pages: List[Page] = []
276
293
  if pages is None:
277
294
  target_pages = self._pages
@@ -295,44 +312,63 @@ class PDF:
295
312
 
296
313
  page_numbers = [p.number for p in target_pages]
297
314
  logger.info(f"Applying batch OCR to pages: {page_numbers}...")
315
+ # --- Determine Rendering Resolution ---
316
+ # Priority: 1. direct `resolution` arg, 2. PDF config, 3. default 150
317
+ final_resolution = resolution # Use direct arg if provided
318
+ if final_resolution is None:
319
+ final_resolution = getattr(self, "_config", {}).get("resolution", 150)
320
+
321
+ logger.debug(f"Using OCR image rendering resolution: {final_resolution} DPI")
298
322
 
299
323
  # --- Render Images for Batch ---
300
324
  images_pil: List[Image.Image] = []
301
325
  page_image_map: List[Tuple[Page, Image.Image]] = [] # Store page and its image
302
- logger.info(f"Rendering {len(target_pages)} pages to images...")
326
+ logger.info(f"Rendering {len(target_pages)} pages to images at {final_resolution} DPI (apply_exclusions={apply_exclusions})...")
303
327
  failed_page_num = "unknown" # Keep track of potentially failing page
304
328
  try:
305
- ocr_scale = getattr(self, "_config", {}).get("ocr_image_scale", 2.0)
306
329
  for i, page in enumerate(target_pages):
307
330
  failed_page_num = page.number # Update current page number in case of error
308
331
  logger.debug(f" Rendering page {page.number} (index {page.index})...")
309
- # Use page.to_image but ensure highlights are off for OCR base image
310
- img = page.to_image(scale=ocr_scale, include_highlights=False)
332
+ # Use the determined final_resolution and apply exclusions if requested
333
+ to_image_kwargs = {
334
+ "resolution": final_resolution,
335
+ "include_highlights": False,
336
+ "exclusions": "mask" if apply_exclusions else None,
337
+ }
338
+ img = page.to_image(**to_image_kwargs)
339
+ if img is None:
340
+ logger.error(f" Failed to render page {page.number} to image.")
341
+ # Decide how to handle: skip page, raise error? For now, skip.
342
+ continue # Skip this page if rendering failed
311
343
  images_pil.append(img)
312
344
  page_image_map.append((page, img)) # Store pair
313
345
  except Exception as e:
314
346
  logger.error(f"Failed to render one or more pages for batch OCR: {e}", exc_info=True)
315
347
  raise RuntimeError(f"Failed to render page {failed_page_num} for OCR.") from e
316
348
 
317
- if not images_pil:
349
+ if not images_pil or not page_image_map:
318
350
  logger.error("No images were successfully rendered for batch OCR.")
319
351
  return self
320
352
 
321
353
  # --- Prepare Arguments for Manager ---
322
- manager_args = {"images": images_pil, "options": options, "engine": engine}
323
- simple_args = {}
324
- if languages is not None:
325
- simple_args["languages"] = languages
326
- if min_confidence is not None:
327
- simple_args["min_confidence"] = min_confidence
328
- if device is not None:
329
- simple_args["device"] = device
330
- manager_args.update(simple_args) # Add simple args if options not provided
354
+ # Pass common args directly, engine-specific via options
355
+ manager_args = {
356
+ "images": images_pil,
357
+ "engine": engine,
358
+ "languages": languages,
359
+ "min_confidence": min_confidence, # Use the renamed parameter
360
+ "device": device,
361
+ "options": options,
362
+ "detect_only": detect_only,
363
+ # Note: resolution is used for rendering, not passed to OCR manager directly
364
+ }
365
+ # Filter out None values so manager can use its defaults
366
+ manager_args = {k: v for k, v in manager_args.items() if v is not None}
331
367
 
332
368
  # --- Call OCR Manager for Batch Processing ---
333
- logger.info(f"Calling OCR Manager for batch processing {len(images_pil)} images...")
369
+ logger.info(f"Calling OCR Manager with args: { {k:v for k,v in manager_args.items() if k!='images'} } ...")
334
370
  try:
335
- # The manager's apply_ocr handles the batch input and returns List[List[Dict]]
371
+ # Manager's apply_ocr signature needs to accept common args directly
336
372
  batch_results = self._ocr_manager.apply_ocr(**manager_args)
337
373
 
338
374
  if not isinstance(batch_results, list) or len(batch_results) != len(images_pil):
@@ -341,16 +377,15 @@ class PDF:
341
377
  f"Expected list of length {len(images_pil)}, got {type(batch_results)} "
342
378
  f"with length {len(batch_results) if isinstance(batch_results, list) else 'N/A'}."
343
379
  )
344
- # Handle error - maybe return early or try processing valid parts?
345
- return self # Return self without adding elements
380
+ return self
346
381
 
347
382
  logger.info("OCR Manager batch processing complete.")
348
383
 
349
384
  except Exception as e:
350
385
  logger.error(f"Batch OCR processing failed: {e}", exc_info=True)
351
- return self # Return self without adding elements
386
+ return self
352
387
 
353
- # --- Distribute Results and Add Elements to Pages ---
388
+ # --- Distribute Results and Add Elements to Pages (unchanged) ---
354
389
  logger.info("Adding OCR results to respective pages...")
355
390
  total_elements_added = 0
356
391
  for i, (page, img) in enumerate(page_image_map):
@@ -362,10 +397,7 @@ class PDF:
362
397
  continue
363
398
 
364
399
  logger.debug(f" Processing {len(results_for_page)} results for page {page.number}...")
365
- # Use the page's element manager to create elements from its results
366
- # Changed from page._create_text_elements_from_ocr to use element_mgr
367
400
  try:
368
- # Calculate scale factors based on rendered image vs page dims
369
401
  img_scale_x = page.width / img.width if img.width > 0 else 1
370
402
  img_scale_y = page.height / img.height if img.height > 0 else 1
371
403
  elements = page._element_mgr.create_text_elements_from_ocr(
@@ -373,7 +405,6 @@ class PDF:
373
405
  )
374
406
 
375
407
  if elements:
376
- # Note: element_mgr.create_text_elements_from_ocr already adds them
377
408
  total_elements_added += len(elements)
378
409
  logger.debug(f" Added {len(elements)} OCR TextElements to page {page.number}.")
379
410
  else:
@@ -382,7 +413,6 @@ class PDF:
382
413
  logger.error(
383
414
  f" Error adding OCR elements to page {page.number}: {e}", exc_info=True
384
415
  )
385
- # Continue to next page
386
416
 
387
417
  logger.info(
388
418
  f"Finished adding OCR results. Total elements added across {len(target_pages)} pages: {total_elements_added}"
@@ -907,6 +937,80 @@ class PDF:
907
937
  f"Search within index failed for PDF '{self.path}'. See logs for details."
908
938
  ) from e
909
939
 
940
+ def export_ocr_correction_task(self, output_zip_path: str, **kwargs):
941
+ """
942
+ Exports OCR results from this PDF into a correction task package (zip file).
943
+
944
+ Args:
945
+ output_zip_path: The path to save the output zip file.
946
+ **kwargs: Additional arguments passed to create_correction_task_package
947
+ (e.g., image_render_scale, overwrite).
948
+ """
949
+ try:
950
+ from natural_pdf.utils.packaging import create_correction_task_package
951
+ create_correction_task_package(source=self, output_zip_path=output_zip_path, **kwargs)
952
+ except ImportError:
953
+ logger.error("Failed to import 'create_correction_task_package'. Packaging utility might be missing.")
954
+ # Or raise
955
+ except Exception as e:
956
+ logger.error(f"Failed to export correction task for {self.path}: {e}", exc_info=True)
957
+ raise # Re-raise the exception from the utility function
958
+
959
+ def correct_ocr(
960
+ self,
961
+ correction_callback: Callable[[Any], Optional[str]],
962
+ pages: Optional[Union[Iterable[int], range, slice]] = None,
963
+ ) -> "PDF": # Return self for chaining
964
+ """
965
+ Applies corrections to OCR-generated text elements using a callback function,
966
+ delegating the core work to the `Page.correct_ocr` method.
967
+
968
+ Args:
969
+ correction_callback: A function that accepts a single argument (an element
970
+ object) and returns `Optional[str]`. It returns the
971
+ corrected text string if an update is needed, otherwise None.
972
+ pages: Optional page indices/slice to limit the scope of correction
973
+ (default: all pages).
974
+
975
+ Returns:
976
+ Self for method chaining.
977
+ """
978
+ # Determine target pages
979
+ target_page_indices: List[int] = []
980
+ if pages is None:
981
+ target_page_indices = list(range(len(self._pages)))
982
+ elif isinstance(pages, slice):
983
+ target_page_indices = list(range(*pages.indices(len(self._pages))))
984
+ elif hasattr(pages, "__iter__"):
985
+ try:
986
+ target_page_indices = [int(i) for i in pages]
987
+ # Validate indices
988
+ for idx in target_page_indices:
989
+ if not (0 <= idx < len(self._pages)):
990
+ raise IndexError(f"Page index {idx} out of range (0-{len(self._pages)-1}).")
991
+ except (IndexError, TypeError, ValueError) as e:
992
+ raise ValueError(f"Invalid page index or type provided in 'pages': {pages}. Error: {e}") from e
993
+ else:
994
+ raise TypeError("'pages' must be None, a slice, or an iterable of page indices (int).")
995
+
996
+ if not target_page_indices:
997
+ logger.warning("No pages selected for OCR correction.")
998
+ return self
999
+
1000
+ logger.info(f"Starting OCR correction process via Page delegation for pages: {target_page_indices}")
1001
+
1002
+ # Iterate through target pages and call their correct_ocr method
1003
+ for page_idx in target_page_indices:
1004
+ page = self._pages[page_idx]
1005
+ try:
1006
+ page.correct_ocr(correction_callback=correction_callback)
1007
+ except Exception as e:
1008
+ logger.error(f"Error during correct_ocr on page {page_idx}: {e}", exc_info=True)
1009
+ # Optionally re-raise or just log and continue
1010
+
1011
+ logger.info(f"OCR correction process finished for requested pages.")
1012
+ return self
1013
+
910
1014
  def __len__(self) -> int:
911
1015
  """Return the number of pages in the PDF."""
912
1016
  # Ensure _pages is initialized
@@ -968,6 +1072,7 @@ class PDF:
968
1072
  self.close()
969
1073
 
970
1074
 
971
- # --- Added TYPE_CHECKING import (if not already present) ---
972
- if TYPE_CHECKING:
973
- from pathlib import Path # Assuming Path is used for type hint
1075
+ # --- Indexable Protocol Methods --- Needed for search/sync
1076
+ def get_id(self) -> str:
1077
+ return self.path
1078
+