natural-pdf 0.1.5__py3-none-any.whl → 0.1.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. docs/finetuning/index.md +176 -0
  2. docs/ocr/index.md +34 -47
  3. docs/tutorials/01-loading-and-extraction.ipynb +34 -1536
  4. docs/tutorials/02-finding-elements.ipynb +42 -42
  5. docs/tutorials/03-extracting-blocks.ipynb +17 -17
  6. docs/tutorials/04-table-extraction.ipynb +12 -12
  7. docs/tutorials/05-excluding-content.ipynb +30 -30
  8. docs/tutorials/06-document-qa.ipynb +28 -28
  9. docs/tutorials/07-layout-analysis.ipynb +63 -35
  10. docs/tutorials/07-working-with-regions.ipynb +55 -51
  11. docs/tutorials/07-working-with-regions.md +2 -2
  12. docs/tutorials/08-spatial-navigation.ipynb +60 -60
  13. docs/tutorials/09-section-extraction.ipynb +113 -113
  14. docs/tutorials/10-form-field-extraction.ipynb +78 -50
  15. docs/tutorials/11-enhanced-table-processing.ipynb +6 -6
  16. docs/tutorials/12-ocr-integration.ipynb +149 -131
  17. docs/tutorials/12-ocr-integration.md +0 -13
  18. docs/tutorials/13-semantic-search.ipynb +313 -873
  19. natural_pdf/__init__.py +21 -22
  20. natural_pdf/analyzers/layout/gemini.py +280 -0
  21. natural_pdf/analyzers/layout/layout_manager.py +28 -1
  22. natural_pdf/analyzers/layout/layout_options.py +11 -0
  23. natural_pdf/analyzers/layout/yolo.py +6 -2
  24. natural_pdf/collections/pdf_collection.py +24 -0
  25. natural_pdf/core/element_manager.py +18 -13
  26. natural_pdf/core/page.py +174 -36
  27. natural_pdf/core/pdf.py +156 -42
  28. natural_pdf/elements/base.py +9 -17
  29. natural_pdf/elements/collections.py +99 -38
  30. natural_pdf/elements/region.py +77 -37
  31. natural_pdf/elements/text.py +5 -0
  32. natural_pdf/exporters/__init__.py +4 -0
  33. natural_pdf/exporters/base.py +61 -0
  34. natural_pdf/exporters/paddleocr.py +345 -0
  35. natural_pdf/ocr/__init__.py +57 -36
  36. natural_pdf/ocr/engine.py +160 -49
  37. natural_pdf/ocr/engine_easyocr.py +178 -157
  38. natural_pdf/ocr/engine_paddle.py +114 -189
  39. natural_pdf/ocr/engine_surya.py +87 -144
  40. natural_pdf/ocr/ocr_factory.py +125 -0
  41. natural_pdf/ocr/ocr_manager.py +65 -89
  42. natural_pdf/ocr/ocr_options.py +8 -13
  43. natural_pdf/ocr/utils.py +113 -0
  44. natural_pdf/templates/finetune/fine_tune_paddleocr.md +415 -0
  45. natural_pdf/templates/spa/css/style.css +334 -0
  46. natural_pdf/templates/spa/index.html +31 -0
  47. natural_pdf/templates/spa/js/app.js +472 -0
  48. natural_pdf/templates/spa/words.txt +235976 -0
  49. natural_pdf/utils/debug.py +34 -0
  50. natural_pdf/utils/identifiers.py +33 -0
  51. natural_pdf/utils/packaging.py +485 -0
  52. natural_pdf/utils/text_extraction.py +44 -64
  53. natural_pdf/utils/visualization.py +1 -1
  54. {natural_pdf-0.1.5.dist-info → natural_pdf-0.1.7.dist-info}/METADATA +44 -20
  55. {natural_pdf-0.1.5.dist-info → natural_pdf-0.1.7.dist-info}/RECORD +58 -47
  56. {natural_pdf-0.1.5.dist-info → natural_pdf-0.1.7.dist-info}/WHEEL +1 -1
  57. {natural_pdf-0.1.5.dist-info → natural_pdf-0.1.7.dist-info}/top_level.txt +0 -1
  58. natural_pdf/templates/ocr_debug.html +0 -517
  59. tests/test_loading.py +0 -50
  60. tests/test_optional_deps.py +0 -298
  61. {natural_pdf-0.1.5.dist-info → natural_pdf-0.1.7.dist-info}/licenses/LICENSE +0 -0
natural_pdf/core/page.py CHANGED
@@ -10,7 +10,7 @@ from pathlib import Path
10
10
  from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union
11
11
 
12
12
  import pdfplumber
13
- from PIL import Image
13
+ from PIL import Image, ImageDraw
14
14
 
15
15
  from natural_pdf.elements.collections import ElementCollection
16
16
  from natural_pdf.elements.region import Region
@@ -43,6 +43,9 @@ from natural_pdf.utils.text_extraction import filter_chars_spatially, generate_t
43
43
  from natural_pdf.widgets import InteractiveViewerWidget
44
44
  from natural_pdf.widgets.viewer import _IPYWIDGETS_AVAILABLE, SimpleInteractiveViewerWidget
45
45
 
46
+ from natural_pdf.qa import DocumentQA, get_qa_engine
47
+ from natural_pdf.ocr.utils import _apply_ocr_correction_to_elements
48
+
46
49
  logger = logging.getLogger(__name__)
47
50
 
48
51
 
@@ -1230,6 +1233,7 @@ class Page:
1230
1233
  render_ocr: bool = False,
1231
1234
  resolution: Optional[float] = None,
1232
1235
  include_highlights: bool = True,
1236
+ exclusions: Optional[str] = None, # New parameter
1233
1237
  **kwargs,
1234
1238
  ) -> Optional[Image.Image]:
1235
1239
  """
@@ -1244,27 +1248,29 @@ class Page:
1244
1248
  render_ocr: Whether to render OCR text on highlights.
1245
1249
  resolution: Resolution in DPI for base page image (default: scale * 72).
1246
1250
  include_highlights: Whether to render highlights.
1251
+ exclusions: If 'mask', excluded regions will be whited out on the image.
1252
+ (default: None).
1247
1253
  **kwargs: Additional parameters for pdfplumber.to_image.
1248
1254
 
1249
1255
  Returns:
1250
1256
  PIL Image of the page, or None if rendering fails.
1251
1257
  """
1252
1258
  image = None
1259
+ render_resolution = resolution if resolution is not None else scale * 72
1253
1260
  try:
1254
1261
  if include_highlights:
1255
1262
  # Delegate rendering to the central service
1256
1263
  image = self._highlighter.render_page(
1257
1264
  page_index=self.index,
1258
- scale=scale,
1265
+ scale=scale, # Note: scale is used by highlighter internally for drawing
1259
1266
  labels=labels,
1260
1267
  legend_position=legend_position,
1261
1268
  render_ocr=render_ocr,
1262
- resolution=resolution,
1269
+ resolution=render_resolution, # Pass the calculated resolution
1263
1270
  **kwargs,
1264
1271
  )
1265
1272
  else:
1266
1273
  # Get the base page image directly from pdfplumber if no highlights needed
1267
- render_resolution = resolution if resolution is not None else scale * 72
1268
1274
  # Use the underlying pdfplumber page object
1269
1275
  img_object = self._page.to_image(resolution=render_resolution, **kwargs)
1270
1276
  # Access the PIL image directly (assuming pdfplumber structure)
@@ -1287,6 +1293,53 @@ class Page:
1287
1293
  if image is None:
1288
1294
  return None
1289
1295
 
1296
+ # --- Apply exclusion masking if requested ---
1297
+ if exclusions == "mask" and self._exclusions:
1298
+ try:
1299
+ # Ensure image is mutable (RGB or RGBA)
1300
+ if image.mode not in ("RGB", "RGBA"):
1301
+ image = image.convert("RGB")
1302
+
1303
+ exclusion_regions = self._get_exclusion_regions(include_callable=True, debug=False)
1304
+ if exclusion_regions:
1305
+ draw = ImageDraw.Draw(image)
1306
+ # Calculate the scaling factor used for the image
1307
+ # Base image was rendered at render_resolution (DPI)
1308
+ # pdfplumber default is 72 DPI
1309
+ # Scale factor = (pixels / inch) / (points / inch) = DPI / 72
1310
+ img_scale = render_resolution / 72.0
1311
+
1312
+ for region in exclusion_regions:
1313
+ # Convert PDF points (x0, top, x1, bottom) to image pixels
1314
+ img_x0 = region.x0 * img_scale
1315
+ img_top = region.top * img_scale
1316
+ img_x1 = region.x1 * img_scale
1317
+ img_bottom = region.bottom * img_scale
1318
+
1319
+ # Draw a white rectangle over the excluded area
1320
+ # Ensure coordinates are within image bounds (though region should be)
1321
+ img_coords = (
1322
+ max(0, img_x0),
1323
+ max(0, img_top),
1324
+ min(image.width, img_x1),
1325
+ min(image.height, img_bottom),
1326
+ )
1327
+ if img_coords[0] < img_coords[2] and img_coords[1] < img_coords[3]:
1328
+ draw.rectangle(img_coords, fill="white")
1329
+ else:
1330
+ logger.warning(
1331
+ f"Skipping invalid exclusion rect for masking: {img_coords}"
1332
+ )
1333
+
1334
+ del draw # Release drawing context
1335
+ except Exception as mask_error:
1336
+ logger.error(
1337
+ f"Error applying exclusion mask to page {self.index}: {mask_error}",
1338
+ exc_info=True,
1339
+ )
1340
+ # Decide if you want to return None or continue without mask
1341
+ # For now, continue without mask
1342
+
1290
1343
  # Resize the final image if width is provided
1291
1344
  if width is not None and width > 0 and image.width > 0:
1292
1345
  aspect_ratio = image.height / image.width
@@ -1328,20 +1381,34 @@ class Page:
1328
1381
  languages: Optional[List[str]] = None,
1329
1382
  min_confidence: Optional[float] = None,
1330
1383
  device: Optional[str] = None,
1384
+ resolution: Optional[int] = None,
1385
+ detect_only: bool = False,
1386
+ apply_exclusions: bool = True,
1331
1387
  ) -> "Page":
1332
1388
  """
1333
1389
  Apply OCR to THIS page and add results to page elements via PDF.apply_ocr.
1334
1390
 
1391
+ Args:
1392
+ engine: Name of the OCR engine.
1393
+ options: Engine-specific options object or dict.
1394
+ languages: List of engine-specific language codes.
1395
+ min_confidence: Minimum confidence threshold.
1396
+ device: Device to run OCR on.
1397
+ resolution: DPI resolution for rendering page image before OCR.
1398
+ apply_exclusions: If True (default), render page image for OCR
1399
+ with excluded areas masked (whited out).
1400
+
1335
1401
  Returns:
1336
1402
  List of created TextElements derived from OCR results for this page.
1337
1403
  """
1338
1404
  if not hasattr(self._parent, "apply_ocr"):
1339
1405
  logger.error(f"Page {self.number}: Parent PDF missing 'apply_ocr'. Cannot apply OCR.")
1340
- return []
1406
+ return [] # Return empty list for consistency
1341
1407
 
1342
1408
  logger.info(f"Page {self.number}: Delegating apply_ocr to PDF.apply_ocr.")
1343
1409
  try:
1344
1410
  # Delegate to parent PDF, targeting only this page's index
1411
+ # Pass all relevant parameters through, including apply_exclusions
1345
1412
  self._parent.apply_ocr(
1346
1413
  pages=[self.index],
1347
1414
  engine=engine,
@@ -1349,17 +1416,21 @@ class Page:
1349
1416
  languages=languages,
1350
1417
  min_confidence=min_confidence,
1351
1418
  device=device,
1419
+ resolution=resolution,
1420
+ detect_only=detect_only,
1421
+ apply_exclusions=apply_exclusions,
1352
1422
  )
1353
1423
  except Exception as e:
1354
1424
  logger.error(f"Page {self.number}: Error during delegated OCR call: {e}", exc_info=True)
1355
1425
  return []
1356
1426
 
1357
1427
  # Return the OCR elements specifically added to this page
1358
- # Use element manager to retrieve them
1359
1428
  ocr_elements = [el for el in self.words if getattr(el, "source", None) == "ocr"]
1360
1429
  logger.debug(
1361
1430
  f"Page {self.number}: apply_ocr completed. Found {len(ocr_elements)} OCR elements."
1362
1431
  )
1432
+ # Note: The method is typed to return Page for chaining, but the log indicates
1433
+ # finding elements. Let's stick to returning self for chaining consistency.
1363
1434
  return self
1364
1435
 
1365
1436
  def extract_ocr_elements(
@@ -1369,10 +1440,22 @@ class Page:
1369
1440
  languages: Optional[List[str]] = None,
1370
1441
  min_confidence: Optional[float] = None,
1371
1442
  device: Optional[str] = None,
1443
+ resolution: Optional[int] = None,
1372
1444
  ) -> List[TextElement]:
1373
1445
  """
1374
1446
  Extract text elements using OCR *without* adding them to the page's elements.
1375
1447
  Uses the shared OCRManager instance.
1448
+
1449
+ Args:
1450
+ engine: Name of the OCR engine.
1451
+ options: Engine-specific options object or dict.
1452
+ languages: List of engine-specific language codes.
1453
+ min_confidence: Minimum confidence threshold.
1454
+ device: Device to run OCR on.
1455
+ resolution: DPI resolution for rendering page image before OCR.
1456
+
1457
+ Returns:
1458
+ List of created TextElement objects derived from OCR results for this page.
1376
1459
  """
1377
1460
  if not self._ocr_manager:
1378
1461
  logger.error(
@@ -1381,10 +1464,14 @@ class Page:
1381
1464
  return []
1382
1465
 
1383
1466
  logger.info(f"Page {self.number}: Extracting OCR elements (extract only)...")
1467
+
1468
+ # Determine rendering resolution
1469
+ final_resolution = resolution if resolution is not None else 150 # Default to 150 DPI
1470
+ logger.debug(f" Using rendering resolution: {final_resolution} DPI")
1471
+
1384
1472
  try:
1385
- ocr_scale = getattr(self._parent, "_config", {}).get("ocr_image_scale", 2.0)
1386
- # Get base image without highlights
1387
- image = self.to_image(scale=ocr_scale, include_highlights=False)
1473
+ # Get base image without highlights using the determined resolution
1474
+ image = self.to_image(resolution=final_resolution, include_highlights=False)
1388
1475
  if not image:
1389
1476
  logger.error(f" Failed to render page {self.number} to image for OCR extraction.")
1390
1477
  return []
@@ -1393,13 +1480,16 @@ class Page:
1393
1480
  logger.error(f" Failed to render page {self.number} to image: {e}", exc_info=True)
1394
1481
  return []
1395
1482
 
1396
- manager_args = {"images": image, "options": options, "engine": engine}
1397
- if languages is not None:
1398
- manager_args["languages"] = languages
1399
- if min_confidence is not None:
1400
- manager_args["min_confidence"] = min_confidence
1401
- if device is not None:
1402
- manager_args["device"] = device
1483
+ # Prepare arguments for the OCR Manager call
1484
+ manager_args = {
1485
+ "images": image,
1486
+ "engine": engine,
1487
+ "languages": languages,
1488
+ "min_confidence": min_confidence,
1489
+ "device": device,
1490
+ "options": options,
1491
+ }
1492
+ manager_args = {k: v for k, v in manager_args.items() if v is not None}
1403
1493
 
1404
1494
  logger.debug(
1405
1495
  f" Calling OCR Manager (extract only) with args: { {k:v for k,v in manager_args.items() if k != 'images'} }"
@@ -1415,7 +1505,6 @@ class Page:
1415
1505
  and isinstance(results_list[0], list)
1416
1506
  else results_list
1417
1507
  )
1418
-
1419
1508
  if not isinstance(results, list):
1420
1509
  logger.error(f" OCR Manager returned unexpected type: {type(results)}")
1421
1510
  results = []
@@ -1426,28 +1515,32 @@ class Page:
1426
1515
 
1427
1516
  # Convert results but DO NOT add to ElementManager
1428
1517
  logger.debug(f" Converting OCR results to TextElements (extract only)...")
1429
- # Use a temporary method to create elements without adding them globally
1430
1518
  temp_elements = []
1431
1519
  scale_x = self.width / image.width if image.width else 1
1432
1520
  scale_y = self.height / image.height if image.height else 1
1433
1521
  for result in results:
1434
- x0, top, x1, bottom = [float(c) for c in result["bbox"]]
1435
- elem_data = {
1436
- "text": result["text"],
1437
- "confidence": result["confidence"],
1438
- "x0": x0 * scale_x,
1439
- "top": top * scale_y,
1440
- "x1": x1 * scale_x,
1441
- "bottom": bottom * scale_y,
1442
- "width": (x1 - x0) * scale_x,
1443
- "height": (bottom - top) * scale_y,
1444
- "object_type": "text",
1445
- "source": "ocr",
1446
- "fontname": "OCR-temp",
1447
- "size": 10.0,
1448
- "page_number": self.number,
1449
- }
1450
- temp_elements.append(TextElement(elem_data, self))
1522
+ try: # Added try-except around result processing
1523
+ x0, top, x1, bottom = [float(c) for c in result["bbox"]]
1524
+ elem_data = {
1525
+ "text": result["text"],
1526
+ "confidence": result["confidence"],
1527
+ "x0": x0 * scale_x,
1528
+ "top": top * scale_y,
1529
+ "x1": x1 * scale_x,
1530
+ "bottom": bottom * scale_y,
1531
+ "width": (x1 - x0) * scale_x,
1532
+ "height": (bottom - top) * scale_y,
1533
+ "object_type": "text", # Using text for temporary elements
1534
+ "source": "ocr",
1535
+ "fontname": "OCR-extract", # Different name for clarity
1536
+ "size": 10.0,
1537
+ "page_number": self.number,
1538
+ }
1539
+ temp_elements.append(TextElement(elem_data, self))
1540
+ except (KeyError, ValueError, TypeError) as convert_err:
1541
+ logger.warning(
1542
+ f" Skipping invalid OCR result during conversion: {result}. Error: {convert_err}"
1543
+ )
1451
1544
 
1452
1545
  logger.info(f" Created {len(temp_elements)} TextElements from OCR (extract only).")
1453
1546
  return temp_elements
@@ -1914,7 +2007,7 @@ class Page:
1914
2007
  Requires optional dependencies. Install with: pip install "natural-pdf[ocr-save]"
1915
2008
 
1916
2009
  Note: OCR must have been applied to the pages beforehand
1917
- (e.g., using pdf.apply_ocr()).
2010
+ (e.g., pdf.apply_ocr()).
1918
2011
 
1919
2012
  Args:
1920
2013
  output_path: Path to save the searchable PDF.
@@ -1929,3 +2022,48 @@ class Page:
1929
2022
 
1930
2023
  create_searchable_pdf(self, output_path_str, dpi=dpi, **kwargs)
1931
2024
  logger.info(f"Searchable PDF saved to: {output_path_str}")
2025
+
2026
+ # --- Added correct_ocr method ---
2027
+ def correct_ocr(
2028
+ self,
2029
+ correction_callback: Callable[[Any], Optional[str]],
2030
+ ) -> "Page": # Return self for chaining
2031
+ """
2032
+ Applies corrections to OCR-generated text elements on this page
2033
+ using a user-provided callback function.
2034
+
2035
+ Finds text elements on this page whose 'source' attribute starts
2036
+ with 'ocr' and calls the `correction_callback` for each, passing the
2037
+ element itself.
2038
+
2039
+ The `correction_callback` should contain the logic to:
2040
+ 1. Determine if the element needs correction.
2041
+ 2. Perform the correction (e.g., call an LLM).
2042
+ 3. Return the new text (`str`) or `None`.
2043
+
2044
+ If the callback returns a string, the element's `.text` is updated.
2045
+ Metadata updates (source, confidence, etc.) should happen within the callback.
2046
+
2047
+ Args:
2048
+ correction_callback: A function accepting an element and returning
2049
+ `Optional[str]` (new text or None).
2050
+
2051
+ Returns:
2052
+ Self for method chaining.
2053
+ """
2054
+ logger.info(
2055
+ f"Page {self.number}: Starting OCR correction process using callback '{correction_callback.__name__}'"
2056
+ )
2057
+
2058
+ # Find OCR elements specifically on this page
2059
+ # Note: We typically want to correct even if the element falls in an excluded area
2060
+ target_elements = self.find_all(selector="text[source^=ocr]", apply_exclusions=False)
2061
+
2062
+ # Delegate to the utility function
2063
+ _apply_ocr_correction_to_elements(
2064
+ elements=target_elements, # Pass the ElementCollection directly
2065
+ correction_callback=correction_callback,
2066
+ caller_info=f"Page({self.number})", # Pass caller info
2067
+ )
2068
+
2069
+ return self # Return self for chaining
natural_pdf/core/pdf.py CHANGED
@@ -17,6 +17,8 @@ from typing import ( # Added Iterable and TYPE_CHECKING
17
17
  Type,
18
18
  Union,
19
19
  )
20
+ from pathlib import Path
21
+
20
22
 
21
23
  import pdfplumber
22
24
  from PIL import Image
@@ -235,11 +237,16 @@ class PDF:
235
237
  self,
236
238
  pages: Optional[Union[Iterable[int], range, slice]] = None,
237
239
  engine: Optional[str] = None,
238
- options: Optional["OCROptions"] = None,
240
+ # --- Common OCR Parameters (Direct Arguments) ---
239
241
  languages: Optional[List[str]] = None,
240
- min_confidence: Optional[float] = None,
242
+ min_confidence: Optional[float] = None, # Min confidence threshold
241
243
  device: Optional[str] = None,
242
- # Add other simple mode args if needed
244
+ resolution: Optional[int] = None, # DPI for rendering before OCR
245
+ apply_exclusions: bool = True, # New parameter
246
+ detect_only: bool = False,
247
+ # --- Engine-Specific Options --- Use 'options=' for this
248
+ options: Optional[Any] = None, # e.g., EasyOCROptions(...), PaddleOCROptions(...), or dict
249
+ # **kwargs: Optional[Dict[str, Any]] = None # Allow potential extra args?
243
250
  ) -> "PDF":
244
251
  """
245
252
  Applies OCR to specified pages (or all pages) of the PDF using batch processing.
@@ -250,20 +257,30 @@ class PDF:
250
257
  Args:
251
258
  pages: An iterable of 0-based page indices (list, range, tuple),
252
259
  a slice object, or None to process all pages.
253
- engine: Name of the engine (e.g., 'easyocr', 'paddleocr', 'surya').
254
- Uses manager's default if None. Ignored if 'options' is provided.
255
- options: An specific Options object (e.g., EasyOCROptions) for
256
- advanced configuration. Overrides simple arguments.
257
- languages: List of language codes for simple mode.
258
- min_confidence: Minimum confidence threshold for simple mode.
259
- device: Device string ('cpu', 'cuda', etc.) for simple mode.
260
+ engine: Name of the OCR engine (e.g., 'easyocr', 'paddleocr', 'surya').
261
+ Uses manager's default ('easyocr') if None.
262
+ languages: List of language codes (e.g., ['en', 'fr'], ['en', 'ch_sim']).
263
+ **Must be codes understood by the specific selected engine.**
264
+ No mapping is performed. Overrides manager/engine default.
265
+ min_confidence: Minimum confidence threshold for detected text (0.0 to 1.0).
266
+ Overrides manager/engine default.
267
+ device: Device to run OCR on (e.g., 'cpu', 'cuda', 'mps').
268
+ Overrides manager/engine default.
269
+ resolution: DPI resolution to render page images before OCR (e.g., 150, 300).
270
+ Affects input quality for OCR. Defaults to 150 if not set.
271
+ apply_exclusions: If True (default), render page image for OCR with
272
+ excluded areas masked (whited out). If False, OCR
273
+ the raw page image without masking exclusions.
274
+ detect_only: If True, only detect text bounding boxes, don't perform OCR.
275
+ options: An engine-specific options object (e.g., EasyOCROptions) or dict
276
+ containing parameters specific to the chosen engine.
260
277
 
261
278
  Returns:
262
279
  Self for method chaining.
263
280
 
264
281
  Raises:
265
- ValueError: If page indices are invalid or the engine name is invalid.
266
- TypeError: If unexpected keyword arguments are provided in simple mode.
282
+ ValueError: If page indices are invalid.
283
+ TypeError: If 'options' is not compatible with the engine.
267
284
  RuntimeError: If the OCRManager or selected engine is not available.
268
285
  """
269
286
  if not self._ocr_manager:
@@ -271,7 +288,7 @@ class PDF:
271
288
  # Or raise RuntimeError("OCRManager not initialized.")
272
289
  return self
273
290
 
274
- # --- Determine Target Pages ---
291
+ # --- Determine Target Pages (unchanged) ---
275
292
  target_pages: List[Page] = []
276
293
  if pages is None:
277
294
  target_pages = self._pages
@@ -295,44 +312,67 @@ class PDF:
295
312
 
296
313
  page_numbers = [p.number for p in target_pages]
297
314
  logger.info(f"Applying batch OCR to pages: {page_numbers}...")
315
+ # --- Determine Rendering Resolution ---
316
+ # Priority: 1. direct `resolution` arg, 2. PDF config, 3. default 150
317
+ final_resolution = resolution # Use direct arg if provided
318
+ if final_resolution is None:
319
+ final_resolution = getattr(self, "_config", {}).get("resolution", 150)
320
+
321
+ logger.debug(f"Using OCR image rendering resolution: {final_resolution} DPI")
298
322
 
299
323
  # --- Render Images for Batch ---
300
324
  images_pil: List[Image.Image] = []
301
325
  page_image_map: List[Tuple[Page, Image.Image]] = [] # Store page and its image
302
- logger.info(f"Rendering {len(target_pages)} pages to images...")
326
+ logger.info(
327
+ f"Rendering {len(target_pages)} pages to images at {final_resolution} DPI (apply_exclusions={apply_exclusions})..."
328
+ )
303
329
  failed_page_num = "unknown" # Keep track of potentially failing page
304
330
  try:
305
- ocr_scale = getattr(self, "_config", {}).get("ocr_image_scale", 2.0)
306
331
  for i, page in enumerate(target_pages):
307
332
  failed_page_num = page.number # Update current page number in case of error
308
333
  logger.debug(f" Rendering page {page.number} (index {page.index})...")
309
- # Use page.to_image but ensure highlights are off for OCR base image
310
- img = page.to_image(scale=ocr_scale, include_highlights=False)
334
+ # Use the determined final_resolution and apply exclusions if requested
335
+ to_image_kwargs = {
336
+ "resolution": final_resolution,
337
+ "include_highlights": False,
338
+ "exclusions": "mask" if apply_exclusions else None,
339
+ }
340
+ img = page.to_image(**to_image_kwargs)
341
+ if img is None:
342
+ logger.error(f" Failed to render page {page.number} to image.")
343
+ # Decide how to handle: skip page, raise error? For now, skip.
344
+ continue # Skip this page if rendering failed
311
345
  images_pil.append(img)
312
346
  page_image_map.append((page, img)) # Store pair
313
347
  except Exception as e:
314
348
  logger.error(f"Failed to render one or more pages for batch OCR: {e}", exc_info=True)
315
349
  raise RuntimeError(f"Failed to render page {failed_page_num} for OCR.") from e
316
350
 
317
- if not images_pil:
351
+ if not images_pil or not page_image_map:
318
352
  logger.error("No images were successfully rendered for batch OCR.")
319
353
  return self
320
354
 
321
355
  # --- Prepare Arguments for Manager ---
322
- manager_args = {"images": images_pil, "options": options, "engine": engine}
323
- simple_args = {}
324
- if languages is not None:
325
- simple_args["languages"] = languages
326
- if min_confidence is not None:
327
- simple_args["min_confidence"] = min_confidence
328
- if device is not None:
329
- simple_args["device"] = device
330
- manager_args.update(simple_args) # Add simple args if options not provided
356
+ # Pass common args directly, engine-specific via options
357
+ manager_args = {
358
+ "images": images_pil,
359
+ "engine": engine,
360
+ "languages": languages,
361
+ "min_confidence": min_confidence, # Use the renamed parameter
362
+ "device": device,
363
+ "options": options,
364
+ "detect_only": detect_only,
365
+ # Note: resolution is used for rendering, not passed to OCR manager directly
366
+ }
367
+ # Filter out None values so manager can use its defaults
368
+ manager_args = {k: v for k, v in manager_args.items() if v is not None}
331
369
 
332
370
  # --- Call OCR Manager for Batch Processing ---
333
- logger.info(f"Calling OCR Manager for batch processing {len(images_pil)} images...")
371
+ logger.info(
372
+ f"Calling OCR Manager with args: { {k:v for k,v in manager_args.items() if k!='images'} } ..."
373
+ )
334
374
  try:
335
- # The manager's apply_ocr handles the batch input and returns List[List[Dict]]
375
+ # Manager's apply_ocr signature needs to accept common args directly
336
376
  batch_results = self._ocr_manager.apply_ocr(**manager_args)
337
377
 
338
378
  if not isinstance(batch_results, list) or len(batch_results) != len(images_pil):
@@ -341,16 +381,15 @@ class PDF:
341
381
  f"Expected list of length {len(images_pil)}, got {type(batch_results)} "
342
382
  f"with length {len(batch_results) if isinstance(batch_results, list) else 'N/A'}."
343
383
  )
344
- # Handle error - maybe return early or try processing valid parts?
345
- return self # Return self without adding elements
384
+ return self
346
385
 
347
386
  logger.info("OCR Manager batch processing complete.")
348
387
 
349
388
  except Exception as e:
350
389
  logger.error(f"Batch OCR processing failed: {e}", exc_info=True)
351
- return self # Return self without adding elements
390
+ return self
352
391
 
353
- # --- Distribute Results and Add Elements to Pages ---
392
+ # --- Distribute Results and Add Elements to Pages (unchanged) ---
354
393
  logger.info("Adding OCR results to respective pages...")
355
394
  total_elements_added = 0
356
395
  for i, (page, img) in enumerate(page_image_map):
@@ -362,10 +401,7 @@ class PDF:
362
401
  continue
363
402
 
364
403
  logger.debug(f" Processing {len(results_for_page)} results for page {page.number}...")
365
- # Use the page's element manager to create elements from its results
366
- # Changed from page._create_text_elements_from_ocr to use element_mgr
367
404
  try:
368
- # Calculate scale factors based on rendered image vs page dims
369
405
  img_scale_x = page.width / img.width if img.width > 0 else 1
370
406
  img_scale_y = page.height / img.height if img.height > 0 else 1
371
407
  elements = page._element_mgr.create_text_elements_from_ocr(
@@ -373,7 +409,6 @@ class PDF:
373
409
  )
374
410
 
375
411
  if elements:
376
- # Note: element_mgr.create_text_elements_from_ocr already adds them
377
412
  total_elements_added += len(elements)
378
413
  logger.debug(f" Added {len(elements)} OCR TextElements to page {page.number}.")
379
414
  else:
@@ -382,7 +417,6 @@ class PDF:
382
417
  logger.error(
383
418
  f" Error adding OCR elements to page {page.number}: {e}", exc_info=True
384
419
  )
385
- # Continue to next page
386
420
 
387
421
  logger.info(
388
422
  f"Finished adding OCR results. Total elements added across {len(target_pages)} pages: {total_elements_added}"
@@ -907,6 +941,87 @@ class PDF:
907
941
  f"Search within index failed for PDF '{self.path}'. See logs for details."
908
942
  ) from e
909
943
 
944
+ def export_ocr_correction_task(self, output_zip_path: str, **kwargs):
945
+ """
946
+ Exports OCR results from this PDF into a correction task package (zip file).
947
+
948
+ Args:
949
+ output_zip_path: The path to save the output zip file.
950
+ **kwargs: Additional arguments passed to create_correction_task_package
951
+ (e.g., image_render_scale, overwrite).
952
+ """
953
+ try:
954
+ from natural_pdf.utils.packaging import create_correction_task_package
955
+
956
+ create_correction_task_package(source=self, output_zip_path=output_zip_path, **kwargs)
957
+ except ImportError:
958
+ logger.error(
959
+ "Failed to import 'create_correction_task_package'. Packaging utility might be missing."
960
+ )
961
+ # Or raise
962
+ except Exception as e:
963
+ logger.error(f"Failed to export correction task for {self.path}: {e}", exc_info=True)
964
+ raise # Re-raise the exception from the utility function
965
+
966
+ def correct_ocr(
967
+ self,
968
+ correction_callback: Callable[[Any], Optional[str]],
969
+ pages: Optional[Union[Iterable[int], range, slice]] = None,
970
+ ) -> "PDF": # Return self for chaining
971
+ """
972
+ Applies corrections to OCR-generated text elements using a callback function,
973
+ delegating the core work to the `Page.correct_ocr` method.
974
+
975
+ Args:
976
+ correction_callback: A function that accepts a single argument (an element
977
+ object) and returns `Optional[str]`. It returns the
978
+ corrected text string if an update is needed, otherwise None.
979
+ pages: Optional page indices/slice to limit the scope of correction
980
+ (default: all pages).
981
+
982
+ Returns:
983
+ Self for method chaining.
984
+ """
985
+ # Determine target pages
986
+ target_page_indices: List[int] = []
987
+ if pages is None:
988
+ target_page_indices = list(range(len(self._pages)))
989
+ elif isinstance(pages, slice):
990
+ target_page_indices = list(range(*pages.indices(len(self._pages))))
991
+ elif hasattr(pages, "__iter__"):
992
+ try:
993
+ target_page_indices = [int(i) for i in pages]
994
+ # Validate indices
995
+ for idx in target_page_indices:
996
+ if not (0 <= idx < len(self._pages)):
997
+ raise IndexError(f"Page index {idx} out of range (0-{len(self._pages)-1}).")
998
+ except (IndexError, TypeError, ValueError) as e:
999
+ raise ValueError(
1000
+ f"Invalid page index or type provided in 'pages': {pages}. Error: {e}"
1001
+ ) from e
1002
+ else:
1003
+ raise TypeError("'pages' must be None, a slice, or an iterable of page indices (int).")
1004
+
1005
+ if not target_page_indices:
1006
+ logger.warning("No pages selected for OCR correction.")
1007
+ return self
1008
+
1009
+ logger.info(
1010
+ f"Starting OCR correction process via Page delegation for pages: {target_page_indices}"
1011
+ )
1012
+
1013
+ # Iterate through target pages and call their correct_ocr method
1014
+ for page_idx in target_page_indices:
1015
+ page = self._pages[page_idx]
1016
+ try:
1017
+ page.correct_ocr(correction_callback=correction_callback)
1018
+ except Exception as e:
1019
+ logger.error(f"Error during correct_ocr on page {page_idx}: {e}", exc_info=True)
1020
+ # Optionally re-raise or just log and continue
1021
+
1022
+ logger.info(f"OCR correction process finished for requested pages.")
1023
+ return self
1024
+
910
1025
  def __len__(self) -> int:
911
1026
  """Return the number of pages in the PDF."""
912
1027
  # Ensure _pages is initialized
@@ -967,7 +1082,6 @@ class PDF:
967
1082
  """Context manager exit."""
968
1083
  self.close()
969
1084
 
970
-
971
- # --- Added TYPE_CHECKING import (if not already present) ---
972
- if TYPE_CHECKING:
973
- from pathlib import Path # Assuming Path is used for type hint
1085
+ # --- Indexable Protocol Methods --- Needed for search/sync
1086
+ def get_id(self) -> str:
1087
+ return self.path