natural-pdf 0.1.6__py3-none-any.whl → 0.1.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. docs/categorizing-documents/index.md +168 -0
  2. docs/data-extraction/index.md +87 -0
  3. docs/element-selection/index.ipynb +218 -164
  4. docs/element-selection/index.md +20 -0
  5. docs/finetuning/index.md +176 -0
  6. docs/index.md +19 -0
  7. docs/ocr/index.md +63 -16
  8. docs/tutorials/01-loading-and-extraction.ipynb +411 -248
  9. docs/tutorials/02-finding-elements.ipynb +123 -46
  10. docs/tutorials/03-extracting-blocks.ipynb +24 -19
  11. docs/tutorials/04-table-extraction.ipynb +17 -12
  12. docs/tutorials/05-excluding-content.ipynb +37 -32
  13. docs/tutorials/06-document-qa.ipynb +36 -31
  14. docs/tutorials/07-layout-analysis.ipynb +45 -40
  15. docs/tutorials/07-working-with-regions.ipynb +61 -60
  16. docs/tutorials/08-spatial-navigation.ipynb +76 -71
  17. docs/tutorials/09-section-extraction.ipynb +160 -155
  18. docs/tutorials/10-form-field-extraction.ipynb +71 -66
  19. docs/tutorials/11-enhanced-table-processing.ipynb +11 -6
  20. docs/tutorials/12-ocr-integration.ipynb +3420 -312
  21. docs/tutorials/12-ocr-integration.md +68 -106
  22. docs/tutorials/13-semantic-search.ipynb +641 -251
  23. natural_pdf/__init__.py +3 -0
  24. natural_pdf/analyzers/layout/gemini.py +63 -47
  25. natural_pdf/classification/manager.py +343 -0
  26. natural_pdf/classification/mixin.py +149 -0
  27. natural_pdf/classification/results.py +62 -0
  28. natural_pdf/collections/mixins.py +63 -0
  29. natural_pdf/collections/pdf_collection.py +326 -17
  30. natural_pdf/core/element_manager.py +73 -4
  31. natural_pdf/core/page.py +255 -83
  32. natural_pdf/core/pdf.py +385 -367
  33. natural_pdf/elements/base.py +1 -3
  34. natural_pdf/elements/collections.py +279 -49
  35. natural_pdf/elements/region.py +106 -21
  36. natural_pdf/elements/text.py +5 -2
  37. natural_pdf/exporters/__init__.py +4 -0
  38. natural_pdf/exporters/base.py +61 -0
  39. natural_pdf/exporters/paddleocr.py +345 -0
  40. natural_pdf/extraction/manager.py +134 -0
  41. natural_pdf/extraction/mixin.py +246 -0
  42. natural_pdf/extraction/result.py +37 -0
  43. natural_pdf/ocr/__init__.py +16 -8
  44. natural_pdf/ocr/engine.py +46 -30
  45. natural_pdf/ocr/engine_easyocr.py +86 -42
  46. natural_pdf/ocr/engine_paddle.py +39 -28
  47. natural_pdf/ocr/engine_surya.py +32 -16
  48. natural_pdf/ocr/ocr_factory.py +34 -23
  49. natural_pdf/ocr/ocr_manager.py +98 -34
  50. natural_pdf/ocr/ocr_options.py +38 -10
  51. natural_pdf/ocr/utils.py +59 -33
  52. natural_pdf/qa/document_qa.py +0 -4
  53. natural_pdf/selectors/parser.py +363 -238
  54. natural_pdf/templates/finetune/fine_tune_paddleocr.md +420 -0
  55. natural_pdf/utils/debug.py +4 -2
  56. natural_pdf/utils/identifiers.py +9 -5
  57. natural_pdf/utils/locks.py +8 -0
  58. natural_pdf/utils/packaging.py +172 -105
  59. natural_pdf/utils/text_extraction.py +96 -65
  60. natural_pdf/utils/tqdm_utils.py +43 -0
  61. natural_pdf/utils/visualization.py +1 -1
  62. {natural_pdf-0.1.6.dist-info → natural_pdf-0.1.8.dist-info}/METADATA +10 -3
  63. {natural_pdf-0.1.6.dist-info → natural_pdf-0.1.8.dist-info}/RECORD +66 -51
  64. {natural_pdf-0.1.6.dist-info → natural_pdf-0.1.8.dist-info}/WHEEL +1 -1
  65. {natural_pdf-0.1.6.dist-info → natural_pdf-0.1.8.dist-info}/licenses/LICENSE +0 -0
  66. {natural_pdf-0.1.6.dist-info → natural_pdf-0.1.8.dist-info}/top_level.txt +0 -0
@@ -359,8 +359,10 @@ class ElementManager:
359
359
 
360
360
  # Handle potential None confidence
361
361
  raw_confidence = result.get("confidence")
362
- confidence_value = float(raw_confidence) if raw_confidence is not None else None # Keep None if it was None
363
- ocr_text = result.get("text") # Get text, will be None if detect_only
362
+ confidence_value = (
363
+ float(raw_confidence) if raw_confidence is not None else None
364
+ ) # Keep None if it was None
365
+ ocr_text = result.get("text") # Get text, will be None if detect_only
364
366
 
365
367
  # Create the TextElement for the word
366
368
  word_element_data = {
@@ -373,7 +375,7 @@ class ElementManager:
373
375
  "height": pdf_height,
374
376
  "object_type": "word", # Treat OCR results as whole words
375
377
  "source": "ocr",
376
- "confidence": confidence_value, # Use the handled confidence
378
+ "confidence": confidence_value, # Use the handled confidence
377
379
  "fontname": "OCR", # Use consistent OCR fontname
378
380
  "size": (
379
381
  round(pdf_height) if pdf_height > 0 else 10.0
@@ -391,7 +393,7 @@ class ElementManager:
391
393
  ocr_char_dict.setdefault("adv", ocr_char_dict.get("width", 0))
392
394
 
393
395
  # Add the char dict list to the word data before creating TextElement
394
- word_element_data["_char_dicts"] = [ocr_char_dict] # Store itself as its only char
396
+ word_element_data["_char_dicts"] = [ocr_char_dict] # Store itself as its only char
395
397
 
396
398
  word_elem = TextElement(word_element_data, self._page)
397
399
  added_word_elements.append(word_elem)
@@ -537,3 +539,70 @@ class ElementManager:
537
539
  """Get all region elements."""
538
540
  self.load_elements()
539
541
  return self._elements.get("regions", [])
542
+
543
+ def remove_ocr_elements(self):
544
+ """
545
+ Remove all elements with source="ocr" from the elements dictionary.
546
+ This should be called before adding new OCR elements if replacement is desired.
547
+
548
+ Returns:
549
+ int: Number of OCR elements removed
550
+ """
551
+ # Load elements if not already loaded
552
+ self.load_elements()
553
+
554
+ removed_count = 0
555
+
556
+ # Filter out OCR elements from words
557
+ if "words" in self._elements:
558
+ original_len = len(self._elements["words"])
559
+ self._elements["words"] = [
560
+ word for word in self._elements["words"]
561
+ if getattr(word, "source", None) != "ocr"
562
+ ]
563
+ removed_count += original_len - len(self._elements["words"])
564
+
565
+ # Filter out OCR elements from chars
566
+ if "chars" in self._elements:
567
+ original_len = len(self._elements["chars"])
568
+ self._elements["chars"] = [
569
+ char for char in self._elements["chars"]
570
+ if (isinstance(char, dict) and char.get("source") != "ocr") or
571
+ (not isinstance(char, dict) and getattr(char, "source", None) != "ocr")
572
+ ]
573
+ removed_count += original_len - len(self._elements["chars"])
574
+
575
+ logger.info(f"Page {self._page.number}: Removed {removed_count} OCR elements.")
576
+ return removed_count
577
+
578
+ def remove_element(self, element, element_type="words"):
579
+ """
580
+ Remove a specific element from the managed elements.
581
+
582
+ Args:
583
+ element: The element to remove
584
+ element_type: The type of element ('words', 'chars', etc.)
585
+
586
+ Returns:
587
+ bool: True if removed successfully, False otherwise
588
+ """
589
+ # Load elements if not already loaded
590
+ self.load_elements()
591
+
592
+ # Check if the collection exists
593
+ if element_type not in self._elements:
594
+ logger.warning(f"Cannot remove element: collection '{element_type}' does not exist")
595
+ return False
596
+
597
+ # Try to remove the element
598
+ try:
599
+ if element in self._elements[element_type]:
600
+ self._elements[element_type].remove(element)
601
+ logger.debug(f"Removed element from {element_type}: {element}")
602
+ return True
603
+ else:
604
+ logger.debug(f"Element not found in {element_type}: {element}")
605
+ return False
606
+ except Exception as e:
607
+ logger.error(f"Error removing element from {element_type}: {e}", exc_info=True)
608
+ return False
natural_pdf/core/page.py CHANGED
@@ -6,14 +6,19 @@ import logging
6
6
  import os
7
7
  import re
8
8
  import tempfile
9
+ import time # Import time
9
10
  from pathlib import Path
10
11
  from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union
12
+ import concurrent.futures # Added import
13
+ from tqdm.auto import tqdm # Added tqdm import
14
+ import threading
11
15
 
12
16
  import pdfplumber
13
17
  from PIL import Image, ImageDraw
14
18
 
15
19
  from natural_pdf.elements.collections import ElementCollection
16
20
  from natural_pdf.elements.region import Region
21
+ from natural_pdf.utils.locks import pdf_render_lock # Import from utils instead
17
22
 
18
23
  if TYPE_CHECKING:
19
24
  import pdfplumber
@@ -46,10 +51,20 @@ from natural_pdf.widgets.viewer import _IPYWIDGETS_AVAILABLE, SimpleInteractiveV
46
51
  from natural_pdf.qa import DocumentQA, get_qa_engine
47
52
  from natural_pdf.ocr.utils import _apply_ocr_correction_to_elements
48
53
 
54
+ # --- Classification Imports --- #
55
+ from natural_pdf.classification.mixin import ClassificationMixin
56
+ from natural_pdf.classification.manager import ClassificationManager # For type hint
57
+ # --- End Classification Imports --- #
58
+
59
+ from natural_pdf.utils.locks import pdf_render_lock # Import the lock
60
+ from natural_pdf.elements.base import Element # Import base element
61
+ from natural_pdf.classification.mixin import ClassificationMixin # Import classification mixin
62
+ from natural_pdf.extraction.mixin import ExtractionMixin # Import extraction mixin
63
+
49
64
  logger = logging.getLogger(__name__)
50
65
 
51
66
 
52
- class Page:
67
+ class Page(ClassificationMixin, ExtractionMixin):
53
68
  """
54
69
  Enhanced Page wrapper built on top of pdfplumber.Page.
55
70
 
@@ -73,14 +88,21 @@ class Page:
73
88
  self._text_styles = None # Lazy-loaded text style analyzer results
74
89
  self._exclusions = [] # List to store exclusion functions/regions
75
90
 
91
+ # --- ADDED --- Metadata store for mixins
92
+ self.metadata: Dict[str, Any] = {}
93
+ # --- END ADDED ---
94
+
76
95
  # Region management
77
96
  self._regions = {
78
97
  "detected": [], # Layout detection results
79
98
  "named": {}, # Named regions (name -> region)
80
99
  }
81
100
 
82
- # Initialize ElementManager
83
- self._element_mgr = ElementManager(self, font_attrs)
101
+ # Initialize ElementManager, passing font_attrs
102
+ self._element_mgr = ElementManager(self, font_attrs=font_attrs)
103
+ # self._highlighter = HighlightingService(self) # REMOVED - Use property accessor
104
+ # --- NEW --- Central registry for analysis results
105
+ self.analyses: Dict[str, Any] = {}
84
106
 
85
107
  # --- Get OCR Manager Instance ---
86
108
  if (
@@ -115,6 +137,8 @@ class Page:
115
137
  # Initialize the internal variable with a single underscore
116
138
  self._layout_analyzer = None
117
139
 
140
+ self._load_elements()
141
+
118
142
  @property
119
143
  def pdf(self) -> "PDF":
120
144
  """Provides public access to the parent PDF object."""
@@ -1233,7 +1257,7 @@ class Page:
1233
1257
  render_ocr: bool = False,
1234
1258
  resolution: Optional[float] = None,
1235
1259
  include_highlights: bool = True,
1236
- exclusions: Optional[str] = None, # New parameter
1260
+ exclusions: Optional[str] = None, # New parameter
1237
1261
  **kwargs,
1238
1262
  ) -> Optional[Image.Image]:
1239
1263
  """
@@ -1257,38 +1281,48 @@ class Page:
1257
1281
  """
1258
1282
  image = None
1259
1283
  render_resolution = resolution if resolution is not None else scale * 72
1284
+ thread_id = threading.current_thread().name
1285
+ logger.debug(f"[{thread_id}] Page {self.index}: Attempting to acquire pdf_render_lock for to_image...")
1286
+ lock_wait_start = time.monotonic()
1260
1287
  try:
1261
- if include_highlights:
1262
- # Delegate rendering to the central service
1263
- image = self._highlighter.render_page(
1264
- page_index=self.index,
1265
- scale=scale, # Note: scale is used by highlighter internally for drawing
1266
- labels=labels,
1267
- legend_position=legend_position,
1268
- render_ocr=render_ocr,
1269
- resolution=render_resolution, # Pass the calculated resolution
1270
- **kwargs,
1271
- )
1272
- else:
1273
- # Get the base page image directly from pdfplumber if no highlights needed
1274
- # Use the underlying pdfplumber page object
1275
- img_object = self._page.to_image(resolution=render_resolution, **kwargs)
1276
- # Access the PIL image directly (assuming pdfplumber structure)
1277
- image = (
1278
- img_object.annotated
1279
- if hasattr(img_object, "annotated")
1280
- else img_object._repr_png_()
1281
- )
1282
- if isinstance(image, bytes): # Handle cases where it returns bytes
1283
- from io import BytesIO
1288
+ # Acquire the global PDF rendering lock
1289
+ with pdf_render_lock:
1290
+ lock_acquired_time = time.monotonic()
1291
+ logger.debug(f"[{thread_id}] Page {self.index}: Acquired pdf_render_lock (waited {lock_acquired_time - lock_wait_start:.2f}s). Starting render...")
1292
+ if include_highlights:
1293
+ # Delegate rendering to the central service
1294
+ image = self._highlighter.render_page(
1295
+ page_index=self.index,
1296
+ scale=scale, # Note: scale is used by highlighter internally for drawing
1297
+ labels=labels,
1298
+ legend_position=legend_position,
1299
+ render_ocr=render_ocr,
1300
+ resolution=render_resolution, # Pass the calculated resolution
1301
+ **kwargs,
1302
+ )
1303
+ else:
1304
+ # Get the base page image directly from pdfplumber if no highlights needed
1305
+ # Use the underlying pdfplumber page object
1306
+ img_object = self._page.to_image(resolution=render_resolution, **kwargs)
1307
+ # Access the PIL image directly (assuming pdfplumber structure)
1308
+ image = (
1309
+ img_object.annotated
1310
+ if hasattr(img_object, "annotated")
1311
+ else img_object._repr_png_()
1312
+ )
1313
+ if isinstance(image, bytes): # Handle cases where it returns bytes
1314
+ from io import BytesIO
1284
1315
 
1285
- image = Image.open(BytesIO(image)).convert(
1286
- "RGB"
1287
- ) # Convert to RGB for consistency
1316
+ image = Image.open(BytesIO(image)).convert(
1317
+ "RGB"
1318
+ ) # Convert to RGB for consistency
1288
1319
 
1289
1320
  except Exception as e:
1290
1321
  logger.error(f"Error rendering page {self.index}: {e}", exc_info=True)
1291
1322
  return None # Return None on error
1323
+ finally:
1324
+ render_end_time = time.monotonic()
1325
+ logger.debug(f"[{thread_id}] Page {self.index}: Released pdf_render_lock. Total render time (incl. lock wait): {render_end_time - lock_wait_start:.2f}s")
1292
1326
 
1293
1327
  if image is None:
1294
1328
  return None
@@ -1322,16 +1356,21 @@ class Page:
1322
1356
  max(0, img_x0),
1323
1357
  max(0, img_top),
1324
1358
  min(image.width, img_x1),
1325
- min(image.height, img_bottom)
1359
+ min(image.height, img_bottom),
1326
1360
  )
1327
1361
  if img_coords[0] < img_coords[2] and img_coords[1] < img_coords[3]:
1328
- draw.rectangle(img_coords, fill="white")
1362
+ draw.rectangle(img_coords, fill="white")
1329
1363
  else:
1330
- logger.warning(f"Skipping invalid exclusion rect for masking: {img_coords}")
1364
+ logger.warning(
1365
+ f"Skipping invalid exclusion rect for masking: {img_coords}"
1366
+ )
1331
1367
 
1332
- del draw # Release drawing context
1368
+ del draw # Release drawing context
1333
1369
  except Exception as mask_error:
1334
- logger.error(f"Error applying exclusion mask to page {self.index}: {mask_error}", exc_info=True)
1370
+ logger.error(
1371
+ f"Error applying exclusion mask to page {self.index}: {mask_error}",
1372
+ exc_info=True,
1373
+ )
1335
1374
  # Decide if you want to return None or continue without mask
1336
1375
  # For now, continue without mask
1337
1376
 
@@ -1379,6 +1418,7 @@ class Page:
1379
1418
  resolution: Optional[int] = None,
1380
1419
  detect_only: bool = False,
1381
1420
  apply_exclusions: bool = True,
1421
+ replace: bool = True,
1382
1422
  ) -> "Page":
1383
1423
  """
1384
1424
  Apply OCR to THIS page and add results to page elements via PDF.apply_ocr.
@@ -1392,13 +1432,21 @@ class Page:
1392
1432
  resolution: DPI resolution for rendering page image before OCR.
1393
1433
  apply_exclusions: If True (default), render page image for OCR
1394
1434
  with excluded areas masked (whited out).
1435
+ detect_only: If True, only detect text bounding boxes, don't perform OCR.
1436
+ replace: If True (default), remove any existing OCR elements before
1437
+ adding new ones. If False, add new OCR elements to existing ones.
1395
1438
 
1396
1439
  Returns:
1397
- List of created TextElements derived from OCR results for this page.
1440
+ Self for method chaining.
1398
1441
  """
1399
1442
  if not hasattr(self._parent, "apply_ocr"):
1400
1443
  logger.error(f"Page {self.number}: Parent PDF missing 'apply_ocr'. Cannot apply OCR.")
1401
- return [] # Return empty list for consistency
1444
+ return self # Return self for chaining
1445
+
1446
+ # Remove existing OCR elements if replace is True
1447
+ if replace and hasattr(self, "_element_mgr"):
1448
+ logger.info(f"Page {self.number}: Removing existing OCR elements before applying new OCR.")
1449
+ self._element_mgr.remove_ocr_elements()
1402
1450
 
1403
1451
  logger.info(f"Page {self.number}: Delegating apply_ocr to PDF.apply_ocr.")
1404
1452
  try:
@@ -1414,18 +1462,13 @@ class Page:
1414
1462
  resolution=resolution,
1415
1463
  detect_only=detect_only,
1416
1464
  apply_exclusions=apply_exclusions,
1465
+ replace=replace, # Pass the replace parameter to PDF.apply_ocr
1417
1466
  )
1418
1467
  except Exception as e:
1419
1468
  logger.error(f"Page {self.number}: Error during delegated OCR call: {e}", exc_info=True)
1420
- return []
1469
+ return self # Return self for chaining
1421
1470
 
1422
- # Return the OCR elements specifically added to this page
1423
- ocr_elements = [el for el in self.words if getattr(el, "source", None) == "ocr"]
1424
- logger.debug(
1425
- f"Page {self.number}: apply_ocr completed. Found {len(ocr_elements)} OCR elements."
1426
- )
1427
- # Note: The method is typed to return Page for chaining, but the log indicates
1428
- # finding elements. Let's stick to returning self for chaining consistency.
1471
+ # Return self for chaining
1429
1472
  return self
1430
1473
 
1431
1474
  def extract_ocr_elements(
@@ -1459,30 +1502,32 @@ class Page:
1459
1502
  return []
1460
1503
 
1461
1504
  logger.info(f"Page {self.number}: Extracting OCR elements (extract only)...")
1462
-
1505
+
1463
1506
  # Determine rendering resolution
1464
- final_resolution = resolution if resolution is not None else 150 # Default to 150 DPI
1507
+ final_resolution = resolution if resolution is not None else 150 # Default to 150 DPI
1465
1508
  logger.debug(f" Using rendering resolution: {final_resolution} DPI")
1466
-
1509
+
1467
1510
  try:
1468
1511
  # Get base image without highlights using the determined resolution
1469
- image = self.to_image(resolution=final_resolution, include_highlights=False)
1470
- if not image:
1471
- logger.error(f" Failed to render page {self.number} to image for OCR extraction.")
1472
- return []
1473
- logger.debug(f" Rendered image size: {image.width}x{image.height}")
1512
+ # Use the global PDF rendering lock
1513
+ with pdf_render_lock:
1514
+ image = self.to_image(resolution=final_resolution, include_highlights=False)
1515
+ if not image:
1516
+ logger.error(f" Failed to render page {self.number} to image for OCR extraction.")
1517
+ return []
1518
+ logger.debug(f" Rendered image size: {image.width}x{image.height}")
1474
1519
  except Exception as e:
1475
1520
  logger.error(f" Failed to render page {self.number} to image: {e}", exc_info=True)
1476
1521
  return []
1477
1522
 
1478
1523
  # Prepare arguments for the OCR Manager call
1479
1524
  manager_args = {
1480
- "images": image,
1481
- "engine": engine,
1482
- "languages": languages,
1483
- "min_confidence": min_confidence,
1484
- "device": device,
1485
- "options": options
1525
+ "images": image,
1526
+ "engine": engine,
1527
+ "languages": languages,
1528
+ "min_confidence": min_confidence,
1529
+ "device": device,
1530
+ "options": options,
1486
1531
  }
1487
1532
  manager_args = {k: v for k, v in manager_args.items() if v is not None}
1488
1533
 
@@ -1514,7 +1559,7 @@ class Page:
1514
1559
  scale_x = self.width / image.width if image.width else 1
1515
1560
  scale_y = self.height / image.height if image.height else 1
1516
1561
  for result in results:
1517
- try: # Added try-except around result processing
1562
+ try: # Added try-except around result processing
1518
1563
  x0, top, x1, bottom = [float(c) for c in result["bbox"]]
1519
1564
  elem_data = {
1520
1565
  "text": result["text"],
@@ -1525,15 +1570,17 @@ class Page:
1525
1570
  "bottom": bottom * scale_y,
1526
1571
  "width": (x1 - x0) * scale_x,
1527
1572
  "height": (bottom - top) * scale_y,
1528
- "object_type": "text", # Using text for temporary elements
1573
+ "object_type": "text", # Using text for temporary elements
1529
1574
  "source": "ocr",
1530
- "fontname": "OCR-extract", # Different name for clarity
1575
+ "fontname": "OCR-extract", # Different name for clarity
1531
1576
  "size": 10.0,
1532
1577
  "page_number": self.number,
1533
1578
  }
1534
1579
  temp_elements.append(TextElement(elem_data, self))
1535
1580
  except (KeyError, ValueError, TypeError) as convert_err:
1536
- logger.warning(f" Skipping invalid OCR result during conversion: {result}. Error: {convert_err}")
1581
+ logger.warning(
1582
+ f" Skipping invalid OCR result during conversion: {result}. Error: {convert_err}"
1583
+ )
1537
1584
 
1538
1585
  logger.info(f" Created {len(temp_elements)} TextElements from OCR (extract only).")
1539
1586
  return temp_elements
@@ -2020,41 +2067,166 @@ class Page:
2020
2067
  def correct_ocr(
2021
2068
  self,
2022
2069
  correction_callback: Callable[[Any], Optional[str]],
2023
- ) -> "Page": # Return self for chaining
2070
+ max_workers: Optional[int] = None,
2071
+ progress_callback: Optional[Callable[[], None]] = None, # Added progress callback
2072
+ ) -> "Page": # Return self for chaining
2024
2073
  """
2025
2074
  Applies corrections to OCR-generated text elements on this page
2026
- using a user-provided callback function.
2075
+ using a user-provided callback function, potentially in parallel.
2027
2076
 
2028
2077
  Finds text elements on this page whose 'source' attribute starts
2029
2078
  with 'ocr' and calls the `correction_callback` for each, passing the
2030
- element itself.
2031
-
2032
- The `correction_callback` should contain the logic to:
2033
- 1. Determine if the element needs correction.
2034
- 2. Perform the correction (e.g., call an LLM).
2035
- 3. Return the new text (`str`) or `None`.
2036
-
2037
- If the callback returns a string, the element's `.text` is updated.
2038
- Metadata updates (source, confidence, etc.) should happen within the callback.
2079
+ element itself. Updates the element's text if the callback returns
2080
+ a new string.
2039
2081
 
2040
2082
  Args:
2041
2083
  correction_callback: A function accepting an element and returning
2042
2084
  `Optional[str]` (new text or None).
2085
+ max_workers: The maximum number of threads to use for parallel execution.
2086
+ If None or 0 or 1, runs sequentially.
2087
+ progress_callback: Optional callback function to call after processing each element.
2043
2088
 
2044
2089
  Returns:
2045
2090
  Self for method chaining.
2046
2091
  """
2047
- logger.info(f"Page {self.number}: Starting OCR correction process using callback '{correction_callback.__name__}'")
2092
+ logger.info(
2093
+ f"Page {self.number}: Starting OCR correction with callback '{correction_callback.__name__}' (max_workers={max_workers})"
2094
+ )
2095
+
2096
+ target_elements_collection = self.find_all(
2097
+ selector="text[source=ocr]", apply_exclusions=False
2098
+ )
2099
+ target_elements = target_elements_collection.elements # Get the list
2100
+
2101
+ if not target_elements:
2102
+ logger.info(f"Page {self.number}: No OCR elements found to correct.")
2103
+ return self
2048
2104
 
2049
- # Find OCR elements specifically on this page
2050
- # Note: We typically want to correct even if the element falls in an excluded area
2051
- target_elements = self.find_all(selector="text[source^=ocr]", apply_exclusions=False)
2105
+ processed_count = 0
2106
+ updated_count = 0
2107
+ error_count = 0
2052
2108
 
2053
- # Delegate to the utility function
2054
- _apply_ocr_correction_to_elements(
2055
- elements=target_elements, # Pass the ElementCollection directly
2056
- correction_callback=correction_callback,
2057
- caller_info=f"Page({self.number})", # Pass caller info
2109
+ # Define the task to be run by the worker thread or sequentially
2110
+ def _process_element_task(element):
2111
+ try:
2112
+ current_text = getattr(element, 'text', None)
2113
+ # Call the user-provided callback
2114
+ corrected_text = correction_callback(element)
2115
+
2116
+ # Validate result type
2117
+ if corrected_text is not None and not isinstance(corrected_text, str):
2118
+ logger.warning(f"Page {self.number}: Correction callback for element '{getattr(element, 'text', '')[:20]}...' returned non-string, non-None type: {type(corrected_text)}. Skipping update.")
2119
+ return element, None, None # Treat as no correction
2120
+
2121
+ return element, corrected_text, None # Return element, result, no error
2122
+ except Exception as e:
2123
+ logger.error(
2124
+ f"Page {self.number}: Error applying correction callback to element '{getattr(element, 'text', '')[:30]}...' ({element.bbox}): {e}",
2125
+ exc_info=False # Keep log concise
2126
+ )
2127
+ return element, None, e # Return element, no result, error
2128
+ finally:
2129
+ # --- Call progress callback here --- #
2130
+ if progress_callback:
2131
+ try:
2132
+ progress_callback()
2133
+ except Exception as cb_e:
2134
+ # Log error in callback itself, but don't stop processing
2135
+ logger.error(f"Page {self.number}: Error executing progress_callback: {cb_e}", exc_info=False)
2136
+
2137
+ # Choose execution strategy based on max_workers
2138
+ if max_workers is not None and max_workers > 1:
2139
+ # --- Parallel execution --- #
2140
+ logger.info(f"Page {self.number}: Running OCR correction in parallel with {max_workers} workers.")
2141
+ futures = []
2142
+ with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
2143
+ # Submit all tasks
2144
+ future_to_element = {executor.submit(_process_element_task, element): element for element in target_elements}
2145
+
2146
+ # Process results as they complete (progress_callback called by worker)
2147
+ for future in concurrent.futures.as_completed(future_to_element):
2148
+ processed_count += 1
2149
+ try:
2150
+ element, corrected_text, error = future.result()
2151
+ if error:
2152
+ error_count += 1
2153
+ # Error already logged in worker
2154
+ elif corrected_text is not None:
2155
+ # Apply correction if text changed
2156
+ current_text = getattr(element, 'text', None)
2157
+ if corrected_text != current_text:
2158
+ element.text = corrected_text
2159
+ updated_count += 1
2160
+ except Exception as exc:
2161
+ # Catch errors from future.result() itself
2162
+ element = future_to_element[future] # Find original element
2163
+ logger.error(f"Page {self.number}: Internal error retrieving correction result for element {element.bbox}: {exc}", exc_info=True)
2164
+ error_count += 1
2165
+ # Note: progress_callback was already called in the worker's finally block
2166
+
2167
+ else:
2168
+ # --- Sequential execution --- #
2169
+ logger.info(f"Page {self.number}: Running OCR correction sequentially.")
2170
+ for element in target_elements:
2171
+ # Call the task function directly (it handles progress_callback)
2172
+ processed_count += 1
2173
+ _element, corrected_text, error = _process_element_task(element)
2174
+ if error:
2175
+ error_count += 1
2176
+ elif corrected_text is not None:
2177
+ # Apply correction if text changed
2178
+ current_text = getattr(_element, 'text', None)
2179
+ if corrected_text != current_text:
2180
+ _element.text = corrected_text
2181
+ updated_count += 1
2182
+
2183
+ logger.info(
2184
+ f"Page {self.number}: OCR correction finished. Processed: {processed_count}/{len(target_elements)}, Updated: {updated_count}, Errors: {error_count}."
2058
2185
  )
2059
2186
 
2060
2187
  return self # Return self for chaining
2188
+
2189
+ # --- Classification Mixin Implementation --- #
2190
+ def _get_classification_manager(self) -> "ClassificationManager":
2191
+ if not hasattr(self, 'pdf') or not hasattr(self.pdf, 'get_manager'):
2192
+ raise AttributeError("ClassificationManager cannot be accessed: Parent PDF or get_manager method missing.")
2193
+ try:
2194
+ # Use the PDF's manager registry accessor
2195
+ return self.pdf.get_manager('classification')
2196
+ except (ValueError, RuntimeError, AttributeError) as e:
2197
+ # Wrap potential errors from get_manager for clarity
2198
+ raise AttributeError(f"Failed to get ClassificationManager from PDF: {e}") from e
2199
+
2200
+ def _get_classification_content(self, model_type: str, **kwargs) -> Union[str, "Image"]: # Use "Image" for lazy import
2201
+ if model_type == 'text':
2202
+ text_content = self.extract_text(layout=False, use_exclusions=False) # Simple join, ignore exclusions for classification
2203
+ if not text_content or text_content.isspace():
2204
+ raise ValueError("Cannot classify page with 'text' model: No text content found.")
2205
+ return text_content
2206
+ elif model_type == 'vision':
2207
+ # Get resolution from manager/kwargs if possible, else default
2208
+ manager = self._get_classification_manager()
2209
+ default_resolution = 150
2210
+ # Access kwargs passed to classify method if needed
2211
+ resolution = kwargs.get('resolution', default_resolution) if 'kwargs' in locals() else default_resolution
2212
+
2213
+ # Use to_image, ensuring no highlights interfere
2214
+ img = self.to_image(
2215
+ resolution=resolution,
2216
+ include_highlights=False,
2217
+ labels=False,
2218
+ exclusions=None # Don't mask exclusions for classification input image
2219
+ )
2220
+ if img is None:
2221
+ raise ValueError("Cannot classify page with 'vision' model: Failed to render image.")
2222
+ return img
2223
+ else:
2224
+ raise ValueError(f"Unsupported model_type for classification: {model_type}")
2225
+
2226
+ def _get_metadata_storage(self) -> Dict[str, Any]:
2227
+ # Ensure metadata exists
2228
+ if not hasattr(self, 'metadata') or self.metadata is None:
2229
+ self.metadata = {}
2230
+ return self.metadata
2231
+
2232
+ # --- Content Extraction ---