natural-pdf 0.1.11__py3-none-any.whl → 0.1.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. natural_pdf/__init__.py +7 -2
  2. natural_pdf/analyzers/shape_detection_mixin.py +1092 -0
  3. natural_pdf/analyzers/text_options.py +9 -1
  4. natural_pdf/analyzers/text_structure.py +371 -58
  5. natural_pdf/classification/manager.py +3 -4
  6. natural_pdf/collections/pdf_collection.py +19 -39
  7. natural_pdf/core/element_manager.py +11 -1
  8. natural_pdf/core/highlighting_service.py +146 -75
  9. natural_pdf/core/page.py +287 -188
  10. natural_pdf/core/pdf.py +57 -42
  11. natural_pdf/elements/base.py +51 -0
  12. natural_pdf/elements/collections.py +362 -67
  13. natural_pdf/elements/line.py +5 -0
  14. natural_pdf/elements/region.py +396 -23
  15. natural_pdf/exporters/data/__init__.py +0 -0
  16. natural_pdf/exporters/data/pdf.ttf +0 -0
  17. natural_pdf/exporters/data/sRGB.icc +0 -0
  18. natural_pdf/exporters/hocr.py +40 -61
  19. natural_pdf/exporters/hocr_font.py +7 -13
  20. natural_pdf/exporters/original_pdf.py +10 -13
  21. natural_pdf/exporters/paddleocr.py +51 -11
  22. natural_pdf/exporters/searchable_pdf.py +0 -10
  23. natural_pdf/flows/__init__.py +12 -0
  24. natural_pdf/flows/collections.py +533 -0
  25. natural_pdf/flows/element.py +382 -0
  26. natural_pdf/flows/flow.py +216 -0
  27. natural_pdf/flows/region.py +458 -0
  28. natural_pdf/search/__init__.py +65 -52
  29. natural_pdf/search/lancedb_search_service.py +325 -0
  30. natural_pdf/search/numpy_search_service.py +255 -0
  31. natural_pdf/search/searchable_mixin.py +25 -71
  32. natural_pdf/selectors/parser.py +163 -8
  33. natural_pdf/widgets/viewer.py +22 -31
  34. {natural_pdf-0.1.11.dist-info → natural_pdf-0.1.13.dist-info}/METADATA +55 -49
  35. {natural_pdf-0.1.11.dist-info → natural_pdf-0.1.13.dist-info}/RECORD +38 -30
  36. {natural_pdf-0.1.11.dist-info → natural_pdf-0.1.13.dist-info}/WHEEL +1 -1
  37. natural_pdf/search/haystack_search_service.py +0 -687
  38. natural_pdf/search/haystack_utils.py +0 -474
  39. natural_pdf/utils/tqdm_utils.py +0 -51
  40. {natural_pdf-0.1.11.dist-info → natural_pdf-0.1.13.dist-info}/licenses/LICENSE +0 -0
  41. {natural_pdf-0.1.11.dist-info → natural_pdf-0.1.13.dist-info}/top_level.txt +0 -0
natural_pdf/core/page.py CHANGED
@@ -74,6 +74,11 @@ from natural_pdf.widgets.viewer import _IPYWIDGETS_AVAILABLE, SimpleInteractiveV
74
74
  # --- End Classification Imports --- #
75
75
 
76
76
 
77
+ # --- Shape Detection Mixin --- #
78
+ from natural_pdf.analyzers.shape_detection_mixin import ShapeDetectionMixin
79
+ # --- End Shape Detection Mixin --- #
80
+
81
+
77
82
  try:
78
83
  from deskew import determine_skew
79
84
 
@@ -86,7 +91,7 @@ except ImportError:
86
91
  logger = logging.getLogger(__name__)
87
92
 
88
93
 
89
- class Page(ClassificationMixin, ExtractionMixin):
94
+ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin):
90
95
  """
91
96
  Enhanced Page wrapper built on top of pdfplumber.Page.
92
97
 
@@ -161,6 +166,7 @@ class Page(ClassificationMixin, ExtractionMixin):
161
166
  self._layout_analyzer = None
162
167
 
163
168
  self._load_elements()
169
+ self._to_image_cache: Dict[tuple, Optional["Image.Image"]] = {}
164
170
 
165
171
  @property
166
172
  def pdf(self) -> "PDF":
@@ -649,7 +655,7 @@ class Page(ClassificationMixin, ExtractionMixin):
649
655
  Exclusions are now handled by the calling methods (find, find_all) if requested.
650
656
 
651
657
  Args:
652
- selector_obj: Parsed selector dictionary
658
+ selector_obj: Parsed selector dictionary (single or compound OR selector)
653
659
  **kwargs: Additional filter parameters including 'regex' and 'case'
654
660
 
655
661
  Returns:
@@ -657,6 +663,30 @@ class Page(ClassificationMixin, ExtractionMixin):
657
663
  """
658
664
  from natural_pdf.selectors.parser import selector_to_filter_func
659
665
 
666
+ # Handle compound OR selectors
667
+ if selector_obj.get("type") == "or":
668
+ # For OR selectors, search all elements and let the filter function decide
669
+ elements_to_search = self._element_mgr.get_all_elements()
670
+
671
+ # Create filter function from compound selector
672
+ filter_func = selector_to_filter_func(selector_obj, **kwargs)
673
+
674
+ # Apply the filter to all elements
675
+ matching_elements = [element for element in elements_to_search if filter_func(element)]
676
+
677
+ # Sort elements in reading order if requested
678
+ if kwargs.get("reading_order", True):
679
+ if all(hasattr(el, "top") and hasattr(el, "x0") for el in matching_elements):
680
+ matching_elements.sort(key=lambda el: (el.top, el.x0))
681
+ else:
682
+ logger.warning(
683
+ "Cannot sort elements in reading order: Missing required attributes (top, x0)."
684
+ )
685
+
686
+ # Return result collection
687
+ return ElementCollection(matching_elements)
688
+
689
+ # Handle single selectors (existing logic)
660
690
  # Get element type to filter
661
691
  element_type = selector_obj.get("type", "any").lower()
662
692
 
@@ -1349,7 +1379,9 @@ class Page(ClassificationMixin, ExtractionMixin):
1349
1379
  self._highlighter.clear_page(self.index)
1350
1380
  return self
1351
1381
 
1352
- def analyze_text_styles(self, options: Optional[TextStyleOptions] = None) -> "ElementCollection":
1382
+ def analyze_text_styles(
1383
+ self, options: Optional[TextStyleOptions] = None
1384
+ ) -> "ElementCollection":
1353
1385
  """
1354
1386
  Analyze text elements by style, adding attributes directly to elements.
1355
1387
 
@@ -1409,114 +1441,171 @@ class Page(ClassificationMixin, ExtractionMixin):
1409
1441
  Returns:
1410
1442
  PIL Image of the page, or None if rendering fails.
1411
1443
  """
1412
- image = None
1413
- render_resolution = resolution if resolution is not None else scale * 72
1414
- thread_id = threading.current_thread().name
1415
- logger.debug(
1416
- f"[{thread_id}] Page {self.index}: Attempting to acquire pdf_render_lock for to_image..."
1417
- )
1418
- lock_wait_start = time.monotonic()
1444
+ # 1. Create cache key (excluding path)
1445
+ cache_key_parts = [
1446
+ scale,
1447
+ width,
1448
+ labels,
1449
+ legend_position,
1450
+ render_ocr,
1451
+ resolution,
1452
+ include_highlights,
1453
+ exclusions,
1454
+ ]
1455
+ # Convert kwargs to a stable, hashable representation
1456
+ sorted_kwargs_list = []
1457
+ for k, v in sorted(kwargs.items()):
1458
+ if isinstance(v, list):
1459
+ try:
1460
+ v = tuple(v) # Convert lists to tuples
1461
+ except TypeError: # pragma: no cover
1462
+ # If list contains unhashable items, fall back to repr or skip
1463
+ # For simplicity, we'll try to proceed; hashing will fail if v remains unhashable
1464
+ logger.warning(f"Cache key generation: List item in kwargs['{k}'] could not be converted to tuple due to unhashable elements.")
1465
+ sorted_kwargs_list.append((k, v))
1466
+
1467
+ cache_key_parts.append(tuple(sorted_kwargs_list))
1468
+
1419
1469
  try:
1420
- # Acquire the global PDF rendering lock
1421
- with pdf_render_lock:
1422
- lock_acquired_time = time.monotonic()
1423
- logger.debug(
1424
- f"[{thread_id}] Page {self.index}: Acquired pdf_render_lock (waited {lock_acquired_time - lock_wait_start:.2f}s). Starting render..."
1425
- )
1426
- if include_highlights:
1427
- # Delegate rendering to the central service
1428
- image = self._highlighter.render_page(
1429
- page_index=self.index,
1430
- scale=scale,
1431
- labels=labels,
1432
- legend_position=legend_position,
1433
- render_ocr=render_ocr,
1434
- resolution=render_resolution, # Pass the calculated resolution
1435
- **kwargs,
1436
- )
1437
- else:
1438
- image = render_plain_page(self, render_resolution)
1439
- except Exception as e:
1440
- logger.error(f"Error rendering page {self.index}: {e}", exc_info=True)
1441
- return None # Return None on error
1442
- finally:
1443
- render_end_time = time.monotonic()
1444
- logger.debug(
1445
- f"[{thread_id}] Page {self.index}: Released pdf_render_lock. Total render time (incl. lock wait): {render_end_time - lock_wait_start:.2f}s"
1446
- )
1470
+ cache_key = tuple(cache_key_parts)
1471
+ except TypeError as e: # pragma: no cover
1472
+ logger.warning(f"Page {self.index}: Could not create cache key for to_image due to unhashable item: {e}. Proceeding without cache for this call.")
1473
+ cache_key = None # Fallback to not using cache for this call
1447
1474
 
1448
- if image is None:
1449
- return None
1475
+ image_to_return: Optional[Image.Image] = None
1450
1476
 
1451
- # --- Apply exclusion masking if requested ---
1452
- if exclusions == "mask" and self._exclusions:
1477
+ # 2. Check cache
1478
+ if cache_key is not None and cache_key in self._to_image_cache:
1479
+ image_to_return = self._to_image_cache[cache_key]
1480
+ logger.debug(f"Page {self.index}: Returning cached image for key: {cache_key}")
1481
+ else:
1482
+ # --- This is the original logic to generate the image ---
1483
+ rendered_image_component: Optional[Image.Image] = None # Renamed from 'image' in original
1484
+ render_resolution = resolution if resolution is not None else scale * 72
1485
+ thread_id = threading.current_thread().name
1486
+ logger.debug(
1487
+ f"[{thread_id}] Page {self.index}: Attempting to acquire pdf_render_lock for to_image..."
1488
+ )
1489
+ lock_wait_start = time.monotonic()
1453
1490
  try:
1454
- # Ensure image is mutable (RGB or RGBA)
1455
- if image.mode not in ("RGB", "RGBA"):
1456
- image = image.convert("RGB")
1457
-
1458
- exclusion_regions = self._get_exclusion_regions(include_callable=True, debug=False)
1459
- if exclusion_regions:
1460
- draw = ImageDraw.Draw(image)
1461
- # Calculate the scaling factor used for the image
1462
- # Base image was rendered at render_resolution (DPI)
1463
- # pdfplumber default is 72 DPI
1464
- # Scale factor = (pixels / inch) / (points / inch) = DPI / 72
1465
- img_scale = render_resolution / 72.0
1466
-
1467
- for region in exclusion_regions:
1468
- # Convert PDF points (x0, top, x1, bottom) to image pixels
1469
- img_x0 = region.x0 * img_scale
1470
- img_top = region.top * img_scale
1471
- img_x1 = region.x1 * img_scale
1472
- img_bottom = region.bottom * img_scale
1473
-
1474
- # Draw a white rectangle over the excluded area
1475
- # Ensure coordinates are within image bounds (though region should be)
1476
- img_coords = (
1477
- max(0, img_x0),
1478
- max(0, img_top),
1479
- min(image.width, img_x1),
1480
- min(image.height, img_bottom),
1491
+ # Acquire the global PDF rendering lock
1492
+ with pdf_render_lock:
1493
+ lock_acquired_time = time.monotonic()
1494
+ logger.debug(
1495
+ f"[{thread_id}] Page {self.index}: Acquired pdf_render_lock (waited {lock_acquired_time - lock_wait_start:.2f}s). Starting render..."
1496
+ )
1497
+ if include_highlights:
1498
+ # Delegate rendering to the central service
1499
+ rendered_image_component = self._highlighter.render_page(
1500
+ page_index=self.index,
1501
+ scale=scale,
1502
+ labels=labels,
1503
+ legend_position=legend_position,
1504
+ render_ocr=render_ocr,
1505
+ resolution=render_resolution, # Pass the calculated resolution
1506
+ **kwargs,
1481
1507
  )
1482
- if img_coords[0] < img_coords[2] and img_coords[1] < img_coords[3]:
1483
- draw.rectangle(img_coords, fill="white")
1484
- else:
1485
- logger.warning(
1486
- f"Skipping invalid exclusion rect for masking: {img_coords}"
1487
- )
1488
-
1489
- del draw # Release drawing context
1490
- except Exception as mask_error:
1491
- logger.error(
1492
- f"Error applying exclusion mask to page {self.index}: {mask_error}",
1493
- exc_info=True,
1508
+ else:
1509
+ rendered_image_component = render_plain_page(self, render_resolution)
1510
+ except Exception as e:
1511
+ logger.error(f"Error rendering page {self.index}: {e}", exc_info=True)
1512
+ # rendered_image_component remains None
1513
+ finally:
1514
+ render_end_time = time.monotonic()
1515
+ logger.debug(
1516
+ f"[{thread_id}] Page {self.index}: Released pdf_render_lock. Total render time (incl. lock wait): {render_end_time - lock_wait_start:.2f}s"
1494
1517
  )
1495
- # Decide if you want to return None or continue without mask
1496
- # For now, continue without mask
1497
1518
 
1498
- # Resize the final image if width is provided
1499
- if width is not None and width > 0 and image.width > 0:
1500
- aspect_ratio = image.height / image.width
1501
- height = int(width * aspect_ratio)
1502
- try:
1503
- image = image.resize(
1504
- (width, height), Image.Resampling.LANCZOS
1505
- ) # Use modern resampling
1506
- except Exception as resize_error:
1507
- logger.warning(f"Could not resize image: {resize_error}")
1508
-
1509
- # Save the image if path is provided
1510
- if path:
1519
+ if rendered_image_component is None:
1520
+ if cache_key is not None:
1521
+ self._to_image_cache[cache_key] = None # Cache the failure
1522
+ # Save the image if path is provided (will try to save None, handled by PIL/OS)
1523
+ if path:
1524
+ try:
1525
+ if os.path.dirname(path):
1526
+ os.makedirs(os.path.dirname(path), exist_ok=True)
1527
+ if rendered_image_component is not None: # Should be None here
1528
+ rendered_image_component.save(path) # This line won't be hit if None
1529
+ # else: logger.debug("Not saving None image") # Not strictly needed
1530
+ except Exception as save_error: # pragma: no cover
1531
+ logger.error(f"Failed to save image to {path}: {save_error}")
1532
+ return None
1533
+
1534
+ # --- Apply exclusion masking if requested ---
1535
+ # This modifies 'rendered_image_component'
1536
+ image_after_masking = rendered_image_component # Start with the rendered image
1537
+ if exclusions == "mask" and self._exclusions:
1538
+ try:
1539
+ # Ensure image is mutable (RGB or RGBA)
1540
+ if image_after_masking.mode not in ("RGB", "RGBA"):
1541
+ image_after_masking = image_after_masking.convert("RGB")
1542
+
1543
+ exclusion_regions = self._get_exclusion_regions(include_callable=True, debug=False)
1544
+ if exclusion_regions:
1545
+ draw = ImageDraw.Draw(image_after_masking)
1546
+ # Calculate the scaling factor used for the image
1547
+ img_scale = render_resolution / 72.0
1548
+
1549
+ for region in exclusion_regions:
1550
+ # Convert PDF points (x0, top, x1, bottom) to image pixels
1551
+ img_x0 = region.x0 * img_scale
1552
+ img_top = region.top * img_scale
1553
+ img_x1 = region.x1 * img_scale
1554
+ img_bottom = region.bottom * img_scale
1555
+
1556
+ # Draw a white rectangle over the excluded area
1557
+ img_coords = (
1558
+ max(0, img_x0),
1559
+ max(0, img_top),
1560
+ min(image_after_masking.width, img_x1),
1561
+ min(image_after_masking.height, img_bottom),
1562
+ )
1563
+ if img_coords[0] < img_coords[2] and img_coords[1] < img_coords[3]:
1564
+ draw.rectangle(img_coords, fill="white")
1565
+ else: # pragma: no cover
1566
+ logger.warning(
1567
+ f"Skipping invalid exclusion rect for masking: {img_coords}"
1568
+ )
1569
+ del draw # Release drawing context
1570
+ except Exception as mask_error: # pragma: no cover
1571
+ logger.error(
1572
+ f"Error applying exclusion mask to page {self.index}: {mask_error}",
1573
+ exc_info=True,
1574
+ )
1575
+ # Continue with potentially unmasked or partially masked image
1576
+
1577
+ # --- Resize the final image if width is provided ---
1578
+ image_final_content = image_after_masking # Start with image after masking
1579
+ if width is not None and width > 0 and image_final_content.width > 0:
1580
+ aspect_ratio = image_final_content.height / image_final_content.width
1581
+ height = int(width * aspect_ratio)
1582
+ try:
1583
+ image_final_content = image_final_content.resize(
1584
+ (width, height), Image.Resampling.LANCZOS
1585
+ )
1586
+ except Exception as resize_error: # pragma: no cover
1587
+ logger.warning(f"Could not resize image: {resize_error}")
1588
+ # image_final_content remains the un-resized version if resize fails
1589
+
1590
+ # Store in cache
1591
+ if cache_key is not None:
1592
+ self._to_image_cache[cache_key] = image_final_content
1593
+ logger.debug(f"Page {self.index}: Cached image for key: {cache_key}")
1594
+ image_to_return = image_final_content
1595
+ # --- End of cache miss block ---
1596
+
1597
+ # Save the image (either from cache or newly generated) if path is provided
1598
+ if path and image_to_return:
1511
1599
  try:
1512
1600
  # Ensure directory exists
1513
- os.makedirs(os.path.dirname(path), exist_ok=True)
1514
- image.save(path)
1601
+ if os.path.dirname(path): # Only call makedirs if there's a directory part
1602
+ os.makedirs(os.path.dirname(path), exist_ok=True)
1603
+ image_to_return.save(path)
1515
1604
  logger.debug(f"Saved page image to: {path}")
1516
- except Exception as save_error:
1605
+ except Exception as save_error: # pragma: no cover
1517
1606
  logger.error(f"Failed to save image to {path}: {save_error}")
1518
1607
 
1519
- return image
1608
+ return image_to_return
1520
1609
 
1521
1610
  def _create_text_elements_from_ocr(
1522
1611
  self, ocr_results: List[Dict[str, Any]], image_width=None, image_height=None
@@ -1984,7 +2073,6 @@ class Page(ClassificationMixin, ExtractionMixin):
1984
2073
  region.is_end_next_start = False
1985
2074
  regions.append(region)
1986
2075
 
1987
- # Return the list wrapped in an ElementCollection
1988
2076
  return ElementCollection(regions)
1989
2077
 
1990
2078
  def __repr__(self) -> str:
@@ -2130,7 +2218,7 @@ class Page(ClassificationMixin, ExtractionMixin):
2130
2218
  if not _IPYWIDGETS_AVAILABLE or SimpleInteractiveViewerWidget is None:
2131
2219
  logger.error(
2132
2220
  "Interactive viewer requires optional dependencies ('ipywidgets'). "
2133
- "Install with `pip install natural-pdf[interactive]`"
2221
+ "Install with `pip install natural-pdf[viewer]`"
2134
2222
  )
2135
2223
  # raise ImportError("ipywidgets not found.") # Option 1: Raise error
2136
2224
  return None # Option 2: Return None gracefully
@@ -2211,6 +2299,7 @@ class Page(ClassificationMixin, ExtractionMixin):
2211
2299
  def correct_ocr(
2212
2300
  self,
2213
2301
  correction_callback: Callable[[Any], Optional[str]],
2302
+ selector: Optional[str] = "text[source=ocr]",
2214
2303
  max_workers: Optional[int] = None,
2215
2304
  progress_callback: Optional[Callable[[], None]] = None, # Added progress callback
2216
2305
  ) -> "Page": # Return self for chaining
@@ -2238,7 +2327,7 @@ class Page(ClassificationMixin, ExtractionMixin):
2238
2327
  )
2239
2328
 
2240
2329
  target_elements_collection = self.find_all(
2241
- selector="text[source=ocr]", apply_exclusions=False
2330
+ selector=selector, apply_exclusions=False
2242
2331
  )
2243
2332
  target_elements = target_elements_collection.elements # Get the list
2244
2333
 
@@ -2246,102 +2335,112 @@ class Page(ClassificationMixin, ExtractionMixin):
2246
2335
  logger.info(f"Page {self.number}: No OCR elements found to correct.")
2247
2336
  return self
2248
2337
 
2249
- processed_count = 0
2250
- updated_count = 0
2251
- error_count = 0
2338
+ element_pbar = None
2339
+ try:
2340
+ element_pbar = tqdm(total=len(target_elements), desc=f"Correcting OCR Page {self.number}", unit="element", leave=False)
2252
2341
 
2253
- # Define the task to be run by the worker thread or sequentially
2254
- def _process_element_task(element):
2255
- try:
2256
- current_text = getattr(element, "text", None)
2257
- # Call the user-provided callback
2258
- corrected_text = correction_callback(element)
2342
+ processed_count = 0
2343
+ updated_count = 0
2344
+ error_count = 0
2259
2345
 
2260
- # Validate result type
2261
- if corrected_text is not None and not isinstance(corrected_text, str):
2262
- logger.warning(
2263
- f"Page {self.number}: Correction callback for element '{getattr(element, 'text', '')[:20]}...' returned non-string, non-None type: {type(corrected_text)}. Skipping update."
2264
- )
2265
- return element, None, None # Treat as no correction
2346
+ # Define the task to be run by the worker thread or sequentially
2347
+ def _process_element_task(element):
2348
+ try:
2349
+ current_text = getattr(element, "text", None)
2350
+ # Call the user-provided callback
2351
+ corrected_text = correction_callback(element)
2266
2352
 
2267
- return element, corrected_text, None # Return element, result, no error
2268
- except Exception as e:
2269
- logger.error(
2270
- f"Page {self.number}: Error applying correction callback to element '{getattr(element, 'text', '')[:30]}...' ({element.bbox}): {e}",
2271
- exc_info=False, # Keep log concise
2272
- )
2273
- return element, None, e # Return element, no result, error
2274
- finally:
2275
- # --- Call progress callback here --- #
2276
- if progress_callback:
2277
- try:
2278
- progress_callback()
2279
- except Exception as cb_e:
2280
- # Log error in callback itself, but don't stop processing
2281
- logger.error(
2282
- f"Page {self.number}: Error executing progress_callback: {cb_e}",
2283
- exc_info=False,
2353
+ # Validate result type
2354
+ if corrected_text is not None and not isinstance(corrected_text, str):
2355
+ logger.warning(
2356
+ f"Page {self.number}: Correction callback for element '{getattr(element, 'text', '')[:20]}...' returned non-string, non-None type: {type(corrected_text)}. Skipping update."
2284
2357
  )
2358
+ return element, None, None # Treat as no correction
2285
2359
 
2286
- # Choose execution strategy based on max_workers
2287
- if max_workers is not None and max_workers > 1:
2288
- # --- Parallel execution --- #
2289
- logger.info(
2290
- f"Page {self.number}: Running OCR correction in parallel with {max_workers} workers."
2291
- )
2292
- futures = []
2293
- with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
2294
- # Submit all tasks
2295
- future_to_element = {
2296
- executor.submit(_process_element_task, element): element
2297
- for element in target_elements
2298
- }
2360
+ return element, corrected_text, None # Return element, result, no error
2361
+ except Exception as e:
2362
+ logger.error(
2363
+ f"Page {self.number}: Error applying correction callback to element '{getattr(element, 'text', '')[:30]}...' ({element.bbox}): {e}",
2364
+ exc_info=False, # Keep log concise
2365
+ )
2366
+ return element, None, e # Return element, no result, error
2367
+ finally:
2368
+ # --- Update internal tqdm progress bar ---
2369
+ if element_pbar:
2370
+ element_pbar.update(1)
2371
+ # --- Call user's progress callback --- #
2372
+ if progress_callback:
2373
+ try:
2374
+ progress_callback()
2375
+ except Exception as cb_e:
2376
+ # Log error in callback itself, but don't stop processing
2377
+ logger.error(
2378
+ f"Page {self.number}: Error executing progress_callback: {cb_e}",
2379
+ exc_info=False,
2380
+ )
2299
2381
 
2300
- # Process results as they complete (progress_callback called by worker)
2301
- for future in concurrent.futures.as_completed(future_to_element):
2302
- processed_count += 1
2303
- try:
2304
- element, corrected_text, error = future.result()
2305
- if error:
2382
+ # Choose execution strategy based on max_workers
2383
+ if max_workers is not None and max_workers > 1:
2384
+ # --- Parallel execution --- #
2385
+ logger.info(
2386
+ f"Page {self.number}: Running OCR correction in parallel with {max_workers} workers."
2387
+ )
2388
+ futures = []
2389
+ with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
2390
+ # Submit all tasks
2391
+ future_to_element = {
2392
+ executor.submit(_process_element_task, element): element
2393
+ for element in target_elements
2394
+ }
2395
+
2396
+ # Process results as they complete (progress_callback called by worker)
2397
+ for future in concurrent.futures.as_completed(future_to_element):
2398
+ processed_count += 1
2399
+ try:
2400
+ element, corrected_text, error = future.result()
2401
+ if error:
2402
+ error_count += 1
2403
+ # Error already logged in worker
2404
+ elif corrected_text is not None:
2405
+ # Apply correction if text changed
2406
+ current_text = getattr(element, "text", None)
2407
+ if corrected_text != current_text:
2408
+ element.text = corrected_text
2409
+ updated_count += 1
2410
+ except Exception as exc:
2411
+ # Catch errors from future.result() itself
2412
+ element = future_to_element[future] # Find original element
2413
+ logger.error(
2414
+ f"Page {self.number}: Internal error retrieving correction result for element {element.bbox}: {exc}",
2415
+ exc_info=True,
2416
+ )
2306
2417
  error_count += 1
2307
- # Error already logged in worker
2308
- elif corrected_text is not None:
2309
- # Apply correction if text changed
2310
- current_text = getattr(element, "text", None)
2311
- if corrected_text != current_text:
2312
- element.text = corrected_text
2313
- updated_count += 1
2314
- except Exception as exc:
2315
- # Catch errors from future.result() itself
2316
- element = future_to_element[future] # Find original element
2317
- logger.error(
2318
- f"Page {self.number}: Internal error retrieving correction result for element {element.bbox}: {exc}",
2319
- exc_info=True,
2320
- )
2321
- error_count += 1
2322
- # Note: progress_callback was already called in the worker's finally block
2418
+ # Note: progress_callback was already called in the worker's finally block
2323
2419
 
2324
- else:
2325
- # --- Sequential execution --- #
2326
- logger.info(f"Page {self.number}: Running OCR correction sequentially.")
2327
- for element in target_elements:
2328
- # Call the task function directly (it handles progress_callback)
2329
- processed_count += 1
2330
- _element, corrected_text, error = _process_element_task(element)
2331
- if error:
2332
- error_count += 1
2333
- elif corrected_text is not None:
2334
- # Apply correction if text changed
2335
- current_text = getattr(_element, "text", None)
2336
- if corrected_text != current_text:
2337
- _element.text = corrected_text
2338
- updated_count += 1
2420
+ else:
2421
+ # --- Sequential execution --- #
2422
+ logger.info(f"Page {self.number}: Running OCR correction sequentially.")
2423
+ for element in target_elements:
2424
+ # Call the task function directly (it handles progress_callback)
2425
+ processed_count += 1
2426
+ _element, corrected_text, error = _process_element_task(element)
2427
+ if error:
2428
+ error_count += 1
2429
+ elif corrected_text is not None:
2430
+ # Apply correction if text changed
2431
+ current_text = getattr(_element, "text", None)
2432
+ if corrected_text != current_text:
2433
+ _element.text = corrected_text
2434
+ updated_count += 1
2339
2435
 
2340
- logger.info(
2341
- f"Page {self.number}: OCR correction finished. Processed: {processed_count}/{len(target_elements)}, Updated: {updated_count}, Errors: {error_count}."
2342
- )
2436
+ logger.info(
2437
+ f"Page {self.number}: OCR correction finished. Processed: {processed_count}/{len(target_elements)}, Updated: {updated_count}, Errors: {error_count}."
2438
+ )
2343
2439
 
2344
- return self # Return self for chaining
2440
+ return self # Return self for chaining
2441
+ finally:
2442
+ if element_pbar:
2443
+ element_pbar.close()
2345
2444
 
2346
2445
  # --- Classification Mixin Implementation --- #
2347
2446
  def _get_classification_manager(self) -> "ClassificationManager":