natural-pdf 0.1.11__py3-none-any.whl → 0.1.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/__init__.py +7 -2
- natural_pdf/analyzers/shape_detection_mixin.py +1092 -0
- natural_pdf/analyzers/text_options.py +9 -1
- natural_pdf/analyzers/text_structure.py +371 -58
- natural_pdf/classification/manager.py +3 -4
- natural_pdf/collections/pdf_collection.py +19 -39
- natural_pdf/core/element_manager.py +11 -1
- natural_pdf/core/highlighting_service.py +146 -75
- natural_pdf/core/page.py +287 -188
- natural_pdf/core/pdf.py +57 -42
- natural_pdf/elements/base.py +51 -0
- natural_pdf/elements/collections.py +362 -67
- natural_pdf/elements/line.py +5 -0
- natural_pdf/elements/region.py +396 -23
- natural_pdf/exporters/data/__init__.py +0 -0
- natural_pdf/exporters/data/pdf.ttf +0 -0
- natural_pdf/exporters/data/sRGB.icc +0 -0
- natural_pdf/exporters/hocr.py +40 -61
- natural_pdf/exporters/hocr_font.py +7 -13
- natural_pdf/exporters/original_pdf.py +10 -13
- natural_pdf/exporters/paddleocr.py +51 -11
- natural_pdf/exporters/searchable_pdf.py +0 -10
- natural_pdf/flows/__init__.py +12 -0
- natural_pdf/flows/collections.py +533 -0
- natural_pdf/flows/element.py +382 -0
- natural_pdf/flows/flow.py +216 -0
- natural_pdf/flows/region.py +458 -0
- natural_pdf/search/__init__.py +65 -52
- natural_pdf/search/lancedb_search_service.py +325 -0
- natural_pdf/search/numpy_search_service.py +255 -0
- natural_pdf/search/searchable_mixin.py +25 -71
- natural_pdf/selectors/parser.py +163 -8
- natural_pdf/widgets/viewer.py +22 -31
- {natural_pdf-0.1.11.dist-info → natural_pdf-0.1.13.dist-info}/METADATA +55 -49
- {natural_pdf-0.1.11.dist-info → natural_pdf-0.1.13.dist-info}/RECORD +38 -30
- {natural_pdf-0.1.11.dist-info → natural_pdf-0.1.13.dist-info}/WHEEL +1 -1
- natural_pdf/search/haystack_search_service.py +0 -687
- natural_pdf/search/haystack_utils.py +0 -474
- natural_pdf/utils/tqdm_utils.py +0 -51
- {natural_pdf-0.1.11.dist-info → natural_pdf-0.1.13.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.1.11.dist-info → natural_pdf-0.1.13.dist-info}/top_level.txt +0 -0
natural_pdf/core/page.py
CHANGED
@@ -74,6 +74,11 @@ from natural_pdf.widgets.viewer import _IPYWIDGETS_AVAILABLE, SimpleInteractiveV
|
|
74
74
|
# --- End Classification Imports --- #
|
75
75
|
|
76
76
|
|
77
|
+
# --- Shape Detection Mixin --- #
|
78
|
+
from natural_pdf.analyzers.shape_detection_mixin import ShapeDetectionMixin
|
79
|
+
# --- End Shape Detection Mixin --- #
|
80
|
+
|
81
|
+
|
77
82
|
try:
|
78
83
|
from deskew import determine_skew
|
79
84
|
|
@@ -86,7 +91,7 @@ except ImportError:
|
|
86
91
|
logger = logging.getLogger(__name__)
|
87
92
|
|
88
93
|
|
89
|
-
class Page(ClassificationMixin, ExtractionMixin):
|
94
|
+
class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin):
|
90
95
|
"""
|
91
96
|
Enhanced Page wrapper built on top of pdfplumber.Page.
|
92
97
|
|
@@ -161,6 +166,7 @@ class Page(ClassificationMixin, ExtractionMixin):
|
|
161
166
|
self._layout_analyzer = None
|
162
167
|
|
163
168
|
self._load_elements()
|
169
|
+
self._to_image_cache: Dict[tuple, Optional["Image.Image"]] = {}
|
164
170
|
|
165
171
|
@property
|
166
172
|
def pdf(self) -> "PDF":
|
@@ -649,7 +655,7 @@ class Page(ClassificationMixin, ExtractionMixin):
|
|
649
655
|
Exclusions are now handled by the calling methods (find, find_all) if requested.
|
650
656
|
|
651
657
|
Args:
|
652
|
-
selector_obj: Parsed selector dictionary
|
658
|
+
selector_obj: Parsed selector dictionary (single or compound OR selector)
|
653
659
|
**kwargs: Additional filter parameters including 'regex' and 'case'
|
654
660
|
|
655
661
|
Returns:
|
@@ -657,6 +663,30 @@ class Page(ClassificationMixin, ExtractionMixin):
|
|
657
663
|
"""
|
658
664
|
from natural_pdf.selectors.parser import selector_to_filter_func
|
659
665
|
|
666
|
+
# Handle compound OR selectors
|
667
|
+
if selector_obj.get("type") == "or":
|
668
|
+
# For OR selectors, search all elements and let the filter function decide
|
669
|
+
elements_to_search = self._element_mgr.get_all_elements()
|
670
|
+
|
671
|
+
# Create filter function from compound selector
|
672
|
+
filter_func = selector_to_filter_func(selector_obj, **kwargs)
|
673
|
+
|
674
|
+
# Apply the filter to all elements
|
675
|
+
matching_elements = [element for element in elements_to_search if filter_func(element)]
|
676
|
+
|
677
|
+
# Sort elements in reading order if requested
|
678
|
+
if kwargs.get("reading_order", True):
|
679
|
+
if all(hasattr(el, "top") and hasattr(el, "x0") for el in matching_elements):
|
680
|
+
matching_elements.sort(key=lambda el: (el.top, el.x0))
|
681
|
+
else:
|
682
|
+
logger.warning(
|
683
|
+
"Cannot sort elements in reading order: Missing required attributes (top, x0)."
|
684
|
+
)
|
685
|
+
|
686
|
+
# Return result collection
|
687
|
+
return ElementCollection(matching_elements)
|
688
|
+
|
689
|
+
# Handle single selectors (existing logic)
|
660
690
|
# Get element type to filter
|
661
691
|
element_type = selector_obj.get("type", "any").lower()
|
662
692
|
|
@@ -1349,7 +1379,9 @@ class Page(ClassificationMixin, ExtractionMixin):
|
|
1349
1379
|
self._highlighter.clear_page(self.index)
|
1350
1380
|
return self
|
1351
1381
|
|
1352
|
-
def analyze_text_styles(
|
1382
|
+
def analyze_text_styles(
|
1383
|
+
self, options: Optional[TextStyleOptions] = None
|
1384
|
+
) -> "ElementCollection":
|
1353
1385
|
"""
|
1354
1386
|
Analyze text elements by style, adding attributes directly to elements.
|
1355
1387
|
|
@@ -1409,114 +1441,171 @@ class Page(ClassificationMixin, ExtractionMixin):
|
|
1409
1441
|
Returns:
|
1410
1442
|
PIL Image of the page, or None if rendering fails.
|
1411
1443
|
"""
|
1412
|
-
|
1413
|
-
|
1414
|
-
|
1415
|
-
|
1416
|
-
|
1417
|
-
|
1418
|
-
|
1444
|
+
# 1. Create cache key (excluding path)
|
1445
|
+
cache_key_parts = [
|
1446
|
+
scale,
|
1447
|
+
width,
|
1448
|
+
labels,
|
1449
|
+
legend_position,
|
1450
|
+
render_ocr,
|
1451
|
+
resolution,
|
1452
|
+
include_highlights,
|
1453
|
+
exclusions,
|
1454
|
+
]
|
1455
|
+
# Convert kwargs to a stable, hashable representation
|
1456
|
+
sorted_kwargs_list = []
|
1457
|
+
for k, v in sorted(kwargs.items()):
|
1458
|
+
if isinstance(v, list):
|
1459
|
+
try:
|
1460
|
+
v = tuple(v) # Convert lists to tuples
|
1461
|
+
except TypeError: # pragma: no cover
|
1462
|
+
# If list contains unhashable items, fall back to repr or skip
|
1463
|
+
# For simplicity, we'll try to proceed; hashing will fail if v remains unhashable
|
1464
|
+
logger.warning(f"Cache key generation: List item in kwargs['{k}'] could not be converted to tuple due to unhashable elements.")
|
1465
|
+
sorted_kwargs_list.append((k, v))
|
1466
|
+
|
1467
|
+
cache_key_parts.append(tuple(sorted_kwargs_list))
|
1468
|
+
|
1419
1469
|
try:
|
1420
|
-
|
1421
|
-
|
1422
|
-
|
1423
|
-
|
1424
|
-
f"[{thread_id}] Page {self.index}: Acquired pdf_render_lock (waited {lock_acquired_time - lock_wait_start:.2f}s). Starting render..."
|
1425
|
-
)
|
1426
|
-
if include_highlights:
|
1427
|
-
# Delegate rendering to the central service
|
1428
|
-
image = self._highlighter.render_page(
|
1429
|
-
page_index=self.index,
|
1430
|
-
scale=scale,
|
1431
|
-
labels=labels,
|
1432
|
-
legend_position=legend_position,
|
1433
|
-
render_ocr=render_ocr,
|
1434
|
-
resolution=render_resolution, # Pass the calculated resolution
|
1435
|
-
**kwargs,
|
1436
|
-
)
|
1437
|
-
else:
|
1438
|
-
image = render_plain_page(self, render_resolution)
|
1439
|
-
except Exception as e:
|
1440
|
-
logger.error(f"Error rendering page {self.index}: {e}", exc_info=True)
|
1441
|
-
return None # Return None on error
|
1442
|
-
finally:
|
1443
|
-
render_end_time = time.monotonic()
|
1444
|
-
logger.debug(
|
1445
|
-
f"[{thread_id}] Page {self.index}: Released pdf_render_lock. Total render time (incl. lock wait): {render_end_time - lock_wait_start:.2f}s"
|
1446
|
-
)
|
1470
|
+
cache_key = tuple(cache_key_parts)
|
1471
|
+
except TypeError as e: # pragma: no cover
|
1472
|
+
logger.warning(f"Page {self.index}: Could not create cache key for to_image due to unhashable item: {e}. Proceeding without cache for this call.")
|
1473
|
+
cache_key = None # Fallback to not using cache for this call
|
1447
1474
|
|
1448
|
-
|
1449
|
-
return None
|
1475
|
+
image_to_return: Optional[Image.Image] = None
|
1450
1476
|
|
1451
|
-
#
|
1452
|
-
if
|
1477
|
+
# 2. Check cache
|
1478
|
+
if cache_key is not None and cache_key in self._to_image_cache:
|
1479
|
+
image_to_return = self._to_image_cache[cache_key]
|
1480
|
+
logger.debug(f"Page {self.index}: Returning cached image for key: {cache_key}")
|
1481
|
+
else:
|
1482
|
+
# --- This is the original logic to generate the image ---
|
1483
|
+
rendered_image_component: Optional[Image.Image] = None # Renamed from 'image' in original
|
1484
|
+
render_resolution = resolution if resolution is not None else scale * 72
|
1485
|
+
thread_id = threading.current_thread().name
|
1486
|
+
logger.debug(
|
1487
|
+
f"[{thread_id}] Page {self.index}: Attempting to acquire pdf_render_lock for to_image..."
|
1488
|
+
)
|
1489
|
+
lock_wait_start = time.monotonic()
|
1453
1490
|
try:
|
1454
|
-
#
|
1455
|
-
|
1456
|
-
|
1457
|
-
|
1458
|
-
|
1459
|
-
|
1460
|
-
|
1461
|
-
|
1462
|
-
|
1463
|
-
|
1464
|
-
|
1465
|
-
|
1466
|
-
|
1467
|
-
|
1468
|
-
|
1469
|
-
|
1470
|
-
img_top = region.top * img_scale
|
1471
|
-
img_x1 = region.x1 * img_scale
|
1472
|
-
img_bottom = region.bottom * img_scale
|
1473
|
-
|
1474
|
-
# Draw a white rectangle over the excluded area
|
1475
|
-
# Ensure coordinates are within image bounds (though region should be)
|
1476
|
-
img_coords = (
|
1477
|
-
max(0, img_x0),
|
1478
|
-
max(0, img_top),
|
1479
|
-
min(image.width, img_x1),
|
1480
|
-
min(image.height, img_bottom),
|
1491
|
+
# Acquire the global PDF rendering lock
|
1492
|
+
with pdf_render_lock:
|
1493
|
+
lock_acquired_time = time.monotonic()
|
1494
|
+
logger.debug(
|
1495
|
+
f"[{thread_id}] Page {self.index}: Acquired pdf_render_lock (waited {lock_acquired_time - lock_wait_start:.2f}s). Starting render..."
|
1496
|
+
)
|
1497
|
+
if include_highlights:
|
1498
|
+
# Delegate rendering to the central service
|
1499
|
+
rendered_image_component = self._highlighter.render_page(
|
1500
|
+
page_index=self.index,
|
1501
|
+
scale=scale,
|
1502
|
+
labels=labels,
|
1503
|
+
legend_position=legend_position,
|
1504
|
+
render_ocr=render_ocr,
|
1505
|
+
resolution=render_resolution, # Pass the calculated resolution
|
1506
|
+
**kwargs,
|
1481
1507
|
)
|
1482
|
-
|
1483
|
-
|
1484
|
-
|
1485
|
-
|
1486
|
-
|
1487
|
-
|
1488
|
-
|
1489
|
-
|
1490
|
-
|
1491
|
-
logger.error(
|
1492
|
-
f"Error applying exclusion mask to page {self.index}: {mask_error}",
|
1493
|
-
exc_info=True,
|
1508
|
+
else:
|
1509
|
+
rendered_image_component = render_plain_page(self, render_resolution)
|
1510
|
+
except Exception as e:
|
1511
|
+
logger.error(f"Error rendering page {self.index}: {e}", exc_info=True)
|
1512
|
+
# rendered_image_component remains None
|
1513
|
+
finally:
|
1514
|
+
render_end_time = time.monotonic()
|
1515
|
+
logger.debug(
|
1516
|
+
f"[{thread_id}] Page {self.index}: Released pdf_render_lock. Total render time (incl. lock wait): {render_end_time - lock_wait_start:.2f}s"
|
1494
1517
|
)
|
1495
|
-
# Decide if you want to return None or continue without mask
|
1496
|
-
# For now, continue without mask
|
1497
1518
|
|
1498
|
-
|
1499
|
-
|
1500
|
-
|
1501
|
-
|
1502
|
-
|
1503
|
-
|
1504
|
-
|
1505
|
-
|
1506
|
-
|
1507
|
-
|
1508
|
-
|
1509
|
-
|
1510
|
-
|
1519
|
+
if rendered_image_component is None:
|
1520
|
+
if cache_key is not None:
|
1521
|
+
self._to_image_cache[cache_key] = None # Cache the failure
|
1522
|
+
# Save the image if path is provided (will try to save None, handled by PIL/OS)
|
1523
|
+
if path:
|
1524
|
+
try:
|
1525
|
+
if os.path.dirname(path):
|
1526
|
+
os.makedirs(os.path.dirname(path), exist_ok=True)
|
1527
|
+
if rendered_image_component is not None: # Should be None here
|
1528
|
+
rendered_image_component.save(path) # This line won't be hit if None
|
1529
|
+
# else: logger.debug("Not saving None image") # Not strictly needed
|
1530
|
+
except Exception as save_error: # pragma: no cover
|
1531
|
+
logger.error(f"Failed to save image to {path}: {save_error}")
|
1532
|
+
return None
|
1533
|
+
|
1534
|
+
# --- Apply exclusion masking if requested ---
|
1535
|
+
# This modifies 'rendered_image_component'
|
1536
|
+
image_after_masking = rendered_image_component # Start with the rendered image
|
1537
|
+
if exclusions == "mask" and self._exclusions:
|
1538
|
+
try:
|
1539
|
+
# Ensure image is mutable (RGB or RGBA)
|
1540
|
+
if image_after_masking.mode not in ("RGB", "RGBA"):
|
1541
|
+
image_after_masking = image_after_masking.convert("RGB")
|
1542
|
+
|
1543
|
+
exclusion_regions = self._get_exclusion_regions(include_callable=True, debug=False)
|
1544
|
+
if exclusion_regions:
|
1545
|
+
draw = ImageDraw.Draw(image_after_masking)
|
1546
|
+
# Calculate the scaling factor used for the image
|
1547
|
+
img_scale = render_resolution / 72.0
|
1548
|
+
|
1549
|
+
for region in exclusion_regions:
|
1550
|
+
# Convert PDF points (x0, top, x1, bottom) to image pixels
|
1551
|
+
img_x0 = region.x0 * img_scale
|
1552
|
+
img_top = region.top * img_scale
|
1553
|
+
img_x1 = region.x1 * img_scale
|
1554
|
+
img_bottom = region.bottom * img_scale
|
1555
|
+
|
1556
|
+
# Draw a white rectangle over the excluded area
|
1557
|
+
img_coords = (
|
1558
|
+
max(0, img_x0),
|
1559
|
+
max(0, img_top),
|
1560
|
+
min(image_after_masking.width, img_x1),
|
1561
|
+
min(image_after_masking.height, img_bottom),
|
1562
|
+
)
|
1563
|
+
if img_coords[0] < img_coords[2] and img_coords[1] < img_coords[3]:
|
1564
|
+
draw.rectangle(img_coords, fill="white")
|
1565
|
+
else: # pragma: no cover
|
1566
|
+
logger.warning(
|
1567
|
+
f"Skipping invalid exclusion rect for masking: {img_coords}"
|
1568
|
+
)
|
1569
|
+
del draw # Release drawing context
|
1570
|
+
except Exception as mask_error: # pragma: no cover
|
1571
|
+
logger.error(
|
1572
|
+
f"Error applying exclusion mask to page {self.index}: {mask_error}",
|
1573
|
+
exc_info=True,
|
1574
|
+
)
|
1575
|
+
# Continue with potentially unmasked or partially masked image
|
1576
|
+
|
1577
|
+
# --- Resize the final image if width is provided ---
|
1578
|
+
image_final_content = image_after_masking # Start with image after masking
|
1579
|
+
if width is not None and width > 0 and image_final_content.width > 0:
|
1580
|
+
aspect_ratio = image_final_content.height / image_final_content.width
|
1581
|
+
height = int(width * aspect_ratio)
|
1582
|
+
try:
|
1583
|
+
image_final_content = image_final_content.resize(
|
1584
|
+
(width, height), Image.Resampling.LANCZOS
|
1585
|
+
)
|
1586
|
+
except Exception as resize_error: # pragma: no cover
|
1587
|
+
logger.warning(f"Could not resize image: {resize_error}")
|
1588
|
+
# image_final_content remains the un-resized version if resize fails
|
1589
|
+
|
1590
|
+
# Store in cache
|
1591
|
+
if cache_key is not None:
|
1592
|
+
self._to_image_cache[cache_key] = image_final_content
|
1593
|
+
logger.debug(f"Page {self.index}: Cached image for key: {cache_key}")
|
1594
|
+
image_to_return = image_final_content
|
1595
|
+
# --- End of cache miss block ---
|
1596
|
+
|
1597
|
+
# Save the image (either from cache or newly generated) if path is provided
|
1598
|
+
if path and image_to_return:
|
1511
1599
|
try:
|
1512
1600
|
# Ensure directory exists
|
1513
|
-
os.
|
1514
|
-
|
1601
|
+
if os.path.dirname(path): # Only call makedirs if there's a directory part
|
1602
|
+
os.makedirs(os.path.dirname(path), exist_ok=True)
|
1603
|
+
image_to_return.save(path)
|
1515
1604
|
logger.debug(f"Saved page image to: {path}")
|
1516
|
-
except Exception as save_error:
|
1605
|
+
except Exception as save_error: # pragma: no cover
|
1517
1606
|
logger.error(f"Failed to save image to {path}: {save_error}")
|
1518
1607
|
|
1519
|
-
return
|
1608
|
+
return image_to_return
|
1520
1609
|
|
1521
1610
|
def _create_text_elements_from_ocr(
|
1522
1611
|
self, ocr_results: List[Dict[str, Any]], image_width=None, image_height=None
|
@@ -1984,7 +2073,6 @@ class Page(ClassificationMixin, ExtractionMixin):
|
|
1984
2073
|
region.is_end_next_start = False
|
1985
2074
|
regions.append(region)
|
1986
2075
|
|
1987
|
-
# Return the list wrapped in an ElementCollection
|
1988
2076
|
return ElementCollection(regions)
|
1989
2077
|
|
1990
2078
|
def __repr__(self) -> str:
|
@@ -2130,7 +2218,7 @@ class Page(ClassificationMixin, ExtractionMixin):
|
|
2130
2218
|
if not _IPYWIDGETS_AVAILABLE or SimpleInteractiveViewerWidget is None:
|
2131
2219
|
logger.error(
|
2132
2220
|
"Interactive viewer requires optional dependencies ('ipywidgets'). "
|
2133
|
-
"Install with `pip install natural-pdf[
|
2221
|
+
"Install with `pip install natural-pdf[viewer]`"
|
2134
2222
|
)
|
2135
2223
|
# raise ImportError("ipywidgets not found.") # Option 1: Raise error
|
2136
2224
|
return None # Option 2: Return None gracefully
|
@@ -2211,6 +2299,7 @@ class Page(ClassificationMixin, ExtractionMixin):
|
|
2211
2299
|
def correct_ocr(
|
2212
2300
|
self,
|
2213
2301
|
correction_callback: Callable[[Any], Optional[str]],
|
2302
|
+
selector: Optional[str] = "text[source=ocr]",
|
2214
2303
|
max_workers: Optional[int] = None,
|
2215
2304
|
progress_callback: Optional[Callable[[], None]] = None, # Added progress callback
|
2216
2305
|
) -> "Page": # Return self for chaining
|
@@ -2238,7 +2327,7 @@ class Page(ClassificationMixin, ExtractionMixin):
|
|
2238
2327
|
)
|
2239
2328
|
|
2240
2329
|
target_elements_collection = self.find_all(
|
2241
|
-
selector=
|
2330
|
+
selector=selector, apply_exclusions=False
|
2242
2331
|
)
|
2243
2332
|
target_elements = target_elements_collection.elements # Get the list
|
2244
2333
|
|
@@ -2246,102 +2335,112 @@ class Page(ClassificationMixin, ExtractionMixin):
|
|
2246
2335
|
logger.info(f"Page {self.number}: No OCR elements found to correct.")
|
2247
2336
|
return self
|
2248
2337
|
|
2249
|
-
|
2250
|
-
|
2251
|
-
|
2338
|
+
element_pbar = None
|
2339
|
+
try:
|
2340
|
+
element_pbar = tqdm(total=len(target_elements), desc=f"Correcting OCR Page {self.number}", unit="element", leave=False)
|
2252
2341
|
|
2253
|
-
|
2254
|
-
|
2255
|
-
|
2256
|
-
current_text = getattr(element, "text", None)
|
2257
|
-
# Call the user-provided callback
|
2258
|
-
corrected_text = correction_callback(element)
|
2342
|
+
processed_count = 0
|
2343
|
+
updated_count = 0
|
2344
|
+
error_count = 0
|
2259
2345
|
|
2260
|
-
|
2261
|
-
|
2262
|
-
|
2263
|
-
|
2264
|
-
|
2265
|
-
|
2346
|
+
# Define the task to be run by the worker thread or sequentially
|
2347
|
+
def _process_element_task(element):
|
2348
|
+
try:
|
2349
|
+
current_text = getattr(element, "text", None)
|
2350
|
+
# Call the user-provided callback
|
2351
|
+
corrected_text = correction_callback(element)
|
2266
2352
|
|
2267
|
-
|
2268
|
-
|
2269
|
-
|
2270
|
-
|
2271
|
-
exc_info=False, # Keep log concise
|
2272
|
-
)
|
2273
|
-
return element, None, e # Return element, no result, error
|
2274
|
-
finally:
|
2275
|
-
# --- Call progress callback here --- #
|
2276
|
-
if progress_callback:
|
2277
|
-
try:
|
2278
|
-
progress_callback()
|
2279
|
-
except Exception as cb_e:
|
2280
|
-
# Log error in callback itself, but don't stop processing
|
2281
|
-
logger.error(
|
2282
|
-
f"Page {self.number}: Error executing progress_callback: {cb_e}",
|
2283
|
-
exc_info=False,
|
2353
|
+
# Validate result type
|
2354
|
+
if corrected_text is not None and not isinstance(corrected_text, str):
|
2355
|
+
logger.warning(
|
2356
|
+
f"Page {self.number}: Correction callback for element '{getattr(element, 'text', '')[:20]}...' returned non-string, non-None type: {type(corrected_text)}. Skipping update."
|
2284
2357
|
)
|
2358
|
+
return element, None, None # Treat as no correction
|
2285
2359
|
|
2286
|
-
|
2287
|
-
|
2288
|
-
|
2289
|
-
|
2290
|
-
|
2291
|
-
|
2292
|
-
|
2293
|
-
|
2294
|
-
|
2295
|
-
|
2296
|
-
|
2297
|
-
|
2298
|
-
|
2360
|
+
return element, corrected_text, None # Return element, result, no error
|
2361
|
+
except Exception as e:
|
2362
|
+
logger.error(
|
2363
|
+
f"Page {self.number}: Error applying correction callback to element '{getattr(element, 'text', '')[:30]}...' ({element.bbox}): {e}",
|
2364
|
+
exc_info=False, # Keep log concise
|
2365
|
+
)
|
2366
|
+
return element, None, e # Return element, no result, error
|
2367
|
+
finally:
|
2368
|
+
# --- Update internal tqdm progress bar ---
|
2369
|
+
if element_pbar:
|
2370
|
+
element_pbar.update(1)
|
2371
|
+
# --- Call user's progress callback --- #
|
2372
|
+
if progress_callback:
|
2373
|
+
try:
|
2374
|
+
progress_callback()
|
2375
|
+
except Exception as cb_e:
|
2376
|
+
# Log error in callback itself, but don't stop processing
|
2377
|
+
logger.error(
|
2378
|
+
f"Page {self.number}: Error executing progress_callback: {cb_e}",
|
2379
|
+
exc_info=False,
|
2380
|
+
)
|
2299
2381
|
|
2300
|
-
|
2301
|
-
|
2302
|
-
|
2303
|
-
|
2304
|
-
|
2305
|
-
|
2382
|
+
# Choose execution strategy based on max_workers
|
2383
|
+
if max_workers is not None and max_workers > 1:
|
2384
|
+
# --- Parallel execution --- #
|
2385
|
+
logger.info(
|
2386
|
+
f"Page {self.number}: Running OCR correction in parallel with {max_workers} workers."
|
2387
|
+
)
|
2388
|
+
futures = []
|
2389
|
+
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
|
2390
|
+
# Submit all tasks
|
2391
|
+
future_to_element = {
|
2392
|
+
executor.submit(_process_element_task, element): element
|
2393
|
+
for element in target_elements
|
2394
|
+
}
|
2395
|
+
|
2396
|
+
# Process results as they complete (progress_callback called by worker)
|
2397
|
+
for future in concurrent.futures.as_completed(future_to_element):
|
2398
|
+
processed_count += 1
|
2399
|
+
try:
|
2400
|
+
element, corrected_text, error = future.result()
|
2401
|
+
if error:
|
2402
|
+
error_count += 1
|
2403
|
+
# Error already logged in worker
|
2404
|
+
elif corrected_text is not None:
|
2405
|
+
# Apply correction if text changed
|
2406
|
+
current_text = getattr(element, "text", None)
|
2407
|
+
if corrected_text != current_text:
|
2408
|
+
element.text = corrected_text
|
2409
|
+
updated_count += 1
|
2410
|
+
except Exception as exc:
|
2411
|
+
# Catch errors from future.result() itself
|
2412
|
+
element = future_to_element[future] # Find original element
|
2413
|
+
logger.error(
|
2414
|
+
f"Page {self.number}: Internal error retrieving correction result for element {element.bbox}: {exc}",
|
2415
|
+
exc_info=True,
|
2416
|
+
)
|
2306
2417
|
error_count += 1
|
2307
|
-
#
|
2308
|
-
elif corrected_text is not None:
|
2309
|
-
# Apply correction if text changed
|
2310
|
-
current_text = getattr(element, "text", None)
|
2311
|
-
if corrected_text != current_text:
|
2312
|
-
element.text = corrected_text
|
2313
|
-
updated_count += 1
|
2314
|
-
except Exception as exc:
|
2315
|
-
# Catch errors from future.result() itself
|
2316
|
-
element = future_to_element[future] # Find original element
|
2317
|
-
logger.error(
|
2318
|
-
f"Page {self.number}: Internal error retrieving correction result for element {element.bbox}: {exc}",
|
2319
|
-
exc_info=True,
|
2320
|
-
)
|
2321
|
-
error_count += 1
|
2322
|
-
# Note: progress_callback was already called in the worker's finally block
|
2418
|
+
# Note: progress_callback was already called in the worker's finally block
|
2323
2419
|
|
2324
|
-
|
2325
|
-
|
2326
|
-
|
2327
|
-
|
2328
|
-
|
2329
|
-
|
2330
|
-
|
2331
|
-
|
2332
|
-
|
2333
|
-
|
2334
|
-
|
2335
|
-
|
2336
|
-
|
2337
|
-
|
2338
|
-
|
2420
|
+
else:
|
2421
|
+
# --- Sequential execution --- #
|
2422
|
+
logger.info(f"Page {self.number}: Running OCR correction sequentially.")
|
2423
|
+
for element in target_elements:
|
2424
|
+
# Call the task function directly (it handles progress_callback)
|
2425
|
+
processed_count += 1
|
2426
|
+
_element, corrected_text, error = _process_element_task(element)
|
2427
|
+
if error:
|
2428
|
+
error_count += 1
|
2429
|
+
elif corrected_text is not None:
|
2430
|
+
# Apply correction if text changed
|
2431
|
+
current_text = getattr(_element, "text", None)
|
2432
|
+
if corrected_text != current_text:
|
2433
|
+
_element.text = corrected_text
|
2434
|
+
updated_count += 1
|
2339
2435
|
|
2340
|
-
|
2341
|
-
|
2342
|
-
|
2436
|
+
logger.info(
|
2437
|
+
f"Page {self.number}: OCR correction finished. Processed: {processed_count}/{len(target_elements)}, Updated: {updated_count}, Errors: {error_count}."
|
2438
|
+
)
|
2343
2439
|
|
2344
|
-
|
2440
|
+
return self # Return self for chaining
|
2441
|
+
finally:
|
2442
|
+
if element_pbar:
|
2443
|
+
element_pbar.close()
|
2345
2444
|
|
2346
2445
|
# --- Classification Mixin Implementation --- #
|
2347
2446
|
def _get_classification_manager(self) -> "ClassificationManager":
|