natural-pdf 0.1.6__py3-none-any.whl → 0.1.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docs/categorizing-documents/index.md +168 -0
- docs/data-extraction/index.md +87 -0
- docs/element-selection/index.ipynb +218 -164
- docs/element-selection/index.md +20 -0
- docs/finetuning/index.md +176 -0
- docs/index.md +19 -0
- docs/ocr/index.md +63 -16
- docs/tutorials/01-loading-and-extraction.ipynb +411 -248
- docs/tutorials/02-finding-elements.ipynb +123 -46
- docs/tutorials/03-extracting-blocks.ipynb +24 -19
- docs/tutorials/04-table-extraction.ipynb +17 -12
- docs/tutorials/05-excluding-content.ipynb +37 -32
- docs/tutorials/06-document-qa.ipynb +36 -31
- docs/tutorials/07-layout-analysis.ipynb +45 -40
- docs/tutorials/07-working-with-regions.ipynb +61 -60
- docs/tutorials/08-spatial-navigation.ipynb +76 -71
- docs/tutorials/09-section-extraction.ipynb +160 -155
- docs/tutorials/10-form-field-extraction.ipynb +71 -66
- docs/tutorials/11-enhanced-table-processing.ipynb +11 -6
- docs/tutorials/12-ocr-integration.ipynb +3420 -312
- docs/tutorials/12-ocr-integration.md +68 -106
- docs/tutorials/13-semantic-search.ipynb +641 -251
- natural_pdf/__init__.py +3 -0
- natural_pdf/analyzers/layout/gemini.py +63 -47
- natural_pdf/classification/manager.py +343 -0
- natural_pdf/classification/mixin.py +149 -0
- natural_pdf/classification/results.py +62 -0
- natural_pdf/collections/mixins.py +63 -0
- natural_pdf/collections/pdf_collection.py +326 -17
- natural_pdf/core/element_manager.py +73 -4
- natural_pdf/core/page.py +255 -83
- natural_pdf/core/pdf.py +385 -367
- natural_pdf/elements/base.py +1 -3
- natural_pdf/elements/collections.py +279 -49
- natural_pdf/elements/region.py +106 -21
- natural_pdf/elements/text.py +5 -2
- natural_pdf/exporters/__init__.py +4 -0
- natural_pdf/exporters/base.py +61 -0
- natural_pdf/exporters/paddleocr.py +345 -0
- natural_pdf/extraction/manager.py +134 -0
- natural_pdf/extraction/mixin.py +246 -0
- natural_pdf/extraction/result.py +37 -0
- natural_pdf/ocr/__init__.py +16 -8
- natural_pdf/ocr/engine.py +46 -30
- natural_pdf/ocr/engine_easyocr.py +86 -42
- natural_pdf/ocr/engine_paddle.py +39 -28
- natural_pdf/ocr/engine_surya.py +32 -16
- natural_pdf/ocr/ocr_factory.py +34 -23
- natural_pdf/ocr/ocr_manager.py +98 -34
- natural_pdf/ocr/ocr_options.py +38 -10
- natural_pdf/ocr/utils.py +59 -33
- natural_pdf/qa/document_qa.py +0 -4
- natural_pdf/selectors/parser.py +363 -238
- natural_pdf/templates/finetune/fine_tune_paddleocr.md +420 -0
- natural_pdf/utils/debug.py +4 -2
- natural_pdf/utils/identifiers.py +9 -5
- natural_pdf/utils/locks.py +8 -0
- natural_pdf/utils/packaging.py +172 -105
- natural_pdf/utils/text_extraction.py +96 -65
- natural_pdf/utils/tqdm_utils.py +43 -0
- natural_pdf/utils/visualization.py +1 -1
- {natural_pdf-0.1.6.dist-info → natural_pdf-0.1.8.dist-info}/METADATA +10 -3
- {natural_pdf-0.1.6.dist-info → natural_pdf-0.1.8.dist-info}/RECORD +66 -51
- {natural_pdf-0.1.6.dist-info → natural_pdf-0.1.8.dist-info}/WHEEL +1 -1
- {natural_pdf-0.1.6.dist-info → natural_pdf-0.1.8.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.1.6.dist-info → natural_pdf-0.1.8.dist-info}/top_level.txt +0 -0
@@ -359,8 +359,10 @@ class ElementManager:
|
|
359
359
|
|
360
360
|
# Handle potential None confidence
|
361
361
|
raw_confidence = result.get("confidence")
|
362
|
-
confidence_value =
|
363
|
-
|
362
|
+
confidence_value = (
|
363
|
+
float(raw_confidence) if raw_confidence is not None else None
|
364
|
+
) # Keep None if it was None
|
365
|
+
ocr_text = result.get("text") # Get text, will be None if detect_only
|
364
366
|
|
365
367
|
# Create the TextElement for the word
|
366
368
|
word_element_data = {
|
@@ -373,7 +375,7 @@ class ElementManager:
|
|
373
375
|
"height": pdf_height,
|
374
376
|
"object_type": "word", # Treat OCR results as whole words
|
375
377
|
"source": "ocr",
|
376
|
-
"confidence": confidence_value,
|
378
|
+
"confidence": confidence_value, # Use the handled confidence
|
377
379
|
"fontname": "OCR", # Use consistent OCR fontname
|
378
380
|
"size": (
|
379
381
|
round(pdf_height) if pdf_height > 0 else 10.0
|
@@ -391,7 +393,7 @@ class ElementManager:
|
|
391
393
|
ocr_char_dict.setdefault("adv", ocr_char_dict.get("width", 0))
|
392
394
|
|
393
395
|
# Add the char dict list to the word data before creating TextElement
|
394
|
-
word_element_data["_char_dicts"] = [ocr_char_dict]
|
396
|
+
word_element_data["_char_dicts"] = [ocr_char_dict] # Store itself as its only char
|
395
397
|
|
396
398
|
word_elem = TextElement(word_element_data, self._page)
|
397
399
|
added_word_elements.append(word_elem)
|
@@ -537,3 +539,70 @@ class ElementManager:
|
|
537
539
|
"""Get all region elements."""
|
538
540
|
self.load_elements()
|
539
541
|
return self._elements.get("regions", [])
|
542
|
+
|
543
|
+
def remove_ocr_elements(self):
|
544
|
+
"""
|
545
|
+
Remove all elements with source="ocr" from the elements dictionary.
|
546
|
+
This should be called before adding new OCR elements if replacement is desired.
|
547
|
+
|
548
|
+
Returns:
|
549
|
+
int: Number of OCR elements removed
|
550
|
+
"""
|
551
|
+
# Load elements if not already loaded
|
552
|
+
self.load_elements()
|
553
|
+
|
554
|
+
removed_count = 0
|
555
|
+
|
556
|
+
# Filter out OCR elements from words
|
557
|
+
if "words" in self._elements:
|
558
|
+
original_len = len(self._elements["words"])
|
559
|
+
self._elements["words"] = [
|
560
|
+
word for word in self._elements["words"]
|
561
|
+
if getattr(word, "source", None) != "ocr"
|
562
|
+
]
|
563
|
+
removed_count += original_len - len(self._elements["words"])
|
564
|
+
|
565
|
+
# Filter out OCR elements from chars
|
566
|
+
if "chars" in self._elements:
|
567
|
+
original_len = len(self._elements["chars"])
|
568
|
+
self._elements["chars"] = [
|
569
|
+
char for char in self._elements["chars"]
|
570
|
+
if (isinstance(char, dict) and char.get("source") != "ocr") or
|
571
|
+
(not isinstance(char, dict) and getattr(char, "source", None) != "ocr")
|
572
|
+
]
|
573
|
+
removed_count += original_len - len(self._elements["chars"])
|
574
|
+
|
575
|
+
logger.info(f"Page {self._page.number}: Removed {removed_count} OCR elements.")
|
576
|
+
return removed_count
|
577
|
+
|
578
|
+
def remove_element(self, element, element_type="words"):
|
579
|
+
"""
|
580
|
+
Remove a specific element from the managed elements.
|
581
|
+
|
582
|
+
Args:
|
583
|
+
element: The element to remove
|
584
|
+
element_type: The type of element ('words', 'chars', etc.)
|
585
|
+
|
586
|
+
Returns:
|
587
|
+
bool: True if removed successfully, False otherwise
|
588
|
+
"""
|
589
|
+
# Load elements if not already loaded
|
590
|
+
self.load_elements()
|
591
|
+
|
592
|
+
# Check if the collection exists
|
593
|
+
if element_type not in self._elements:
|
594
|
+
logger.warning(f"Cannot remove element: collection '{element_type}' does not exist")
|
595
|
+
return False
|
596
|
+
|
597
|
+
# Try to remove the element
|
598
|
+
try:
|
599
|
+
if element in self._elements[element_type]:
|
600
|
+
self._elements[element_type].remove(element)
|
601
|
+
logger.debug(f"Removed element from {element_type}: {element}")
|
602
|
+
return True
|
603
|
+
else:
|
604
|
+
logger.debug(f"Element not found in {element_type}: {element}")
|
605
|
+
return False
|
606
|
+
except Exception as e:
|
607
|
+
logger.error(f"Error removing element from {element_type}: {e}", exc_info=True)
|
608
|
+
return False
|
natural_pdf/core/page.py
CHANGED
@@ -6,14 +6,19 @@ import logging
|
|
6
6
|
import os
|
7
7
|
import re
|
8
8
|
import tempfile
|
9
|
+
import time # Import time
|
9
10
|
from pathlib import Path
|
10
11
|
from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union
|
12
|
+
import concurrent.futures # Added import
|
13
|
+
from tqdm.auto import tqdm # Added tqdm import
|
14
|
+
import threading
|
11
15
|
|
12
16
|
import pdfplumber
|
13
17
|
from PIL import Image, ImageDraw
|
14
18
|
|
15
19
|
from natural_pdf.elements.collections import ElementCollection
|
16
20
|
from natural_pdf.elements.region import Region
|
21
|
+
from natural_pdf.utils.locks import pdf_render_lock # Import from utils instead
|
17
22
|
|
18
23
|
if TYPE_CHECKING:
|
19
24
|
import pdfplumber
|
@@ -46,10 +51,20 @@ from natural_pdf.widgets.viewer import _IPYWIDGETS_AVAILABLE, SimpleInteractiveV
|
|
46
51
|
from natural_pdf.qa import DocumentQA, get_qa_engine
|
47
52
|
from natural_pdf.ocr.utils import _apply_ocr_correction_to_elements
|
48
53
|
|
54
|
+
# --- Classification Imports --- #
|
55
|
+
from natural_pdf.classification.mixin import ClassificationMixin
|
56
|
+
from natural_pdf.classification.manager import ClassificationManager # For type hint
|
57
|
+
# --- End Classification Imports --- #
|
58
|
+
|
59
|
+
from natural_pdf.utils.locks import pdf_render_lock # Import the lock
|
60
|
+
from natural_pdf.elements.base import Element # Import base element
|
61
|
+
from natural_pdf.classification.mixin import ClassificationMixin # Import classification mixin
|
62
|
+
from natural_pdf.extraction.mixin import ExtractionMixin # Import extraction mixin
|
63
|
+
|
49
64
|
logger = logging.getLogger(__name__)
|
50
65
|
|
51
66
|
|
52
|
-
class Page:
|
67
|
+
class Page(ClassificationMixin, ExtractionMixin):
|
53
68
|
"""
|
54
69
|
Enhanced Page wrapper built on top of pdfplumber.Page.
|
55
70
|
|
@@ -73,14 +88,21 @@ class Page:
|
|
73
88
|
self._text_styles = None # Lazy-loaded text style analyzer results
|
74
89
|
self._exclusions = [] # List to store exclusion functions/regions
|
75
90
|
|
91
|
+
# --- ADDED --- Metadata store for mixins
|
92
|
+
self.metadata: Dict[str, Any] = {}
|
93
|
+
# --- END ADDED ---
|
94
|
+
|
76
95
|
# Region management
|
77
96
|
self._regions = {
|
78
97
|
"detected": [], # Layout detection results
|
79
98
|
"named": {}, # Named regions (name -> region)
|
80
99
|
}
|
81
100
|
|
82
|
-
# Initialize ElementManager
|
83
|
-
self._element_mgr = ElementManager(self, font_attrs)
|
101
|
+
# Initialize ElementManager, passing font_attrs
|
102
|
+
self._element_mgr = ElementManager(self, font_attrs=font_attrs)
|
103
|
+
# self._highlighter = HighlightingService(self) # REMOVED - Use property accessor
|
104
|
+
# --- NEW --- Central registry for analysis results
|
105
|
+
self.analyses: Dict[str, Any] = {}
|
84
106
|
|
85
107
|
# --- Get OCR Manager Instance ---
|
86
108
|
if (
|
@@ -115,6 +137,8 @@ class Page:
|
|
115
137
|
# Initialize the internal variable with a single underscore
|
116
138
|
self._layout_analyzer = None
|
117
139
|
|
140
|
+
self._load_elements()
|
141
|
+
|
118
142
|
@property
|
119
143
|
def pdf(self) -> "PDF":
|
120
144
|
"""Provides public access to the parent PDF object."""
|
@@ -1233,7 +1257,7 @@ class Page:
|
|
1233
1257
|
render_ocr: bool = False,
|
1234
1258
|
resolution: Optional[float] = None,
|
1235
1259
|
include_highlights: bool = True,
|
1236
|
-
exclusions: Optional[str] = None,
|
1260
|
+
exclusions: Optional[str] = None, # New parameter
|
1237
1261
|
**kwargs,
|
1238
1262
|
) -> Optional[Image.Image]:
|
1239
1263
|
"""
|
@@ -1257,38 +1281,48 @@ class Page:
|
|
1257
1281
|
"""
|
1258
1282
|
image = None
|
1259
1283
|
render_resolution = resolution if resolution is not None else scale * 72
|
1284
|
+
thread_id = threading.current_thread().name
|
1285
|
+
logger.debug(f"[{thread_id}] Page {self.index}: Attempting to acquire pdf_render_lock for to_image...")
|
1286
|
+
lock_wait_start = time.monotonic()
|
1260
1287
|
try:
|
1261
|
-
|
1262
|
-
|
1263
|
-
|
1264
|
-
|
1265
|
-
|
1266
|
-
|
1267
|
-
|
1268
|
-
|
1269
|
-
|
1270
|
-
|
1271
|
-
|
1272
|
-
|
1273
|
-
|
1274
|
-
|
1275
|
-
|
1276
|
-
|
1277
|
-
|
1278
|
-
|
1279
|
-
|
1280
|
-
|
1281
|
-
|
1282
|
-
|
1283
|
-
|
1288
|
+
# Acquire the global PDF rendering lock
|
1289
|
+
with pdf_render_lock:
|
1290
|
+
lock_acquired_time = time.monotonic()
|
1291
|
+
logger.debug(f"[{thread_id}] Page {self.index}: Acquired pdf_render_lock (waited {lock_acquired_time - lock_wait_start:.2f}s). Starting render...")
|
1292
|
+
if include_highlights:
|
1293
|
+
# Delegate rendering to the central service
|
1294
|
+
image = self._highlighter.render_page(
|
1295
|
+
page_index=self.index,
|
1296
|
+
scale=scale, # Note: scale is used by highlighter internally for drawing
|
1297
|
+
labels=labels,
|
1298
|
+
legend_position=legend_position,
|
1299
|
+
render_ocr=render_ocr,
|
1300
|
+
resolution=render_resolution, # Pass the calculated resolution
|
1301
|
+
**kwargs,
|
1302
|
+
)
|
1303
|
+
else:
|
1304
|
+
# Get the base page image directly from pdfplumber if no highlights needed
|
1305
|
+
# Use the underlying pdfplumber page object
|
1306
|
+
img_object = self._page.to_image(resolution=render_resolution, **kwargs)
|
1307
|
+
# Access the PIL image directly (assuming pdfplumber structure)
|
1308
|
+
image = (
|
1309
|
+
img_object.annotated
|
1310
|
+
if hasattr(img_object, "annotated")
|
1311
|
+
else img_object._repr_png_()
|
1312
|
+
)
|
1313
|
+
if isinstance(image, bytes): # Handle cases where it returns bytes
|
1314
|
+
from io import BytesIO
|
1284
1315
|
|
1285
|
-
|
1286
|
-
|
1287
|
-
|
1316
|
+
image = Image.open(BytesIO(image)).convert(
|
1317
|
+
"RGB"
|
1318
|
+
) # Convert to RGB for consistency
|
1288
1319
|
|
1289
1320
|
except Exception as e:
|
1290
1321
|
logger.error(f"Error rendering page {self.index}: {e}", exc_info=True)
|
1291
1322
|
return None # Return None on error
|
1323
|
+
finally:
|
1324
|
+
render_end_time = time.monotonic()
|
1325
|
+
logger.debug(f"[{thread_id}] Page {self.index}: Released pdf_render_lock. Total render time (incl. lock wait): {render_end_time - lock_wait_start:.2f}s")
|
1292
1326
|
|
1293
1327
|
if image is None:
|
1294
1328
|
return None
|
@@ -1322,16 +1356,21 @@ class Page:
|
|
1322
1356
|
max(0, img_x0),
|
1323
1357
|
max(0, img_top),
|
1324
1358
|
min(image.width, img_x1),
|
1325
|
-
min(image.height, img_bottom)
|
1359
|
+
min(image.height, img_bottom),
|
1326
1360
|
)
|
1327
1361
|
if img_coords[0] < img_coords[2] and img_coords[1] < img_coords[3]:
|
1328
|
-
|
1362
|
+
draw.rectangle(img_coords, fill="white")
|
1329
1363
|
else:
|
1330
|
-
|
1364
|
+
logger.warning(
|
1365
|
+
f"Skipping invalid exclusion rect for masking: {img_coords}"
|
1366
|
+
)
|
1331
1367
|
|
1332
|
-
del draw
|
1368
|
+
del draw # Release drawing context
|
1333
1369
|
except Exception as mask_error:
|
1334
|
-
logger.error(
|
1370
|
+
logger.error(
|
1371
|
+
f"Error applying exclusion mask to page {self.index}: {mask_error}",
|
1372
|
+
exc_info=True,
|
1373
|
+
)
|
1335
1374
|
# Decide if you want to return None or continue without mask
|
1336
1375
|
# For now, continue without mask
|
1337
1376
|
|
@@ -1379,6 +1418,7 @@ class Page:
|
|
1379
1418
|
resolution: Optional[int] = None,
|
1380
1419
|
detect_only: bool = False,
|
1381
1420
|
apply_exclusions: bool = True,
|
1421
|
+
replace: bool = True,
|
1382
1422
|
) -> "Page":
|
1383
1423
|
"""
|
1384
1424
|
Apply OCR to THIS page and add results to page elements via PDF.apply_ocr.
|
@@ -1392,13 +1432,21 @@ class Page:
|
|
1392
1432
|
resolution: DPI resolution for rendering page image before OCR.
|
1393
1433
|
apply_exclusions: If True (default), render page image for OCR
|
1394
1434
|
with excluded areas masked (whited out).
|
1435
|
+
detect_only: If True, only detect text bounding boxes, don't perform OCR.
|
1436
|
+
replace: If True (default), remove any existing OCR elements before
|
1437
|
+
adding new ones. If False, add new OCR elements to existing ones.
|
1395
1438
|
|
1396
1439
|
Returns:
|
1397
|
-
|
1440
|
+
Self for method chaining.
|
1398
1441
|
"""
|
1399
1442
|
if not hasattr(self._parent, "apply_ocr"):
|
1400
1443
|
logger.error(f"Page {self.number}: Parent PDF missing 'apply_ocr'. Cannot apply OCR.")
|
1401
|
-
return
|
1444
|
+
return self # Return self for chaining
|
1445
|
+
|
1446
|
+
# Remove existing OCR elements if replace is True
|
1447
|
+
if replace and hasattr(self, "_element_mgr"):
|
1448
|
+
logger.info(f"Page {self.number}: Removing existing OCR elements before applying new OCR.")
|
1449
|
+
self._element_mgr.remove_ocr_elements()
|
1402
1450
|
|
1403
1451
|
logger.info(f"Page {self.number}: Delegating apply_ocr to PDF.apply_ocr.")
|
1404
1452
|
try:
|
@@ -1414,18 +1462,13 @@ class Page:
|
|
1414
1462
|
resolution=resolution,
|
1415
1463
|
detect_only=detect_only,
|
1416
1464
|
apply_exclusions=apply_exclusions,
|
1465
|
+
replace=replace, # Pass the replace parameter to PDF.apply_ocr
|
1417
1466
|
)
|
1418
1467
|
except Exception as e:
|
1419
1468
|
logger.error(f"Page {self.number}: Error during delegated OCR call: {e}", exc_info=True)
|
1420
|
-
return
|
1469
|
+
return self # Return self for chaining
|
1421
1470
|
|
1422
|
-
# Return
|
1423
|
-
ocr_elements = [el for el in self.words if getattr(el, "source", None) == "ocr"]
|
1424
|
-
logger.debug(
|
1425
|
-
f"Page {self.number}: apply_ocr completed. Found {len(ocr_elements)} OCR elements."
|
1426
|
-
)
|
1427
|
-
# Note: The method is typed to return Page for chaining, but the log indicates
|
1428
|
-
# finding elements. Let's stick to returning self for chaining consistency.
|
1471
|
+
# Return self for chaining
|
1429
1472
|
return self
|
1430
1473
|
|
1431
1474
|
def extract_ocr_elements(
|
@@ -1459,30 +1502,32 @@ class Page:
|
|
1459
1502
|
return []
|
1460
1503
|
|
1461
1504
|
logger.info(f"Page {self.number}: Extracting OCR elements (extract only)...")
|
1462
|
-
|
1505
|
+
|
1463
1506
|
# Determine rendering resolution
|
1464
|
-
final_resolution = resolution if resolution is not None else 150
|
1507
|
+
final_resolution = resolution if resolution is not None else 150 # Default to 150 DPI
|
1465
1508
|
logger.debug(f" Using rendering resolution: {final_resolution} DPI")
|
1466
|
-
|
1509
|
+
|
1467
1510
|
try:
|
1468
1511
|
# Get base image without highlights using the determined resolution
|
1469
|
-
|
1470
|
-
|
1471
|
-
|
1472
|
-
|
1473
|
-
|
1512
|
+
# Use the global PDF rendering lock
|
1513
|
+
with pdf_render_lock:
|
1514
|
+
image = self.to_image(resolution=final_resolution, include_highlights=False)
|
1515
|
+
if not image:
|
1516
|
+
logger.error(f" Failed to render page {self.number} to image for OCR extraction.")
|
1517
|
+
return []
|
1518
|
+
logger.debug(f" Rendered image size: {image.width}x{image.height}")
|
1474
1519
|
except Exception as e:
|
1475
1520
|
logger.error(f" Failed to render page {self.number} to image: {e}", exc_info=True)
|
1476
1521
|
return []
|
1477
1522
|
|
1478
1523
|
# Prepare arguments for the OCR Manager call
|
1479
1524
|
manager_args = {
|
1480
|
-
|
1481
|
-
|
1482
|
-
|
1483
|
-
|
1484
|
-
|
1485
|
-
|
1525
|
+
"images": image,
|
1526
|
+
"engine": engine,
|
1527
|
+
"languages": languages,
|
1528
|
+
"min_confidence": min_confidence,
|
1529
|
+
"device": device,
|
1530
|
+
"options": options,
|
1486
1531
|
}
|
1487
1532
|
manager_args = {k: v for k, v in manager_args.items() if v is not None}
|
1488
1533
|
|
@@ -1514,7 +1559,7 @@ class Page:
|
|
1514
1559
|
scale_x = self.width / image.width if image.width else 1
|
1515
1560
|
scale_y = self.height / image.height if image.height else 1
|
1516
1561
|
for result in results:
|
1517
|
-
try:
|
1562
|
+
try: # Added try-except around result processing
|
1518
1563
|
x0, top, x1, bottom = [float(c) for c in result["bbox"]]
|
1519
1564
|
elem_data = {
|
1520
1565
|
"text": result["text"],
|
@@ -1525,15 +1570,17 @@ class Page:
|
|
1525
1570
|
"bottom": bottom * scale_y,
|
1526
1571
|
"width": (x1 - x0) * scale_x,
|
1527
1572
|
"height": (bottom - top) * scale_y,
|
1528
|
-
"object_type": "text",
|
1573
|
+
"object_type": "text", # Using text for temporary elements
|
1529
1574
|
"source": "ocr",
|
1530
|
-
"fontname": "OCR-extract",
|
1575
|
+
"fontname": "OCR-extract", # Different name for clarity
|
1531
1576
|
"size": 10.0,
|
1532
1577
|
"page_number": self.number,
|
1533
1578
|
}
|
1534
1579
|
temp_elements.append(TextElement(elem_data, self))
|
1535
1580
|
except (KeyError, ValueError, TypeError) as convert_err:
|
1536
|
-
|
1581
|
+
logger.warning(
|
1582
|
+
f" Skipping invalid OCR result during conversion: {result}. Error: {convert_err}"
|
1583
|
+
)
|
1537
1584
|
|
1538
1585
|
logger.info(f" Created {len(temp_elements)} TextElements from OCR (extract only).")
|
1539
1586
|
return temp_elements
|
@@ -2020,41 +2067,166 @@ class Page:
|
|
2020
2067
|
def correct_ocr(
|
2021
2068
|
self,
|
2022
2069
|
correction_callback: Callable[[Any], Optional[str]],
|
2023
|
-
|
2070
|
+
max_workers: Optional[int] = None,
|
2071
|
+
progress_callback: Optional[Callable[[], None]] = None, # Added progress callback
|
2072
|
+
) -> "Page": # Return self for chaining
|
2024
2073
|
"""
|
2025
2074
|
Applies corrections to OCR-generated text elements on this page
|
2026
|
-
using a user-provided callback function.
|
2075
|
+
using a user-provided callback function, potentially in parallel.
|
2027
2076
|
|
2028
2077
|
Finds text elements on this page whose 'source' attribute starts
|
2029
2078
|
with 'ocr' and calls the `correction_callback` for each, passing the
|
2030
|
-
element itself.
|
2031
|
-
|
2032
|
-
The `correction_callback` should contain the logic to:
|
2033
|
-
1. Determine if the element needs correction.
|
2034
|
-
2. Perform the correction (e.g., call an LLM).
|
2035
|
-
3. Return the new text (`str`) or `None`.
|
2036
|
-
|
2037
|
-
If the callback returns a string, the element's `.text` is updated.
|
2038
|
-
Metadata updates (source, confidence, etc.) should happen within the callback.
|
2079
|
+
element itself. Updates the element's text if the callback returns
|
2080
|
+
a new string.
|
2039
2081
|
|
2040
2082
|
Args:
|
2041
2083
|
correction_callback: A function accepting an element and returning
|
2042
2084
|
`Optional[str]` (new text or None).
|
2085
|
+
max_workers: The maximum number of threads to use for parallel execution.
|
2086
|
+
If None or 0 or 1, runs sequentially.
|
2087
|
+
progress_callback: Optional callback function to call after processing each element.
|
2043
2088
|
|
2044
2089
|
Returns:
|
2045
2090
|
Self for method chaining.
|
2046
2091
|
"""
|
2047
|
-
logger.info(
|
2092
|
+
logger.info(
|
2093
|
+
f"Page {self.number}: Starting OCR correction with callback '{correction_callback.__name__}' (max_workers={max_workers})"
|
2094
|
+
)
|
2095
|
+
|
2096
|
+
target_elements_collection = self.find_all(
|
2097
|
+
selector="text[source=ocr]", apply_exclusions=False
|
2098
|
+
)
|
2099
|
+
target_elements = target_elements_collection.elements # Get the list
|
2100
|
+
|
2101
|
+
if not target_elements:
|
2102
|
+
logger.info(f"Page {self.number}: No OCR elements found to correct.")
|
2103
|
+
return self
|
2048
2104
|
|
2049
|
-
|
2050
|
-
|
2051
|
-
|
2105
|
+
processed_count = 0
|
2106
|
+
updated_count = 0
|
2107
|
+
error_count = 0
|
2052
2108
|
|
2053
|
-
#
|
2054
|
-
|
2055
|
-
|
2056
|
-
|
2057
|
-
|
2109
|
+
# Define the task to be run by the worker thread or sequentially
|
2110
|
+
def _process_element_task(element):
|
2111
|
+
try:
|
2112
|
+
current_text = getattr(element, 'text', None)
|
2113
|
+
# Call the user-provided callback
|
2114
|
+
corrected_text = correction_callback(element)
|
2115
|
+
|
2116
|
+
# Validate result type
|
2117
|
+
if corrected_text is not None and not isinstance(corrected_text, str):
|
2118
|
+
logger.warning(f"Page {self.number}: Correction callback for element '{getattr(element, 'text', '')[:20]}...' returned non-string, non-None type: {type(corrected_text)}. Skipping update.")
|
2119
|
+
return element, None, None # Treat as no correction
|
2120
|
+
|
2121
|
+
return element, corrected_text, None # Return element, result, no error
|
2122
|
+
except Exception as e:
|
2123
|
+
logger.error(
|
2124
|
+
f"Page {self.number}: Error applying correction callback to element '{getattr(element, 'text', '')[:30]}...' ({element.bbox}): {e}",
|
2125
|
+
exc_info=False # Keep log concise
|
2126
|
+
)
|
2127
|
+
return element, None, e # Return element, no result, error
|
2128
|
+
finally:
|
2129
|
+
# --- Call progress callback here --- #
|
2130
|
+
if progress_callback:
|
2131
|
+
try:
|
2132
|
+
progress_callback()
|
2133
|
+
except Exception as cb_e:
|
2134
|
+
# Log error in callback itself, but don't stop processing
|
2135
|
+
logger.error(f"Page {self.number}: Error executing progress_callback: {cb_e}", exc_info=False)
|
2136
|
+
|
2137
|
+
# Choose execution strategy based on max_workers
|
2138
|
+
if max_workers is not None and max_workers > 1:
|
2139
|
+
# --- Parallel execution --- #
|
2140
|
+
logger.info(f"Page {self.number}: Running OCR correction in parallel with {max_workers} workers.")
|
2141
|
+
futures = []
|
2142
|
+
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
|
2143
|
+
# Submit all tasks
|
2144
|
+
future_to_element = {executor.submit(_process_element_task, element): element for element in target_elements}
|
2145
|
+
|
2146
|
+
# Process results as they complete (progress_callback called by worker)
|
2147
|
+
for future in concurrent.futures.as_completed(future_to_element):
|
2148
|
+
processed_count += 1
|
2149
|
+
try:
|
2150
|
+
element, corrected_text, error = future.result()
|
2151
|
+
if error:
|
2152
|
+
error_count += 1
|
2153
|
+
# Error already logged in worker
|
2154
|
+
elif corrected_text is not None:
|
2155
|
+
# Apply correction if text changed
|
2156
|
+
current_text = getattr(element, 'text', None)
|
2157
|
+
if corrected_text != current_text:
|
2158
|
+
element.text = corrected_text
|
2159
|
+
updated_count += 1
|
2160
|
+
except Exception as exc:
|
2161
|
+
# Catch errors from future.result() itself
|
2162
|
+
element = future_to_element[future] # Find original element
|
2163
|
+
logger.error(f"Page {self.number}: Internal error retrieving correction result for element {element.bbox}: {exc}", exc_info=True)
|
2164
|
+
error_count += 1
|
2165
|
+
# Note: progress_callback was already called in the worker's finally block
|
2166
|
+
|
2167
|
+
else:
|
2168
|
+
# --- Sequential execution --- #
|
2169
|
+
logger.info(f"Page {self.number}: Running OCR correction sequentially.")
|
2170
|
+
for element in target_elements:
|
2171
|
+
# Call the task function directly (it handles progress_callback)
|
2172
|
+
processed_count += 1
|
2173
|
+
_element, corrected_text, error = _process_element_task(element)
|
2174
|
+
if error:
|
2175
|
+
error_count += 1
|
2176
|
+
elif corrected_text is not None:
|
2177
|
+
# Apply correction if text changed
|
2178
|
+
current_text = getattr(_element, 'text', None)
|
2179
|
+
if corrected_text != current_text:
|
2180
|
+
_element.text = corrected_text
|
2181
|
+
updated_count += 1
|
2182
|
+
|
2183
|
+
logger.info(
|
2184
|
+
f"Page {self.number}: OCR correction finished. Processed: {processed_count}/{len(target_elements)}, Updated: {updated_count}, Errors: {error_count}."
|
2058
2185
|
)
|
2059
2186
|
|
2060
2187
|
return self # Return self for chaining
|
2188
|
+
|
2189
|
+
# --- Classification Mixin Implementation --- #
|
2190
|
+
def _get_classification_manager(self) -> "ClassificationManager":
|
2191
|
+
if not hasattr(self, 'pdf') or not hasattr(self.pdf, 'get_manager'):
|
2192
|
+
raise AttributeError("ClassificationManager cannot be accessed: Parent PDF or get_manager method missing.")
|
2193
|
+
try:
|
2194
|
+
# Use the PDF's manager registry accessor
|
2195
|
+
return self.pdf.get_manager('classification')
|
2196
|
+
except (ValueError, RuntimeError, AttributeError) as e:
|
2197
|
+
# Wrap potential errors from get_manager for clarity
|
2198
|
+
raise AttributeError(f"Failed to get ClassificationManager from PDF: {e}") from e
|
2199
|
+
|
2200
|
+
def _get_classification_content(self, model_type: str, **kwargs) -> Union[str, "Image"]: # Use "Image" for lazy import
|
2201
|
+
if model_type == 'text':
|
2202
|
+
text_content = self.extract_text(layout=False, use_exclusions=False) # Simple join, ignore exclusions for classification
|
2203
|
+
if not text_content or text_content.isspace():
|
2204
|
+
raise ValueError("Cannot classify page with 'text' model: No text content found.")
|
2205
|
+
return text_content
|
2206
|
+
elif model_type == 'vision':
|
2207
|
+
# Get resolution from manager/kwargs if possible, else default
|
2208
|
+
manager = self._get_classification_manager()
|
2209
|
+
default_resolution = 150
|
2210
|
+
# Access kwargs passed to classify method if needed
|
2211
|
+
resolution = kwargs.get('resolution', default_resolution) if 'kwargs' in locals() else default_resolution
|
2212
|
+
|
2213
|
+
# Use to_image, ensuring no highlights interfere
|
2214
|
+
img = self.to_image(
|
2215
|
+
resolution=resolution,
|
2216
|
+
include_highlights=False,
|
2217
|
+
labels=False,
|
2218
|
+
exclusions=None # Don't mask exclusions for classification input image
|
2219
|
+
)
|
2220
|
+
if img is None:
|
2221
|
+
raise ValueError("Cannot classify page with 'vision' model: Failed to render image.")
|
2222
|
+
return img
|
2223
|
+
else:
|
2224
|
+
raise ValueError(f"Unsupported model_type for classification: {model_type}")
|
2225
|
+
|
2226
|
+
def _get_metadata_storage(self) -> Dict[str, Any]:
|
2227
|
+
# Ensure metadata exists
|
2228
|
+
if not hasattr(self, 'metadata') or self.metadata is None:
|
2229
|
+
self.metadata = {}
|
2230
|
+
return self.metadata
|
2231
|
+
|
2232
|
+
# --- Content Extraction ---
|