natural-pdf 0.1.7__py3-none-any.whl → 0.1.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/__init__.py +3 -0
- natural_pdf/analyzers/layout/base.py +1 -5
- natural_pdf/analyzers/layout/gemini.py +61 -51
- natural_pdf/analyzers/layout/layout_analyzer.py +40 -11
- natural_pdf/analyzers/layout/layout_manager.py +26 -84
- natural_pdf/analyzers/layout/layout_options.py +7 -0
- natural_pdf/analyzers/layout/pdfplumber_table_finder.py +142 -0
- natural_pdf/analyzers/layout/surya.py +46 -123
- natural_pdf/analyzers/layout/tatr.py +51 -4
- natural_pdf/analyzers/text_structure.py +3 -5
- natural_pdf/analyzers/utils.py +3 -3
- natural_pdf/classification/manager.py +422 -0
- natural_pdf/classification/mixin.py +163 -0
- natural_pdf/classification/results.py +80 -0
- natural_pdf/collections/mixins.py +111 -0
- natural_pdf/collections/pdf_collection.py +434 -15
- natural_pdf/core/element_manager.py +83 -0
- natural_pdf/core/highlighting_service.py +13 -22
- natural_pdf/core/page.py +578 -93
- natural_pdf/core/pdf.py +912 -460
- natural_pdf/elements/base.py +134 -40
- natural_pdf/elements/collections.py +712 -109
- natural_pdf/elements/region.py +722 -69
- natural_pdf/elements/text.py +4 -1
- natural_pdf/export/mixin.py +137 -0
- natural_pdf/exporters/base.py +3 -3
- natural_pdf/exporters/paddleocr.py +5 -4
- natural_pdf/extraction/manager.py +135 -0
- natural_pdf/extraction/mixin.py +279 -0
- natural_pdf/extraction/result.py +23 -0
- natural_pdf/ocr/__init__.py +5 -5
- natural_pdf/ocr/engine_doctr.py +346 -0
- natural_pdf/ocr/engine_easyocr.py +6 -3
- natural_pdf/ocr/ocr_factory.py +24 -4
- natural_pdf/ocr/ocr_manager.py +122 -26
- natural_pdf/ocr/ocr_options.py +94 -11
- natural_pdf/ocr/utils.py +19 -6
- natural_pdf/qa/document_qa.py +0 -4
- natural_pdf/search/__init__.py +20 -34
- natural_pdf/search/haystack_search_service.py +309 -265
- natural_pdf/search/haystack_utils.py +99 -75
- natural_pdf/search/search_service_protocol.py +11 -12
- natural_pdf/selectors/parser.py +431 -230
- natural_pdf/utils/debug.py +3 -3
- natural_pdf/utils/identifiers.py +1 -1
- natural_pdf/utils/locks.py +8 -0
- natural_pdf/utils/packaging.py +8 -6
- natural_pdf/utils/text_extraction.py +60 -1
- natural_pdf/utils/tqdm_utils.py +51 -0
- natural_pdf/utils/visualization.py +18 -0
- natural_pdf/widgets/viewer.py +4 -25
- {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/METADATA +17 -3
- natural_pdf-0.1.9.dist-info/RECORD +80 -0
- {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/WHEEL +1 -1
- {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/top_level.txt +0 -2
- docs/api/index.md +0 -386
- docs/assets/favicon.png +0 -3
- docs/assets/favicon.svg +0 -3
- docs/assets/javascripts/custom.js +0 -17
- docs/assets/logo.svg +0 -3
- docs/assets/sample-screen.png +0 -0
- docs/assets/social-preview.png +0 -17
- docs/assets/social-preview.svg +0 -17
- docs/assets/stylesheets/custom.css +0 -65
- docs/document-qa/index.ipynb +0 -435
- docs/document-qa/index.md +0 -79
- docs/element-selection/index.ipynb +0 -915
- docs/element-selection/index.md +0 -229
- docs/finetuning/index.md +0 -176
- docs/index.md +0 -170
- docs/installation/index.md +0 -69
- docs/interactive-widget/index.ipynb +0 -962
- docs/interactive-widget/index.md +0 -12
- docs/layout-analysis/index.ipynb +0 -818
- docs/layout-analysis/index.md +0 -185
- docs/ocr/index.md +0 -209
- docs/pdf-navigation/index.ipynb +0 -314
- docs/pdf-navigation/index.md +0 -97
- docs/regions/index.ipynb +0 -816
- docs/regions/index.md +0 -294
- docs/tables/index.ipynb +0 -658
- docs/tables/index.md +0 -144
- docs/text-analysis/index.ipynb +0 -370
- docs/text-analysis/index.md +0 -105
- docs/text-extraction/index.ipynb +0 -1478
- docs/text-extraction/index.md +0 -292
- docs/tutorials/01-loading-and-extraction.ipynb +0 -194
- docs/tutorials/01-loading-and-extraction.md +0 -95
- docs/tutorials/02-finding-elements.ipynb +0 -340
- docs/tutorials/02-finding-elements.md +0 -149
- docs/tutorials/03-extracting-blocks.ipynb +0 -147
- docs/tutorials/03-extracting-blocks.md +0 -48
- docs/tutorials/04-table-extraction.ipynb +0 -114
- docs/tutorials/04-table-extraction.md +0 -50
- docs/tutorials/05-excluding-content.ipynb +0 -270
- docs/tutorials/05-excluding-content.md +0 -109
- docs/tutorials/06-document-qa.ipynb +0 -332
- docs/tutorials/06-document-qa.md +0 -91
- docs/tutorials/07-layout-analysis.ipynb +0 -288
- docs/tutorials/07-layout-analysis.md +0 -66
- docs/tutorials/07-working-with-regions.ipynb +0 -413
- docs/tutorials/07-working-with-regions.md +0 -151
- docs/tutorials/08-spatial-navigation.ipynb +0 -508
- docs/tutorials/08-spatial-navigation.md +0 -190
- docs/tutorials/09-section-extraction.ipynb +0 -2434
- docs/tutorials/09-section-extraction.md +0 -256
- docs/tutorials/10-form-field-extraction.ipynb +0 -512
- docs/tutorials/10-form-field-extraction.md +0 -201
- docs/tutorials/11-enhanced-table-processing.ipynb +0 -54
- docs/tutorials/11-enhanced-table-processing.md +0 -9
- docs/tutorials/12-ocr-integration.ipynb +0 -604
- docs/tutorials/12-ocr-integration.md +0 -175
- docs/tutorials/13-semantic-search.ipynb +0 -1328
- docs/tutorials/13-semantic-search.md +0 -77
- docs/visual-debugging/index.ipynb +0 -2970
- docs/visual-debugging/index.md +0 -157
- docs/visual-debugging/region.png +0 -0
- natural_pdf/templates/finetune/fine_tune_paddleocr.md +0 -415
- natural_pdf/templates/spa/css/style.css +0 -334
- natural_pdf/templates/spa/index.html +0 -31
- natural_pdf/templates/spa/js/app.js +0 -472
- natural_pdf/templates/spa/words.txt +0 -235976
- natural_pdf/widgets/frontend/viewer.js +0 -88
- natural_pdf-0.1.7.dist-info/RECORD +0 -145
- notebooks/Examples.ipynb +0 -1293
- pdfs/.gitkeep +0 -0
- pdfs/01-practice.pdf +0 -543
- pdfs/0500000US42001.pdf +0 -0
- pdfs/0500000US42007.pdf +0 -0
- pdfs/2014 Statistics.pdf +0 -0
- pdfs/2019 Statistics.pdf +0 -0
- pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
- pdfs/needs-ocr.pdf +0 -0
- {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/licenses/LICENSE +0 -0
natural_pdf/core/page.py
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
import base64
|
2
|
+
import concurrent.futures # Added import
|
2
3
|
import hashlib
|
3
4
|
import io
|
4
5
|
import json
|
@@ -6,14 +7,30 @@ import logging
|
|
6
7
|
import os
|
7
8
|
import re
|
8
9
|
import tempfile
|
10
|
+
import threading
|
11
|
+
import time # Import time
|
9
12
|
from pathlib import Path
|
10
|
-
from typing import
|
13
|
+
from typing import ( # Added overload
|
14
|
+
TYPE_CHECKING,
|
15
|
+
Any,
|
16
|
+
Callable,
|
17
|
+
Dict,
|
18
|
+
List,
|
19
|
+
Optional,
|
20
|
+
Tuple,
|
21
|
+
Union,
|
22
|
+
overload,
|
23
|
+
)
|
11
24
|
|
12
25
|
import pdfplumber
|
13
26
|
from PIL import Image, ImageDraw
|
27
|
+
from tqdm.auto import tqdm # Added tqdm import
|
14
28
|
|
15
29
|
from natural_pdf.elements.collections import ElementCollection
|
16
30
|
from natural_pdf.elements.region import Region
|
31
|
+
from natural_pdf.selectors.parser import parse_selector
|
32
|
+
from natural_pdf.utils.locks import pdf_render_lock # Import from utils instead
|
33
|
+
from natural_pdf.utils.visualization import render_plain_page
|
17
34
|
|
18
35
|
if TYPE_CHECKING:
|
19
36
|
import pdfplumber
|
@@ -26,6 +43,8 @@ if TYPE_CHECKING:
|
|
26
43
|
# New Imports
|
27
44
|
import itertools
|
28
45
|
|
46
|
+
# Deskew Imports (Conditional)
|
47
|
+
import numpy as np
|
29
48
|
from pdfplumber.utils.geometry import get_bbox_overlap, merge_bboxes, objects_to_bbox
|
30
49
|
from pdfplumber.utils.text import TEXTMAP_KWARGS, WORD_EXTRACTOR_KWARGS, chars_to_textmap
|
31
50
|
|
@@ -34,22 +53,40 @@ from natural_pdf.analyzers.layout.layout_manager import LayoutManager
|
|
34
53
|
from natural_pdf.analyzers.layout.layout_options import LayoutOptions
|
35
54
|
from natural_pdf.analyzers.text_options import TextStyleOptions
|
36
55
|
from natural_pdf.analyzers.text_structure import TextStyleAnalyzer
|
56
|
+
from natural_pdf.classification.manager import ClassificationManager # For type hint
|
57
|
+
|
58
|
+
# --- Classification Imports --- #
|
59
|
+
from natural_pdf.classification.mixin import ClassificationMixin # Import classification mixin
|
37
60
|
from natural_pdf.core.element_manager import ElementManager
|
61
|
+
from natural_pdf.elements.base import Element # Import base element
|
38
62
|
from natural_pdf.elements.text import TextElement
|
63
|
+
from natural_pdf.extraction.mixin import ExtractionMixin # Import extraction mixin
|
39
64
|
from natural_pdf.ocr import OCRManager, OCROptions
|
65
|
+
from natural_pdf.ocr.utils import _apply_ocr_correction_to_elements
|
66
|
+
from natural_pdf.qa import DocumentQA, get_qa_engine
|
67
|
+
from natural_pdf.utils.locks import pdf_render_lock # Import the lock
|
40
68
|
|
41
69
|
# Import new utils
|
42
70
|
from natural_pdf.utils.text_extraction import filter_chars_spatially, generate_text_layout
|
43
71
|
from natural_pdf.widgets import InteractiveViewerWidget
|
44
72
|
from natural_pdf.widgets.viewer import _IPYWIDGETS_AVAILABLE, SimpleInteractiveViewerWidget
|
45
73
|
|
46
|
-
|
47
|
-
|
74
|
+
# --- End Classification Imports --- #
|
75
|
+
|
76
|
+
|
77
|
+
try:
|
78
|
+
from deskew import determine_skew
|
79
|
+
|
80
|
+
DESKEW_AVAILABLE = True
|
81
|
+
except ImportError:
|
82
|
+
DESKEW_AVAILABLE = False
|
83
|
+
determine_skew = None
|
84
|
+
# End Deskew Imports
|
48
85
|
|
49
86
|
logger = logging.getLogger(__name__)
|
50
87
|
|
51
88
|
|
52
|
-
class Page:
|
89
|
+
class Page(ClassificationMixin, ExtractionMixin):
|
53
90
|
"""
|
54
91
|
Enhanced Page wrapper built on top of pdfplumber.Page.
|
55
92
|
|
@@ -72,6 +109,11 @@ class Page:
|
|
72
109
|
self._index = index
|
73
110
|
self._text_styles = None # Lazy-loaded text style analyzer results
|
74
111
|
self._exclusions = [] # List to store exclusion functions/regions
|
112
|
+
self._skew_angle: Optional[float] = None # Stores detected skew angle
|
113
|
+
|
114
|
+
# --- ADDED --- Metadata store for mixins
|
115
|
+
self.metadata: Dict[str, Any] = {}
|
116
|
+
# --- END ADDED ---
|
75
117
|
|
76
118
|
# Region management
|
77
119
|
self._regions = {
|
@@ -79,8 +121,11 @@ class Page:
|
|
79
121
|
"named": {}, # Named regions (name -> region)
|
80
122
|
}
|
81
123
|
|
82
|
-
# Initialize ElementManager
|
83
|
-
self._element_mgr = ElementManager(self, font_attrs)
|
124
|
+
# Initialize ElementManager, passing font_attrs
|
125
|
+
self._element_mgr = ElementManager(self, font_attrs=font_attrs)
|
126
|
+
# self._highlighter = HighlightingService(self) # REMOVED - Use property accessor
|
127
|
+
# --- NEW --- Central registry for analysis results
|
128
|
+
self.analyses: Dict[str, Any] = {}
|
84
129
|
|
85
130
|
# --- Get OCR Manager Instance ---
|
86
131
|
if (
|
@@ -115,6 +160,8 @@ class Page:
|
|
115
160
|
# Initialize the internal variable with a single underscore
|
116
161
|
self._layout_analyzer = None
|
117
162
|
|
163
|
+
self._load_elements()
|
164
|
+
|
118
165
|
@property
|
119
166
|
def pdf(self) -> "PDF":
|
120
167
|
"""Provides public access to the parent PDF object."""
|
@@ -412,25 +459,79 @@ class Page:
|
|
412
459
|
|
413
460
|
return filtered_elements
|
414
461
|
|
415
|
-
|
462
|
+
@overload
|
463
|
+
def find(
|
464
|
+
self,
|
465
|
+
*,
|
466
|
+
text: str,
|
467
|
+
apply_exclusions: bool = True,
|
468
|
+
regex: bool = False,
|
469
|
+
case: bool = True,
|
470
|
+
**kwargs,
|
471
|
+
) -> Optional[Any]: ...
|
472
|
+
|
473
|
+
@overload
|
474
|
+
def find(
|
475
|
+
self,
|
476
|
+
selector: str,
|
477
|
+
*,
|
478
|
+
apply_exclusions: bool = True,
|
479
|
+
regex: bool = False,
|
480
|
+
case: bool = True,
|
481
|
+
**kwargs,
|
482
|
+
) -> Optional[Any]: ...
|
483
|
+
|
484
|
+
def find(
|
485
|
+
self,
|
486
|
+
selector: Optional[str] = None, # Now optional
|
487
|
+
*, # Force subsequent args to be keyword-only
|
488
|
+
text: Optional[str] = None, # New text parameter
|
489
|
+
apply_exclusions: bool = True,
|
490
|
+
regex: bool = False,
|
491
|
+
case: bool = True,
|
492
|
+
**kwargs,
|
493
|
+
) -> Optional[Any]:
|
416
494
|
"""
|
417
|
-
Find first element on this page matching selector.
|
495
|
+
Find first element on this page matching selector OR text content.
|
496
|
+
|
497
|
+
Provide EITHER `selector` OR `text`, but not both.
|
418
498
|
|
419
499
|
Args:
|
420
|
-
selector: CSS-like selector string
|
421
|
-
|
422
|
-
|
423
|
-
|
424
|
-
|
500
|
+
selector: CSS-like selector string.
|
501
|
+
text: Text content to search for (equivalent to 'text:contains(...)').
|
502
|
+
apply_exclusions: Whether to exclude elements in exclusion regions (default: True).
|
503
|
+
regex: Whether to use regex for text search (`selector` or `text`) (default: False).
|
504
|
+
case: Whether to do case-sensitive text search (`selector` or `text`) (default: True).
|
505
|
+
**kwargs: Additional filter parameters.
|
425
506
|
|
426
507
|
Returns:
|
427
|
-
Element object or None if not found
|
428
|
-
"""
|
429
|
-
|
508
|
+
Element object or None if not found.
|
509
|
+
"""
|
510
|
+
if selector is not None and text is not None:
|
511
|
+
raise ValueError("Provide either 'selector' or 'text', not both.")
|
512
|
+
if selector is None and text is None:
|
513
|
+
raise ValueError("Provide either 'selector' or 'text'.")
|
514
|
+
|
515
|
+
# Construct selector if 'text' is provided
|
516
|
+
effective_selector = ""
|
517
|
+
if text is not None:
|
518
|
+
# Escape quotes within the text for the selector string
|
519
|
+
escaped_text = text.replace('"', '\\"').replace("'", "\\'")
|
520
|
+
# Default to 'text:contains(...)'
|
521
|
+
effective_selector = f'text:contains("{escaped_text}")'
|
522
|
+
# Note: regex/case handled by kwargs passed down
|
523
|
+
logger.debug(
|
524
|
+
f"Using text shortcut: find(text='{text}') -> find('{effective_selector}')"
|
525
|
+
)
|
526
|
+
elif selector is not None:
|
527
|
+
effective_selector = selector
|
528
|
+
else:
|
529
|
+
# Should be unreachable due to checks above
|
530
|
+
raise ValueError("Internal error: No selector or text provided.")
|
430
531
|
|
431
|
-
selector_obj = parse_selector(
|
532
|
+
selector_obj = parse_selector(effective_selector)
|
432
533
|
|
433
|
-
# Pass regex and case flags to selector function
|
534
|
+
# Pass regex and case flags to selector function via kwargs
|
434
535
|
kwargs["regex"] = regex
|
435
536
|
kwargs["case"] = case
|
436
537
|
|
@@ -450,27 +551,80 @@ class Page:
|
|
450
551
|
else:
|
451
552
|
return None
|
452
553
|
|
554
|
+
@overload
|
555
|
+
def find_all(
|
556
|
+
self,
|
557
|
+
*,
|
558
|
+
text: str,
|
559
|
+
apply_exclusions: bool = True,
|
560
|
+
regex: bool = False,
|
561
|
+
case: bool = True,
|
562
|
+
**kwargs,
|
563
|
+
) -> "ElementCollection": ...
|
564
|
+
|
565
|
+
@overload
|
566
|
+
def find_all(
|
567
|
+
self,
|
568
|
+
selector: str,
|
569
|
+
*,
|
570
|
+
apply_exclusions: bool = True,
|
571
|
+
regex: bool = False,
|
572
|
+
case: bool = True,
|
573
|
+
**kwargs,
|
574
|
+
) -> "ElementCollection": ...
|
575
|
+
|
453
576
|
def find_all(
|
454
|
-
self,
|
577
|
+
self,
|
578
|
+
selector: Optional[str] = None, # Now optional
|
579
|
+
*, # Force subsequent args to be keyword-only
|
580
|
+
text: Optional[str] = None, # New text parameter
|
581
|
+
apply_exclusions: bool = True,
|
582
|
+
regex: bool = False,
|
583
|
+
case: bool = True,
|
584
|
+
**kwargs,
|
455
585
|
) -> "ElementCollection":
|
456
586
|
"""
|
457
|
-
Find all elements on this page matching selector.
|
587
|
+
Find all elements on this page matching selector OR text content.
|
588
|
+
|
589
|
+
Provide EITHER `selector` OR `text`, but not both.
|
458
590
|
|
459
591
|
Args:
|
460
|
-
selector: CSS-like selector string
|
461
|
-
|
462
|
-
|
463
|
-
|
464
|
-
|
592
|
+
selector: CSS-like selector string.
|
593
|
+
text: Text content to search for (equivalent to 'text:contains(...)').
|
594
|
+
apply_exclusions: Whether to exclude elements in exclusion regions (default: True).
|
595
|
+
regex: Whether to use regex for text search (`selector` or `text`) (default: False).
|
596
|
+
case: Whether to do case-sensitive text search (`selector` or `text`) (default: True).
|
597
|
+
**kwargs: Additional filter parameters.
|
465
598
|
|
466
599
|
Returns:
|
467
|
-
ElementCollection with matching elements
|
468
|
-
"""
|
469
|
-
from natural_pdf.
|
600
|
+
ElementCollection with matching elements.
|
601
|
+
"""
|
602
|
+
from natural_pdf.elements.collections import ElementCollection # Import here for type hint
|
603
|
+
|
604
|
+
if selector is not None and text is not None:
|
605
|
+
raise ValueError("Provide either 'selector' or 'text', not both.")
|
606
|
+
if selector is None and text is None:
|
607
|
+
raise ValueError("Provide either 'selector' or 'text'.")
|
608
|
+
|
609
|
+
# Construct selector if 'text' is provided
|
610
|
+
effective_selector = ""
|
611
|
+
if text is not None:
|
612
|
+
# Escape quotes within the text for the selector string
|
613
|
+
escaped_text = text.replace('"', '\\"').replace("'", "\\'")
|
614
|
+
# Default to 'text:contains(...)'
|
615
|
+
effective_selector = f'text:contains("{escaped_text}")'
|
616
|
+
logger.debug(
|
617
|
+
f"Using text shortcut: find_all(text='{text}') -> find_all('{effective_selector}')"
|
618
|
+
)
|
619
|
+
elif selector is not None:
|
620
|
+
effective_selector = selector
|
621
|
+
else:
|
622
|
+
# Should be unreachable due to checks above
|
623
|
+
raise ValueError("Internal error: No selector or text provided.")
|
470
624
|
|
471
|
-
selector_obj = parse_selector(
|
625
|
+
selector_obj = parse_selector(effective_selector)
|
472
626
|
|
473
|
-
# Pass regex and case flags to selector function
|
627
|
+
# Pass regex and case flags to selector function via kwargs
|
474
628
|
kwargs["regex"] = regex
|
475
629
|
kwargs["case"] = case
|
476
630
|
|
@@ -1257,38 +1411,39 @@ class Page:
|
|
1257
1411
|
"""
|
1258
1412
|
image = None
|
1259
1413
|
render_resolution = resolution if resolution is not None else scale * 72
|
1414
|
+
thread_id = threading.current_thread().name
|
1415
|
+
logger.debug(
|
1416
|
+
f"[{thread_id}] Page {self.index}: Attempting to acquire pdf_render_lock for to_image..."
|
1417
|
+
)
|
1418
|
+
lock_wait_start = time.monotonic()
|
1260
1419
|
try:
|
1261
|
-
|
1262
|
-
|
1263
|
-
|
1264
|
-
|
1265
|
-
|
1266
|
-
labels=labels,
|
1267
|
-
legend_position=legend_position,
|
1268
|
-
render_ocr=render_ocr,
|
1269
|
-
resolution=render_resolution, # Pass the calculated resolution
|
1270
|
-
**kwargs,
|
1271
|
-
)
|
1272
|
-
else:
|
1273
|
-
# Get the base page image directly from pdfplumber if no highlights needed
|
1274
|
-
# Use the underlying pdfplumber page object
|
1275
|
-
img_object = self._page.to_image(resolution=render_resolution, **kwargs)
|
1276
|
-
# Access the PIL image directly (assuming pdfplumber structure)
|
1277
|
-
image = (
|
1278
|
-
img_object.annotated
|
1279
|
-
if hasattr(img_object, "annotated")
|
1280
|
-
else img_object._repr_png_()
|
1420
|
+
# Acquire the global PDF rendering lock
|
1421
|
+
with pdf_render_lock:
|
1422
|
+
lock_acquired_time = time.monotonic()
|
1423
|
+
logger.debug(
|
1424
|
+
f"[{thread_id}] Page {self.index}: Acquired pdf_render_lock (waited {lock_acquired_time - lock_wait_start:.2f}s). Starting render..."
|
1281
1425
|
)
|
1282
|
-
if
|
1283
|
-
|
1284
|
-
|
1285
|
-
|
1286
|
-
|
1287
|
-
|
1288
|
-
|
1426
|
+
if include_highlights:
|
1427
|
+
# Delegate rendering to the central service
|
1428
|
+
image = self._highlighter.render_page(
|
1429
|
+
page_index=self.index,
|
1430
|
+
scale=scale,
|
1431
|
+
labels=labels,
|
1432
|
+
legend_position=legend_position,
|
1433
|
+
render_ocr=render_ocr,
|
1434
|
+
resolution=render_resolution, # Pass the calculated resolution
|
1435
|
+
**kwargs,
|
1436
|
+
)
|
1437
|
+
else:
|
1438
|
+
image = render_plain_page(self, render_resolution)
|
1289
1439
|
except Exception as e:
|
1290
1440
|
logger.error(f"Error rendering page {self.index}: {e}", exc_info=True)
|
1291
1441
|
return None # Return None on error
|
1442
|
+
finally:
|
1443
|
+
render_end_time = time.monotonic()
|
1444
|
+
logger.debug(
|
1445
|
+
f"[{thread_id}] Page {self.index}: Released pdf_render_lock. Total render time (incl. lock wait): {render_end_time - lock_wait_start:.2f}s"
|
1446
|
+
)
|
1292
1447
|
|
1293
1448
|
if image is None:
|
1294
1449
|
return None
|
@@ -1384,6 +1539,7 @@ class Page:
|
|
1384
1539
|
resolution: Optional[int] = None,
|
1385
1540
|
detect_only: bool = False,
|
1386
1541
|
apply_exclusions: bool = True,
|
1542
|
+
replace: bool = True,
|
1387
1543
|
) -> "Page":
|
1388
1544
|
"""
|
1389
1545
|
Apply OCR to THIS page and add results to page elements via PDF.apply_ocr.
|
@@ -1397,13 +1553,23 @@ class Page:
|
|
1397
1553
|
resolution: DPI resolution for rendering page image before OCR.
|
1398
1554
|
apply_exclusions: If True (default), render page image for OCR
|
1399
1555
|
with excluded areas masked (whited out).
|
1556
|
+
detect_only: If True, only detect text bounding boxes, don't perform OCR.
|
1557
|
+
replace: If True (default), remove any existing OCR elements before
|
1558
|
+
adding new ones. If False, add new OCR elements to existing ones.
|
1400
1559
|
|
1401
1560
|
Returns:
|
1402
|
-
|
1561
|
+
Self for method chaining.
|
1403
1562
|
"""
|
1404
1563
|
if not hasattr(self._parent, "apply_ocr"):
|
1405
1564
|
logger.error(f"Page {self.number}: Parent PDF missing 'apply_ocr'. Cannot apply OCR.")
|
1406
|
-
return
|
1565
|
+
return self # Return self for chaining
|
1566
|
+
|
1567
|
+
# Remove existing OCR elements if replace is True
|
1568
|
+
if replace and hasattr(self, "_element_mgr"):
|
1569
|
+
logger.info(
|
1570
|
+
f"Page {self.number}: Removing existing OCR elements before applying new OCR."
|
1571
|
+
)
|
1572
|
+
self._element_mgr.remove_ocr_elements()
|
1407
1573
|
|
1408
1574
|
logger.info(f"Page {self.number}: Delegating apply_ocr to PDF.apply_ocr.")
|
1409
1575
|
try:
|
@@ -1419,18 +1585,13 @@ class Page:
|
|
1419
1585
|
resolution=resolution,
|
1420
1586
|
detect_only=detect_only,
|
1421
1587
|
apply_exclusions=apply_exclusions,
|
1588
|
+
replace=replace, # Pass the replace parameter to PDF.apply_ocr
|
1422
1589
|
)
|
1423
1590
|
except Exception as e:
|
1424
1591
|
logger.error(f"Page {self.number}: Error during delegated OCR call: {e}", exc_info=True)
|
1425
|
-
return
|
1592
|
+
return self # Return self for chaining
|
1426
1593
|
|
1427
|
-
# Return
|
1428
|
-
ocr_elements = [el for el in self.words if getattr(el, "source", None) == "ocr"]
|
1429
|
-
logger.debug(
|
1430
|
-
f"Page {self.number}: apply_ocr completed. Found {len(ocr_elements)} OCR elements."
|
1431
|
-
)
|
1432
|
-
# Note: The method is typed to return Page for chaining, but the log indicates
|
1433
|
-
# finding elements. Let's stick to returning self for chaining consistency.
|
1594
|
+
# Return self for chaining
|
1434
1595
|
return self
|
1435
1596
|
|
1436
1597
|
def extract_ocr_elements(
|
@@ -1471,11 +1632,15 @@ class Page:
|
|
1471
1632
|
|
1472
1633
|
try:
|
1473
1634
|
# Get base image without highlights using the determined resolution
|
1474
|
-
|
1475
|
-
|
1476
|
-
|
1477
|
-
|
1478
|
-
|
1635
|
+
# Use the global PDF rendering lock
|
1636
|
+
with pdf_render_lock:
|
1637
|
+
image = self.to_image(resolution=final_resolution, include_highlights=False)
|
1638
|
+
if not image:
|
1639
|
+
logger.error(
|
1640
|
+
f" Failed to render page {self.number} to image for OCR extraction."
|
1641
|
+
)
|
1642
|
+
return []
|
1643
|
+
logger.debug(f" Rendered image size: {image.width}x{image.height}")
|
1479
1644
|
except Exception as e:
|
1480
1645
|
logger.error(f" Failed to render page {self.number} to image: {e}", exc_info=True)
|
1481
1646
|
return []
|
@@ -1545,6 +1710,11 @@ class Page:
|
|
1545
1710
|
logger.info(f" Created {len(temp_elements)} TextElements from OCR (extract only).")
|
1546
1711
|
return temp_elements
|
1547
1712
|
|
1713
|
+
@property
|
1714
|
+
def size(self) -> Tuple[float, float]:
|
1715
|
+
"""Get the size of the page in points."""
|
1716
|
+
return (self._page.width, self._page.height)
|
1717
|
+
|
1548
1718
|
@property
|
1549
1719
|
def layout_analyzer(self) -> LayoutAnalyzer:
|
1550
1720
|
"""Get or create the layout analyzer for this page."""
|
@@ -1564,6 +1734,8 @@ class Page:
|
|
1564
1734
|
exclude_classes: Optional[List[str]] = None,
|
1565
1735
|
device: Optional[str] = None,
|
1566
1736
|
existing: str = "replace",
|
1737
|
+
model_name: Optional[str] = None,
|
1738
|
+
client: Optional[Any] = None, # Add client parameter
|
1567
1739
|
) -> ElementCollection[Region]:
|
1568
1740
|
"""
|
1569
1741
|
Analyze the page layout using the configured LayoutManager.
|
@@ -1589,6 +1761,8 @@ class Page:
|
|
1589
1761
|
exclude_classes=exclude_classes,
|
1590
1762
|
device=device,
|
1591
1763
|
existing=existing,
|
1764
|
+
model_name=model_name,
|
1765
|
+
client=client, # Pass client down
|
1592
1766
|
)
|
1593
1767
|
|
1594
1768
|
# Retrieve the detected regions from the element manager
|
@@ -1659,14 +1833,24 @@ class Page:
|
|
1659
1833
|
)
|
1660
1834
|
return None
|
1661
1835
|
|
1836
|
+
def split(self, divider, **kwargs) -> "ElementCollection[Region]":
|
1837
|
+
"""
|
1838
|
+
Divides the page into sections based on the provided divider elements.
|
1839
|
+
"""
|
1840
|
+
sections = self.get_sections(start_elements=divider, **kwargs)
|
1841
|
+
top = self.region(0, 0, self.width, sections[0].top)
|
1842
|
+
sections.append(top)
|
1843
|
+
|
1844
|
+
return sections
|
1845
|
+
|
1662
1846
|
def get_sections(
|
1663
1847
|
self,
|
1664
1848
|
start_elements=None,
|
1665
1849
|
end_elements=None,
|
1666
|
-
boundary_inclusion="
|
1850
|
+
boundary_inclusion="start",
|
1667
1851
|
y_threshold=5.0,
|
1668
1852
|
bounding_box=None,
|
1669
|
-
) -> "ElementCollection[Region]":
|
1853
|
+
) -> "ElementCollection[Region]":
|
1670
1854
|
"""
|
1671
1855
|
Get sections of a page defined by start/end elements.
|
1672
1856
|
Uses the page-level implementation.
|
@@ -2027,43 +2211,344 @@ class Page:
|
|
2027
2211
|
def correct_ocr(
|
2028
2212
|
self,
|
2029
2213
|
correction_callback: Callable[[Any], Optional[str]],
|
2214
|
+
max_workers: Optional[int] = None,
|
2215
|
+
progress_callback: Optional[Callable[[], None]] = None, # Added progress callback
|
2030
2216
|
) -> "Page": # Return self for chaining
|
2031
2217
|
"""
|
2032
2218
|
Applies corrections to OCR-generated text elements on this page
|
2033
|
-
using a user-provided callback function.
|
2219
|
+
using a user-provided callback function, potentially in parallel.
|
2034
2220
|
|
2035
2221
|
Finds text elements on this page whose 'source' attribute starts
|
2036
2222
|
with 'ocr' and calls the `correction_callback` for each, passing the
|
2037
|
-
element itself.
|
2038
|
-
|
2039
|
-
The `correction_callback` should contain the logic to:
|
2040
|
-
1. Determine if the element needs correction.
|
2041
|
-
2. Perform the correction (e.g., call an LLM).
|
2042
|
-
3. Return the new text (`str`) or `None`.
|
2043
|
-
|
2044
|
-
If the callback returns a string, the element's `.text` is updated.
|
2045
|
-
Metadata updates (source, confidence, etc.) should happen within the callback.
|
2223
|
+
element itself. Updates the element's text if the callback returns
|
2224
|
+
a new string.
|
2046
2225
|
|
2047
2226
|
Args:
|
2048
2227
|
correction_callback: A function accepting an element and returning
|
2049
2228
|
`Optional[str]` (new text or None).
|
2229
|
+
max_workers: The maximum number of threads to use for parallel execution.
|
2230
|
+
If None or 0 or 1, runs sequentially.
|
2231
|
+
progress_callback: Optional callback function to call after processing each element.
|
2050
2232
|
|
2051
2233
|
Returns:
|
2052
2234
|
Self for method chaining.
|
2053
2235
|
"""
|
2054
2236
|
logger.info(
|
2055
|
-
f"Page {self.number}: Starting OCR correction
|
2237
|
+
f"Page {self.number}: Starting OCR correction with callback '{correction_callback.__name__}' (max_workers={max_workers})"
|
2238
|
+
)
|
2239
|
+
|
2240
|
+
target_elements_collection = self.find_all(
|
2241
|
+
selector="text[source=ocr]", apply_exclusions=False
|
2056
2242
|
)
|
2243
|
+
target_elements = target_elements_collection.elements # Get the list
|
2244
|
+
|
2245
|
+
if not target_elements:
|
2246
|
+
logger.info(f"Page {self.number}: No OCR elements found to correct.")
|
2247
|
+
return self
|
2248
|
+
|
2249
|
+
processed_count = 0
|
2250
|
+
updated_count = 0
|
2251
|
+
error_count = 0
|
2252
|
+
|
2253
|
+
# Define the task to be run by the worker thread or sequentially
|
2254
|
+
def _process_element_task(element):
|
2255
|
+
try:
|
2256
|
+
current_text = getattr(element, "text", None)
|
2257
|
+
# Call the user-provided callback
|
2258
|
+
corrected_text = correction_callback(element)
|
2259
|
+
|
2260
|
+
# Validate result type
|
2261
|
+
if corrected_text is not None and not isinstance(corrected_text, str):
|
2262
|
+
logger.warning(
|
2263
|
+
f"Page {self.number}: Correction callback for element '{getattr(element, 'text', '')[:20]}...' returned non-string, non-None type: {type(corrected_text)}. Skipping update."
|
2264
|
+
)
|
2265
|
+
return element, None, None # Treat as no correction
|
2057
2266
|
|
2058
|
-
|
2059
|
-
|
2060
|
-
|
2267
|
+
return element, corrected_text, None # Return element, result, no error
|
2268
|
+
except Exception as e:
|
2269
|
+
logger.error(
|
2270
|
+
f"Page {self.number}: Error applying correction callback to element '{getattr(element, 'text', '')[:30]}...' ({element.bbox}): {e}",
|
2271
|
+
exc_info=False, # Keep log concise
|
2272
|
+
)
|
2273
|
+
return element, None, e # Return element, no result, error
|
2274
|
+
finally:
|
2275
|
+
# --- Call progress callback here --- #
|
2276
|
+
if progress_callback:
|
2277
|
+
try:
|
2278
|
+
progress_callback()
|
2279
|
+
except Exception as cb_e:
|
2280
|
+
# Log error in callback itself, but don't stop processing
|
2281
|
+
logger.error(
|
2282
|
+
f"Page {self.number}: Error executing progress_callback: {cb_e}",
|
2283
|
+
exc_info=False,
|
2284
|
+
)
|
2061
2285
|
|
2062
|
-
#
|
2063
|
-
|
2064
|
-
|
2065
|
-
|
2066
|
-
|
2286
|
+
# Choose execution strategy based on max_workers
|
2287
|
+
if max_workers is not None and max_workers > 1:
|
2288
|
+
# --- Parallel execution --- #
|
2289
|
+
logger.info(
|
2290
|
+
f"Page {self.number}: Running OCR correction in parallel with {max_workers} workers."
|
2291
|
+
)
|
2292
|
+
futures = []
|
2293
|
+
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
|
2294
|
+
# Submit all tasks
|
2295
|
+
future_to_element = {
|
2296
|
+
executor.submit(_process_element_task, element): element
|
2297
|
+
for element in target_elements
|
2298
|
+
}
|
2299
|
+
|
2300
|
+
# Process results as they complete (progress_callback called by worker)
|
2301
|
+
for future in concurrent.futures.as_completed(future_to_element):
|
2302
|
+
processed_count += 1
|
2303
|
+
try:
|
2304
|
+
element, corrected_text, error = future.result()
|
2305
|
+
if error:
|
2306
|
+
error_count += 1
|
2307
|
+
# Error already logged in worker
|
2308
|
+
elif corrected_text is not None:
|
2309
|
+
# Apply correction if text changed
|
2310
|
+
current_text = getattr(element, "text", None)
|
2311
|
+
if corrected_text != current_text:
|
2312
|
+
element.text = corrected_text
|
2313
|
+
updated_count += 1
|
2314
|
+
except Exception as exc:
|
2315
|
+
# Catch errors from future.result() itself
|
2316
|
+
element = future_to_element[future] # Find original element
|
2317
|
+
logger.error(
|
2318
|
+
f"Page {self.number}: Internal error retrieving correction result for element {element.bbox}: {exc}",
|
2319
|
+
exc_info=True,
|
2320
|
+
)
|
2321
|
+
error_count += 1
|
2322
|
+
# Note: progress_callback was already called in the worker's finally block
|
2323
|
+
|
2324
|
+
else:
|
2325
|
+
# --- Sequential execution --- #
|
2326
|
+
logger.info(f"Page {self.number}: Running OCR correction sequentially.")
|
2327
|
+
for element in target_elements:
|
2328
|
+
# Call the task function directly (it handles progress_callback)
|
2329
|
+
processed_count += 1
|
2330
|
+
_element, corrected_text, error = _process_element_task(element)
|
2331
|
+
if error:
|
2332
|
+
error_count += 1
|
2333
|
+
elif corrected_text is not None:
|
2334
|
+
# Apply correction if text changed
|
2335
|
+
current_text = getattr(_element, "text", None)
|
2336
|
+
if corrected_text != current_text:
|
2337
|
+
_element.text = corrected_text
|
2338
|
+
updated_count += 1
|
2339
|
+
|
2340
|
+
logger.info(
|
2341
|
+
f"Page {self.number}: OCR correction finished. Processed: {processed_count}/{len(target_elements)}, Updated: {updated_count}, Errors: {error_count}."
|
2067
2342
|
)
|
2068
2343
|
|
2069
2344
|
return self # Return self for chaining
|
2345
|
+
|
2346
|
+
# --- Classification Mixin Implementation --- #
|
2347
|
+
def _get_classification_manager(self) -> "ClassificationManager":
|
2348
|
+
if not hasattr(self, "pdf") or not hasattr(self.pdf, "get_manager"):
|
2349
|
+
raise AttributeError(
|
2350
|
+
"ClassificationManager cannot be accessed: Parent PDF or get_manager method missing."
|
2351
|
+
)
|
2352
|
+
try:
|
2353
|
+
# Use the PDF's manager registry accessor
|
2354
|
+
return self.pdf.get_manager("classification")
|
2355
|
+
except (ValueError, RuntimeError, AttributeError) as e:
|
2356
|
+
# Wrap potential errors from get_manager for clarity
|
2357
|
+
raise AttributeError(f"Failed to get ClassificationManager from PDF: {e}") from e
|
2358
|
+
|
2359
|
+
def _get_classification_content(
|
2360
|
+
self, model_type: str, **kwargs
|
2361
|
+
) -> Union[str, "Image"]: # Use "Image" for lazy import
|
2362
|
+
if model_type == "text":
|
2363
|
+
text_content = self.extract_text(
|
2364
|
+
layout=False, use_exclusions=False
|
2365
|
+
) # Simple join, ignore exclusions for classification
|
2366
|
+
if not text_content or text_content.isspace():
|
2367
|
+
raise ValueError("Cannot classify page with 'text' model: No text content found.")
|
2368
|
+
return text_content
|
2369
|
+
elif model_type == "vision":
|
2370
|
+
# Get resolution from manager/kwargs if possible, else default
|
2371
|
+
manager = self._get_classification_manager()
|
2372
|
+
default_resolution = 150
|
2373
|
+
# Access kwargs passed to classify method if needed
|
2374
|
+
resolution = (
|
2375
|
+
kwargs.get("resolution", default_resolution)
|
2376
|
+
if "kwargs" in locals()
|
2377
|
+
else default_resolution
|
2378
|
+
)
|
2379
|
+
|
2380
|
+
# Use to_image, ensuring no highlights interfere
|
2381
|
+
img = self.to_image(
|
2382
|
+
resolution=resolution,
|
2383
|
+
include_highlights=False,
|
2384
|
+
labels=False,
|
2385
|
+
exclusions=None, # Don't mask exclusions for classification input image
|
2386
|
+
)
|
2387
|
+
if img is None:
|
2388
|
+
raise ValueError(
|
2389
|
+
"Cannot classify page with 'vision' model: Failed to render image."
|
2390
|
+
)
|
2391
|
+
return img
|
2392
|
+
else:
|
2393
|
+
raise ValueError(f"Unsupported model_type for classification: {model_type}")
|
2394
|
+
|
2395
|
+
def _get_metadata_storage(self) -> Dict[str, Any]:
|
2396
|
+
# Ensure metadata exists
|
2397
|
+
if not hasattr(self, "metadata") or self.metadata is None:
|
2398
|
+
self.metadata = {}
|
2399
|
+
return self.metadata
|
2400
|
+
|
2401
|
+
# --- Content Extraction ---
|
2402
|
+
|
2403
|
+
# --- Skew Detection and Correction --- #
|
2404
|
+
|
2405
|
+
@property
|
2406
|
+
def skew_angle(self) -> Optional[float]:
|
2407
|
+
"""Get the detected skew angle for this page (if calculated)."""
|
2408
|
+
return self._skew_angle
|
2409
|
+
|
2410
|
+
def detect_skew_angle(
|
2411
|
+
self,
|
2412
|
+
resolution: int = 72,
|
2413
|
+
grayscale: bool = True,
|
2414
|
+
force_recalculate: bool = False,
|
2415
|
+
**deskew_kwargs,
|
2416
|
+
) -> Optional[float]:
|
2417
|
+
"""
|
2418
|
+
Detects the skew angle of the page image and stores it.
|
2419
|
+
|
2420
|
+
Args:
|
2421
|
+
resolution: DPI resolution for rendering the page image for detection.
|
2422
|
+
grayscale: Whether to convert the image to grayscale before detection.
|
2423
|
+
force_recalculate: If True, recalculate even if an angle exists.
|
2424
|
+
**deskew_kwargs: Additional keyword arguments passed to `deskew.determine_skew`
|
2425
|
+
(e.g., `max_angle`, `num_peaks`).
|
2426
|
+
|
2427
|
+
Returns:
|
2428
|
+
The detected skew angle in degrees, or None if detection failed.
|
2429
|
+
|
2430
|
+
Raises:
|
2431
|
+
ImportError: If the 'deskew' library is not installed.
|
2432
|
+
"""
|
2433
|
+
if not DESKEW_AVAILABLE:
|
2434
|
+
raise ImportError(
|
2435
|
+
"Deskew library not found. Install with: pip install natural-pdf[deskew]"
|
2436
|
+
)
|
2437
|
+
|
2438
|
+
if self._skew_angle is not None and not force_recalculate:
|
2439
|
+
logger.debug(f"Page {self.number}: Returning cached skew angle: {self._skew_angle:.2f}")
|
2440
|
+
return self._skew_angle
|
2441
|
+
|
2442
|
+
logger.debug(f"Page {self.number}: Detecting skew angle (resolution={resolution} DPI)...")
|
2443
|
+
try:
|
2444
|
+
# Render the page at the specified detection resolution
|
2445
|
+
img = self.to_image(resolution=resolution, include_highlights=False)
|
2446
|
+
if not img:
|
2447
|
+
logger.warning(f"Page {self.number}: Failed to render image for skew detection.")
|
2448
|
+
self._skew_angle = None
|
2449
|
+
return None
|
2450
|
+
|
2451
|
+
# Convert to numpy array
|
2452
|
+
img_np = np.array(img)
|
2453
|
+
|
2454
|
+
# Convert to grayscale if needed
|
2455
|
+
if grayscale:
|
2456
|
+
if len(img_np.shape) == 3 and img_np.shape[2] >= 3:
|
2457
|
+
gray_np = np.mean(img_np[:, :, :3], axis=2).astype(np.uint8)
|
2458
|
+
elif len(img_np.shape) == 2:
|
2459
|
+
gray_np = img_np # Already grayscale
|
2460
|
+
else:
|
2461
|
+
logger.warning(
|
2462
|
+
f"Page {self.number}: Unexpected image shape {img_np.shape} for grayscale conversion."
|
2463
|
+
)
|
2464
|
+
gray_np = img_np # Try using it anyway
|
2465
|
+
else:
|
2466
|
+
gray_np = img_np # Use original if grayscale=False
|
2467
|
+
|
2468
|
+
# Determine skew angle using the deskew library
|
2469
|
+
angle = determine_skew(gray_np, **deskew_kwargs)
|
2470
|
+
self._skew_angle = angle
|
2471
|
+
logger.debug(f"Page {self.number}: Detected skew angle = {angle}")
|
2472
|
+
return angle
|
2473
|
+
|
2474
|
+
except Exception as e:
|
2475
|
+
logger.warning(f"Page {self.number}: Failed during skew detection: {e}", exc_info=True)
|
2476
|
+
self._skew_angle = None
|
2477
|
+
return None
|
2478
|
+
|
2479
|
+
def deskew(
|
2480
|
+
self,
|
2481
|
+
resolution: int = 300,
|
2482
|
+
angle: Optional[float] = None,
|
2483
|
+
detection_resolution: int = 72,
|
2484
|
+
**deskew_kwargs,
|
2485
|
+
) -> Optional[Image.Image]:
|
2486
|
+
"""
|
2487
|
+
Creates and returns a deskewed PIL image of the page.
|
2488
|
+
|
2489
|
+
If `angle` is not provided, it will first try to detect the skew angle
|
2490
|
+
using `detect_skew_angle` (or use the cached angle if available).
|
2491
|
+
|
2492
|
+
Args:
|
2493
|
+
resolution: DPI resolution for the output deskewed image.
|
2494
|
+
angle: The specific angle (in degrees) to rotate by. If None, detects automatically.
|
2495
|
+
detection_resolution: DPI resolution used for detection if `angle` is None.
|
2496
|
+
**deskew_kwargs: Additional keyword arguments passed to `deskew.determine_skew`
|
2497
|
+
if automatic detection is performed.
|
2498
|
+
|
2499
|
+
Returns:
|
2500
|
+
A deskewed PIL.Image.Image object, or None if rendering/rotation fails.
|
2501
|
+
|
2502
|
+
Raises:
|
2503
|
+
ImportError: If the 'deskew' library is not installed.
|
2504
|
+
"""
|
2505
|
+
if not DESKEW_AVAILABLE:
|
2506
|
+
raise ImportError(
|
2507
|
+
"Deskew library not found. Install with: pip install natural-pdf[deskew]"
|
2508
|
+
)
|
2509
|
+
|
2510
|
+
# Determine the angle to use
|
2511
|
+
rotation_angle = angle
|
2512
|
+
if rotation_angle is None:
|
2513
|
+
# Detect angle (or use cached) if not explicitly provided
|
2514
|
+
rotation_angle = self.detect_skew_angle(
|
2515
|
+
resolution=detection_resolution, **deskew_kwargs
|
2516
|
+
)
|
2517
|
+
|
2518
|
+
logger.debug(
|
2519
|
+
f"Page {self.number}: Preparing to deskew (output resolution={resolution} DPI). Using angle: {rotation_angle}"
|
2520
|
+
)
|
2521
|
+
|
2522
|
+
try:
|
2523
|
+
# Render the original page at the desired output resolution
|
2524
|
+
img = self.to_image(resolution=resolution, include_highlights=False)
|
2525
|
+
if not img:
|
2526
|
+
logger.error(f"Page {self.number}: Failed to render image for deskewing.")
|
2527
|
+
return None
|
2528
|
+
|
2529
|
+
# Rotate if a significant angle was found/provided
|
2530
|
+
if rotation_angle is not None and abs(rotation_angle) > 0.05:
|
2531
|
+
logger.debug(f"Page {self.number}: Rotating by {rotation_angle:.2f} degrees.")
|
2532
|
+
# Determine fill color based on image mode
|
2533
|
+
fill = (255, 255, 255) if img.mode == "RGB" else 255 # White background
|
2534
|
+
# Rotate the image using PIL
|
2535
|
+
rotated_img = img.rotate(
|
2536
|
+
rotation_angle, # deskew provides angle, PIL rotates counter-clockwise
|
2537
|
+
resample=Image.Resampling.BILINEAR,
|
2538
|
+
expand=True, # Expand image to fit rotated content
|
2539
|
+
fillcolor=fill,
|
2540
|
+
)
|
2541
|
+
return rotated_img
|
2542
|
+
else:
|
2543
|
+
logger.debug(
|
2544
|
+
f"Page {self.number}: No significant rotation needed (angle={rotation_angle}). Returning original render."
|
2545
|
+
)
|
2546
|
+
return img # Return the original rendered image if no rotation needed
|
2547
|
+
|
2548
|
+
except Exception as e:
|
2549
|
+
logger.error(
|
2550
|
+
f"Page {self.number}: Error during deskewing image generation: {e}", exc_info=True
|
2551
|
+
)
|
2552
|
+
return None
|
2553
|
+
|
2554
|
+
# --- End Skew Detection and Correction --- #
|