natural-pdf 0.1.8__py3-none-any.whl → 0.1.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/__init__.py +1 -0
- natural_pdf/analyzers/layout/base.py +1 -5
- natural_pdf/analyzers/layout/gemini.py +61 -51
- natural_pdf/analyzers/layout/layout_analyzer.py +40 -11
- natural_pdf/analyzers/layout/layout_manager.py +26 -84
- natural_pdf/analyzers/layout/layout_options.py +7 -0
- natural_pdf/analyzers/layout/pdfplumber_table_finder.py +142 -0
- natural_pdf/analyzers/layout/surya.py +46 -123
- natural_pdf/analyzers/layout/tatr.py +51 -4
- natural_pdf/analyzers/text_structure.py +3 -5
- natural_pdf/analyzers/utils.py +3 -3
- natural_pdf/classification/manager.py +241 -158
- natural_pdf/classification/mixin.py +52 -38
- natural_pdf/classification/results.py +71 -45
- natural_pdf/collections/mixins.py +85 -20
- natural_pdf/collections/pdf_collection.py +245 -100
- natural_pdf/core/element_manager.py +30 -14
- natural_pdf/core/highlighting_service.py +13 -22
- natural_pdf/core/page.py +423 -101
- natural_pdf/core/pdf.py +694 -195
- natural_pdf/elements/base.py +134 -40
- natural_pdf/elements/collections.py +610 -134
- natural_pdf/elements/region.py +659 -90
- natural_pdf/elements/text.py +1 -1
- natural_pdf/export/mixin.py +137 -0
- natural_pdf/exporters/base.py +3 -3
- natural_pdf/exporters/paddleocr.py +4 -3
- natural_pdf/extraction/manager.py +50 -49
- natural_pdf/extraction/mixin.py +90 -57
- natural_pdf/extraction/result.py +9 -23
- natural_pdf/ocr/__init__.py +5 -5
- natural_pdf/ocr/engine_doctr.py +346 -0
- natural_pdf/ocr/ocr_factory.py +24 -4
- natural_pdf/ocr/ocr_manager.py +61 -25
- natural_pdf/ocr/ocr_options.py +70 -10
- natural_pdf/ocr/utils.py +6 -4
- natural_pdf/search/__init__.py +20 -34
- natural_pdf/search/haystack_search_service.py +309 -265
- natural_pdf/search/haystack_utils.py +99 -75
- natural_pdf/search/search_service_protocol.py +11 -12
- natural_pdf/selectors/parser.py +219 -143
- natural_pdf/utils/debug.py +3 -3
- natural_pdf/utils/identifiers.py +1 -1
- natural_pdf/utils/locks.py +1 -1
- natural_pdf/utils/packaging.py +8 -6
- natural_pdf/utils/text_extraction.py +24 -16
- natural_pdf/utils/tqdm_utils.py +18 -10
- natural_pdf/utils/visualization.py +18 -0
- natural_pdf/widgets/viewer.py +4 -25
- {natural_pdf-0.1.8.dist-info → natural_pdf-0.1.10.dist-info}/METADATA +12 -3
- natural_pdf-0.1.10.dist-info/RECORD +80 -0
- {natural_pdf-0.1.8.dist-info → natural_pdf-0.1.10.dist-info}/WHEEL +1 -1
- {natural_pdf-0.1.8.dist-info → natural_pdf-0.1.10.dist-info}/top_level.txt +0 -2
- docs/api/index.md +0 -386
- docs/assets/favicon.png +0 -3
- docs/assets/favicon.svg +0 -3
- docs/assets/javascripts/custom.js +0 -17
- docs/assets/logo.svg +0 -3
- docs/assets/sample-screen.png +0 -0
- docs/assets/social-preview.png +0 -17
- docs/assets/social-preview.svg +0 -17
- docs/assets/stylesheets/custom.css +0 -65
- docs/categorizing-documents/index.md +0 -168
- docs/data-extraction/index.md +0 -87
- docs/document-qa/index.ipynb +0 -435
- docs/document-qa/index.md +0 -79
- docs/element-selection/index.ipynb +0 -969
- docs/element-selection/index.md +0 -249
- docs/finetuning/index.md +0 -176
- docs/index.md +0 -189
- docs/installation/index.md +0 -69
- docs/interactive-widget/index.ipynb +0 -962
- docs/interactive-widget/index.md +0 -12
- docs/layout-analysis/index.ipynb +0 -818
- docs/layout-analysis/index.md +0 -185
- docs/ocr/index.md +0 -256
- docs/pdf-navigation/index.ipynb +0 -314
- docs/pdf-navigation/index.md +0 -97
- docs/regions/index.ipynb +0 -816
- docs/regions/index.md +0 -294
- docs/tables/index.ipynb +0 -658
- docs/tables/index.md +0 -144
- docs/text-analysis/index.ipynb +0 -370
- docs/text-analysis/index.md +0 -105
- docs/text-extraction/index.ipynb +0 -1478
- docs/text-extraction/index.md +0 -292
- docs/tutorials/01-loading-and-extraction.ipynb +0 -1873
- docs/tutorials/01-loading-and-extraction.md +0 -95
- docs/tutorials/02-finding-elements.ipynb +0 -417
- docs/tutorials/02-finding-elements.md +0 -149
- docs/tutorials/03-extracting-blocks.ipynb +0 -152
- docs/tutorials/03-extracting-blocks.md +0 -48
- docs/tutorials/04-table-extraction.ipynb +0 -119
- docs/tutorials/04-table-extraction.md +0 -50
- docs/tutorials/05-excluding-content.ipynb +0 -275
- docs/tutorials/05-excluding-content.md +0 -109
- docs/tutorials/06-document-qa.ipynb +0 -337
- docs/tutorials/06-document-qa.md +0 -91
- docs/tutorials/07-layout-analysis.ipynb +0 -293
- docs/tutorials/07-layout-analysis.md +0 -66
- docs/tutorials/07-working-with-regions.ipynb +0 -414
- docs/tutorials/07-working-with-regions.md +0 -151
- docs/tutorials/08-spatial-navigation.ipynb +0 -513
- docs/tutorials/08-spatial-navigation.md +0 -190
- docs/tutorials/09-section-extraction.ipynb +0 -2439
- docs/tutorials/09-section-extraction.md +0 -256
- docs/tutorials/10-form-field-extraction.ipynb +0 -517
- docs/tutorials/10-form-field-extraction.md +0 -201
- docs/tutorials/11-enhanced-table-processing.ipynb +0 -59
- docs/tutorials/11-enhanced-table-processing.md +0 -9
- docs/tutorials/12-ocr-integration.ipynb +0 -3712
- docs/tutorials/12-ocr-integration.md +0 -137
- docs/tutorials/13-semantic-search.ipynb +0 -1718
- docs/tutorials/13-semantic-search.md +0 -77
- docs/visual-debugging/index.ipynb +0 -2970
- docs/visual-debugging/index.md +0 -157
- docs/visual-debugging/region.png +0 -0
- natural_pdf/templates/finetune/fine_tune_paddleocr.md +0 -420
- natural_pdf/templates/spa/css/style.css +0 -334
- natural_pdf/templates/spa/index.html +0 -31
- natural_pdf/templates/spa/js/app.js +0 -472
- natural_pdf/templates/spa/words.txt +0 -235976
- natural_pdf/widgets/frontend/viewer.js +0 -88
- natural_pdf-0.1.8.dist-info/RECORD +0 -156
- notebooks/Examples.ipynb +0 -1293
- pdfs/.gitkeep +0 -0
- pdfs/01-practice.pdf +0 -543
- pdfs/0500000US42001.pdf +0 -0
- pdfs/0500000US42007.pdf +0 -0
- pdfs/2014 Statistics.pdf +0 -0
- pdfs/2019 Statistics.pdf +0 -0
- pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
- pdfs/needs-ocr.pdf +0 -0
- {natural_pdf-0.1.8.dist-info → natural_pdf-0.1.10.dist-info}/licenses/LICENSE +0 -0
natural_pdf/core/page.py
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
import base64
|
2
|
+
import concurrent.futures # Added import
|
2
3
|
import hashlib
|
3
4
|
import io
|
4
5
|
import json
|
@@ -6,19 +7,30 @@ import logging
|
|
6
7
|
import os
|
7
8
|
import re
|
8
9
|
import tempfile
|
9
|
-
import time # Import time
|
10
|
-
from pathlib import Path
|
11
|
-
from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union
|
12
|
-
import concurrent.futures # Added import
|
13
|
-
from tqdm.auto import tqdm # Added tqdm import
|
14
10
|
import threading
|
11
|
+
import time # Import time
|
12
|
+
from pathlib import Path
|
13
|
+
from typing import ( # Added overload
|
14
|
+
TYPE_CHECKING,
|
15
|
+
Any,
|
16
|
+
Callable,
|
17
|
+
Dict,
|
18
|
+
List,
|
19
|
+
Optional,
|
20
|
+
Tuple,
|
21
|
+
Union,
|
22
|
+
overload,
|
23
|
+
)
|
15
24
|
|
16
25
|
import pdfplumber
|
17
26
|
from PIL import Image, ImageDraw
|
27
|
+
from tqdm.auto import tqdm # Added tqdm import
|
18
28
|
|
19
29
|
from natural_pdf.elements.collections import ElementCollection
|
20
30
|
from natural_pdf.elements.region import Region
|
31
|
+
from natural_pdf.selectors.parser import parse_selector
|
21
32
|
from natural_pdf.utils.locks import pdf_render_lock # Import from utils instead
|
33
|
+
from natural_pdf.utils.visualization import render_plain_page
|
22
34
|
|
23
35
|
if TYPE_CHECKING:
|
24
36
|
import pdfplumber
|
@@ -31,6 +43,8 @@ if TYPE_CHECKING:
|
|
31
43
|
# New Imports
|
32
44
|
import itertools
|
33
45
|
|
46
|
+
# Deskew Imports (Conditional)
|
47
|
+
import numpy as np
|
34
48
|
from pdfplumber.utils.geometry import get_bbox_overlap, merge_bboxes, objects_to_bbox
|
35
49
|
from pdfplumber.utils.text import TEXTMAP_KWARGS, WORD_EXTRACTOR_KWARGS, chars_to_textmap
|
36
50
|
|
@@ -39,27 +53,35 @@ from natural_pdf.analyzers.layout.layout_manager import LayoutManager
|
|
39
53
|
from natural_pdf.analyzers.layout.layout_options import LayoutOptions
|
40
54
|
from natural_pdf.analyzers.text_options import TextStyleOptions
|
41
55
|
from natural_pdf.analyzers.text_structure import TextStyleAnalyzer
|
56
|
+
from natural_pdf.classification.manager import ClassificationManager # For type hint
|
57
|
+
|
58
|
+
# --- Classification Imports --- #
|
59
|
+
from natural_pdf.classification.mixin import ClassificationMixin # Import classification mixin
|
42
60
|
from natural_pdf.core.element_manager import ElementManager
|
61
|
+
from natural_pdf.elements.base import Element # Import base element
|
43
62
|
from natural_pdf.elements.text import TextElement
|
63
|
+
from natural_pdf.extraction.mixin import ExtractionMixin # Import extraction mixin
|
44
64
|
from natural_pdf.ocr import OCRManager, OCROptions
|
65
|
+
from natural_pdf.ocr.utils import _apply_ocr_correction_to_elements
|
66
|
+
from natural_pdf.qa import DocumentQA, get_qa_engine
|
67
|
+
from natural_pdf.utils.locks import pdf_render_lock # Import the lock
|
45
68
|
|
46
69
|
# Import new utils
|
47
70
|
from natural_pdf.utils.text_extraction import filter_chars_spatially, generate_text_layout
|
48
71
|
from natural_pdf.widgets import InteractiveViewerWidget
|
49
72
|
from natural_pdf.widgets.viewer import _IPYWIDGETS_AVAILABLE, SimpleInteractiveViewerWidget
|
50
73
|
|
51
|
-
from natural_pdf.qa import DocumentQA, get_qa_engine
|
52
|
-
from natural_pdf.ocr.utils import _apply_ocr_correction_to_elements
|
53
|
-
|
54
|
-
# --- Classification Imports --- #
|
55
|
-
from natural_pdf.classification.mixin import ClassificationMixin
|
56
|
-
from natural_pdf.classification.manager import ClassificationManager # For type hint
|
57
74
|
# --- End Classification Imports --- #
|
58
75
|
|
59
|
-
|
60
|
-
|
61
|
-
from
|
62
|
-
|
76
|
+
|
77
|
+
try:
|
78
|
+
from deskew import determine_skew
|
79
|
+
|
80
|
+
DESKEW_AVAILABLE = True
|
81
|
+
except ImportError:
|
82
|
+
DESKEW_AVAILABLE = False
|
83
|
+
determine_skew = None
|
84
|
+
# End Deskew Imports
|
63
85
|
|
64
86
|
logger = logging.getLogger(__name__)
|
65
87
|
|
@@ -87,6 +109,7 @@ class Page(ClassificationMixin, ExtractionMixin):
|
|
87
109
|
self._index = index
|
88
110
|
self._text_styles = None # Lazy-loaded text style analyzer results
|
89
111
|
self._exclusions = [] # List to store exclusion functions/regions
|
112
|
+
self._skew_angle: Optional[float] = None # Stores detected skew angle
|
90
113
|
|
91
114
|
# --- ADDED --- Metadata store for mixins
|
92
115
|
self.metadata: Dict[str, Any] = {}
|
@@ -436,25 +459,79 @@ class Page(ClassificationMixin, ExtractionMixin):
|
|
436
459
|
|
437
460
|
return filtered_elements
|
438
461
|
|
439
|
-
|
462
|
+
@overload
|
463
|
+
def find(
|
464
|
+
self,
|
465
|
+
*,
|
466
|
+
text: str,
|
467
|
+
apply_exclusions: bool = True,
|
468
|
+
regex: bool = False,
|
469
|
+
case: bool = True,
|
470
|
+
**kwargs,
|
471
|
+
) -> Optional[Any]: ...
|
472
|
+
|
473
|
+
@overload
|
474
|
+
def find(
|
475
|
+
self,
|
476
|
+
selector: str,
|
477
|
+
*,
|
478
|
+
apply_exclusions: bool = True,
|
479
|
+
regex: bool = False,
|
480
|
+
case: bool = True,
|
481
|
+
**kwargs,
|
482
|
+
) -> Optional[Any]: ...
|
483
|
+
|
484
|
+
def find(
|
485
|
+
self,
|
486
|
+
selector: Optional[str] = None, # Now optional
|
487
|
+
*, # Force subsequent args to be keyword-only
|
488
|
+
text: Optional[str] = None, # New text parameter
|
489
|
+
apply_exclusions: bool = True,
|
490
|
+
regex: bool = False,
|
491
|
+
case: bool = True,
|
492
|
+
**kwargs,
|
493
|
+
) -> Optional[Any]:
|
440
494
|
"""
|
441
|
-
Find first element on this page matching selector.
|
495
|
+
Find first element on this page matching selector OR text content.
|
496
|
+
|
497
|
+
Provide EITHER `selector` OR `text`, but not both.
|
442
498
|
|
443
499
|
Args:
|
444
|
-
selector: CSS-like selector string
|
445
|
-
|
446
|
-
|
447
|
-
|
448
|
-
|
500
|
+
selector: CSS-like selector string.
|
501
|
+
text: Text content to search for (equivalent to 'text:contains(...)').
|
502
|
+
apply_exclusions: Whether to exclude elements in exclusion regions (default: True).
|
503
|
+
regex: Whether to use regex for text search (`selector` or `text`) (default: False).
|
504
|
+
case: Whether to do case-sensitive text search (`selector` or `text`) (default: True).
|
505
|
+
**kwargs: Additional filter parameters.
|
449
506
|
|
450
507
|
Returns:
|
451
|
-
Element object or None if not found
|
452
|
-
"""
|
453
|
-
|
508
|
+
Element object or None if not found.
|
509
|
+
"""
|
510
|
+
if selector is not None and text is not None:
|
511
|
+
raise ValueError("Provide either 'selector' or 'text', not both.")
|
512
|
+
if selector is None and text is None:
|
513
|
+
raise ValueError("Provide either 'selector' or 'text'.")
|
514
|
+
|
515
|
+
# Construct selector if 'text' is provided
|
516
|
+
effective_selector = ""
|
517
|
+
if text is not None:
|
518
|
+
# Escape quotes within the text for the selector string
|
519
|
+
escaped_text = text.replace('"', '\\"').replace("'", "\\'")
|
520
|
+
# Default to 'text:contains(...)'
|
521
|
+
effective_selector = f'text:contains("{escaped_text}")'
|
522
|
+
# Note: regex/case handled by kwargs passed down
|
523
|
+
logger.debug(
|
524
|
+
f"Using text shortcut: find(text='{text}') -> find('{effective_selector}')"
|
525
|
+
)
|
526
|
+
elif selector is not None:
|
527
|
+
effective_selector = selector
|
528
|
+
else:
|
529
|
+
# Should be unreachable due to checks above
|
530
|
+
raise ValueError("Internal error: No selector or text provided.")
|
454
531
|
|
455
|
-
selector_obj = parse_selector(
|
532
|
+
selector_obj = parse_selector(effective_selector)
|
456
533
|
|
457
|
-
# Pass regex and case flags to selector function
|
534
|
+
# Pass regex and case flags to selector function via kwargs
|
458
535
|
kwargs["regex"] = regex
|
459
536
|
kwargs["case"] = case
|
460
537
|
|
@@ -474,27 +551,80 @@ class Page(ClassificationMixin, ExtractionMixin):
|
|
474
551
|
else:
|
475
552
|
return None
|
476
553
|
|
554
|
+
@overload
|
555
|
+
def find_all(
|
556
|
+
self,
|
557
|
+
*,
|
558
|
+
text: str,
|
559
|
+
apply_exclusions: bool = True,
|
560
|
+
regex: bool = False,
|
561
|
+
case: bool = True,
|
562
|
+
**kwargs,
|
563
|
+
) -> "ElementCollection": ...
|
564
|
+
|
565
|
+
@overload
|
477
566
|
def find_all(
|
478
|
-
self,
|
567
|
+
self,
|
568
|
+
selector: str,
|
569
|
+
*,
|
570
|
+
apply_exclusions: bool = True,
|
571
|
+
regex: bool = False,
|
572
|
+
case: bool = True,
|
573
|
+
**kwargs,
|
574
|
+
) -> "ElementCollection": ...
|
575
|
+
|
576
|
+
def find_all(
|
577
|
+
self,
|
578
|
+
selector: Optional[str] = None, # Now optional
|
579
|
+
*, # Force subsequent args to be keyword-only
|
580
|
+
text: Optional[str] = None, # New text parameter
|
581
|
+
apply_exclusions: bool = True,
|
582
|
+
regex: bool = False,
|
583
|
+
case: bool = True,
|
584
|
+
**kwargs,
|
479
585
|
) -> "ElementCollection":
|
480
586
|
"""
|
481
|
-
Find all elements on this page matching selector.
|
587
|
+
Find all elements on this page matching selector OR text content.
|
588
|
+
|
589
|
+
Provide EITHER `selector` OR `text`, but not both.
|
482
590
|
|
483
591
|
Args:
|
484
|
-
selector: CSS-like selector string
|
485
|
-
|
486
|
-
|
487
|
-
|
488
|
-
|
592
|
+
selector: CSS-like selector string.
|
593
|
+
text: Text content to search for (equivalent to 'text:contains(...)').
|
594
|
+
apply_exclusions: Whether to exclude elements in exclusion regions (default: True).
|
595
|
+
regex: Whether to use regex for text search (`selector` or `text`) (default: False).
|
596
|
+
case: Whether to do case-sensitive text search (`selector` or `text`) (default: True).
|
597
|
+
**kwargs: Additional filter parameters.
|
489
598
|
|
490
599
|
Returns:
|
491
|
-
ElementCollection with matching elements
|
492
|
-
"""
|
493
|
-
from natural_pdf.
|
600
|
+
ElementCollection with matching elements.
|
601
|
+
"""
|
602
|
+
from natural_pdf.elements.collections import ElementCollection # Import here for type hint
|
603
|
+
|
604
|
+
if selector is not None and text is not None:
|
605
|
+
raise ValueError("Provide either 'selector' or 'text', not both.")
|
606
|
+
if selector is None and text is None:
|
607
|
+
raise ValueError("Provide either 'selector' or 'text'.")
|
608
|
+
|
609
|
+
# Construct selector if 'text' is provided
|
610
|
+
effective_selector = ""
|
611
|
+
if text is not None:
|
612
|
+
# Escape quotes within the text for the selector string
|
613
|
+
escaped_text = text.replace('"', '\\"').replace("'", "\\'")
|
614
|
+
# Default to 'text:contains(...)'
|
615
|
+
effective_selector = f'text:contains("{escaped_text}")'
|
616
|
+
logger.debug(
|
617
|
+
f"Using text shortcut: find_all(text='{text}') -> find_all('{effective_selector}')"
|
618
|
+
)
|
619
|
+
elif selector is not None:
|
620
|
+
effective_selector = selector
|
621
|
+
else:
|
622
|
+
# Should be unreachable due to checks above
|
623
|
+
raise ValueError("Internal error: No selector or text provided.")
|
494
624
|
|
495
|
-
selector_obj = parse_selector(
|
625
|
+
selector_obj = parse_selector(effective_selector)
|
496
626
|
|
497
|
-
# Pass regex and case flags to selector function
|
627
|
+
# Pass regex and case flags to selector function via kwargs
|
498
628
|
kwargs["regex"] = regex
|
499
629
|
kwargs["case"] = case
|
500
630
|
|
@@ -1282,18 +1412,22 @@ class Page(ClassificationMixin, ExtractionMixin):
|
|
1282
1412
|
image = None
|
1283
1413
|
render_resolution = resolution if resolution is not None else scale * 72
|
1284
1414
|
thread_id = threading.current_thread().name
|
1285
|
-
logger.debug(
|
1415
|
+
logger.debug(
|
1416
|
+
f"[{thread_id}] Page {self.index}: Attempting to acquire pdf_render_lock for to_image..."
|
1417
|
+
)
|
1286
1418
|
lock_wait_start = time.monotonic()
|
1287
1419
|
try:
|
1288
1420
|
# Acquire the global PDF rendering lock
|
1289
1421
|
with pdf_render_lock:
|
1290
1422
|
lock_acquired_time = time.monotonic()
|
1291
|
-
logger.debug(
|
1423
|
+
logger.debug(
|
1424
|
+
f"[{thread_id}] Page {self.index}: Acquired pdf_render_lock (waited {lock_acquired_time - lock_wait_start:.2f}s). Starting render..."
|
1425
|
+
)
|
1292
1426
|
if include_highlights:
|
1293
1427
|
# Delegate rendering to the central service
|
1294
1428
|
image = self._highlighter.render_page(
|
1295
1429
|
page_index=self.index,
|
1296
|
-
scale=scale,
|
1430
|
+
scale=scale,
|
1297
1431
|
labels=labels,
|
1298
1432
|
legend_position=legend_position,
|
1299
1433
|
render_ocr=render_ocr,
|
@@ -1301,28 +1435,15 @@ class Page(ClassificationMixin, ExtractionMixin):
|
|
1301
1435
|
**kwargs,
|
1302
1436
|
)
|
1303
1437
|
else:
|
1304
|
-
|
1305
|
-
# Use the underlying pdfplumber page object
|
1306
|
-
img_object = self._page.to_image(resolution=render_resolution, **kwargs)
|
1307
|
-
# Access the PIL image directly (assuming pdfplumber structure)
|
1308
|
-
image = (
|
1309
|
-
img_object.annotated
|
1310
|
-
if hasattr(img_object, "annotated")
|
1311
|
-
else img_object._repr_png_()
|
1312
|
-
)
|
1313
|
-
if isinstance(image, bytes): # Handle cases where it returns bytes
|
1314
|
-
from io import BytesIO
|
1315
|
-
|
1316
|
-
image = Image.open(BytesIO(image)).convert(
|
1317
|
-
"RGB"
|
1318
|
-
) # Convert to RGB for consistency
|
1319
|
-
|
1438
|
+
image = render_plain_page(self, render_resolution)
|
1320
1439
|
except Exception as e:
|
1321
1440
|
logger.error(f"Error rendering page {self.index}: {e}", exc_info=True)
|
1322
1441
|
return None # Return None on error
|
1323
1442
|
finally:
|
1324
1443
|
render_end_time = time.monotonic()
|
1325
|
-
logger.debug(
|
1444
|
+
logger.debug(
|
1445
|
+
f"[{thread_id}] Page {self.index}: Released pdf_render_lock. Total render time (incl. lock wait): {render_end_time - lock_wait_start:.2f}s"
|
1446
|
+
)
|
1326
1447
|
|
1327
1448
|
if image is None:
|
1328
1449
|
return None
|
@@ -1445,7 +1566,9 @@ class Page(ClassificationMixin, ExtractionMixin):
|
|
1445
1566
|
|
1446
1567
|
# Remove existing OCR elements if replace is True
|
1447
1568
|
if replace and hasattr(self, "_element_mgr"):
|
1448
|
-
logger.info(
|
1569
|
+
logger.info(
|
1570
|
+
f"Page {self.number}: Removing existing OCR elements before applying new OCR."
|
1571
|
+
)
|
1449
1572
|
self._element_mgr.remove_ocr_elements()
|
1450
1573
|
|
1451
1574
|
logger.info(f"Page {self.number}: Delegating apply_ocr to PDF.apply_ocr.")
|
@@ -1513,7 +1636,9 @@ class Page(ClassificationMixin, ExtractionMixin):
|
|
1513
1636
|
with pdf_render_lock:
|
1514
1637
|
image = self.to_image(resolution=final_resolution, include_highlights=False)
|
1515
1638
|
if not image:
|
1516
|
-
logger.error(
|
1639
|
+
logger.error(
|
1640
|
+
f" Failed to render page {self.number} to image for OCR extraction."
|
1641
|
+
)
|
1517
1642
|
return []
|
1518
1643
|
logger.debug(f" Rendered image size: {image.width}x{image.height}")
|
1519
1644
|
except Exception as e:
|
@@ -1585,6 +1710,11 @@ class Page(ClassificationMixin, ExtractionMixin):
|
|
1585
1710
|
logger.info(f" Created {len(temp_elements)} TextElements from OCR (extract only).")
|
1586
1711
|
return temp_elements
|
1587
1712
|
|
1713
|
+
@property
|
1714
|
+
def size(self) -> Tuple[float, float]:
|
1715
|
+
"""Get the size of the page in points."""
|
1716
|
+
return (self._page.width, self._page.height)
|
1717
|
+
|
1588
1718
|
@property
|
1589
1719
|
def layout_analyzer(self) -> LayoutAnalyzer:
|
1590
1720
|
"""Get or create the layout analyzer for this page."""
|
@@ -1604,6 +1734,8 @@ class Page(ClassificationMixin, ExtractionMixin):
|
|
1604
1734
|
exclude_classes: Optional[List[str]] = None,
|
1605
1735
|
device: Optional[str] = None,
|
1606
1736
|
existing: str = "replace",
|
1737
|
+
model_name: Optional[str] = None,
|
1738
|
+
client: Optional[Any] = None, # Add client parameter
|
1607
1739
|
) -> ElementCollection[Region]:
|
1608
1740
|
"""
|
1609
1741
|
Analyze the page layout using the configured LayoutManager.
|
@@ -1629,6 +1761,8 @@ class Page(ClassificationMixin, ExtractionMixin):
|
|
1629
1761
|
exclude_classes=exclude_classes,
|
1630
1762
|
device=device,
|
1631
1763
|
existing=existing,
|
1764
|
+
model_name=model_name,
|
1765
|
+
client=client, # Pass client down
|
1632
1766
|
)
|
1633
1767
|
|
1634
1768
|
# Retrieve the detected regions from the element manager
|
@@ -1699,14 +1833,24 @@ class Page(ClassificationMixin, ExtractionMixin):
|
|
1699
1833
|
)
|
1700
1834
|
return None
|
1701
1835
|
|
1836
|
+
def split(self, divider, **kwargs) -> "ElementCollection[Region]":
|
1837
|
+
"""
|
1838
|
+
Divides the page into sections based on the provided divider elements.
|
1839
|
+
"""
|
1840
|
+
sections = self.get_sections(start_elements=divider, **kwargs)
|
1841
|
+
top = self.region(0, 0, self.width, sections[0].top)
|
1842
|
+
sections.append(top)
|
1843
|
+
|
1844
|
+
return sections
|
1845
|
+
|
1702
1846
|
def get_sections(
|
1703
1847
|
self,
|
1704
1848
|
start_elements=None,
|
1705
1849
|
end_elements=None,
|
1706
|
-
boundary_inclusion="
|
1850
|
+
boundary_inclusion="start",
|
1707
1851
|
y_threshold=5.0,
|
1708
1852
|
bounding_box=None,
|
1709
|
-
) -> "ElementCollection[Region]":
|
1853
|
+
) -> "ElementCollection[Region]":
|
1710
1854
|
"""
|
1711
1855
|
Get sections of a page defined by start/end elements.
|
1712
1856
|
Uses the page-level implementation.
|
@@ -2068,7 +2212,7 @@ class Page(ClassificationMixin, ExtractionMixin):
|
|
2068
2212
|
self,
|
2069
2213
|
correction_callback: Callable[[Any], Optional[str]],
|
2070
2214
|
max_workers: Optional[int] = None,
|
2071
|
-
progress_callback: Optional[Callable[[], None]] = None,
|
2215
|
+
progress_callback: Optional[Callable[[], None]] = None, # Added progress callback
|
2072
2216
|
) -> "Page": # Return self for chaining
|
2073
2217
|
"""
|
2074
2218
|
Applies corrections to OCR-generated text elements on this page
|
@@ -2096,7 +2240,7 @@ class Page(ClassificationMixin, ExtractionMixin):
|
|
2096
2240
|
target_elements_collection = self.find_all(
|
2097
2241
|
selector="text[source=ocr]", apply_exclusions=False
|
2098
2242
|
)
|
2099
|
-
target_elements = target_elements_collection.elements
|
2243
|
+
target_elements = target_elements_collection.elements # Get the list
|
2100
2244
|
|
2101
2245
|
if not target_elements:
|
2102
2246
|
logger.info(f"Page {self.number}: No OCR elements found to correct.")
|
@@ -2109,22 +2253,24 @@ class Page(ClassificationMixin, ExtractionMixin):
|
|
2109
2253
|
# Define the task to be run by the worker thread or sequentially
|
2110
2254
|
def _process_element_task(element):
|
2111
2255
|
try:
|
2112
|
-
current_text = getattr(element,
|
2256
|
+
current_text = getattr(element, "text", None)
|
2113
2257
|
# Call the user-provided callback
|
2114
2258
|
corrected_text = correction_callback(element)
|
2115
2259
|
|
2116
2260
|
# Validate result type
|
2117
2261
|
if corrected_text is not None and not isinstance(corrected_text, str):
|
2118
|
-
logger.warning(
|
2119
|
-
|
2262
|
+
logger.warning(
|
2263
|
+
f"Page {self.number}: Correction callback for element '{getattr(element, 'text', '')[:20]}...' returned non-string, non-None type: {type(corrected_text)}. Skipping update."
|
2264
|
+
)
|
2265
|
+
return element, None, None # Treat as no correction
|
2120
2266
|
|
2121
2267
|
return element, corrected_text, None # Return element, result, no error
|
2122
2268
|
except Exception as e:
|
2123
2269
|
logger.error(
|
2124
2270
|
f"Page {self.number}: Error applying correction callback to element '{getattr(element, 'text', '')[:30]}...' ({element.bbox}): {e}",
|
2125
|
-
exc_info=False
|
2271
|
+
exc_info=False, # Keep log concise
|
2126
2272
|
)
|
2127
|
-
return element, None, e
|
2273
|
+
return element, None, e # Return element, no result, error
|
2128
2274
|
finally:
|
2129
2275
|
# --- Call progress callback here --- #
|
2130
2276
|
if progress_callback:
|
@@ -2132,16 +2278,24 @@ class Page(ClassificationMixin, ExtractionMixin):
|
|
2132
2278
|
progress_callback()
|
2133
2279
|
except Exception as cb_e:
|
2134
2280
|
# Log error in callback itself, but don't stop processing
|
2135
|
-
logger.error(
|
2281
|
+
logger.error(
|
2282
|
+
f"Page {self.number}: Error executing progress_callback: {cb_e}",
|
2283
|
+
exc_info=False,
|
2284
|
+
)
|
2136
2285
|
|
2137
2286
|
# Choose execution strategy based on max_workers
|
2138
2287
|
if max_workers is not None and max_workers > 1:
|
2139
2288
|
# --- Parallel execution --- #
|
2140
|
-
logger.info(
|
2289
|
+
logger.info(
|
2290
|
+
f"Page {self.number}: Running OCR correction in parallel with {max_workers} workers."
|
2291
|
+
)
|
2141
2292
|
futures = []
|
2142
2293
|
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
|
2143
2294
|
# Submit all tasks
|
2144
|
-
future_to_element = {
|
2295
|
+
future_to_element = {
|
2296
|
+
executor.submit(_process_element_task, element): element
|
2297
|
+
for element in target_elements
|
2298
|
+
}
|
2145
2299
|
|
2146
2300
|
# Process results as they complete (progress_callback called by worker)
|
2147
2301
|
for future in concurrent.futures.as_completed(future_to_element):
|
@@ -2153,14 +2307,17 @@ class Page(ClassificationMixin, ExtractionMixin):
|
|
2153
2307
|
# Error already logged in worker
|
2154
2308
|
elif corrected_text is not None:
|
2155
2309
|
# Apply correction if text changed
|
2156
|
-
current_text = getattr(element,
|
2310
|
+
current_text = getattr(element, "text", None)
|
2157
2311
|
if corrected_text != current_text:
|
2158
2312
|
element.text = corrected_text
|
2159
2313
|
updated_count += 1
|
2160
2314
|
except Exception as exc:
|
2161
2315
|
# Catch errors from future.result() itself
|
2162
|
-
element = future_to_element[future]
|
2163
|
-
logger.error(
|
2316
|
+
element = future_to_element[future] # Find original element
|
2317
|
+
logger.error(
|
2318
|
+
f"Page {self.number}: Internal error retrieving correction result for element {element.bbox}: {exc}",
|
2319
|
+
exc_info=True,
|
2320
|
+
)
|
2164
2321
|
error_count += 1
|
2165
2322
|
# Note: progress_callback was already called in the worker's finally block
|
2166
2323
|
|
@@ -2168,65 +2325,230 @@ class Page(ClassificationMixin, ExtractionMixin):
|
|
2168
2325
|
# --- Sequential execution --- #
|
2169
2326
|
logger.info(f"Page {self.number}: Running OCR correction sequentially.")
|
2170
2327
|
for element in target_elements:
|
2171
|
-
|
2172
|
-
|
2173
|
-
|
2174
|
-
|
2175
|
-
|
2176
|
-
|
2177
|
-
|
2178
|
-
|
2179
|
-
|
2180
|
-
|
2181
|
-
|
2328
|
+
# Call the task function directly (it handles progress_callback)
|
2329
|
+
processed_count += 1
|
2330
|
+
_element, corrected_text, error = _process_element_task(element)
|
2331
|
+
if error:
|
2332
|
+
error_count += 1
|
2333
|
+
elif corrected_text is not None:
|
2334
|
+
# Apply correction if text changed
|
2335
|
+
current_text = getattr(_element, "text", None)
|
2336
|
+
if corrected_text != current_text:
|
2337
|
+
_element.text = corrected_text
|
2338
|
+
updated_count += 1
|
2182
2339
|
|
2183
2340
|
logger.info(
|
2184
|
-
|
2341
|
+
f"Page {self.number}: OCR correction finished. Processed: {processed_count}/{len(target_elements)}, Updated: {updated_count}, Errors: {error_count}."
|
2185
2342
|
)
|
2186
2343
|
|
2187
|
-
return self
|
2344
|
+
return self # Return self for chaining
|
2188
2345
|
|
2189
2346
|
# --- Classification Mixin Implementation --- #
|
2190
2347
|
def _get_classification_manager(self) -> "ClassificationManager":
|
2191
|
-
if not hasattr(self,
|
2192
|
-
|
2348
|
+
if not hasattr(self, "pdf") or not hasattr(self.pdf, "get_manager"):
|
2349
|
+
raise AttributeError(
|
2350
|
+
"ClassificationManager cannot be accessed: Parent PDF or get_manager method missing."
|
2351
|
+
)
|
2193
2352
|
try:
|
2194
|
-
|
2195
|
-
|
2353
|
+
# Use the PDF's manager registry accessor
|
2354
|
+
return self.pdf.get_manager("classification")
|
2196
2355
|
except (ValueError, RuntimeError, AttributeError) as e:
|
2197
2356
|
# Wrap potential errors from get_manager for clarity
|
2198
2357
|
raise AttributeError(f"Failed to get ClassificationManager from PDF: {e}") from e
|
2199
2358
|
|
2200
|
-
def _get_classification_content(
|
2201
|
-
|
2202
|
-
|
2359
|
+
def _get_classification_content(
|
2360
|
+
self, model_type: str, **kwargs
|
2361
|
+
) -> Union[str, "Image"]: # Use "Image" for lazy import
|
2362
|
+
if model_type == "text":
|
2363
|
+
text_content = self.extract_text(
|
2364
|
+
layout=False, use_exclusions=False
|
2365
|
+
) # Simple join, ignore exclusions for classification
|
2203
2366
|
if not text_content or text_content.isspace():
|
2204
2367
|
raise ValueError("Cannot classify page with 'text' model: No text content found.")
|
2205
2368
|
return text_content
|
2206
|
-
elif model_type ==
|
2369
|
+
elif model_type == "vision":
|
2207
2370
|
# Get resolution from manager/kwargs if possible, else default
|
2208
2371
|
manager = self._get_classification_manager()
|
2209
2372
|
default_resolution = 150
|
2210
2373
|
# Access kwargs passed to classify method if needed
|
2211
|
-
resolution =
|
2374
|
+
resolution = (
|
2375
|
+
kwargs.get("resolution", default_resolution)
|
2376
|
+
if "kwargs" in locals()
|
2377
|
+
else default_resolution
|
2378
|
+
)
|
2212
2379
|
|
2213
2380
|
# Use to_image, ensuring no highlights interfere
|
2214
2381
|
img = self.to_image(
|
2215
2382
|
resolution=resolution,
|
2216
2383
|
include_highlights=False,
|
2217
2384
|
labels=False,
|
2218
|
-
exclusions=None
|
2385
|
+
exclusions=None, # Don't mask exclusions for classification input image
|
2219
2386
|
)
|
2220
2387
|
if img is None:
|
2221
|
-
raise ValueError(
|
2388
|
+
raise ValueError(
|
2389
|
+
"Cannot classify page with 'vision' model: Failed to render image."
|
2390
|
+
)
|
2222
2391
|
return img
|
2223
2392
|
else:
|
2224
2393
|
raise ValueError(f"Unsupported model_type for classification: {model_type}")
|
2225
2394
|
|
2226
2395
|
def _get_metadata_storage(self) -> Dict[str, Any]:
|
2227
2396
|
# Ensure metadata exists
|
2228
|
-
if not hasattr(self,
|
2397
|
+
if not hasattr(self, "metadata") or self.metadata is None:
|
2229
2398
|
self.metadata = {}
|
2230
2399
|
return self.metadata
|
2231
2400
|
|
2232
2401
|
# --- Content Extraction ---
|
2402
|
+
|
2403
|
+
# --- Skew Detection and Correction --- #
|
2404
|
+
|
2405
|
+
@property
|
2406
|
+
def skew_angle(self) -> Optional[float]:
|
2407
|
+
"""Get the detected skew angle for this page (if calculated)."""
|
2408
|
+
return self._skew_angle
|
2409
|
+
|
2410
|
+
def detect_skew_angle(
|
2411
|
+
self,
|
2412
|
+
resolution: int = 72,
|
2413
|
+
grayscale: bool = True,
|
2414
|
+
force_recalculate: bool = False,
|
2415
|
+
**deskew_kwargs,
|
2416
|
+
) -> Optional[float]:
|
2417
|
+
"""
|
2418
|
+
Detects the skew angle of the page image and stores it.
|
2419
|
+
|
2420
|
+
Args:
|
2421
|
+
resolution: DPI resolution for rendering the page image for detection.
|
2422
|
+
grayscale: Whether to convert the image to grayscale before detection.
|
2423
|
+
force_recalculate: If True, recalculate even if an angle exists.
|
2424
|
+
**deskew_kwargs: Additional keyword arguments passed to `deskew.determine_skew`
|
2425
|
+
(e.g., `max_angle`, `num_peaks`).
|
2426
|
+
|
2427
|
+
Returns:
|
2428
|
+
The detected skew angle in degrees, or None if detection failed.
|
2429
|
+
|
2430
|
+
Raises:
|
2431
|
+
ImportError: If the 'deskew' library is not installed.
|
2432
|
+
"""
|
2433
|
+
if not DESKEW_AVAILABLE:
|
2434
|
+
raise ImportError(
|
2435
|
+
"Deskew library not found. Install with: pip install natural-pdf[deskew]"
|
2436
|
+
)
|
2437
|
+
|
2438
|
+
if self._skew_angle is not None and not force_recalculate:
|
2439
|
+
logger.debug(f"Page {self.number}: Returning cached skew angle: {self._skew_angle:.2f}")
|
2440
|
+
return self._skew_angle
|
2441
|
+
|
2442
|
+
logger.debug(f"Page {self.number}: Detecting skew angle (resolution={resolution} DPI)...")
|
2443
|
+
try:
|
2444
|
+
# Render the page at the specified detection resolution
|
2445
|
+
img = self.to_image(resolution=resolution, include_highlights=False)
|
2446
|
+
if not img:
|
2447
|
+
logger.warning(f"Page {self.number}: Failed to render image for skew detection.")
|
2448
|
+
self._skew_angle = None
|
2449
|
+
return None
|
2450
|
+
|
2451
|
+
# Convert to numpy array
|
2452
|
+
img_np = np.array(img)
|
2453
|
+
|
2454
|
+
# Convert to grayscale if needed
|
2455
|
+
if grayscale:
|
2456
|
+
if len(img_np.shape) == 3 and img_np.shape[2] >= 3:
|
2457
|
+
gray_np = np.mean(img_np[:, :, :3], axis=2).astype(np.uint8)
|
2458
|
+
elif len(img_np.shape) == 2:
|
2459
|
+
gray_np = img_np # Already grayscale
|
2460
|
+
else:
|
2461
|
+
logger.warning(
|
2462
|
+
f"Page {self.number}: Unexpected image shape {img_np.shape} for grayscale conversion."
|
2463
|
+
)
|
2464
|
+
gray_np = img_np # Try using it anyway
|
2465
|
+
else:
|
2466
|
+
gray_np = img_np # Use original if grayscale=False
|
2467
|
+
|
2468
|
+
# Determine skew angle using the deskew library
|
2469
|
+
angle = determine_skew(gray_np, **deskew_kwargs)
|
2470
|
+
self._skew_angle = angle
|
2471
|
+
logger.debug(f"Page {self.number}: Detected skew angle = {angle}")
|
2472
|
+
return angle
|
2473
|
+
|
2474
|
+
except Exception as e:
|
2475
|
+
logger.warning(f"Page {self.number}: Failed during skew detection: {e}", exc_info=True)
|
2476
|
+
self._skew_angle = None
|
2477
|
+
return None
|
2478
|
+
|
2479
|
+
def deskew(
|
2480
|
+
self,
|
2481
|
+
resolution: int = 300,
|
2482
|
+
angle: Optional[float] = None,
|
2483
|
+
detection_resolution: int = 72,
|
2484
|
+
**deskew_kwargs,
|
2485
|
+
) -> Optional[Image.Image]:
|
2486
|
+
"""
|
2487
|
+
Creates and returns a deskewed PIL image of the page.
|
2488
|
+
|
2489
|
+
If `angle` is not provided, it will first try to detect the skew angle
|
2490
|
+
using `detect_skew_angle` (or use the cached angle if available).
|
2491
|
+
|
2492
|
+
Args:
|
2493
|
+
resolution: DPI resolution for the output deskewed image.
|
2494
|
+
angle: The specific angle (in degrees) to rotate by. If None, detects automatically.
|
2495
|
+
detection_resolution: DPI resolution used for detection if `angle` is None.
|
2496
|
+
**deskew_kwargs: Additional keyword arguments passed to `deskew.determine_skew`
|
2497
|
+
if automatic detection is performed.
|
2498
|
+
|
2499
|
+
Returns:
|
2500
|
+
A deskewed PIL.Image.Image object, or None if rendering/rotation fails.
|
2501
|
+
|
2502
|
+
Raises:
|
2503
|
+
ImportError: If the 'deskew' library is not installed.
|
2504
|
+
"""
|
2505
|
+
if not DESKEW_AVAILABLE:
|
2506
|
+
raise ImportError(
|
2507
|
+
"Deskew library not found. Install with: pip install natural-pdf[deskew]"
|
2508
|
+
)
|
2509
|
+
|
2510
|
+
# Determine the angle to use
|
2511
|
+
rotation_angle = angle
|
2512
|
+
if rotation_angle is None:
|
2513
|
+
# Detect angle (or use cached) if not explicitly provided
|
2514
|
+
rotation_angle = self.detect_skew_angle(
|
2515
|
+
resolution=detection_resolution, **deskew_kwargs
|
2516
|
+
)
|
2517
|
+
|
2518
|
+
logger.debug(
|
2519
|
+
f"Page {self.number}: Preparing to deskew (output resolution={resolution} DPI). Using angle: {rotation_angle}"
|
2520
|
+
)
|
2521
|
+
|
2522
|
+
try:
|
2523
|
+
# Render the original page at the desired output resolution
|
2524
|
+
img = self.to_image(resolution=resolution, include_highlights=False)
|
2525
|
+
if not img:
|
2526
|
+
logger.error(f"Page {self.number}: Failed to render image for deskewing.")
|
2527
|
+
return None
|
2528
|
+
|
2529
|
+
# Rotate if a significant angle was found/provided
|
2530
|
+
if rotation_angle is not None and abs(rotation_angle) > 0.05:
|
2531
|
+
logger.debug(f"Page {self.number}: Rotating by {rotation_angle:.2f} degrees.")
|
2532
|
+
# Determine fill color based on image mode
|
2533
|
+
fill = (255, 255, 255) if img.mode == "RGB" else 255 # White background
|
2534
|
+
# Rotate the image using PIL
|
2535
|
+
rotated_img = img.rotate(
|
2536
|
+
rotation_angle, # deskew provides angle, PIL rotates counter-clockwise
|
2537
|
+
resample=Image.Resampling.BILINEAR,
|
2538
|
+
expand=True, # Expand image to fit rotated content
|
2539
|
+
fillcolor=fill,
|
2540
|
+
)
|
2541
|
+
return rotated_img
|
2542
|
+
else:
|
2543
|
+
logger.debug(
|
2544
|
+
f"Page {self.number}: No significant rotation needed (angle={rotation_angle}). Returning original render."
|
2545
|
+
)
|
2546
|
+
return img # Return the original rendered image if no rotation needed
|
2547
|
+
|
2548
|
+
except Exception as e:
|
2549
|
+
logger.error(
|
2550
|
+
f"Page {self.number}: Error during deskewing image generation: {e}", exc_info=True
|
2551
|
+
)
|
2552
|
+
return None
|
2553
|
+
|
2554
|
+
# --- End Skew Detection and Correction --- #
|