natural-pdf 0.1.4__py3-none-any.whl → 0.1.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docs/api/index.md +386 -0
- docs/assets/favicon.png +3 -0
- docs/assets/favicon.svg +3 -0
- docs/assets/javascripts/custom.js +17 -0
- docs/assets/logo.svg +3 -0
- docs/assets/sample-screen.png +0 -0
- docs/assets/social-preview.png +17 -0
- docs/assets/social-preview.svg +17 -0
- docs/assets/stylesheets/custom.css +65 -0
- docs/document-qa/index.ipynb +435 -0
- docs/document-qa/index.md +79 -0
- docs/element-selection/index.ipynb +915 -0
- docs/element-selection/index.md +229 -0
- docs/index.md +170 -0
- docs/installation/index.md +69 -0
- docs/interactive-widget/index.ipynb +962 -0
- docs/interactive-widget/index.md +12 -0
- docs/layout-analysis/index.ipynb +818 -0
- docs/layout-analysis/index.md +185 -0
- docs/ocr/index.md +209 -0
- docs/pdf-navigation/index.ipynb +314 -0
- docs/pdf-navigation/index.md +97 -0
- docs/regions/index.ipynb +816 -0
- docs/regions/index.md +294 -0
- docs/tables/index.ipynb +658 -0
- docs/tables/index.md +144 -0
- docs/text-analysis/index.ipynb +370 -0
- docs/text-analysis/index.md +105 -0
- docs/text-extraction/index.ipynb +1478 -0
- docs/text-extraction/index.md +292 -0
- docs/tutorials/01-loading-and-extraction.ipynb +1710 -0
- docs/tutorials/01-loading-and-extraction.md +95 -0
- docs/tutorials/02-finding-elements.ipynb +340 -0
- docs/tutorials/02-finding-elements.md +149 -0
- docs/tutorials/03-extracting-blocks.ipynb +147 -0
- docs/tutorials/03-extracting-blocks.md +48 -0
- docs/tutorials/04-table-extraction.ipynb +114 -0
- docs/tutorials/04-table-extraction.md +50 -0
- docs/tutorials/05-excluding-content.ipynb +270 -0
- docs/tutorials/05-excluding-content.md +109 -0
- docs/tutorials/06-document-qa.ipynb +332 -0
- docs/tutorials/06-document-qa.md +91 -0
- docs/tutorials/07-layout-analysis.ipynb +288 -0
- docs/tutorials/07-layout-analysis.md +66 -0
- docs/tutorials/07-working-with-regions.ipynb +413 -0
- docs/tutorials/07-working-with-regions.md +151 -0
- docs/tutorials/08-spatial-navigation.ipynb +508 -0
- docs/tutorials/08-spatial-navigation.md +190 -0
- docs/tutorials/09-section-extraction.ipynb +2434 -0
- docs/tutorials/09-section-extraction.md +256 -0
- docs/tutorials/10-form-field-extraction.ipynb +512 -0
- docs/tutorials/10-form-field-extraction.md +201 -0
- docs/tutorials/11-enhanced-table-processing.ipynb +54 -0
- docs/tutorials/11-enhanced-table-processing.md +9 -0
- docs/tutorials/12-ocr-integration.ipynb +604 -0
- docs/tutorials/12-ocr-integration.md +175 -0
- docs/tutorials/13-semantic-search.ipynb +1328 -0
- docs/tutorials/13-semantic-search.md +77 -0
- docs/visual-debugging/index.ipynb +2970 -0
- docs/visual-debugging/index.md +157 -0
- docs/visual-debugging/region.png +0 -0
- natural_pdf/__init__.py +50 -33
- natural_pdf/analyzers/__init__.py +2 -1
- natural_pdf/analyzers/layout/base.py +32 -24
- natural_pdf/analyzers/layout/docling.py +131 -72
- natural_pdf/analyzers/layout/gemini.py +264 -0
- natural_pdf/analyzers/layout/layout_analyzer.py +156 -113
- natural_pdf/analyzers/layout/layout_manager.py +125 -58
- natural_pdf/analyzers/layout/layout_options.py +43 -17
- natural_pdf/analyzers/layout/paddle.py +152 -95
- natural_pdf/analyzers/layout/surya.py +164 -92
- natural_pdf/analyzers/layout/tatr.py +149 -84
- natural_pdf/analyzers/layout/yolo.py +89 -45
- natural_pdf/analyzers/text_options.py +22 -15
- natural_pdf/analyzers/text_structure.py +131 -85
- natural_pdf/analyzers/utils.py +30 -23
- natural_pdf/collections/pdf_collection.py +146 -97
- natural_pdf/core/__init__.py +1 -1
- natural_pdf/core/element_manager.py +419 -337
- natural_pdf/core/highlighting_service.py +268 -196
- natural_pdf/core/page.py +1044 -521
- natural_pdf/core/pdf.py +516 -313
- natural_pdf/elements/__init__.py +1 -1
- natural_pdf/elements/base.py +307 -225
- natural_pdf/elements/collections.py +805 -543
- natural_pdf/elements/line.py +39 -36
- natural_pdf/elements/rect.py +32 -30
- natural_pdf/elements/region.py +889 -879
- natural_pdf/elements/text.py +127 -99
- natural_pdf/exporters/__init__.py +0 -1
- natural_pdf/exporters/searchable_pdf.py +261 -102
- natural_pdf/ocr/__init__.py +57 -35
- natural_pdf/ocr/engine.py +150 -46
- natural_pdf/ocr/engine_easyocr.py +146 -150
- natural_pdf/ocr/engine_paddle.py +118 -175
- natural_pdf/ocr/engine_surya.py +78 -141
- natural_pdf/ocr/ocr_factory.py +114 -0
- natural_pdf/ocr/ocr_manager.py +122 -124
- natural_pdf/ocr/ocr_options.py +16 -20
- natural_pdf/ocr/utils.py +98 -0
- natural_pdf/qa/__init__.py +1 -1
- natural_pdf/qa/document_qa.py +119 -111
- natural_pdf/search/__init__.py +37 -31
- natural_pdf/search/haystack_search_service.py +312 -189
- natural_pdf/search/haystack_utils.py +186 -122
- natural_pdf/search/search_options.py +25 -14
- natural_pdf/search/search_service_protocol.py +12 -6
- natural_pdf/search/searchable_mixin.py +261 -176
- natural_pdf/selectors/__init__.py +2 -1
- natural_pdf/selectors/parser.py +159 -316
- natural_pdf/templates/__init__.py +1 -1
- natural_pdf/templates/spa/css/style.css +334 -0
- natural_pdf/templates/spa/index.html +31 -0
- natural_pdf/templates/spa/js/app.js +472 -0
- natural_pdf/templates/spa/words.txt +235976 -0
- natural_pdf/utils/debug.py +32 -0
- natural_pdf/utils/highlighting.py +8 -2
- natural_pdf/utils/identifiers.py +29 -0
- natural_pdf/utils/packaging.py +418 -0
- natural_pdf/utils/reading_order.py +65 -63
- natural_pdf/utils/text_extraction.py +195 -0
- natural_pdf/utils/visualization.py +70 -61
- natural_pdf/widgets/__init__.py +2 -3
- natural_pdf/widgets/viewer.py +749 -718
- {natural_pdf-0.1.4.dist-info → natural_pdf-0.1.6.dist-info}/METADATA +53 -17
- natural_pdf-0.1.6.dist-info/RECORD +141 -0
- {natural_pdf-0.1.4.dist-info → natural_pdf-0.1.6.dist-info}/WHEEL +1 -1
- natural_pdf-0.1.6.dist-info/top_level.txt +4 -0
- notebooks/Examples.ipynb +1293 -0
- pdfs/.gitkeep +0 -0
- pdfs/01-practice.pdf +543 -0
- pdfs/0500000US42001.pdf +0 -0
- pdfs/0500000US42007.pdf +0 -0
- pdfs/2014 Statistics.pdf +0 -0
- pdfs/2019 Statistics.pdf +0 -0
- pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
- pdfs/needs-ocr.pdf +0 -0
- natural_pdf/templates/ocr_debug.html +0 -517
- natural_pdf-0.1.4.dist-info/RECORD +0 -61
- natural_pdf-0.1.4.dist-info/top_level.txt +0 -1
- {natural_pdf-0.1.4.dist-info → natural_pdf-0.1.6.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,114 @@
|
|
1
|
+
import logging
|
2
|
+
import importlib.util
|
3
|
+
from typing import Dict, Any, Optional, Type, Union, List
|
4
|
+
|
5
|
+
from .engine import OCREngine
|
6
|
+
|
7
|
+
logger = logging.getLogger(__name__)
|
8
|
+
|
9
|
+
|
10
|
+
class OCRFactory:
|
11
|
+
"""Factory for creating and managing OCR engines with optional dependencies."""
|
12
|
+
|
13
|
+
@staticmethod
|
14
|
+
def create_engine(engine_type: str, **kwargs) -> OCREngine:
|
15
|
+
"""Create and return an OCR engine instance.
|
16
|
+
|
17
|
+
Args:
|
18
|
+
engine_type: One of 'surya', 'easyocr', 'paddle'
|
19
|
+
**kwargs: Arguments to pass to the engine constructor
|
20
|
+
|
21
|
+
Returns:
|
22
|
+
An initialized OCR engine
|
23
|
+
|
24
|
+
Raises:
|
25
|
+
ImportError: If the required dependencies aren't installed
|
26
|
+
ValueError: If the engine_type is unknown
|
27
|
+
"""
|
28
|
+
if engine_type == "surya":
|
29
|
+
try:
|
30
|
+
from .engine_surya import SuryaOCREngine
|
31
|
+
return SuryaOCREngine(**kwargs)
|
32
|
+
except ImportError:
|
33
|
+
raise ImportError("Surya engine requires the 'surya' package. "
|
34
|
+
"Install with: pip install surya")
|
35
|
+
elif engine_type == "easyocr":
|
36
|
+
try:
|
37
|
+
from .engine_easyocr import EasyOCREngine
|
38
|
+
return EasyOCREngine(**kwargs)
|
39
|
+
except ImportError:
|
40
|
+
raise ImportError("EasyOCR engine requires the 'easyocr' package. "
|
41
|
+
"Install with: pip install easyocr")
|
42
|
+
elif engine_type == "paddle":
|
43
|
+
try:
|
44
|
+
from .engine_paddle import PaddleOCREngine
|
45
|
+
return PaddleOCREngine(**kwargs)
|
46
|
+
except ImportError:
|
47
|
+
raise ImportError("PaddleOCR engine requires 'paddleocr' and 'paddlepaddle'. "
|
48
|
+
"Install with: pip install paddleocr paddlepaddle")
|
49
|
+
else:
|
50
|
+
raise ValueError(f"Unknown engine type: {engine_type}")
|
51
|
+
|
52
|
+
@staticmethod
|
53
|
+
def list_available_engines() -> Dict[str, bool]:
|
54
|
+
"""Returns a dictionary of engine names and their availability status."""
|
55
|
+
engines = {}
|
56
|
+
|
57
|
+
# Check Surya
|
58
|
+
try:
|
59
|
+
engines["surya"] = importlib.util.find_spec("surya") is not None
|
60
|
+
except ImportError:
|
61
|
+
engines["surya"] = False
|
62
|
+
|
63
|
+
# Check EasyOCR
|
64
|
+
try:
|
65
|
+
engines["easyocr"] = importlib.util.find_spec("easyocr") is not None
|
66
|
+
except ImportError:
|
67
|
+
engines["easyocr"] = False
|
68
|
+
|
69
|
+
# Check PaddleOCR
|
70
|
+
try:
|
71
|
+
paddle = importlib.util.find_spec("paddle") is not None or importlib.util.find_spec("paddlepaddle") is not None
|
72
|
+
paddleocr = importlib.util.find_spec("paddleocr") is not None
|
73
|
+
engines["paddle"] = paddle and paddleocr
|
74
|
+
except ImportError:
|
75
|
+
engines["paddle"] = False
|
76
|
+
|
77
|
+
return engines
|
78
|
+
|
79
|
+
@staticmethod
|
80
|
+
def get_recommended_engine(**kwargs) -> OCREngine:
|
81
|
+
"""Returns the best available OCR engine based on what's installed.
|
82
|
+
|
83
|
+
First tries engines in order of preference: EasyOCR, Paddle, Surya.
|
84
|
+
If none are available, raises ImportError with installation instructions.
|
85
|
+
|
86
|
+
Args:
|
87
|
+
**kwargs: Arguments to pass to the engine constructor
|
88
|
+
|
89
|
+
Returns:
|
90
|
+
The best available OCR engine instance
|
91
|
+
|
92
|
+
Raises:
|
93
|
+
ImportError: If no engines are available
|
94
|
+
"""
|
95
|
+
available = OCRFactory.list_available_engines()
|
96
|
+
|
97
|
+
# Try engines in order of recommendation
|
98
|
+
if available.get("easyocr", False):
|
99
|
+
logger.info("Using EasyOCR engine (recommended)")
|
100
|
+
return OCRFactory.create_engine("easyocr", **kwargs)
|
101
|
+
elif available.get("paddle", False):
|
102
|
+
logger.info("Using PaddleOCR engine")
|
103
|
+
return OCRFactory.create_engine("paddle", **kwargs)
|
104
|
+
elif available.get("surya", False):
|
105
|
+
logger.info("Using Surya OCR engine")
|
106
|
+
return OCRFactory.create_engine("surya", **kwargs)
|
107
|
+
|
108
|
+
# If we get here, no engines are available
|
109
|
+
raise ImportError(
|
110
|
+
"No OCR engines available. Please install at least one of: \n"
|
111
|
+
"- EasyOCR (recommended): pip install easyocr\n"
|
112
|
+
"- PaddleOCR: pip install paddleocr paddlepaddle\n"
|
113
|
+
"- Surya OCR: pip install surya"
|
114
|
+
)
|
natural_pdf/ocr/ocr_manager.py
CHANGED
@@ -1,191 +1,189 @@
|
|
1
1
|
# ocr_manager.py
|
2
|
+
import copy # For deep copying options
|
2
3
|
import logging
|
3
|
-
from typing import Dict, List,
|
4
|
+
from typing import Any, Dict, List, Optional, Type, Union
|
5
|
+
|
4
6
|
from PIL import Image
|
5
|
-
import copy # For deep copying options
|
6
7
|
|
7
8
|
# Import engine classes and options
|
8
9
|
from .engine import OCREngine
|
9
10
|
from .engine_easyocr import EasyOCREngine
|
10
11
|
from .engine_paddle import PaddleOCREngine
|
11
|
-
from .engine_surya import SuryaOCREngine
|
12
|
-
from .ocr_options import
|
13
|
-
|
14
|
-
)
|
12
|
+
from .engine_surya import SuryaOCREngine
|
13
|
+
from .ocr_options import OCROptions
|
14
|
+
from .ocr_options import BaseOCROptions, EasyOCROptions, PaddleOCROptions, SuryaOCROptions
|
15
15
|
|
16
16
|
logger = logging.getLogger(__name__)
|
17
17
|
|
18
|
+
|
18
19
|
class OCRManager:
|
19
20
|
"""Manages OCR engine selection, configuration, and execution."""
|
20
21
|
|
21
22
|
# Registry mapping engine names to classes and default options
|
22
23
|
ENGINE_REGISTRY: Dict[str, Dict[str, Any]] = {
|
23
|
-
|
24
|
-
|
25
|
-
|
24
|
+
"easyocr": {"class": EasyOCREngine, "options_class": EasyOCROptions},
|
25
|
+
"paddle": {"class": PaddleOCREngine, "options_class": PaddleOCROptions},
|
26
|
+
"surya": {"class": SuryaOCREngine, "options_class": SuryaOCROptions}, # <-- Add Surya
|
26
27
|
# Add other engines here
|
27
28
|
}
|
28
29
|
|
29
|
-
# Define the limited set of kwargs allowed for the simple apply_ocr call
|
30
|
-
SIMPLE_MODE_ALLOWED_KWARGS = {
|
31
|
-
'engine', 'languages', 'min_confidence', 'device'
|
32
|
-
# Add image pre-processing args like 'resolution', 'width' if handled here
|
33
|
-
}
|
34
|
-
|
35
30
|
def __init__(self):
|
36
31
|
"""Initializes the OCR Manager."""
|
37
|
-
self._engine_instances: Dict[str, OCREngine] = {}
|
32
|
+
self._engine_instances: Dict[str, OCREngine] = {} # Cache for engine instances
|
38
33
|
logger.info("OCRManager initialized.")
|
39
34
|
|
40
35
|
def _get_engine_instance(self, engine_name: str) -> OCREngine:
|
41
36
|
"""Retrieves or creates an instance of the specified OCR engine."""
|
42
37
|
engine_name = engine_name.lower()
|
43
38
|
if engine_name not in self.ENGINE_REGISTRY:
|
44
|
-
raise ValueError(
|
39
|
+
raise ValueError(
|
40
|
+
f"Unknown OCR engine: '{engine_name}'. Available: {list(self.ENGINE_REGISTRY.keys())}"
|
41
|
+
)
|
45
42
|
|
46
|
-
# Surya engine might manage its own predictor state, consider if caching instance is always right
|
47
|
-
# For now, we cache the engine instance itself.
|
48
43
|
if engine_name not in self._engine_instances:
|
49
44
|
logger.info(f"Creating instance of engine: {engine_name}")
|
50
|
-
engine_class = self.ENGINE_REGISTRY[engine_name][
|
51
|
-
engine_instance = engine_class()
|
45
|
+
engine_class = self.ENGINE_REGISTRY[engine_name]["class"]
|
46
|
+
engine_instance = engine_class() # Instantiate first
|
52
47
|
if not engine_instance.is_available():
|
53
|
-
|
54
|
-
|
55
|
-
|
48
|
+
# Check availability before storing
|
49
|
+
# Construct helpful error message with install hint
|
50
|
+
install_hint = f"pip install 'natural-pdf[{engine_name}]'"
|
51
|
+
raise RuntimeError(
|
52
|
+
f"Engine '{engine_name}' is not available. Please install the required dependencies: {install_hint}"
|
53
|
+
)
|
54
|
+
self._engine_instances[engine_name] = engine_instance # Store if available
|
56
55
|
|
57
56
|
return self._engine_instances[engine_name]
|
58
57
|
|
59
58
|
def apply_ocr(
|
60
59
|
self,
|
61
|
-
images: Union[Image.Image, List[Image.Image]],
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
60
|
+
images: Union[Image.Image, List[Image.Image]],
|
61
|
+
# --- Explicit Common Parameters ---
|
62
|
+
engine: Optional[str] = None,
|
63
|
+
languages: Optional[List[str]] = None,
|
64
|
+
min_confidence: Optional[float] = None,
|
65
|
+
device: Optional[str] = None,
|
66
|
+
detect_only: bool = False,
|
67
|
+
# --- Engine-Specific Options ---
|
68
|
+
options: Optional[Any] = None, # e.g. EasyOCROptions(), PaddleOCROptions()
|
69
|
+
) -> Union[List[Dict[str, Any]], List[List[Dict[str, Any]]]]:
|
66
70
|
"""
|
67
|
-
Applies OCR to a single image or a batch of images
|
68
|
-
keyword arguments or an options object.
|
71
|
+
Applies OCR to a single image or a batch of images.
|
69
72
|
|
70
73
|
Args:
|
71
74
|
images: A single PIL Image or a list of PIL Images to process.
|
72
|
-
engine: Name of the engine
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
75
|
+
engine: Name of the engine (e.g., 'easyocr', 'paddle', 'surya').
|
76
|
+
Defaults to 'easyocr' if not specified.
|
77
|
+
languages: List of language codes (e.g., ['en', 'fr'], ['en', 'german']).
|
78
|
+
**Passed directly to the engine.** Must be codes understood
|
79
|
+
by the specific engine. No mapping is performed by the manager.
|
80
|
+
min_confidence: Minimum confidence threshold (0.0-1.0).
|
81
|
+
Passed directly to the engine.
|
82
|
+
device: Device string (e.g., 'cpu', 'cuda').
|
83
|
+
Passed directly to the engine.
|
84
|
+
detect_only: If True, only detect text regions, do not perform OCR.
|
85
|
+
options: An engine-specific options object (e.g., EasyOCROptions) or dict
|
86
|
+
containing additional parameters specific to the chosen engine.
|
87
|
+
Passed directly to the engine.
|
79
88
|
|
80
89
|
Returns:
|
81
90
|
If input is a single image: List of result dictionaries.
|
82
|
-
If input is a list of images: List of lists of result dictionaries
|
83
|
-
corresponding to each input image.
|
91
|
+
If input is a list of images: List of lists of result dictionaries.
|
84
92
|
|
85
93
|
Raises:
|
86
94
|
ValueError: If the engine name is invalid.
|
87
|
-
TypeError: If
|
88
|
-
|
89
|
-
RuntimeError: If the selected engine is not available.
|
95
|
+
TypeError: If input 'images' is not valid or options type is incompatible.
|
96
|
+
RuntimeError: If the selected engine is not available or processing fails.
|
90
97
|
"""
|
91
|
-
final_options: BaseOCROptions
|
92
|
-
selected_engine_name: str
|
93
|
-
|
94
98
|
# --- Validate input type ---
|
95
99
|
is_batch = isinstance(images, list)
|
96
100
|
if not is_batch and not isinstance(images, Image.Image):
|
97
|
-
|
98
|
-
|
99
|
-
#
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
selected_engine_name = engine.lower() if engine else 'easyocr' # Fallback default
|
122
|
-
logger.debug(f"Using simple mode with engine: '{selected_engine_name}' and kwargs: {kwargs}")
|
123
|
-
|
124
|
-
if selected_engine_name not in self.ENGINE_REGISTRY:
|
125
|
-
raise ValueError(f"Unknown OCR engine: '{selected_engine_name}'. Available: {list(self.ENGINE_REGISTRY.keys())}")
|
126
|
-
|
127
|
-
unexpected_kwargs = set(kwargs.keys()) - self.SIMPLE_MODE_ALLOWED_KWARGS
|
128
|
-
if unexpected_kwargs:
|
129
|
-
raise TypeError(f"Got unexpected keyword arguments in simple mode: {list(unexpected_kwargs)}. Use the 'options' parameter for detailed configuration.")
|
130
|
-
|
131
|
-
# Get the *correct* options class for the selected engine
|
132
|
-
options_class = self.ENGINE_REGISTRY[selected_engine_name]['options_class']
|
133
|
-
|
134
|
-
# Create options instance using provided simple kwargs or defaults
|
135
|
-
simple_args = {
|
136
|
-
'languages': kwargs.get('languages', ['en']),
|
137
|
-
'min_confidence': kwargs.get('min_confidence', 0.5),
|
138
|
-
'device': kwargs.get('device', 'cpu')
|
139
|
-
# Note: 'extra_args' isn't populated in simple mode
|
140
|
-
}
|
141
|
-
final_options = options_class(**simple_args)
|
142
|
-
logger.debug(f"Constructed options for simple mode: {final_options}")
|
143
|
-
|
144
|
-
|
145
|
-
# --- Get Engine Instance and Process ---
|
101
|
+
raise TypeError("Input 'images' must be a PIL Image or a list of PIL Images.")
|
102
|
+
|
103
|
+
# --- Determine Engine ---
|
104
|
+
selected_engine_name = (engine or "easyocr").lower()
|
105
|
+
if selected_engine_name not in self.ENGINE_REGISTRY:
|
106
|
+
raise ValueError(
|
107
|
+
f"Unknown OCR engine: '{selected_engine_name}'. Available: {list(self.ENGINE_REGISTRY.keys())}"
|
108
|
+
)
|
109
|
+
logger.debug(f"Selected engine: '{selected_engine_name}'")
|
110
|
+
|
111
|
+
# --- Prepare Options ---
|
112
|
+
final_options = copy.deepcopy(options) if options is not None else None
|
113
|
+
|
114
|
+
# Type check options object if provided
|
115
|
+
if final_options is not None:
|
116
|
+
options_class = self.ENGINE_REGISTRY[selected_engine_name].get("options_class", BaseOCROptions)
|
117
|
+
if not isinstance(final_options, options_class):
|
118
|
+
# Allow dicts to be passed directly too, assuming engine handles them
|
119
|
+
if not isinstance(final_options, dict):
|
120
|
+
raise TypeError(
|
121
|
+
f"Provided options type '{type(final_options).__name__}' is not compatible with engine '{selected_engine_name}'. Expected '{options_class.__name__}' or dict."
|
122
|
+
)
|
123
|
+
|
124
|
+
# --- Get Engine Instance and Process ---
|
146
125
|
try:
|
147
126
|
engine_instance = self._get_engine_instance(selected_engine_name)
|
148
127
|
processing_mode = "batch" if is_batch else "single image"
|
149
128
|
logger.info(f"Processing {processing_mode} with engine '{selected_engine_name}'...")
|
150
|
-
|
151
|
-
|
152
|
-
|
129
|
+
logger.debug(f" Engine Args: languages={languages}, min_confidence={min_confidence}, device={device}, options={final_options}")
|
130
|
+
|
131
|
+
# Call the engine's process_image, passing common args and options object
|
132
|
+
# **ASSUMPTION**: Engine process_image signatures are updated to accept these common args.
|
133
|
+
results = engine_instance.process_image(
|
134
|
+
images=images,
|
135
|
+
languages=languages,
|
136
|
+
min_confidence=min_confidence,
|
137
|
+
device=device,
|
138
|
+
detect_only=detect_only,
|
139
|
+
options=final_options
|
140
|
+
)
|
153
141
|
|
154
142
|
# Log result summary based on mode
|
155
143
|
if is_batch:
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
144
|
+
# Ensure results is a list before trying to get lengths
|
145
|
+
if isinstance(results, list):
|
146
|
+
num_results_per_image = [
|
147
|
+
len(res_list) if isinstance(res_list, list) else -1 for res_list in results
|
148
|
+
] # Handle potential errors returning non-lists
|
149
|
+
logger.info(
|
150
|
+
f"Processing complete. Found results per image: {num_results_per_image}"
|
151
|
+
)
|
152
|
+
else:
|
153
|
+
logger.error(
|
154
|
+
f"Processing complete but received unexpected result type for batch: {type(results)}"
|
155
|
+
)
|
162
156
|
else:
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
157
|
+
# Ensure results is a list
|
158
|
+
if isinstance(results, list):
|
159
|
+
logger.info(f"Processing complete. Found {len(results)} results.")
|
160
|
+
else:
|
161
|
+
logger.error(
|
162
|
+
f"Processing complete but received unexpected result type for single image: {type(results)}"
|
163
|
+
)
|
164
|
+
return results # Return type matches input type due to engine logic
|
169
165
|
|
170
166
|
except (ImportError, RuntimeError, ValueError, TypeError) as e:
|
171
|
-
|
172
|
-
|
167
|
+
logger.error(
|
168
|
+
f"OCR processing failed for engine '{selected_engine_name}': {e}", exc_info=True
|
169
|
+
)
|
170
|
+
raise # Re-raise expected errors
|
173
171
|
except Exception as e:
|
174
|
-
|
175
|
-
|
176
|
-
|
172
|
+
logger.error(f"An unexpected error occurred during OCR processing: {e}", exc_info=True)
|
173
|
+
raise # Re-raise unexpected errors
|
177
174
|
|
178
175
|
def get_available_engines(self) -> List[str]:
|
179
176
|
"""Returns a list of registered engine names that are currently available."""
|
180
177
|
available = []
|
181
178
|
for name, registry_entry in self.ENGINE_REGISTRY.items():
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
179
|
+
try:
|
180
|
+
# Temporarily instantiate to check availability without caching
|
181
|
+
engine_class = registry_entry["class"]
|
182
|
+
if engine_class().is_available():
|
183
|
+
available.append(name)
|
184
|
+
except Exception as e:
|
185
|
+
logger.debug(
|
186
|
+
f"Engine '{name}' check failed: {e}"
|
187
|
+
) # Log check failures at debug level
|
188
|
+
pass # Ignore engines that fail to instantiate or check
|
190
189
|
return available
|
191
|
-
|
natural_pdf/ocr/ocr_options.py
CHANGED
@@ -1,30 +1,29 @@
|
|
1
1
|
# ocr_options.py
|
2
2
|
import logging
|
3
3
|
from dataclasses import dataclass, field
|
4
|
-
from typing import
|
4
|
+
from typing import Any, Dict, List, Optional, Tuple, Union
|
5
5
|
|
6
6
|
# Configure logging
|
7
7
|
# logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
8
8
|
# logger = logging.getLogger(__name__)
|
9
9
|
# Assume logger is configured elsewhere or remove if not needed globally
|
10
10
|
|
11
|
+
|
11
12
|
# --- Base Options ---
|
12
13
|
@dataclass
|
13
14
|
class BaseOCROptions:
|
14
15
|
"""Base class for OCR engine options."""
|
15
|
-
languages: List[str] = field(default_factory=lambda: ['en'])
|
16
|
-
min_confidence: float = 0.5
|
17
|
-
device: Optional[str] = 'cpu' # Suggestion, actual device usage depends on engine impl.
|
18
16
|
extra_args: Dict[str, Any] = field(default_factory=dict)
|
19
17
|
|
18
|
+
|
20
19
|
# --- EasyOCR Specific Options ---
|
21
20
|
@dataclass
|
22
21
|
class EasyOCROptions(BaseOCROptions):
|
23
22
|
"""Specific options for the EasyOCR engine."""
|
24
23
|
model_storage_directory: Optional[str] = None
|
25
24
|
user_network_directory: Optional[str] = None
|
26
|
-
recog_network: str =
|
27
|
-
detect_network: str =
|
25
|
+
recog_network: str = "english_g2"
|
26
|
+
detect_network: str = "craft"
|
28
27
|
download_enabled: bool = True
|
29
28
|
detector: bool = True
|
30
29
|
recognizer: bool = True
|
@@ -32,7 +31,7 @@ class EasyOCROptions(BaseOCROptions):
|
|
32
31
|
quantize: bool = True
|
33
32
|
cudnn_benchmark: bool = False
|
34
33
|
detail: int = 1
|
35
|
-
decoder: str =
|
34
|
+
decoder: str = "greedy"
|
36
35
|
beamWidth: int = 5
|
37
36
|
batch_size: int = 1
|
38
37
|
workers: int = 0
|
@@ -55,7 +54,7 @@ class EasyOCROptions(BaseOCROptions):
|
|
55
54
|
y_ths: float = 0.5
|
56
55
|
x_ths: float = 1.0
|
57
56
|
add_margin: float = 0.1
|
58
|
-
output_format: str =
|
57
|
+
output_format: str = "standard"
|
59
58
|
|
60
59
|
# def __post_init__(self):
|
61
60
|
# logger.debug(f"Initialized EasyOCROptions: {self}")
|
@@ -71,7 +70,7 @@ class PaddleOCROptions(BaseOCROptions):
|
|
71
70
|
ir_optim: bool = True
|
72
71
|
use_tensorrt: bool = False
|
73
72
|
min_subgraph_size: int = 15
|
74
|
-
precision: str =
|
73
|
+
precision: str = "fp32"
|
75
74
|
enable_mkldnn: bool = False
|
76
75
|
cpu_threads: int = 10
|
77
76
|
use_fp16: bool = False
|
@@ -90,25 +89,22 @@ class PaddleOCROptions(BaseOCROptions):
|
|
90
89
|
cls: Optional[bool] = None
|
91
90
|
|
92
91
|
def __post_init__(self):
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
92
|
+
pass
|
93
|
+
# if self.use_gpu is None:
|
94
|
+
# if self.device and "cuda" in self.device.lower():
|
95
|
+
# self.use_gpu = True
|
96
|
+
# else:
|
97
|
+
# self.use_gpu = False
|
98
|
+
# # logger.debug(f"Initialized PaddleOCROptions: {self}")
|
99
|
+
|
99
100
|
|
100
101
|
# --- Surya Specific Options ---
|
101
102
|
@dataclass
|
102
103
|
class SuryaOCROptions(BaseOCROptions):
|
103
104
|
"""Specific options for the Surya OCR engine."""
|
104
105
|
# Currently, Surya example shows languages passed at prediction time.
|
105
|
-
# Add fields here if Surya's RecognitionPredictor or DetectionPredictor
|
106
|
-
# constructors accept relevant arguments (e.g., model paths, device settings).
|
107
|
-
# For now, it primarily uses the base options like 'languages' and 'min_confidence'.
|
108
|
-
# Configuration like batch sizes are often set via environment variables for Surya.
|
109
106
|
pass
|
110
107
|
|
111
108
|
|
112
109
|
# --- Union type for type hinting ---
|
113
110
|
OCROptions = Union[EasyOCROptions, PaddleOCROptions, SuryaOCROptions, BaseOCROptions]
|
114
|
-
|
natural_pdf/ocr/utils.py
ADDED
@@ -0,0 +1,98 @@
|
|
1
|
+
import io
|
2
|
+
import base64
|
3
|
+
import logging
|
4
|
+
from typing import TYPE_CHECKING, Callable, Iterable, Optional, Any
|
5
|
+
from natural_pdf.elements.text import TextElement
|
6
|
+
from tqdm.auto import tqdm
|
7
|
+
|
8
|
+
if TYPE_CHECKING:
|
9
|
+
from natural_pdf.elements.base import Element
|
10
|
+
|
11
|
+
logger = logging.getLogger(__name__)
|
12
|
+
|
13
|
+
def _apply_ocr_correction_to_elements(
|
14
|
+
elements: Iterable["Element"],
|
15
|
+
correction_callback: Callable[[Any], Optional[str]],
|
16
|
+
) -> None:
|
17
|
+
"""
|
18
|
+
Applies correction callback to a list of elements in place,
|
19
|
+
showing a progress bar.
|
20
|
+
|
21
|
+
Iterates through elements, calls the callback, and updates
|
22
|
+
element.text if a new string is returned.
|
23
|
+
|
24
|
+
Args:
|
25
|
+
elements: An iterable of Element objects.
|
26
|
+
correction_callback: A function accepting an element and returning
|
27
|
+
Optional[str] (new text or None).
|
28
|
+
"""
|
29
|
+
corrections_applied = 0
|
30
|
+
elements_checked = 0
|
31
|
+
|
32
|
+
# Prepare the iterable with tqdm
|
33
|
+
element_iterable = tqdm(elements, desc=f"Correcting OCR", unit="element")
|
34
|
+
|
35
|
+
for element in element_iterable:
|
36
|
+
# Check if the element is likely from OCR and has text attribute
|
37
|
+
element_source = getattr(element, 'source', None)
|
38
|
+
if isinstance(element_source, str) and element_source.startswith('ocr') and hasattr(element, 'text'):
|
39
|
+
elements_checked += 1
|
40
|
+
current_text = getattr(element, 'text')
|
41
|
+
|
42
|
+
new_text = correction_callback(element)
|
43
|
+
|
44
|
+
if new_text is not None:
|
45
|
+
if new_text != current_text:
|
46
|
+
element.text = new_text
|
47
|
+
corrections_applied += 1
|
48
|
+
|
49
|
+
logger.info(f"OCR correction finished. Checked: {elements_checked}, Applied: {corrections_applied}")
|
50
|
+
|
51
|
+
|
52
|
+
def direct_ocr_llm(element,
|
53
|
+
client,
|
54
|
+
model="",
|
55
|
+
resolution=150,
|
56
|
+
prompt="OCR this image. Return only the exact text from the image. Include misspellings, punctuation, etc.",
|
57
|
+
padding=2) -> str:
|
58
|
+
"""Convenience method to directly OCR a region of the page."""
|
59
|
+
|
60
|
+
if isinstance(element, TextElement):
|
61
|
+
region = element.expand(left=padding, right=padding, top=padding, bottom=padding)
|
62
|
+
else:
|
63
|
+
region = element
|
64
|
+
|
65
|
+
buffered = io.BytesIO()
|
66
|
+
region_img = region.to_image(resolution=resolution, include_highlights=False)
|
67
|
+
region_img.save(buffered, format="PNG")
|
68
|
+
base64_image = base64.b64encode(buffered.getvalue()).decode('utf-8')
|
69
|
+
|
70
|
+
response = client.chat.completions.create(
|
71
|
+
model=model,
|
72
|
+
messages=[
|
73
|
+
{
|
74
|
+
"role": "system",
|
75
|
+
"content": "You are an expert OCR engineer. You will be given an image of a region of a page. You will return the exact text from the image."
|
76
|
+
},
|
77
|
+
{
|
78
|
+
"role": "user",
|
79
|
+
"content": [
|
80
|
+
{
|
81
|
+
"type": "text",
|
82
|
+
"text": prompt
|
83
|
+
},
|
84
|
+
{
|
85
|
+
"type": "image_url",
|
86
|
+
"image_url": {
|
87
|
+
"url": f"data:image/png;base64,{base64_image}"
|
88
|
+
}
|
89
|
+
}
|
90
|
+
]
|
91
|
+
}
|
92
|
+
]
|
93
|
+
)
|
94
|
+
|
95
|
+
corrected = response.choices[0].message.content
|
96
|
+
logger.debug(f"Corrected {region.extract_text()} to {corrected}")
|
97
|
+
|
98
|
+
return corrected
|
natural_pdf/qa/__init__.py
CHANGED