natural-pdf 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/__init__.py +55 -0
- natural_pdf/analyzers/__init__.py +6 -0
- natural_pdf/analyzers/layout/__init__.py +1 -0
- natural_pdf/analyzers/layout/base.py +151 -0
- natural_pdf/analyzers/layout/docling.py +247 -0
- natural_pdf/analyzers/layout/layout_analyzer.py +166 -0
- natural_pdf/analyzers/layout/layout_manager.py +200 -0
- natural_pdf/analyzers/layout/layout_options.py +78 -0
- natural_pdf/analyzers/layout/paddle.py +240 -0
- natural_pdf/analyzers/layout/surya.py +151 -0
- natural_pdf/analyzers/layout/tatr.py +251 -0
- natural_pdf/analyzers/layout/yolo.py +165 -0
- natural_pdf/analyzers/text_options.py +60 -0
- natural_pdf/analyzers/text_structure.py +270 -0
- natural_pdf/analyzers/utils.py +57 -0
- natural_pdf/core/__init__.py +3 -0
- natural_pdf/core/element_manager.py +457 -0
- natural_pdf/core/highlighting_service.py +698 -0
- natural_pdf/core/page.py +1444 -0
- natural_pdf/core/pdf.py +653 -0
- natural_pdf/elements/__init__.py +3 -0
- natural_pdf/elements/base.py +761 -0
- natural_pdf/elements/collections.py +1345 -0
- natural_pdf/elements/line.py +140 -0
- natural_pdf/elements/rect.py +122 -0
- natural_pdf/elements/region.py +1793 -0
- natural_pdf/elements/text.py +304 -0
- natural_pdf/ocr/__init__.py +56 -0
- natural_pdf/ocr/engine.py +104 -0
- natural_pdf/ocr/engine_easyocr.py +179 -0
- natural_pdf/ocr/engine_paddle.py +204 -0
- natural_pdf/ocr/engine_surya.py +171 -0
- natural_pdf/ocr/ocr_manager.py +191 -0
- natural_pdf/ocr/ocr_options.py +114 -0
- natural_pdf/qa/__init__.py +3 -0
- natural_pdf/qa/document_qa.py +396 -0
- natural_pdf/selectors/__init__.py +4 -0
- natural_pdf/selectors/parser.py +354 -0
- natural_pdf/templates/__init__.py +1 -0
- natural_pdf/templates/ocr_debug.html +517 -0
- natural_pdf/utils/__init__.py +3 -0
- natural_pdf/utils/highlighting.py +12 -0
- natural_pdf/utils/reading_order.py +227 -0
- natural_pdf/utils/visualization.py +223 -0
- natural_pdf/widgets/__init__.py +4 -0
- natural_pdf/widgets/frontend/viewer.js +88 -0
- natural_pdf/widgets/viewer.py +765 -0
- natural_pdf-0.1.0.dist-info/METADATA +295 -0
- natural_pdf-0.1.0.dist-info/RECORD +52 -0
- natural_pdf-0.1.0.dist-info/WHEEL +5 -0
- natural_pdf-0.1.0.dist-info/licenses/LICENSE +21 -0
- natural_pdf-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,204 @@
|
|
1
|
+
# ocr_engine_paddleocr.py
|
2
|
+
import logging
|
3
|
+
import importlib.util
|
4
|
+
from typing import Dict, List, Any, Optional, Tuple, Union
|
5
|
+
import numpy as np
|
6
|
+
from PIL import Image
|
7
|
+
import inspect # Used for dynamic parameter passing
|
8
|
+
|
9
|
+
from .engine import OCREngine
|
10
|
+
from .ocr_options import PaddleOCROptions, BaseOCROptions
|
11
|
+
|
12
|
+
logger = logging.getLogger(__name__)
|
13
|
+
|
14
|
+
class PaddleOCREngine(OCREngine):
|
15
|
+
"""PaddleOCR engine implementation."""
|
16
|
+
|
17
|
+
LANGUAGE_MAP = {
|
18
|
+
'en': 'en', 'zh': 'ch', 'zh-cn': 'ch', 'zh-tw': 'chinese_cht',
|
19
|
+
'ja': 'japan', 'ko': 'korean', 'th': 'thai', 'fr': 'french',
|
20
|
+
'de': 'german', 'ru': 'russian', 'ar': 'arabic', 'hi': 'hindi',
|
21
|
+
'vi': 'vietnam', 'fa': 'cyrillic', 'ur': 'cyrillic', 'rs': 'serbian',
|
22
|
+
'oc': 'latin', 'rsc': 'cyrillic', 'bg': 'bulgarian', 'uk': 'cyrillic',
|
23
|
+
'be': 'cyrillic', 'te': 'telugu', 'kn': 'kannada', 'ta': 'tamil',
|
24
|
+
'latin': 'latin', 'cyrillic': 'cyrillic', 'devanagari': 'devanagari',
|
25
|
+
}
|
26
|
+
|
27
|
+
def __init__(self):
|
28
|
+
super().__init__()
|
29
|
+
self._paddleocr = None
|
30
|
+
|
31
|
+
def _lazy_import_paddleocr(self):
|
32
|
+
"""Imports paddleocr only when needed."""
|
33
|
+
if self._paddleocr is None:
|
34
|
+
if not self.is_available():
|
35
|
+
raise ImportError("PaddleOCR or PaddlePaddle is not installed or available.")
|
36
|
+
try:
|
37
|
+
import paddle
|
38
|
+
import paddleocr
|
39
|
+
self._paddleocr = paddleocr
|
40
|
+
logger.info("PaddleOCR module imported successfully.")
|
41
|
+
except ImportError as e:
|
42
|
+
logger.error(f"Failed to import PaddleOCR/PaddlePaddle: {e}")
|
43
|
+
raise
|
44
|
+
return self._paddleocr
|
45
|
+
|
46
|
+
def is_available(self) -> bool:
|
47
|
+
"""Check if PaddleOCR and paddlepaddle are installed."""
|
48
|
+
paddle_installed = importlib.util.find_spec("paddle") is not None or \
|
49
|
+
importlib.util.find_spec("paddlepaddle") is not None
|
50
|
+
paddleocr_installed = importlib.util.find_spec("paddleocr") is not None
|
51
|
+
return paddle_installed and paddleocr_installed
|
52
|
+
|
53
|
+
def _map_language(self, iso_lang: str) -> str:
|
54
|
+
"""Map ISO language code to PaddleOCR language code."""
|
55
|
+
return self.LANGUAGE_MAP.get(iso_lang.lower(), 'en')
|
56
|
+
|
57
|
+
def _get_cache_key(self, options: PaddleOCROptions) -> str:
|
58
|
+
"""Generate a more specific cache key for PaddleOCR."""
|
59
|
+
base_key = super()._get_cache_key(options)
|
60
|
+
primary_lang = self._map_language(options.languages[0]) if options.languages else 'en'
|
61
|
+
angle_cls_key = str(options.use_angle_cls)
|
62
|
+
precision_key = options.precision
|
63
|
+
return f"{base_key}_{primary_lang}_{angle_cls_key}_{precision_key}"
|
64
|
+
|
65
|
+
def _get_reader(self, options: PaddleOCROptions):
|
66
|
+
"""Get or initialize a PaddleOCR reader based on options."""
|
67
|
+
cache_key = self._get_cache_key(options)
|
68
|
+
if cache_key in self._reader_cache:
|
69
|
+
logger.debug(f"Using cached PaddleOCR reader for key: {cache_key}")
|
70
|
+
return self._reader_cache[cache_key]
|
71
|
+
|
72
|
+
logger.info(f"Creating new PaddleOCR reader for key: {cache_key}")
|
73
|
+
paddleocr = self._lazy_import_paddleocr()
|
74
|
+
|
75
|
+
constructor_sig = inspect.signature(paddleocr.PaddleOCR.__init__)
|
76
|
+
constructor_args = {}
|
77
|
+
constructor_args['lang'] = self._map_language(options.languages[0]) if options.languages else 'en'
|
78
|
+
|
79
|
+
for field_name, param in constructor_sig.parameters.items():
|
80
|
+
if field_name in ['self', 'lang']: continue
|
81
|
+
if field_name == 'use_gpu':
|
82
|
+
constructor_args['use_gpu'] = options.use_gpu
|
83
|
+
continue
|
84
|
+
if hasattr(options, field_name):
|
85
|
+
constructor_args[field_name] = getattr(options, field_name)
|
86
|
+
elif field_name in options.extra_args:
|
87
|
+
constructor_args[field_name] = options.extra_args[field_name]
|
88
|
+
|
89
|
+
constructor_args.pop('device', None)
|
90
|
+
logger.debug(f"PaddleOCR constructor args: {constructor_args}")
|
91
|
+
|
92
|
+
try:
|
93
|
+
show_log = constructor_args.get('show_log', False)
|
94
|
+
original_log_level = logging.getLogger('ppocr').level
|
95
|
+
if not show_log:
|
96
|
+
logging.getLogger('ppocr').setLevel(logging.ERROR)
|
97
|
+
|
98
|
+
reader = paddleocr.PaddleOCR(**constructor_args)
|
99
|
+
|
100
|
+
if not show_log:
|
101
|
+
logging.getLogger('ppocr').setLevel(original_log_level)
|
102
|
+
|
103
|
+
self._reader_cache[cache_key] = reader
|
104
|
+
logger.info("PaddleOCR reader created successfully.")
|
105
|
+
return reader
|
106
|
+
except Exception as e:
|
107
|
+
logger.error(f"Failed to create PaddleOCR reader: {e}", exc_info=True)
|
108
|
+
raise
|
109
|
+
|
110
|
+
def _prepare_ocr_args(self, options: PaddleOCROptions) -> Dict[str, Any]:
|
111
|
+
"""Helper to prepare arguments for the ocr method (excluding image)."""
|
112
|
+
ocr_args = {}
|
113
|
+
# Determine 'cls' value based on options precedence
|
114
|
+
ocr_args['cls'] = options.cls if options.cls is not None else options.use_angle_cls
|
115
|
+
ocr_args['det'] = options.det
|
116
|
+
ocr_args['rec'] = options.rec
|
117
|
+
# Add extra args if needed (less common for ocr method itself)
|
118
|
+
# for field_name in options.extra_args:
|
119
|
+
# if field_name in ['cls', 'det', 'rec']: # Check against known ocr args
|
120
|
+
# ocr_args[field_name] = options.extra_args[field_name]
|
121
|
+
logger.debug(f"PaddleOCR ocr args (excluding image): {ocr_args}")
|
122
|
+
return ocr_args
|
123
|
+
|
124
|
+
def _standardize_results(self, raw_page_results: Optional[List[Any]], options: PaddleOCROptions) -> List[Dict[str, Any]]:
|
125
|
+
"""Standardizes raw results from a single page/image from PaddleOCR."""
|
126
|
+
standardized_page = []
|
127
|
+
if not raw_page_results: # Handle None or empty list
|
128
|
+
return standardized_page
|
129
|
+
|
130
|
+
min_confidence = options.min_confidence
|
131
|
+
for detection in raw_page_results:
|
132
|
+
try:
|
133
|
+
if not isinstance(detection, (list, tuple)) or len(detection) < 2: continue
|
134
|
+
bbox_raw = detection[0]
|
135
|
+
text_confidence = detection[1]
|
136
|
+
if not isinstance(text_confidence, tuple) or len(text_confidence) < 2: continue
|
137
|
+
|
138
|
+
text = str(text_confidence[0])
|
139
|
+
confidence = float(text_confidence[1])
|
140
|
+
|
141
|
+
if confidence >= min_confidence:
|
142
|
+
bbox = self._standardize_bbox(bbox_raw)
|
143
|
+
if bbox:
|
144
|
+
standardized_page.append({
|
145
|
+
'bbox': bbox, 'text': text, 'confidence': confidence, 'source': 'ocr'
|
146
|
+
})
|
147
|
+
else:
|
148
|
+
logger.warning(f"Skipping result due to invalid bbox: {bbox_raw}")
|
149
|
+
except (IndexError, ValueError, TypeError) as e:
|
150
|
+
logger.warning(f"Skipping invalid detection format: {detection}. Error: {e}")
|
151
|
+
continue
|
152
|
+
return standardized_page
|
153
|
+
|
154
|
+
def _pil_to_bgr(self, image: Image.Image) -> np.ndarray:
|
155
|
+
"""Converts PIL Image to BGR numpy array."""
|
156
|
+
if image.mode == 'BGR': # Already BGR
|
157
|
+
return np.array(image)
|
158
|
+
img_rgb = image.convert('RGB')
|
159
|
+
img_array_rgb = np.array(img_rgb)
|
160
|
+
img_array_bgr = img_array_rgb[:, :, ::-1] # Convert RGB to BGR
|
161
|
+
return img_array_bgr
|
162
|
+
|
163
|
+
|
164
|
+
def process_image(
|
165
|
+
self,
|
166
|
+
images: Union[Image.Image, List[Image.Image]],
|
167
|
+
options: BaseOCROptions
|
168
|
+
) -> Union[List[Dict[str, Any]], List[List[Dict[str, Any]]]]:
|
169
|
+
"""Processes a single image or a batch of images with PaddleOCR."""
|
170
|
+
|
171
|
+
if not isinstance(options, PaddleOCROptions):
|
172
|
+
logger.warning("Received BaseOCROptions, expected PaddleOCROptions. Using defaults.")
|
173
|
+
options = PaddleOCROptions(
|
174
|
+
languages=options.languages,
|
175
|
+
min_confidence=options.min_confidence,
|
176
|
+
device=options.device,
|
177
|
+
extra_args=options.extra_args
|
178
|
+
)
|
179
|
+
|
180
|
+
reader = self._get_reader(options)
|
181
|
+
ocr_args = self._prepare_ocr_args(options)
|
182
|
+
|
183
|
+
# Helper function to process one image
|
184
|
+
def process_one(img):
|
185
|
+
try:
|
186
|
+
img_array_bgr = self._pil_to_bgr(img)
|
187
|
+
raw_results = reader.ocr(img_array_bgr, **ocr_args)
|
188
|
+
|
189
|
+
page_results = []
|
190
|
+
if raw_results and isinstance(raw_results, list) and len(raw_results) > 0:
|
191
|
+
page_results = raw_results[0]
|
192
|
+
|
193
|
+
return self._standardize_results(page_results, options)
|
194
|
+
except Exception as e:
|
195
|
+
logger.error(f"Error processing image with PaddleOCR: {e}")
|
196
|
+
return []
|
197
|
+
|
198
|
+
# Handle single image or list of images
|
199
|
+
if isinstance(images, Image.Image):
|
200
|
+
return process_one(images)
|
201
|
+
elif isinstance(images, list):
|
202
|
+
return [process_one(img) for img in images]
|
203
|
+
else:
|
204
|
+
raise TypeError("Input 'images' must be a PIL Image or a list of PIL Images.")
|
@@ -0,0 +1,171 @@
|
|
1
|
+
# ocr_engine_surya.py
|
2
|
+
import logging
|
3
|
+
import importlib.util
|
4
|
+
from typing import Dict, List, Any, Optional, Tuple, Union
|
5
|
+
import numpy as np
|
6
|
+
from PIL import Image
|
7
|
+
|
8
|
+
from .engine import OCREngine
|
9
|
+
from .ocr_options import SuryaOCROptions, BaseOCROptions
|
10
|
+
|
11
|
+
logger = logging.getLogger(__name__)
|
12
|
+
|
13
|
+
class SuryaOCREngine(OCREngine):
|
14
|
+
"""Surya OCR engine implementation."""
|
15
|
+
|
16
|
+
def __init__(self):
|
17
|
+
super().__init__()
|
18
|
+
self._recognition_predictor = None
|
19
|
+
self._detection_predictor = None
|
20
|
+
self._surya_recognition = None
|
21
|
+
self._surya_detection = None
|
22
|
+
self._initialized = False
|
23
|
+
|
24
|
+
def _lazy_load_predictors(self, options: SuryaOCROptions):
|
25
|
+
"""Initializes Surya predictors when first needed."""
|
26
|
+
if self._initialized:
|
27
|
+
return
|
28
|
+
|
29
|
+
if not self.is_available():
|
30
|
+
raise ImportError("Surya OCR library is not installed or available.")
|
31
|
+
|
32
|
+
try:
|
33
|
+
from surya.recognition import RecognitionPredictor
|
34
|
+
from surya.detection import DetectionPredictor
|
35
|
+
self._surya_recognition = RecognitionPredictor
|
36
|
+
self._surya_detection = DetectionPredictor
|
37
|
+
logger.info("Surya modules imported successfully.")
|
38
|
+
|
39
|
+
# --- Instantiate Predictors ---
|
40
|
+
# Add arguments from options if Surya supports them
|
41
|
+
# Example: device = options.device or 'cuda' if torch.cuda.is_available() else 'cpu'
|
42
|
+
# predictor_args = {'device': options.device} # If applicable
|
43
|
+
predictor_args = {} # Assuming parameterless init based on example
|
44
|
+
|
45
|
+
logger.info("Instantiating Surya DetectionPredictor...")
|
46
|
+
self._detection_predictor = self._surya_detection(**predictor_args)
|
47
|
+
logger.info("Instantiating Surya RecognitionPredictor...")
|
48
|
+
self._recognition_predictor = self._surya_recognition(**predictor_args)
|
49
|
+
|
50
|
+
self._initialized = True
|
51
|
+
logger.info("Surya predictors initialized.")
|
52
|
+
|
53
|
+
except ImportError as e:
|
54
|
+
logger.error(f"Failed to import Surya modules: {e}")
|
55
|
+
raise
|
56
|
+
except Exception as e:
|
57
|
+
logger.error(f"Failed to initialize Surya predictors: {e}", exc_info=True)
|
58
|
+
raise
|
59
|
+
|
60
|
+
def is_available(self) -> bool:
|
61
|
+
"""Check if the surya library is installed."""
|
62
|
+
return importlib.util.find_spec("surya") is not None
|
63
|
+
|
64
|
+
def _standardize_results(self, raw_ocr_result: Any, options: SuryaOCROptions) -> List[Dict[str, Any]]:
|
65
|
+
"""Standardizes raw results from a single image from Surya."""
|
66
|
+
standardized_page = []
|
67
|
+
min_confidence = options.min_confidence
|
68
|
+
|
69
|
+
# Check if the result has the expected structure (OCRResult with text_lines)
|
70
|
+
if not hasattr(raw_ocr_result, 'text_lines') or not isinstance(raw_ocr_result.text_lines, list):
|
71
|
+
logger.warning(f"Unexpected Surya result format: {type(raw_ocr_result)}. Skipping.")
|
72
|
+
return standardized_page
|
73
|
+
|
74
|
+
for line in raw_ocr_result.text_lines:
|
75
|
+
try:
|
76
|
+
# Extract data from Surya's TextLine object
|
77
|
+
text = line.text
|
78
|
+
confidence = line.confidence
|
79
|
+
# Surya provides both polygon and bbox, bbox is already (x0, y0, x1, y1)
|
80
|
+
bbox_raw = line.bbox # Use bbox directly if available and correct format
|
81
|
+
|
82
|
+
if confidence >= min_confidence:
|
83
|
+
bbox = self._standardize_bbox(bbox_raw) # Validate/convert format
|
84
|
+
if bbox:
|
85
|
+
standardized_page.append({
|
86
|
+
'bbox': bbox,
|
87
|
+
'text': text,
|
88
|
+
'confidence': confidence,
|
89
|
+
'source': 'ocr'
|
90
|
+
})
|
91
|
+
else:
|
92
|
+
# Try polygon if bbox failed standardization
|
93
|
+
bbox_poly = self._standardize_bbox(line.polygon)
|
94
|
+
if bbox_poly:
|
95
|
+
standardized_page.append({
|
96
|
+
'bbox': bbox_poly, 'text': text, 'confidence': confidence, 'source': 'ocr'
|
97
|
+
})
|
98
|
+
else:
|
99
|
+
logger.warning(f"Skipping Surya line due to invalid bbox/polygon: {line}")
|
100
|
+
|
101
|
+
except (AttributeError, ValueError, TypeError) as e:
|
102
|
+
logger.warning(f"Skipping invalid Surya TextLine format: {line}. Error: {e}")
|
103
|
+
continue
|
104
|
+
return standardized_page
|
105
|
+
|
106
|
+
def process_image(
|
107
|
+
self,
|
108
|
+
images: Union[Image.Image, List[Image.Image]],
|
109
|
+
options: BaseOCROptions
|
110
|
+
) -> Union[List[Dict[str, Any]], List[List[Dict[str, Any]]]]:
|
111
|
+
"""Processes a single image or a batch of images with Surya OCR."""
|
112
|
+
|
113
|
+
if not isinstance(options, SuryaOCROptions):
|
114
|
+
logger.warning("Received BaseOCROptions, expected SuryaOCROptions. Using defaults.")
|
115
|
+
options = SuryaOCROptions(
|
116
|
+
languages=options.languages,
|
117
|
+
min_confidence=options.min_confidence,
|
118
|
+
device=options.device,
|
119
|
+
extra_args=options.extra_args
|
120
|
+
)
|
121
|
+
|
122
|
+
# Ensure predictors are loaded/initialized
|
123
|
+
self._lazy_load_predictors(options)
|
124
|
+
if not self._recognition_predictor or not self._detection_predictor:
|
125
|
+
raise RuntimeError("Surya predictors could not be initialized.")
|
126
|
+
|
127
|
+
# --- Prepare inputs for Surya ---
|
128
|
+
is_batch = isinstance(images, list)
|
129
|
+
input_images: List[Image.Image] = images if is_batch else [images]
|
130
|
+
# Surya expects a list of language lists, one per image
|
131
|
+
input_langs: List[List[str]] = [options.languages for _ in input_images]
|
132
|
+
|
133
|
+
if not input_images:
|
134
|
+
logger.warning("No images provided for Surya processing.")
|
135
|
+
return [] if not is_batch else [[]]
|
136
|
+
|
137
|
+
# --- Run Surya Prediction ---
|
138
|
+
try:
|
139
|
+
processing_mode = "batch" if is_batch else "single image"
|
140
|
+
logger.info(f"Processing {processing_mode} ({len(input_images)} images) with Surya...")
|
141
|
+
# Call Surya's predictor
|
142
|
+
# It returns a list of OCRResult objects, one per input image
|
143
|
+
predictions = self._recognition_predictor(
|
144
|
+
images=input_images,
|
145
|
+
langs=input_langs,
|
146
|
+
det_predictor=self._detection_predictor
|
147
|
+
)
|
148
|
+
logger.info(f"Surya prediction complete. Received {len(predictions)} results.")
|
149
|
+
|
150
|
+
# --- Standardize Results ---
|
151
|
+
if len(predictions) != len(input_images):
|
152
|
+
logger.error(f"Surya result count ({len(predictions)}) does not match input count ({len(input_images)}). Returning empty results.")
|
153
|
+
# Decide on error handling: raise error or return empty structure
|
154
|
+
return [[] for _ in input_images] if is_batch else []
|
155
|
+
|
156
|
+
all_standardized_results = [self._standardize_results(res, options) for res in predictions]
|
157
|
+
|
158
|
+
if is_batch:
|
159
|
+
return all_standardized_results # Return List[List[Dict]]
|
160
|
+
else:
|
161
|
+
return all_standardized_results[0] # Return List[Dict] for single image
|
162
|
+
|
163
|
+
except Exception as e:
|
164
|
+
logger.error(f"Error during Surya OCR processing: {e}", exc_info=True)
|
165
|
+
# Return empty structure matching input type on failure
|
166
|
+
return [[] for _ in input_images] if is_batch else []
|
167
|
+
|
168
|
+
# Note: Caching is handled differently for Surya as predictors are stateful
|
169
|
+
# and initialized once. The base class _reader_cache is not used here.
|
170
|
+
# If predictors could be configured per-run, caching would need rethinking.
|
171
|
+
|
@@ -0,0 +1,191 @@
|
|
1
|
+
# ocr_manager.py
|
2
|
+
import logging
|
3
|
+
from typing import Dict, List, Any, Optional, Union, Type
|
4
|
+
from PIL import Image
|
5
|
+
import copy # For deep copying options
|
6
|
+
|
7
|
+
# Import engine classes and options
|
8
|
+
from .engine import OCREngine
|
9
|
+
from .engine_easyocr import EasyOCREngine
|
10
|
+
from .engine_paddle import PaddleOCREngine
|
11
|
+
from .engine_surya import SuryaOCREngine # <-- Import Surya Engine
|
12
|
+
from .ocr_options import (
|
13
|
+
BaseOCROptions, EasyOCROptions, PaddleOCROptions, SuryaOCROptions, OCROptions # <-- Import Surya Options
|
14
|
+
)
|
15
|
+
|
16
|
+
logger = logging.getLogger(__name__)
|
17
|
+
|
18
|
+
class OCRManager:
|
19
|
+
"""Manages OCR engine selection, configuration, and execution."""
|
20
|
+
|
21
|
+
# Registry mapping engine names to classes and default options
|
22
|
+
ENGINE_REGISTRY: Dict[str, Dict[str, Any]] = {
|
23
|
+
'easyocr': {'class': EasyOCREngine, 'options_class': EasyOCROptions},
|
24
|
+
'paddle': {'class': PaddleOCREngine, 'options_class': PaddleOCROptions},
|
25
|
+
'surya': {'class': SuryaOCREngine, 'options_class': SuryaOCROptions}, # <-- Add Surya
|
26
|
+
# Add other engines here
|
27
|
+
}
|
28
|
+
|
29
|
+
# Define the limited set of kwargs allowed for the simple apply_ocr call
|
30
|
+
SIMPLE_MODE_ALLOWED_KWARGS = {
|
31
|
+
'engine', 'languages', 'min_confidence', 'device'
|
32
|
+
# Add image pre-processing args like 'resolution', 'width' if handled here
|
33
|
+
}
|
34
|
+
|
35
|
+
def __init__(self):
|
36
|
+
"""Initializes the OCR Manager."""
|
37
|
+
self._engine_instances: Dict[str, OCREngine] = {} # Cache for engine instances
|
38
|
+
logger.info("OCRManager initialized.")
|
39
|
+
|
40
|
+
def _get_engine_instance(self, engine_name: str) -> OCREngine:
|
41
|
+
"""Retrieves or creates an instance of the specified OCR engine."""
|
42
|
+
engine_name = engine_name.lower()
|
43
|
+
if engine_name not in self.ENGINE_REGISTRY:
|
44
|
+
raise ValueError(f"Unknown OCR engine: '{engine_name}'. Available: {list(self.ENGINE_REGISTRY.keys())}")
|
45
|
+
|
46
|
+
# Surya engine might manage its own predictor state, consider if caching instance is always right
|
47
|
+
# For now, we cache the engine instance itself.
|
48
|
+
if engine_name not in self._engine_instances:
|
49
|
+
logger.info(f"Creating instance of engine: {engine_name}")
|
50
|
+
engine_class = self.ENGINE_REGISTRY[engine_name]['class']
|
51
|
+
engine_instance = engine_class() # Instantiate first
|
52
|
+
if not engine_instance.is_available():
|
53
|
+
# Check availability before storing
|
54
|
+
raise RuntimeError(f"Engine '{engine_name}' is not available. Please check dependencies.")
|
55
|
+
self._engine_instances[engine_name] = engine_instance # Store if available
|
56
|
+
|
57
|
+
return self._engine_instances[engine_name]
|
58
|
+
|
59
|
+
def apply_ocr(
|
60
|
+
self,
|
61
|
+
images: Union[Image.Image, List[Image.Image]], # Accept single or list
|
62
|
+
engine: Optional[str] = 'easyocr', # Default engine
|
63
|
+
options: Optional[OCROptions] = None,
|
64
|
+
**kwargs
|
65
|
+
) -> Union[List[Dict[str, Any]], List[List[Dict[str, Any]]]]: # Return single or list of lists
|
66
|
+
"""
|
67
|
+
Applies OCR to a single image or a batch of images using either simple
|
68
|
+
keyword arguments or an options object.
|
69
|
+
|
70
|
+
Args:
|
71
|
+
images: A single PIL Image or a list of PIL Images to process.
|
72
|
+
engine: Name of the engine to use (e.g., 'easyocr', 'paddle', 'surya').
|
73
|
+
Ignored if 'options' object is provided. Defaults to 'easyocr'.
|
74
|
+
options: An instance of EasyOCROptions, PaddleOCROptions, or SuryaOCROptions
|
75
|
+
for detailed configuration. If provided, simple kwargs (languages, etc.)
|
76
|
+
and the 'engine' arg are ignored.
|
77
|
+
**kwargs: For simple mode, accepts: 'languages', 'min_confidence', 'device'.
|
78
|
+
Other kwargs will raise a TypeError unless 'options' is provided.
|
79
|
+
|
80
|
+
Returns:
|
81
|
+
If input is a single image: List of result dictionaries.
|
82
|
+
If input is a list of images: List of lists of result dictionaries,
|
83
|
+
corresponding to each input image.
|
84
|
+
|
85
|
+
Raises:
|
86
|
+
ValueError: If the engine name is invalid.
|
87
|
+
TypeError: If unexpected keyword arguments are provided in simple mode,
|
88
|
+
or if input 'images' is not a PIL Image or list of PIL Images.
|
89
|
+
RuntimeError: If the selected engine is not available.
|
90
|
+
"""
|
91
|
+
final_options: BaseOCROptions
|
92
|
+
selected_engine_name: str
|
93
|
+
|
94
|
+
# --- Validate input type ---
|
95
|
+
is_batch = isinstance(images, list)
|
96
|
+
if not is_batch and not isinstance(images, Image.Image):
|
97
|
+
raise TypeError("Input 'images' must be a PIL Image or a list of PIL Images.")
|
98
|
+
# Allow engines to handle non-PIL images in list if they support it/log warnings
|
99
|
+
# if is_batch and not all(isinstance(img, Image.Image) for img in images):
|
100
|
+
# logger.warning("Batch may contain items that are not PIL Images.")
|
101
|
+
|
102
|
+
|
103
|
+
# --- Determine Options and Engine ---
|
104
|
+
if options is not None:
|
105
|
+
# Advanced Mode
|
106
|
+
logger.debug(f"Using advanced mode with options object: {type(options).__name__}")
|
107
|
+
final_options = copy.deepcopy(options) # Prevent modification of original
|
108
|
+
found_engine = False
|
109
|
+
for name, registry_entry in self.ENGINE_REGISTRY.items():
|
110
|
+
# Check if options object is an instance of the registered options class
|
111
|
+
if isinstance(options, registry_entry['options_class']):
|
112
|
+
selected_engine_name = name
|
113
|
+
found_engine = True
|
114
|
+
break
|
115
|
+
if not found_engine:
|
116
|
+
raise TypeError(f"Provided options object type '{type(options).__name__}' does not match any registered engine options.")
|
117
|
+
if kwargs:
|
118
|
+
logger.warning(f"Keyword arguments {list(kwargs.keys())} were provided alongside 'options' and will be ignored.")
|
119
|
+
else:
|
120
|
+
# Simple Mode
|
121
|
+
selected_engine_name = engine.lower() if engine else 'easyocr' # Fallback default
|
122
|
+
logger.debug(f"Using simple mode with engine: '{selected_engine_name}' and kwargs: {kwargs}")
|
123
|
+
|
124
|
+
if selected_engine_name not in self.ENGINE_REGISTRY:
|
125
|
+
raise ValueError(f"Unknown OCR engine: '{selected_engine_name}'. Available: {list(self.ENGINE_REGISTRY.keys())}")
|
126
|
+
|
127
|
+
unexpected_kwargs = set(kwargs.keys()) - self.SIMPLE_MODE_ALLOWED_KWARGS
|
128
|
+
if unexpected_kwargs:
|
129
|
+
raise TypeError(f"Got unexpected keyword arguments in simple mode: {list(unexpected_kwargs)}. Use the 'options' parameter for detailed configuration.")
|
130
|
+
|
131
|
+
# Get the *correct* options class for the selected engine
|
132
|
+
options_class = self.ENGINE_REGISTRY[selected_engine_name]['options_class']
|
133
|
+
|
134
|
+
# Create options instance using provided simple kwargs or defaults
|
135
|
+
simple_args = {
|
136
|
+
'languages': kwargs.get('languages', ['en']),
|
137
|
+
'min_confidence': kwargs.get('min_confidence', 0.5),
|
138
|
+
'device': kwargs.get('device', 'cpu')
|
139
|
+
# Note: 'extra_args' isn't populated in simple mode
|
140
|
+
}
|
141
|
+
final_options = options_class(**simple_args)
|
142
|
+
logger.debug(f"Constructed options for simple mode: {final_options}")
|
143
|
+
|
144
|
+
|
145
|
+
# --- Get Engine Instance and Process ---
|
146
|
+
try:
|
147
|
+
engine_instance = self._get_engine_instance(selected_engine_name)
|
148
|
+
processing_mode = "batch" if is_batch else "single image"
|
149
|
+
logger.info(f"Processing {processing_mode} with engine '{selected_engine_name}'...")
|
150
|
+
|
151
|
+
# Call the engine's process_image, passing single image or list
|
152
|
+
results = engine_instance.process_image(images, final_options)
|
153
|
+
|
154
|
+
# Log result summary based on mode
|
155
|
+
if is_batch:
|
156
|
+
# Ensure results is a list before trying to get lengths
|
157
|
+
if isinstance(results, list):
|
158
|
+
num_results_per_image = [len(res_list) if isinstance(res_list, list) else -1 for res_list in results] # Handle potential errors returning non-lists
|
159
|
+
logger.info(f"Processing complete. Found results per image: {num_results_per_image}")
|
160
|
+
else:
|
161
|
+
logger.error(f"Processing complete but received unexpected result type for batch: {type(results)}")
|
162
|
+
else:
|
163
|
+
# Ensure results is a list
|
164
|
+
if isinstance(results, list):
|
165
|
+
logger.info(f"Processing complete. Found {len(results)} results.")
|
166
|
+
else:
|
167
|
+
logger.error(f"Processing complete but received unexpected result type for single image: {type(results)}")
|
168
|
+
return results # Return type matches input type due to engine logic
|
169
|
+
|
170
|
+
except (ImportError, RuntimeError, ValueError, TypeError) as e:
|
171
|
+
logger.error(f"OCR processing failed for engine '{selected_engine_name}': {e}", exc_info=True)
|
172
|
+
raise # Re-raise expected errors
|
173
|
+
except Exception as e:
|
174
|
+
logger.error(f"An unexpected error occurred during OCR processing: {e}", exc_info=True)
|
175
|
+
raise # Re-raise unexpected errors
|
176
|
+
|
177
|
+
|
178
|
+
def get_available_engines(self) -> List[str]:
|
179
|
+
"""Returns a list of registered engine names that are currently available."""
|
180
|
+
available = []
|
181
|
+
for name, registry_entry in self.ENGINE_REGISTRY.items():
|
182
|
+
try:
|
183
|
+
# Temporarily instantiate to check availability without caching
|
184
|
+
engine_class = registry_entry['class']
|
185
|
+
if engine_class().is_available():
|
186
|
+
available.append(name)
|
187
|
+
except Exception as e:
|
188
|
+
logger.debug(f"Engine '{name}' check failed: {e}") # Log check failures at debug level
|
189
|
+
pass # Ignore engines that fail to instantiate or check
|
190
|
+
return available
|
191
|
+
|
@@ -0,0 +1,114 @@
|
|
1
|
+
# ocr_options.py
|
2
|
+
import logging
|
3
|
+
from dataclasses import dataclass, field
|
4
|
+
from typing import List, Optional, Dict, Any, Tuple, Union
|
5
|
+
|
6
|
+
# Configure logging
|
7
|
+
# logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
8
|
+
# logger = logging.getLogger(__name__)
|
9
|
+
# Assume logger is configured elsewhere or remove if not needed globally
|
10
|
+
|
11
|
+
# --- Base Options ---
|
12
|
+
@dataclass
|
13
|
+
class BaseOCROptions:
|
14
|
+
"""Base class for OCR engine options."""
|
15
|
+
languages: List[str] = field(default_factory=lambda: ['en'])
|
16
|
+
min_confidence: float = 0.5
|
17
|
+
device: Optional[str] = 'cpu' # Suggestion, actual device usage depends on engine impl.
|
18
|
+
extra_args: Dict[str, Any] = field(default_factory=dict)
|
19
|
+
|
20
|
+
# --- EasyOCR Specific Options ---
|
21
|
+
@dataclass
|
22
|
+
class EasyOCROptions(BaseOCROptions):
|
23
|
+
"""Specific options for the EasyOCR engine."""
|
24
|
+
model_storage_directory: Optional[str] = None
|
25
|
+
user_network_directory: Optional[str] = None
|
26
|
+
recog_network: str = 'english_g2'
|
27
|
+
detect_network: str = 'craft'
|
28
|
+
download_enabled: bool = True
|
29
|
+
detector: bool = True
|
30
|
+
recognizer: bool = True
|
31
|
+
verbose: bool = True
|
32
|
+
quantize: bool = True
|
33
|
+
cudnn_benchmark: bool = False
|
34
|
+
detail: int = 1
|
35
|
+
decoder: str = 'greedy'
|
36
|
+
beamWidth: int = 5
|
37
|
+
batch_size: int = 1
|
38
|
+
workers: int = 0
|
39
|
+
allowlist: Optional[str] = None
|
40
|
+
blocklist: Optional[str] = None
|
41
|
+
paragraph: bool = False
|
42
|
+
min_size: int = 10
|
43
|
+
contrast_ths: float = 0.1
|
44
|
+
adjust_contrast: float = 0.5
|
45
|
+
filter_ths: float = 0.0
|
46
|
+
text_threshold: float = 0.7
|
47
|
+
low_text: float = 0.4
|
48
|
+
link_threshold: float = 0.4
|
49
|
+
canvas_size: int = 2560
|
50
|
+
mag_ratio: float = 1.0
|
51
|
+
slope_ths: float = 0.1
|
52
|
+
ycenter_ths: float = 0.5
|
53
|
+
height_ths: float = 0.5
|
54
|
+
width_ths: float = 0.5
|
55
|
+
y_ths: float = 0.5
|
56
|
+
x_ths: float = 1.0
|
57
|
+
add_margin: float = 0.1
|
58
|
+
output_format: str = 'standard'
|
59
|
+
|
60
|
+
# def __post_init__(self):
|
61
|
+
# logger.debug(f"Initialized EasyOCROptions: {self}")
|
62
|
+
|
63
|
+
|
64
|
+
# --- PaddleOCR Specific Options ---
|
65
|
+
@dataclass
|
66
|
+
class PaddleOCROptions(BaseOCROptions):
|
67
|
+
"""Specific options for the PaddleOCR engine."""
|
68
|
+
use_angle_cls: bool = True
|
69
|
+
use_gpu: Optional[bool] = None
|
70
|
+
gpu_mem: int = 500
|
71
|
+
ir_optim: bool = True
|
72
|
+
use_tensorrt: bool = False
|
73
|
+
min_subgraph_size: int = 15
|
74
|
+
precision: str = 'fp32'
|
75
|
+
enable_mkldnn: bool = False
|
76
|
+
cpu_threads: int = 10
|
77
|
+
use_fp16: bool = False
|
78
|
+
det_model_dir: Optional[str] = None
|
79
|
+
rec_model_dir: Optional[str] = None
|
80
|
+
cls_model_dir: Optional[str] = None
|
81
|
+
det_limit_side_len: int = 960
|
82
|
+
rec_batch_num: int = 6
|
83
|
+
max_text_length: int = 25
|
84
|
+
use_space_char: bool = True
|
85
|
+
drop_score: float = 0.5
|
86
|
+
show_log: bool = False
|
87
|
+
use_onnx: bool = False
|
88
|
+
det: bool = True
|
89
|
+
rec: bool = True
|
90
|
+
cls: Optional[bool] = None
|
91
|
+
|
92
|
+
def __post_init__(self):
|
93
|
+
if self.use_gpu is None:
|
94
|
+
if self.device and 'cuda' in self.device.lower():
|
95
|
+
self.use_gpu = True
|
96
|
+
else:
|
97
|
+
self.use_gpu = False
|
98
|
+
# logger.debug(f"Initialized PaddleOCROptions: {self}")
|
99
|
+
|
100
|
+
# --- Surya Specific Options ---
|
101
|
+
@dataclass
|
102
|
+
class SuryaOCROptions(BaseOCROptions):
|
103
|
+
"""Specific options for the Surya OCR engine."""
|
104
|
+
# Currently, Surya example shows languages passed at prediction time.
|
105
|
+
# Add fields here if Surya's RecognitionPredictor or DetectionPredictor
|
106
|
+
# constructors accept relevant arguments (e.g., model paths, device settings).
|
107
|
+
# For now, it primarily uses the base options like 'languages' and 'min_confidence'.
|
108
|
+
# Configuration like batch sizes are often set via environment variables for Surya.
|
109
|
+
pass
|
110
|
+
|
111
|
+
|
112
|
+
# --- Union type for type hinting ---
|
113
|
+
OCROptions = Union[EasyOCROptions, PaddleOCROptions, SuryaOCROptions, BaseOCROptions]
|
114
|
+
|