natural-pdf 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. natural_pdf/__init__.py +55 -0
  2. natural_pdf/analyzers/__init__.py +6 -0
  3. natural_pdf/analyzers/layout/__init__.py +1 -0
  4. natural_pdf/analyzers/layout/base.py +151 -0
  5. natural_pdf/analyzers/layout/docling.py +247 -0
  6. natural_pdf/analyzers/layout/layout_analyzer.py +166 -0
  7. natural_pdf/analyzers/layout/layout_manager.py +200 -0
  8. natural_pdf/analyzers/layout/layout_options.py +78 -0
  9. natural_pdf/analyzers/layout/paddle.py +240 -0
  10. natural_pdf/analyzers/layout/surya.py +151 -0
  11. natural_pdf/analyzers/layout/tatr.py +251 -0
  12. natural_pdf/analyzers/layout/yolo.py +165 -0
  13. natural_pdf/analyzers/text_options.py +60 -0
  14. natural_pdf/analyzers/text_structure.py +270 -0
  15. natural_pdf/analyzers/utils.py +57 -0
  16. natural_pdf/core/__init__.py +3 -0
  17. natural_pdf/core/element_manager.py +457 -0
  18. natural_pdf/core/highlighting_service.py +698 -0
  19. natural_pdf/core/page.py +1444 -0
  20. natural_pdf/core/pdf.py +653 -0
  21. natural_pdf/elements/__init__.py +3 -0
  22. natural_pdf/elements/base.py +761 -0
  23. natural_pdf/elements/collections.py +1345 -0
  24. natural_pdf/elements/line.py +140 -0
  25. natural_pdf/elements/rect.py +122 -0
  26. natural_pdf/elements/region.py +1793 -0
  27. natural_pdf/elements/text.py +304 -0
  28. natural_pdf/ocr/__init__.py +56 -0
  29. natural_pdf/ocr/engine.py +104 -0
  30. natural_pdf/ocr/engine_easyocr.py +179 -0
  31. natural_pdf/ocr/engine_paddle.py +204 -0
  32. natural_pdf/ocr/engine_surya.py +171 -0
  33. natural_pdf/ocr/ocr_manager.py +191 -0
  34. natural_pdf/ocr/ocr_options.py +114 -0
  35. natural_pdf/qa/__init__.py +3 -0
  36. natural_pdf/qa/document_qa.py +396 -0
  37. natural_pdf/selectors/__init__.py +4 -0
  38. natural_pdf/selectors/parser.py +354 -0
  39. natural_pdf/templates/__init__.py +1 -0
  40. natural_pdf/templates/ocr_debug.html +517 -0
  41. natural_pdf/utils/__init__.py +3 -0
  42. natural_pdf/utils/highlighting.py +12 -0
  43. natural_pdf/utils/reading_order.py +227 -0
  44. natural_pdf/utils/visualization.py +223 -0
  45. natural_pdf/widgets/__init__.py +4 -0
  46. natural_pdf/widgets/frontend/viewer.js +88 -0
  47. natural_pdf/widgets/viewer.py +765 -0
  48. natural_pdf-0.1.0.dist-info/METADATA +295 -0
  49. natural_pdf-0.1.0.dist-info/RECORD +52 -0
  50. natural_pdf-0.1.0.dist-info/WHEEL +5 -0
  51. natural_pdf-0.1.0.dist-info/licenses/LICENSE +21 -0
  52. natural_pdf-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,204 @@
1
+ # ocr_engine_paddleocr.py
2
+ import logging
3
+ import importlib.util
4
+ from typing import Dict, List, Any, Optional, Tuple, Union
5
+ import numpy as np
6
+ from PIL import Image
7
+ import inspect # Used for dynamic parameter passing
8
+
9
+ from .engine import OCREngine
10
+ from .ocr_options import PaddleOCROptions, BaseOCROptions
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+ class PaddleOCREngine(OCREngine):
15
+ """PaddleOCR engine implementation."""
16
+
17
+ LANGUAGE_MAP = {
18
+ 'en': 'en', 'zh': 'ch', 'zh-cn': 'ch', 'zh-tw': 'chinese_cht',
19
+ 'ja': 'japan', 'ko': 'korean', 'th': 'thai', 'fr': 'french',
20
+ 'de': 'german', 'ru': 'russian', 'ar': 'arabic', 'hi': 'hindi',
21
+ 'vi': 'vietnam', 'fa': 'cyrillic', 'ur': 'cyrillic', 'rs': 'serbian',
22
+ 'oc': 'latin', 'rsc': 'cyrillic', 'bg': 'bulgarian', 'uk': 'cyrillic',
23
+ 'be': 'cyrillic', 'te': 'telugu', 'kn': 'kannada', 'ta': 'tamil',
24
+ 'latin': 'latin', 'cyrillic': 'cyrillic', 'devanagari': 'devanagari',
25
+ }
26
+
27
+ def __init__(self):
28
+ super().__init__()
29
+ self._paddleocr = None
30
+
31
+ def _lazy_import_paddleocr(self):
32
+ """Imports paddleocr only when needed."""
33
+ if self._paddleocr is None:
34
+ if not self.is_available():
35
+ raise ImportError("PaddleOCR or PaddlePaddle is not installed or available.")
36
+ try:
37
+ import paddle
38
+ import paddleocr
39
+ self._paddleocr = paddleocr
40
+ logger.info("PaddleOCR module imported successfully.")
41
+ except ImportError as e:
42
+ logger.error(f"Failed to import PaddleOCR/PaddlePaddle: {e}")
43
+ raise
44
+ return self._paddleocr
45
+
46
+ def is_available(self) -> bool:
47
+ """Check if PaddleOCR and paddlepaddle are installed."""
48
+ paddle_installed = importlib.util.find_spec("paddle") is not None or \
49
+ importlib.util.find_spec("paddlepaddle") is not None
50
+ paddleocr_installed = importlib.util.find_spec("paddleocr") is not None
51
+ return paddle_installed and paddleocr_installed
52
+
53
+ def _map_language(self, iso_lang: str) -> str:
54
+ """Map ISO language code to PaddleOCR language code."""
55
+ return self.LANGUAGE_MAP.get(iso_lang.lower(), 'en')
56
+
57
+ def _get_cache_key(self, options: PaddleOCROptions) -> str:
58
+ """Generate a more specific cache key for PaddleOCR."""
59
+ base_key = super()._get_cache_key(options)
60
+ primary_lang = self._map_language(options.languages[0]) if options.languages else 'en'
61
+ angle_cls_key = str(options.use_angle_cls)
62
+ precision_key = options.precision
63
+ return f"{base_key}_{primary_lang}_{angle_cls_key}_{precision_key}"
64
+
65
+ def _get_reader(self, options: PaddleOCROptions):
66
+ """Get or initialize a PaddleOCR reader based on options."""
67
+ cache_key = self._get_cache_key(options)
68
+ if cache_key in self._reader_cache:
69
+ logger.debug(f"Using cached PaddleOCR reader for key: {cache_key}")
70
+ return self._reader_cache[cache_key]
71
+
72
+ logger.info(f"Creating new PaddleOCR reader for key: {cache_key}")
73
+ paddleocr = self._lazy_import_paddleocr()
74
+
75
+ constructor_sig = inspect.signature(paddleocr.PaddleOCR.__init__)
76
+ constructor_args = {}
77
+ constructor_args['lang'] = self._map_language(options.languages[0]) if options.languages else 'en'
78
+
79
+ for field_name, param in constructor_sig.parameters.items():
80
+ if field_name in ['self', 'lang']: continue
81
+ if field_name == 'use_gpu':
82
+ constructor_args['use_gpu'] = options.use_gpu
83
+ continue
84
+ if hasattr(options, field_name):
85
+ constructor_args[field_name] = getattr(options, field_name)
86
+ elif field_name in options.extra_args:
87
+ constructor_args[field_name] = options.extra_args[field_name]
88
+
89
+ constructor_args.pop('device', None)
90
+ logger.debug(f"PaddleOCR constructor args: {constructor_args}")
91
+
92
+ try:
93
+ show_log = constructor_args.get('show_log', False)
94
+ original_log_level = logging.getLogger('ppocr').level
95
+ if not show_log:
96
+ logging.getLogger('ppocr').setLevel(logging.ERROR)
97
+
98
+ reader = paddleocr.PaddleOCR(**constructor_args)
99
+
100
+ if not show_log:
101
+ logging.getLogger('ppocr').setLevel(original_log_level)
102
+
103
+ self._reader_cache[cache_key] = reader
104
+ logger.info("PaddleOCR reader created successfully.")
105
+ return reader
106
+ except Exception as e:
107
+ logger.error(f"Failed to create PaddleOCR reader: {e}", exc_info=True)
108
+ raise
109
+
110
+ def _prepare_ocr_args(self, options: PaddleOCROptions) -> Dict[str, Any]:
111
+ """Helper to prepare arguments for the ocr method (excluding image)."""
112
+ ocr_args = {}
113
+ # Determine 'cls' value based on options precedence
114
+ ocr_args['cls'] = options.cls if options.cls is not None else options.use_angle_cls
115
+ ocr_args['det'] = options.det
116
+ ocr_args['rec'] = options.rec
117
+ # Add extra args if needed (less common for ocr method itself)
118
+ # for field_name in options.extra_args:
119
+ # if field_name in ['cls', 'det', 'rec']: # Check against known ocr args
120
+ # ocr_args[field_name] = options.extra_args[field_name]
121
+ logger.debug(f"PaddleOCR ocr args (excluding image): {ocr_args}")
122
+ return ocr_args
123
+
124
+ def _standardize_results(self, raw_page_results: Optional[List[Any]], options: PaddleOCROptions) -> List[Dict[str, Any]]:
125
+ """Standardizes raw results from a single page/image from PaddleOCR."""
126
+ standardized_page = []
127
+ if not raw_page_results: # Handle None or empty list
128
+ return standardized_page
129
+
130
+ min_confidence = options.min_confidence
131
+ for detection in raw_page_results:
132
+ try:
133
+ if not isinstance(detection, (list, tuple)) or len(detection) < 2: continue
134
+ bbox_raw = detection[0]
135
+ text_confidence = detection[1]
136
+ if not isinstance(text_confidence, tuple) or len(text_confidence) < 2: continue
137
+
138
+ text = str(text_confidence[0])
139
+ confidence = float(text_confidence[1])
140
+
141
+ if confidence >= min_confidence:
142
+ bbox = self._standardize_bbox(bbox_raw)
143
+ if bbox:
144
+ standardized_page.append({
145
+ 'bbox': bbox, 'text': text, 'confidence': confidence, 'source': 'ocr'
146
+ })
147
+ else:
148
+ logger.warning(f"Skipping result due to invalid bbox: {bbox_raw}")
149
+ except (IndexError, ValueError, TypeError) as e:
150
+ logger.warning(f"Skipping invalid detection format: {detection}. Error: {e}")
151
+ continue
152
+ return standardized_page
153
+
154
+ def _pil_to_bgr(self, image: Image.Image) -> np.ndarray:
155
+ """Converts PIL Image to BGR numpy array."""
156
+ if image.mode == 'BGR': # Already BGR
157
+ return np.array(image)
158
+ img_rgb = image.convert('RGB')
159
+ img_array_rgb = np.array(img_rgb)
160
+ img_array_bgr = img_array_rgb[:, :, ::-1] # Convert RGB to BGR
161
+ return img_array_bgr
162
+
163
+
164
+ def process_image(
165
+ self,
166
+ images: Union[Image.Image, List[Image.Image]],
167
+ options: BaseOCROptions
168
+ ) -> Union[List[Dict[str, Any]], List[List[Dict[str, Any]]]]:
169
+ """Processes a single image or a batch of images with PaddleOCR."""
170
+
171
+ if not isinstance(options, PaddleOCROptions):
172
+ logger.warning("Received BaseOCROptions, expected PaddleOCROptions. Using defaults.")
173
+ options = PaddleOCROptions(
174
+ languages=options.languages,
175
+ min_confidence=options.min_confidence,
176
+ device=options.device,
177
+ extra_args=options.extra_args
178
+ )
179
+
180
+ reader = self._get_reader(options)
181
+ ocr_args = self._prepare_ocr_args(options)
182
+
183
+ # Helper function to process one image
184
+ def process_one(img):
185
+ try:
186
+ img_array_bgr = self._pil_to_bgr(img)
187
+ raw_results = reader.ocr(img_array_bgr, **ocr_args)
188
+
189
+ page_results = []
190
+ if raw_results and isinstance(raw_results, list) and len(raw_results) > 0:
191
+ page_results = raw_results[0]
192
+
193
+ return self._standardize_results(page_results, options)
194
+ except Exception as e:
195
+ logger.error(f"Error processing image with PaddleOCR: {e}")
196
+ return []
197
+
198
+ # Handle single image or list of images
199
+ if isinstance(images, Image.Image):
200
+ return process_one(images)
201
+ elif isinstance(images, list):
202
+ return [process_one(img) for img in images]
203
+ else:
204
+ raise TypeError("Input 'images' must be a PIL Image or a list of PIL Images.")
@@ -0,0 +1,171 @@
1
+ # ocr_engine_surya.py
2
+ import logging
3
+ import importlib.util
4
+ from typing import Dict, List, Any, Optional, Tuple, Union
5
+ import numpy as np
6
+ from PIL import Image
7
+
8
+ from .engine import OCREngine
9
+ from .ocr_options import SuryaOCROptions, BaseOCROptions
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+ class SuryaOCREngine(OCREngine):
14
+ """Surya OCR engine implementation."""
15
+
16
+ def __init__(self):
17
+ super().__init__()
18
+ self._recognition_predictor = None
19
+ self._detection_predictor = None
20
+ self._surya_recognition = None
21
+ self._surya_detection = None
22
+ self._initialized = False
23
+
24
+ def _lazy_load_predictors(self, options: SuryaOCROptions):
25
+ """Initializes Surya predictors when first needed."""
26
+ if self._initialized:
27
+ return
28
+
29
+ if not self.is_available():
30
+ raise ImportError("Surya OCR library is not installed or available.")
31
+
32
+ try:
33
+ from surya.recognition import RecognitionPredictor
34
+ from surya.detection import DetectionPredictor
35
+ self._surya_recognition = RecognitionPredictor
36
+ self._surya_detection = DetectionPredictor
37
+ logger.info("Surya modules imported successfully.")
38
+
39
+ # --- Instantiate Predictors ---
40
+ # Add arguments from options if Surya supports them
41
+ # Example: device = options.device or 'cuda' if torch.cuda.is_available() else 'cpu'
42
+ # predictor_args = {'device': options.device} # If applicable
43
+ predictor_args = {} # Assuming parameterless init based on example
44
+
45
+ logger.info("Instantiating Surya DetectionPredictor...")
46
+ self._detection_predictor = self._surya_detection(**predictor_args)
47
+ logger.info("Instantiating Surya RecognitionPredictor...")
48
+ self._recognition_predictor = self._surya_recognition(**predictor_args)
49
+
50
+ self._initialized = True
51
+ logger.info("Surya predictors initialized.")
52
+
53
+ except ImportError as e:
54
+ logger.error(f"Failed to import Surya modules: {e}")
55
+ raise
56
+ except Exception as e:
57
+ logger.error(f"Failed to initialize Surya predictors: {e}", exc_info=True)
58
+ raise
59
+
60
+ def is_available(self) -> bool:
61
+ """Check if the surya library is installed."""
62
+ return importlib.util.find_spec("surya") is not None
63
+
64
+ def _standardize_results(self, raw_ocr_result: Any, options: SuryaOCROptions) -> List[Dict[str, Any]]:
65
+ """Standardizes raw results from a single image from Surya."""
66
+ standardized_page = []
67
+ min_confidence = options.min_confidence
68
+
69
+ # Check if the result has the expected structure (OCRResult with text_lines)
70
+ if not hasattr(raw_ocr_result, 'text_lines') or not isinstance(raw_ocr_result.text_lines, list):
71
+ logger.warning(f"Unexpected Surya result format: {type(raw_ocr_result)}. Skipping.")
72
+ return standardized_page
73
+
74
+ for line in raw_ocr_result.text_lines:
75
+ try:
76
+ # Extract data from Surya's TextLine object
77
+ text = line.text
78
+ confidence = line.confidence
79
+ # Surya provides both polygon and bbox, bbox is already (x0, y0, x1, y1)
80
+ bbox_raw = line.bbox # Use bbox directly if available and correct format
81
+
82
+ if confidence >= min_confidence:
83
+ bbox = self._standardize_bbox(bbox_raw) # Validate/convert format
84
+ if bbox:
85
+ standardized_page.append({
86
+ 'bbox': bbox,
87
+ 'text': text,
88
+ 'confidence': confidence,
89
+ 'source': 'ocr'
90
+ })
91
+ else:
92
+ # Try polygon if bbox failed standardization
93
+ bbox_poly = self._standardize_bbox(line.polygon)
94
+ if bbox_poly:
95
+ standardized_page.append({
96
+ 'bbox': bbox_poly, 'text': text, 'confidence': confidence, 'source': 'ocr'
97
+ })
98
+ else:
99
+ logger.warning(f"Skipping Surya line due to invalid bbox/polygon: {line}")
100
+
101
+ except (AttributeError, ValueError, TypeError) as e:
102
+ logger.warning(f"Skipping invalid Surya TextLine format: {line}. Error: {e}")
103
+ continue
104
+ return standardized_page
105
+
106
+ def process_image(
107
+ self,
108
+ images: Union[Image.Image, List[Image.Image]],
109
+ options: BaseOCROptions
110
+ ) -> Union[List[Dict[str, Any]], List[List[Dict[str, Any]]]]:
111
+ """Processes a single image or a batch of images with Surya OCR."""
112
+
113
+ if not isinstance(options, SuryaOCROptions):
114
+ logger.warning("Received BaseOCROptions, expected SuryaOCROptions. Using defaults.")
115
+ options = SuryaOCROptions(
116
+ languages=options.languages,
117
+ min_confidence=options.min_confidence,
118
+ device=options.device,
119
+ extra_args=options.extra_args
120
+ )
121
+
122
+ # Ensure predictors are loaded/initialized
123
+ self._lazy_load_predictors(options)
124
+ if not self._recognition_predictor or not self._detection_predictor:
125
+ raise RuntimeError("Surya predictors could not be initialized.")
126
+
127
+ # --- Prepare inputs for Surya ---
128
+ is_batch = isinstance(images, list)
129
+ input_images: List[Image.Image] = images if is_batch else [images]
130
+ # Surya expects a list of language lists, one per image
131
+ input_langs: List[List[str]] = [options.languages for _ in input_images]
132
+
133
+ if not input_images:
134
+ logger.warning("No images provided for Surya processing.")
135
+ return [] if not is_batch else [[]]
136
+
137
+ # --- Run Surya Prediction ---
138
+ try:
139
+ processing_mode = "batch" if is_batch else "single image"
140
+ logger.info(f"Processing {processing_mode} ({len(input_images)} images) with Surya...")
141
+ # Call Surya's predictor
142
+ # It returns a list of OCRResult objects, one per input image
143
+ predictions = self._recognition_predictor(
144
+ images=input_images,
145
+ langs=input_langs,
146
+ det_predictor=self._detection_predictor
147
+ )
148
+ logger.info(f"Surya prediction complete. Received {len(predictions)} results.")
149
+
150
+ # --- Standardize Results ---
151
+ if len(predictions) != len(input_images):
152
+ logger.error(f"Surya result count ({len(predictions)}) does not match input count ({len(input_images)}). Returning empty results.")
153
+ # Decide on error handling: raise error or return empty structure
154
+ return [[] for _ in input_images] if is_batch else []
155
+
156
+ all_standardized_results = [self._standardize_results(res, options) for res in predictions]
157
+
158
+ if is_batch:
159
+ return all_standardized_results # Return List[List[Dict]]
160
+ else:
161
+ return all_standardized_results[0] # Return List[Dict] for single image
162
+
163
+ except Exception as e:
164
+ logger.error(f"Error during Surya OCR processing: {e}", exc_info=True)
165
+ # Return empty structure matching input type on failure
166
+ return [[] for _ in input_images] if is_batch else []
167
+
168
+ # Note: Caching is handled differently for Surya as predictors are stateful
169
+ # and initialized once. The base class _reader_cache is not used here.
170
+ # If predictors could be configured per-run, caching would need rethinking.
171
+
@@ -0,0 +1,191 @@
1
+ # ocr_manager.py
2
+ import logging
3
+ from typing import Dict, List, Any, Optional, Union, Type
4
+ from PIL import Image
5
+ import copy # For deep copying options
6
+
7
+ # Import engine classes and options
8
+ from .engine import OCREngine
9
+ from .engine_easyocr import EasyOCREngine
10
+ from .engine_paddle import PaddleOCREngine
11
+ from .engine_surya import SuryaOCREngine # <-- Import Surya Engine
12
+ from .ocr_options import (
13
+ BaseOCROptions, EasyOCROptions, PaddleOCROptions, SuryaOCROptions, OCROptions # <-- Import Surya Options
14
+ )
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+ class OCRManager:
19
+ """Manages OCR engine selection, configuration, and execution."""
20
+
21
+ # Registry mapping engine names to classes and default options
22
+ ENGINE_REGISTRY: Dict[str, Dict[str, Any]] = {
23
+ 'easyocr': {'class': EasyOCREngine, 'options_class': EasyOCROptions},
24
+ 'paddle': {'class': PaddleOCREngine, 'options_class': PaddleOCROptions},
25
+ 'surya': {'class': SuryaOCREngine, 'options_class': SuryaOCROptions}, # <-- Add Surya
26
+ # Add other engines here
27
+ }
28
+
29
+ # Define the limited set of kwargs allowed for the simple apply_ocr call
30
+ SIMPLE_MODE_ALLOWED_KWARGS = {
31
+ 'engine', 'languages', 'min_confidence', 'device'
32
+ # Add image pre-processing args like 'resolution', 'width' if handled here
33
+ }
34
+
35
+ def __init__(self):
36
+ """Initializes the OCR Manager."""
37
+ self._engine_instances: Dict[str, OCREngine] = {} # Cache for engine instances
38
+ logger.info("OCRManager initialized.")
39
+
40
+ def _get_engine_instance(self, engine_name: str) -> OCREngine:
41
+ """Retrieves or creates an instance of the specified OCR engine."""
42
+ engine_name = engine_name.lower()
43
+ if engine_name not in self.ENGINE_REGISTRY:
44
+ raise ValueError(f"Unknown OCR engine: '{engine_name}'. Available: {list(self.ENGINE_REGISTRY.keys())}")
45
+
46
+ # Surya engine might manage its own predictor state, consider if caching instance is always right
47
+ # For now, we cache the engine instance itself.
48
+ if engine_name not in self._engine_instances:
49
+ logger.info(f"Creating instance of engine: {engine_name}")
50
+ engine_class = self.ENGINE_REGISTRY[engine_name]['class']
51
+ engine_instance = engine_class() # Instantiate first
52
+ if not engine_instance.is_available():
53
+ # Check availability before storing
54
+ raise RuntimeError(f"Engine '{engine_name}' is not available. Please check dependencies.")
55
+ self._engine_instances[engine_name] = engine_instance # Store if available
56
+
57
+ return self._engine_instances[engine_name]
58
+
59
+ def apply_ocr(
60
+ self,
61
+ images: Union[Image.Image, List[Image.Image]], # Accept single or list
62
+ engine: Optional[str] = 'easyocr', # Default engine
63
+ options: Optional[OCROptions] = None,
64
+ **kwargs
65
+ ) -> Union[List[Dict[str, Any]], List[List[Dict[str, Any]]]]: # Return single or list of lists
66
+ """
67
+ Applies OCR to a single image or a batch of images using either simple
68
+ keyword arguments or an options object.
69
+
70
+ Args:
71
+ images: A single PIL Image or a list of PIL Images to process.
72
+ engine: Name of the engine to use (e.g., 'easyocr', 'paddle', 'surya').
73
+ Ignored if 'options' object is provided. Defaults to 'easyocr'.
74
+ options: An instance of EasyOCROptions, PaddleOCROptions, or SuryaOCROptions
75
+ for detailed configuration. If provided, simple kwargs (languages, etc.)
76
+ and the 'engine' arg are ignored.
77
+ **kwargs: For simple mode, accepts: 'languages', 'min_confidence', 'device'.
78
+ Other kwargs will raise a TypeError unless 'options' is provided.
79
+
80
+ Returns:
81
+ If input is a single image: List of result dictionaries.
82
+ If input is a list of images: List of lists of result dictionaries,
83
+ corresponding to each input image.
84
+
85
+ Raises:
86
+ ValueError: If the engine name is invalid.
87
+ TypeError: If unexpected keyword arguments are provided in simple mode,
88
+ or if input 'images' is not a PIL Image or list of PIL Images.
89
+ RuntimeError: If the selected engine is not available.
90
+ """
91
+ final_options: BaseOCROptions
92
+ selected_engine_name: str
93
+
94
+ # --- Validate input type ---
95
+ is_batch = isinstance(images, list)
96
+ if not is_batch and not isinstance(images, Image.Image):
97
+ raise TypeError("Input 'images' must be a PIL Image or a list of PIL Images.")
98
+ # Allow engines to handle non-PIL images in list if they support it/log warnings
99
+ # if is_batch and not all(isinstance(img, Image.Image) for img in images):
100
+ # logger.warning("Batch may contain items that are not PIL Images.")
101
+
102
+
103
+ # --- Determine Options and Engine ---
104
+ if options is not None:
105
+ # Advanced Mode
106
+ logger.debug(f"Using advanced mode with options object: {type(options).__name__}")
107
+ final_options = copy.deepcopy(options) # Prevent modification of original
108
+ found_engine = False
109
+ for name, registry_entry in self.ENGINE_REGISTRY.items():
110
+ # Check if options object is an instance of the registered options class
111
+ if isinstance(options, registry_entry['options_class']):
112
+ selected_engine_name = name
113
+ found_engine = True
114
+ break
115
+ if not found_engine:
116
+ raise TypeError(f"Provided options object type '{type(options).__name__}' does not match any registered engine options.")
117
+ if kwargs:
118
+ logger.warning(f"Keyword arguments {list(kwargs.keys())} were provided alongside 'options' and will be ignored.")
119
+ else:
120
+ # Simple Mode
121
+ selected_engine_name = engine.lower() if engine else 'easyocr' # Fallback default
122
+ logger.debug(f"Using simple mode with engine: '{selected_engine_name}' and kwargs: {kwargs}")
123
+
124
+ if selected_engine_name not in self.ENGINE_REGISTRY:
125
+ raise ValueError(f"Unknown OCR engine: '{selected_engine_name}'. Available: {list(self.ENGINE_REGISTRY.keys())}")
126
+
127
+ unexpected_kwargs = set(kwargs.keys()) - self.SIMPLE_MODE_ALLOWED_KWARGS
128
+ if unexpected_kwargs:
129
+ raise TypeError(f"Got unexpected keyword arguments in simple mode: {list(unexpected_kwargs)}. Use the 'options' parameter for detailed configuration.")
130
+
131
+ # Get the *correct* options class for the selected engine
132
+ options_class = self.ENGINE_REGISTRY[selected_engine_name]['options_class']
133
+
134
+ # Create options instance using provided simple kwargs or defaults
135
+ simple_args = {
136
+ 'languages': kwargs.get('languages', ['en']),
137
+ 'min_confidence': kwargs.get('min_confidence', 0.5),
138
+ 'device': kwargs.get('device', 'cpu')
139
+ # Note: 'extra_args' isn't populated in simple mode
140
+ }
141
+ final_options = options_class(**simple_args)
142
+ logger.debug(f"Constructed options for simple mode: {final_options}")
143
+
144
+
145
+ # --- Get Engine Instance and Process ---
146
+ try:
147
+ engine_instance = self._get_engine_instance(selected_engine_name)
148
+ processing_mode = "batch" if is_batch else "single image"
149
+ logger.info(f"Processing {processing_mode} with engine '{selected_engine_name}'...")
150
+
151
+ # Call the engine's process_image, passing single image or list
152
+ results = engine_instance.process_image(images, final_options)
153
+
154
+ # Log result summary based on mode
155
+ if is_batch:
156
+ # Ensure results is a list before trying to get lengths
157
+ if isinstance(results, list):
158
+ num_results_per_image = [len(res_list) if isinstance(res_list, list) else -1 for res_list in results] # Handle potential errors returning non-lists
159
+ logger.info(f"Processing complete. Found results per image: {num_results_per_image}")
160
+ else:
161
+ logger.error(f"Processing complete but received unexpected result type for batch: {type(results)}")
162
+ else:
163
+ # Ensure results is a list
164
+ if isinstance(results, list):
165
+ logger.info(f"Processing complete. Found {len(results)} results.")
166
+ else:
167
+ logger.error(f"Processing complete but received unexpected result type for single image: {type(results)}")
168
+ return results # Return type matches input type due to engine logic
169
+
170
+ except (ImportError, RuntimeError, ValueError, TypeError) as e:
171
+ logger.error(f"OCR processing failed for engine '{selected_engine_name}': {e}", exc_info=True)
172
+ raise # Re-raise expected errors
173
+ except Exception as e:
174
+ logger.error(f"An unexpected error occurred during OCR processing: {e}", exc_info=True)
175
+ raise # Re-raise unexpected errors
176
+
177
+
178
+ def get_available_engines(self) -> List[str]:
179
+ """Returns a list of registered engine names that are currently available."""
180
+ available = []
181
+ for name, registry_entry in self.ENGINE_REGISTRY.items():
182
+ try:
183
+ # Temporarily instantiate to check availability without caching
184
+ engine_class = registry_entry['class']
185
+ if engine_class().is_available():
186
+ available.append(name)
187
+ except Exception as e:
188
+ logger.debug(f"Engine '{name}' check failed: {e}") # Log check failures at debug level
189
+ pass # Ignore engines that fail to instantiate or check
190
+ return available
191
+
@@ -0,0 +1,114 @@
1
+ # ocr_options.py
2
+ import logging
3
+ from dataclasses import dataclass, field
4
+ from typing import List, Optional, Dict, Any, Tuple, Union
5
+
6
+ # Configure logging
7
+ # logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
8
+ # logger = logging.getLogger(__name__)
9
+ # Assume logger is configured elsewhere or remove if not needed globally
10
+
11
+ # --- Base Options ---
12
+ @dataclass
13
+ class BaseOCROptions:
14
+ """Base class for OCR engine options."""
15
+ languages: List[str] = field(default_factory=lambda: ['en'])
16
+ min_confidence: float = 0.5
17
+ device: Optional[str] = 'cpu' # Suggestion, actual device usage depends on engine impl.
18
+ extra_args: Dict[str, Any] = field(default_factory=dict)
19
+
20
+ # --- EasyOCR Specific Options ---
21
+ @dataclass
22
+ class EasyOCROptions(BaseOCROptions):
23
+ """Specific options for the EasyOCR engine."""
24
+ model_storage_directory: Optional[str] = None
25
+ user_network_directory: Optional[str] = None
26
+ recog_network: str = 'english_g2'
27
+ detect_network: str = 'craft'
28
+ download_enabled: bool = True
29
+ detector: bool = True
30
+ recognizer: bool = True
31
+ verbose: bool = True
32
+ quantize: bool = True
33
+ cudnn_benchmark: bool = False
34
+ detail: int = 1
35
+ decoder: str = 'greedy'
36
+ beamWidth: int = 5
37
+ batch_size: int = 1
38
+ workers: int = 0
39
+ allowlist: Optional[str] = None
40
+ blocklist: Optional[str] = None
41
+ paragraph: bool = False
42
+ min_size: int = 10
43
+ contrast_ths: float = 0.1
44
+ adjust_contrast: float = 0.5
45
+ filter_ths: float = 0.0
46
+ text_threshold: float = 0.7
47
+ low_text: float = 0.4
48
+ link_threshold: float = 0.4
49
+ canvas_size: int = 2560
50
+ mag_ratio: float = 1.0
51
+ slope_ths: float = 0.1
52
+ ycenter_ths: float = 0.5
53
+ height_ths: float = 0.5
54
+ width_ths: float = 0.5
55
+ y_ths: float = 0.5
56
+ x_ths: float = 1.0
57
+ add_margin: float = 0.1
58
+ output_format: str = 'standard'
59
+
60
+ # def __post_init__(self):
61
+ # logger.debug(f"Initialized EasyOCROptions: {self}")
62
+
63
+
64
+ # --- PaddleOCR Specific Options ---
65
+ @dataclass
66
+ class PaddleOCROptions(BaseOCROptions):
67
+ """Specific options for the PaddleOCR engine."""
68
+ use_angle_cls: bool = True
69
+ use_gpu: Optional[bool] = None
70
+ gpu_mem: int = 500
71
+ ir_optim: bool = True
72
+ use_tensorrt: bool = False
73
+ min_subgraph_size: int = 15
74
+ precision: str = 'fp32'
75
+ enable_mkldnn: bool = False
76
+ cpu_threads: int = 10
77
+ use_fp16: bool = False
78
+ det_model_dir: Optional[str] = None
79
+ rec_model_dir: Optional[str] = None
80
+ cls_model_dir: Optional[str] = None
81
+ det_limit_side_len: int = 960
82
+ rec_batch_num: int = 6
83
+ max_text_length: int = 25
84
+ use_space_char: bool = True
85
+ drop_score: float = 0.5
86
+ show_log: bool = False
87
+ use_onnx: bool = False
88
+ det: bool = True
89
+ rec: bool = True
90
+ cls: Optional[bool] = None
91
+
92
+ def __post_init__(self):
93
+ if self.use_gpu is None:
94
+ if self.device and 'cuda' in self.device.lower():
95
+ self.use_gpu = True
96
+ else:
97
+ self.use_gpu = False
98
+ # logger.debug(f"Initialized PaddleOCROptions: {self}")
99
+
100
+ # --- Surya Specific Options ---
101
+ @dataclass
102
+ class SuryaOCROptions(BaseOCROptions):
103
+ """Specific options for the Surya OCR engine."""
104
+ # Currently, Surya example shows languages passed at prediction time.
105
+ # Add fields here if Surya's RecognitionPredictor or DetectionPredictor
106
+ # constructors accept relevant arguments (e.g., model paths, device settings).
107
+ # For now, it primarily uses the base options like 'languages' and 'min_confidence'.
108
+ # Configuration like batch sizes are often set via environment variables for Surya.
109
+ pass
110
+
111
+
112
+ # --- Union type for type hinting ---
113
+ OCROptions = Union[EasyOCROptions, PaddleOCROptions, SuryaOCROptions, BaseOCROptions]
114
+