natural-pdf 0.1.5__py3-none-any.whl → 0.1.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. docs/finetuning/index.md +176 -0
  2. docs/ocr/index.md +34 -47
  3. docs/tutorials/01-loading-and-extraction.ipynb +34 -1536
  4. docs/tutorials/02-finding-elements.ipynb +42 -42
  5. docs/tutorials/03-extracting-blocks.ipynb +17 -17
  6. docs/tutorials/04-table-extraction.ipynb +12 -12
  7. docs/tutorials/05-excluding-content.ipynb +30 -30
  8. docs/tutorials/06-document-qa.ipynb +28 -28
  9. docs/tutorials/07-layout-analysis.ipynb +63 -35
  10. docs/tutorials/07-working-with-regions.ipynb +55 -51
  11. docs/tutorials/07-working-with-regions.md +2 -2
  12. docs/tutorials/08-spatial-navigation.ipynb +60 -60
  13. docs/tutorials/09-section-extraction.ipynb +113 -113
  14. docs/tutorials/10-form-field-extraction.ipynb +78 -50
  15. docs/tutorials/11-enhanced-table-processing.ipynb +6 -6
  16. docs/tutorials/12-ocr-integration.ipynb +149 -131
  17. docs/tutorials/12-ocr-integration.md +0 -13
  18. docs/tutorials/13-semantic-search.ipynb +313 -873
  19. natural_pdf/__init__.py +21 -22
  20. natural_pdf/analyzers/layout/gemini.py +280 -0
  21. natural_pdf/analyzers/layout/layout_manager.py +28 -1
  22. natural_pdf/analyzers/layout/layout_options.py +11 -0
  23. natural_pdf/analyzers/layout/yolo.py +6 -2
  24. natural_pdf/collections/pdf_collection.py +24 -0
  25. natural_pdf/core/element_manager.py +18 -13
  26. natural_pdf/core/page.py +174 -36
  27. natural_pdf/core/pdf.py +156 -42
  28. natural_pdf/elements/base.py +9 -17
  29. natural_pdf/elements/collections.py +99 -38
  30. natural_pdf/elements/region.py +77 -37
  31. natural_pdf/elements/text.py +5 -0
  32. natural_pdf/exporters/__init__.py +4 -0
  33. natural_pdf/exporters/base.py +61 -0
  34. natural_pdf/exporters/paddleocr.py +345 -0
  35. natural_pdf/ocr/__init__.py +57 -36
  36. natural_pdf/ocr/engine.py +160 -49
  37. natural_pdf/ocr/engine_easyocr.py +178 -157
  38. natural_pdf/ocr/engine_paddle.py +114 -189
  39. natural_pdf/ocr/engine_surya.py +87 -144
  40. natural_pdf/ocr/ocr_factory.py +125 -0
  41. natural_pdf/ocr/ocr_manager.py +65 -89
  42. natural_pdf/ocr/ocr_options.py +8 -13
  43. natural_pdf/ocr/utils.py +113 -0
  44. natural_pdf/templates/finetune/fine_tune_paddleocr.md +415 -0
  45. natural_pdf/templates/spa/css/style.css +334 -0
  46. natural_pdf/templates/spa/index.html +31 -0
  47. natural_pdf/templates/spa/js/app.js +472 -0
  48. natural_pdf/templates/spa/words.txt +235976 -0
  49. natural_pdf/utils/debug.py +34 -0
  50. natural_pdf/utils/identifiers.py +33 -0
  51. natural_pdf/utils/packaging.py +485 -0
  52. natural_pdf/utils/text_extraction.py +44 -64
  53. natural_pdf/utils/visualization.py +1 -1
  54. {natural_pdf-0.1.5.dist-info → natural_pdf-0.1.7.dist-info}/METADATA +44 -20
  55. {natural_pdf-0.1.5.dist-info → natural_pdf-0.1.7.dist-info}/RECORD +58 -47
  56. {natural_pdf-0.1.5.dist-info → natural_pdf-0.1.7.dist-info}/WHEEL +1 -1
  57. {natural_pdf-0.1.5.dist-info → natural_pdf-0.1.7.dist-info}/top_level.txt +0 -1
  58. natural_pdf/templates/ocr_debug.html +0 -517
  59. tests/test_loading.py +0 -50
  60. tests/test_optional_deps.py +0 -298
  61. {natural_pdf-0.1.5.dist-info → natural_pdf-0.1.7.dist-info}/licenses/LICENSE +0 -0
natural_pdf/ocr/engine.py CHANGED
@@ -11,35 +11,153 @@ from .ocr_options import BaseOCROptions
11
11
  logger = logging.getLogger(__name__)
12
12
 
13
13
 
14
+ class TextRegion:
15
+ """Standard representation of an OCR text region."""
16
+
17
+ def __init__(
18
+ self,
19
+ bbox: Tuple[float, float, float, float],
20
+ text: str,
21
+ confidence: float,
22
+ source: str = "ocr",
23
+ ):
24
+ """
25
+ Initialize a text region.
26
+
27
+ Args:
28
+ bbox: Tuple of (x0, y0, x1, y1) coordinates
29
+ text: The recognized text
30
+ confidence: Confidence score (0.0-1.0)
31
+ source: Source of the text region (default: "ocr")
32
+ """
33
+ self.bbox = bbox
34
+ self.text = text
35
+ self.confidence = confidence
36
+ self.source = source
37
+
38
+ @classmethod
39
+ def from_polygon(cls, polygon: List[List[float]], text: str, confidence: float):
40
+ """Create from polygon coordinates [[x1,y1], [x2,y2], ...]"""
41
+ x_coords = [float(point[0]) for point in polygon]
42
+ y_coords = [float(point[1]) for point in polygon]
43
+ bbox = (min(x_coords), min(y_coords), max(x_coords), max(y_coords))
44
+ return cls(bbox, text, confidence)
45
+
46
+ def to_dict(self) -> Dict[str, Any]:
47
+ """Convert to dictionary representation for compatibility."""
48
+ return {
49
+ "bbox": self.bbox,
50
+ "text": self.text,
51
+ "confidence": self.confidence,
52
+ "source": self.source,
53
+ }
54
+
55
+
14
56
  class OCREngine(ABC):
15
57
  """Abstract Base Class for OCR engines."""
16
58
 
59
+ # Default values as class constants
60
+ DEFAULT_MIN_CONFIDENCE = 0.2
61
+ DEFAULT_LANGUAGES = ["en"]
62
+ DEFAULT_DEVICE = "cpu"
63
+
17
64
  def __init__(self):
18
65
  """Initializes the base OCR engine."""
19
66
  self.logger = logging.getLogger(f"{__name__}.{self.__class__.__name__}")
20
67
  self.logger.info(f"Initializing {self.__class__.__name__}")
68
+ self._model = None
69
+ self._initialized = False
21
70
  self._reader_cache = {} # Cache for initialized models/readers
22
71
 
23
- @abstractmethod
24
72
  def process_image(
25
73
  self,
26
- images: Union[Image.Image, List[Image.Image]], # Accept single or list
27
- options: BaseOCROptions,
28
- ) -> Union[List[Dict[str, Any]], List[List[Dict[str, Any]]]]: # Return single or list of lists
74
+ images: Union[Image.Image, List[Image.Image]],
75
+ languages: Optional[List[str]] = None,
76
+ min_confidence: Optional[float] = None,
77
+ device: Optional[str] = None,
78
+ detect_only: bool = False,
79
+ options: Optional[BaseOCROptions] = None,
80
+ ) -> Union[List[Dict[str, Any]], List[List[Dict[str, Any]]]]:
29
81
  """
30
- Processes a single image or a batch of images using the specific engine and options.
82
+ Process a single image or batch of images with OCR.
31
83
 
32
84
  Args:
33
- images: A single PIL Image or a list of PIL Images.
34
- options: An instance of a dataclass inheriting from BaseOCROptions
35
- containing configuration for this run.
85
+ images: A single PIL Image or a list of PIL Images
86
+ languages: List of languages to use (default: ['en'])
87
+ min_confidence: Minimum confidence threshold (default: 0.2)
88
+ device: Device to use for processing (default: 'cpu')
89
+ detect_only: Whether to only detect text regions without recognition
90
+ options: Engine-specific options
36
91
 
37
92
  Returns:
38
- If input is a single image: List of result dictionaries.
39
- If input is a list of images: List of lists of result dictionaries,
40
- corresponding to each input image.
41
- An empty list indicates failure for that image.
93
+ For a single image: List of text region dictionaries
94
+ For a batch: List of lists of text region dictionaries
42
95
  """
96
+ # Convert single image to batch format
97
+ single_image = not isinstance(images, list)
98
+ image_batch = [images] if single_image else images
99
+
100
+ # Use default values where parameters are not provided
101
+ effective_languages = languages or self.DEFAULT_LANGUAGES
102
+ effective_confidence = (
103
+ min_confidence if min_confidence is not None else self.DEFAULT_MIN_CONFIDENCE
104
+ )
105
+ effective_device = device or self.DEFAULT_DEVICE
106
+
107
+ # Ensure the model is initialized
108
+ self._ensure_initialized(effective_languages, effective_device, options)
109
+
110
+ # Process each image in the batch
111
+ results = []
112
+ for img in image_batch:
113
+ # Preprocess the image for the specific engine
114
+ processed_img = self._preprocess_image(img)
115
+
116
+ # Process the image with the engine-specific implementation
117
+ raw_results = self._process_single_image(processed_img, detect_only, options)
118
+
119
+ # Convert results to standardized format
120
+ text_regions = self._standardize_results(raw_results, effective_confidence, detect_only)
121
+
122
+ # Convert TextRegion objects to dictionaries for backward compatibility
123
+ region_dicts = [region.to_dict() for region in text_regions]
124
+ results.append(region_dicts)
125
+
126
+ # Return results in the appropriate format
127
+ return results[0] if single_image else results
128
+
129
+ def _ensure_initialized(
130
+ self, languages: List[str], device: str, options: Optional[BaseOCROptions]
131
+ ):
132
+ """Ensure the model is initialized with the correct parameters."""
133
+ if not self._initialized:
134
+ self._initialize_model(languages, device, options)
135
+ self._initialized = True
136
+
137
+ @abstractmethod
138
+ def _initialize_model(
139
+ self, languages: List[str], device: str, options: Optional[BaseOCROptions]
140
+ ):
141
+ """Initialize the OCR model with the given parameters."""
142
+ raise NotImplementedError("Subclasses must implement this method")
143
+
144
+ @abstractmethod
145
+ def _preprocess_image(self, image: Image.Image) -> Any:
146
+ """Convert PIL Image to engine-specific format."""
147
+ raise NotImplementedError("Subclasses must implement this method")
148
+
149
+ @abstractmethod
150
+ def _process_single_image(
151
+ self, image: Any, detect_only: bool, options: Optional[BaseOCROptions]
152
+ ) -> Any:
153
+ """Process a single image with the initialized model."""
154
+ raise NotImplementedError("Subclasses must implement this method")
155
+
156
+ @abstractmethod
157
+ def _standardize_results(
158
+ self, raw_results: Any, min_confidence: float, detect_only: bool
159
+ ) -> List[TextRegion]:
160
+ """Convert engine-specific results to standardized TextRegion objects."""
43
161
  raise NotImplementedError("Subclasses must implement this method")
44
162
 
45
163
  @abstractmethod
@@ -63,48 +181,41 @@ class OCREngine(ABC):
63
181
  Returns:
64
182
  A string cache key.
65
183
  """
66
- # Basic key includes languages and device
67
- lang_key = "-".join(sorted(options.languages))
68
- device_key = str(options.device).lower()
184
+ lang_key = "-".join(sorted(getattr(options, "languages", self.DEFAULT_LANGUAGES)))
185
+ device_key = str(getattr(options, "device", self.DEFAULT_DEVICE)).lower()
69
186
  return f"{self.__class__.__name__}_{lang_key}_{device_key}"
70
187
 
71
- def _standardize_bbox(self, bbox: Any) -> Optional[Tuple[float, float, float, float]]:
72
- """
73
- Helper to standardize bounding boxes to (x0, y0, x1, y1) format.
74
-
75
- Args:
76
- bbox: The bounding box in the engine's native format.
77
- Expected formats:
78
- - List/Tuple of 4 numbers: (x0, y0, x1, y1)
79
- - List of points: [[x1,y1],[x2,y2],[x3,y3],[x4,y4]] (polygon)
80
-
81
- Returns:
82
- Tuple[float, float, float, float] or None if conversion fails.
83
- """
84
- try:
85
- if (
86
- isinstance(bbox, (list, tuple))
87
- and len(bbox) == 4
88
- and all(isinstance(n, (int, float)) for n in bbox)
89
- ):
90
- # Already in (x0, y0, x1, y1) format (or similar)
188
+ def _standardize_bbox(self, bbox: Any) -> Tuple[float, float, float, float]:
189
+ """Standardizes bounding boxes to (x0, y0, x1, y1) format. Raises ValueError if standardization fails."""
190
+ # Check if it's already in the correct tuple/list format
191
+ if (
192
+ isinstance(bbox, (list, tuple))
193
+ and len(bbox) == 4
194
+ and all(isinstance(n, (int, float)) for n in bbox)
195
+ ):
196
+ try:
91
197
  return tuple(float(c) for c in bbox[:4])
92
- elif (
93
- isinstance(bbox, (list, tuple))
94
- and len(bbox) > 0
95
- and isinstance(bbox[0], (list, tuple))
96
- ):
97
- # Polygon format [[x1,y1],[x2,y2],...]
198
+ except (ValueError, TypeError) as e:
199
+ raise ValueError(f"Invalid number format in bbox: {bbox}") from e
200
+
201
+ # Check if it's in polygon format [[x1,y1],[x2,y2],...]
202
+ elif (
203
+ isinstance(bbox, (list, tuple))
204
+ and len(bbox) > 0
205
+ and isinstance(bbox[0], (list, tuple))
206
+ and len(bbox[0]) == 2 # Ensure points are pairs
207
+ ):
208
+ try:
98
209
  x_coords = [float(point[0]) for point in bbox]
99
210
  y_coords = [float(point[1]) for point in bbox]
100
- x0 = min(x_coords)
101
- y0 = min(y_coords)
102
- x1 = max(x_coords)
103
- y1 = max(y_coords)
104
- return (x0, y0, x1, y1)
105
- except Exception as e:
106
- self.logger.warning(f"Could not standardize bounding box: {bbox}. Error: {e}")
107
- return None
211
+ if not x_coords or not y_coords: # Handle empty polygon case
212
+ raise ValueError("Empty polygon provided")
213
+ return (min(x_coords), min(y_coords), max(x_coords), max(y_coords))
214
+ except (ValueError, TypeError, IndexError) as e:
215
+ raise ValueError(f"Invalid polygon format or values: {bbox}") from e
216
+
217
+ # If it's neither format, raise an error
218
+ raise ValueError(f"Could not standardize bounding box from unexpected format: {bbox}")
108
219
 
109
220
  def __del__(self):
110
221
  """Cleanup resources when the engine is deleted."""
@@ -1,13 +1,12 @@
1
1
  # ocr_engine_easyocr.py
2
2
  import importlib.util
3
- import inspect # Used for dynamic parameter passing
4
3
  import logging
5
4
  from typing import Any, Dict, List, Optional, Tuple, Union
6
5
 
7
6
  import numpy as np
8
7
  from PIL import Image
9
8
 
10
- from .engine import OCREngine
9
+ from .engine import OCREngine, TextRegion
11
10
  from .ocr_options import BaseOCROptions, EasyOCROptions
12
11
 
13
12
  logger = logging.getLogger(__name__)
@@ -18,178 +17,200 @@ class EasyOCREngine(OCREngine):
18
17
 
19
18
  def __init__(self):
20
19
  super().__init__()
21
- self._easyocr = None # Lazy load easyocr module
22
-
23
- def _lazy_import_easyocr(self):
24
- """Imports easyocr only when needed."""
25
- if self._easyocr is None:
26
- if not self.is_available():
27
- raise ImportError("EasyOCR is not installed or available.")
28
- try:
29
- import easyocr
30
-
31
- self._easyocr = easyocr
32
- logger.info("EasyOCR module imported successfully.")
33
- except ImportError as e:
34
- logger.error(f"Failed to import EasyOCR: {e}")
35
- raise
36
- return self._easyocr
20
+ # No longer need _easyocr attribute
21
+ # self._easyocr = None
37
22
 
38
23
  def is_available(self) -> bool:
39
24
  """Check if EasyOCR is installed."""
40
25
  return importlib.util.find_spec("easyocr") is not None
41
26
 
42
- def _get_cache_key(self, options: EasyOCROptions) -> str:
43
- """Generate a more specific cache key for EasyOCR."""
44
- base_key = super()._get_cache_key(options)
45
- recog_key = options.recog_network
46
- detect_key = options.detect_network
47
- quantize_key = str(options.quantize)
48
- return f"{base_key}_{recog_key}_{detect_key}_{quantize_key}"
49
-
50
- def _get_reader(self, options: EasyOCROptions):
51
- """Get or initialize an EasyOCR reader based on options."""
52
- cache_key = self._get_cache_key(options)
53
- if cache_key in self._reader_cache:
54
- logger.debug(f"Using cached EasyOCR reader for key: {cache_key}")
55
- return self._reader_cache[cache_key]
56
-
57
- logger.info(f"Creating new EasyOCR reader for key: {cache_key}")
58
- easyocr = self._lazy_import_easyocr()
59
-
60
- constructor_sig = inspect.signature(easyocr.Reader.__init__)
61
- constructor_args = {}
62
- constructor_args["lang_list"] = options.languages
63
- constructor_args["gpu"] = (
64
- "cuda" in str(options.device).lower() or "mps" in str(options.device).lower()
65
- )
66
-
67
- for field_name, param in constructor_sig.parameters.items():
68
- if field_name in ["self", "lang_list", "gpu"]:
69
- continue
70
- if hasattr(options, field_name):
71
- constructor_args[field_name] = getattr(options, field_name)
72
- elif field_name in options.extra_args:
73
- constructor_args[field_name] = options.extra_args[field_name]
74
-
75
- logger.debug(f"EasyOCR Reader constructor args: {constructor_args}")
27
+ def _initialize_model(
28
+ self, languages: List[str], device: str, options: Optional[BaseOCROptions]
29
+ ):
30
+ """Initialize the EasyOCR model."""
31
+ # Import directly here
32
+ try:
33
+ import easyocr
34
+
35
+ self.logger.info("EasyOCR module imported successfully.")
36
+ except ImportError as e:
37
+ self.logger.error(f"Failed to import EasyOCR: {e}")
38
+ raise
39
+
40
+ # Cast to EasyOCROptions if possible, otherwise use default
41
+ easy_options = options if isinstance(options, EasyOCROptions) else EasyOCROptions()
42
+
43
+ # Prepare constructor arguments
44
+ use_gpu = "cuda" in device.lower() or "mps" in device.lower()
45
+
46
+ constructor_args = {
47
+ "lang_list": languages,
48
+ "gpu": use_gpu,
49
+ # Explicitly map relevant options
50
+ "model_storage_directory": easy_options.model_storage_directory,
51
+ "user_network_directory": easy_options.user_network_directory,
52
+ "recog_network": easy_options.recog_network,
53
+ "detect_network": easy_options.detect_network,
54
+ "download_enabled": easy_options.download_enabled,
55
+ "detector": easy_options.detector,
56
+ "recognizer": easy_options.recognizer,
57
+ "verbose": easy_options.verbose,
58
+ "quantize": easy_options.quantize,
59
+ "cudnn_benchmark": easy_options.cudnn_benchmark,
60
+ }
61
+
62
+ # Filter out None values, as EasyOCR expects non-None or default behaviour
63
+ constructor_args = {k: v for k, v in constructor_args.items() if v is not None}
64
+
65
+ self.logger.debug(f"EasyOCR Reader constructor args: {constructor_args}")
66
+
67
+ # Create the reader
76
68
  try:
77
- reader = easyocr.Reader(**constructor_args)
78
- self._reader_cache[cache_key] = reader
79
- logger.info("EasyOCR reader created successfully.")
80
- return reader
69
+ self._model = easyocr.Reader(**constructor_args)
70
+ self.logger.info("EasyOCR reader created successfully")
81
71
  except Exception as e:
82
- logger.error(f"Failed to create EasyOCR reader: {e}", exc_info=True)
72
+ self.logger.error(f"Failed to create EasyOCR reader: {e}")
83
73
  raise
84
74
 
85
- def _prepare_readtext_args(self, options: EasyOCROptions, reader) -> Dict[str, Any]:
86
- """Helper to prepare arguments for the readtext method."""
87
- readtext_sig = inspect.signature(reader.readtext)
75
+ def _preprocess_image(self, image: Image.Image) -> np.ndarray:
76
+ """Convert PIL Image to numpy array for EasyOCR."""
77
+ return np.array(image)
78
+
79
+ def _process_single_image(
80
+ self, image: np.ndarray, detect_only: bool, options: Optional[EasyOCROptions]
81
+ ) -> Any:
82
+ """Process a single image with EasyOCR."""
83
+ if self._model is None:
84
+ raise RuntimeError("EasyOCR model not initialized")
85
+
86
+ # Cast options to proper type if provided
87
+ easy_options = options if isinstance(options, EasyOCROptions) else EasyOCROptions()
88
+
89
+ # Prepare readtext arguments (only needed if not detect_only)
88
90
  readtext_args = {}
89
- for field_name, param in readtext_sig.parameters.items():
90
- if field_name == "image":
91
- continue
92
- if hasattr(options, field_name):
93
- readtext_args[field_name] = getattr(options, field_name)
94
- elif field_name in options.extra_args:
95
- readtext_args[field_name] = options.extra_args[field_name]
96
- logger.debug(f"EasyOCR readtext args: {readtext_args}")
97
- return readtext_args
91
+ if not detect_only:
92
+ for param in [
93
+ "detail",
94
+ "paragraph",
95
+ "min_size",
96
+ "contrast_ths",
97
+ "adjust_contrast",
98
+ "filter_ths",
99
+ "text_threshold",
100
+ "low_text",
101
+ "link_threshold",
102
+ "canvas_size",
103
+ "mag_ratio",
104
+ "slope_ths",
105
+ "ycenter_ths",
106
+ "height_ths",
107
+ "width_ths",
108
+ "y_ths",
109
+ "x_ths",
110
+ "add_margin",
111
+ "output_format",
112
+ ]:
113
+ if hasattr(easy_options, param):
114
+ val = getattr(easy_options, param)
115
+ if val is not None:
116
+ readtext_args[param] = val
117
+
118
+ # Process differently based on detect_only flag
119
+ if detect_only:
120
+ # Returns tuple (horizontal_list, free_list)
121
+ # horizontal_list is a list containing one item: the list of boxes
122
+ # Each box is [[x1, y1], [x2, y1], [x2, y2], [x1, y2]]
123
+ bboxes_tuple = self._model.detect(
124
+ image, **readtext_args
125
+ ) # Pass args here too? Check EasyOCR docs if needed.
126
+ if (
127
+ bboxes_tuple
128
+ and isinstance(bboxes_tuple, tuple)
129
+ and len(bboxes_tuple) > 0
130
+ and isinstance(bboxes_tuple[0], list)
131
+ ):
132
+ return bboxes_tuple[0] # Return the list of polygons directly
133
+ else:
134
+ self.logger.warning(f"EasyOCR detect returned unexpected format: {bboxes_tuple}")
135
+ return [] # Return empty list on unexpected format
136
+ else:
137
+ return self._model.readtext(image, **readtext_args)
98
138
 
99
139
  def _standardize_results(
100
- self, raw_results: List[Any], options: EasyOCROptions
101
- ) -> List[Dict[str, Any]]:
102
- """Standardizes raw results from EasyOCR's readtext."""
103
- standardized_results = []
104
- min_confidence = options.min_confidence
105
-
140
+ self, raw_results: Any, min_confidence: float, detect_only: bool
141
+ ) -> List[TextRegion]:
142
+ """Convert EasyOCR results to standardized TextRegion objects."""
143
+ standardized_regions = []
144
+
145
+ if detect_only:
146
+ # In detect_only mode, raw_results is already a list of bounding boxes
147
+ # Each bbox is in [x_min, x_max, y_min, y_max] format
148
+ if isinstance(raw_results, list):
149
+ for detection in raw_results:
150
+ try:
151
+ if isinstance(detection, (list, tuple)) and len(detection) == 4:
152
+ x_min, x_max, y_min, y_max = detection
153
+ # Convert to standardized (x0, y0, x1, y1) format
154
+ try:
155
+ bbox = (float(x_min), float(y_min), float(x_max), float(y_max))
156
+ standardized_regions.append(
157
+ TextRegion(bbox, text=None, confidence=None)
158
+ )
159
+ except (ValueError, TypeError) as e:
160
+ raise ValueError(
161
+ f"Invalid number format in EasyOCR detect bbox: {detection}"
162
+ ) from e
163
+ else:
164
+ raise ValueError(f"Invalid detection format from EasyOCR: {detection}")
165
+ except ValueError as e:
166
+ # Re-raise any value errors from standardization or format checks
167
+ raise e
168
+ except Exception as e:
169
+ # Catch other potential processing errors
170
+ raise ValueError(
171
+ f"Error processing EasyOCR detection item: {detection}"
172
+ ) from e
173
+ else:
174
+ raise ValueError(
175
+ f"Expected list of bounding boxes in detect_only mode, got: {raw_results}"
176
+ )
177
+
178
+ return standardized_regions
179
+
180
+ # Full OCR mode (readtext results)
106
181
  for detection in raw_results:
107
182
  try:
108
- if (
109
- options.detail == 1
110
- and isinstance(detection, (list, tuple))
111
- and len(detection) >= 3
112
- ):
113
- bbox_raw = detection[0]
183
+ # Detail mode (list/tuple result)
184
+ if isinstance(detection, (list, tuple)) and len(detection) >= 3:
185
+ bbox_raw = detection[0] # This is usually a polygon [[x1,y1],...]
114
186
  text = str(detection[1])
115
187
  confidence = float(detection[2])
116
188
 
117
189
  if confidence >= min_confidence:
118
- bbox = self._standardize_bbox(bbox_raw)
119
- if bbox:
120
- standardized_results.append(
121
- {
122
- "bbox": bbox,
123
- "text": text,
124
- "confidence": confidence,
125
- "source": "ocr",
126
- }
127
- )
128
- else:
129
- logger.warning(f"Skipping result due to invalid bbox: {bbox_raw}")
130
-
131
- elif options.detail == 0 and isinstance(detection, str):
132
- standardized_results.append(
133
- {"bbox": None, "text": detection, "confidence": 1.0, "source": "ocr"}
190
+ try:
191
+ # Use the standard helper for polygons
192
+ bbox = self._standardize_bbox(bbox_raw)
193
+ standardized_regions.append(TextRegion(bbox, text, confidence))
194
+ except ValueError as e:
195
+ raise ValueError(
196
+ f"Could not standardize bounding box from EasyOCR readtext: {bbox_raw}"
197
+ ) from e
198
+
199
+ # Simple mode (string result)
200
+ elif isinstance(detection, str):
201
+ if 0.0 >= min_confidence: # Always include if min_confidence is 0
202
+ standardized_regions.append(TextRegion((0, 0, 0, 0), detection, 1.0))
203
+ else:
204
+ # Handle unexpected format in OCR mode
205
+ raise ValueError(
206
+ f"Invalid OCR detection format from EasyOCR readtext: {detection}"
134
207
  )
135
- except (IndexError, ValueError, TypeError) as e:
136
- logger.warning(f"Skipping invalid detection format: {detection}. Error: {e}")
137
- continue
138
- return standardized_results
139
-
140
- def process_image(
141
- self, images: Union[Image.Image, List[Image.Image]], options: BaseOCROptions
142
- ) -> Union[List[Dict[str, Any]], List[List[Dict[str, Any]]]]:
143
- """Processes a single image or a batch of images with EasyOCR."""
144
-
145
- if not isinstance(options, EasyOCROptions):
146
- logger.warning("Received BaseOCROptions, expected EasyOCROptions. Using defaults.")
147
- # Create default EasyOCR options if base was passed, preserving base settings
148
- options = EasyOCROptions(
149
- languages=options.languages,
150
- min_confidence=options.min_confidence,
151
- device=options.device,
152
- extra_args=options.extra_args, # Pass along any extra args
153
- )
154
-
155
- reader = self._get_reader(options)
156
- readtext_args = self._prepare_readtext_args(options, reader)
157
-
158
- # --- Handle single image or batch ---
159
- if isinstance(images, list):
160
- # --- Batch Processing (Iterative for EasyOCR) ---
161
- all_results = []
162
- logger.info(f"Processing batch of {len(images)} images with EasyOCR (iteratively)...")
163
- for i, img in enumerate(images):
164
- if not isinstance(img, Image.Image):
165
- logger.warning(f"Item at index {i} in batch is not a PIL Image. Skipping.")
166
- all_results.append([])
167
- continue
168
- img_array = np.array(img)
169
- try:
170
- logger.debug(f"Processing image {i+1}/{len(images)} in batch.")
171
- raw_results = reader.readtext(img_array, **readtext_args)
172
- standardized = self._standardize_results(raw_results, options)
173
- all_results.append(standardized)
174
- except Exception as e:
175
- logger.error(
176
- f"Error processing image {i+1} in EasyOCR batch: {e}", exc_info=True
177
- )
178
- all_results.append([]) # Append empty list for failed image
179
- logger.info(f"Finished processing batch with EasyOCR.")
180
- return all_results # Return List[List[Dict]]
181
-
182
- elif isinstance(images, Image.Image):
183
- # --- Single Image Processing ---
184
- logger.info("Processing single image with EasyOCR...")
185
- img_array = np.array(images)
186
- try:
187
- raw_results = reader.readtext(img_array, **readtext_args)
188
- standardized = self._standardize_results(raw_results, options)
189
- logger.info(f"Finished processing single image. Found {len(standardized)} results.")
190
- return standardized # Return List[Dict]
208
+
209
+ except ValueError as e:
210
+ # Re-raise any value errors from standardization or format checks
211
+ raise e
191
212
  except Exception as e:
192
- logger.error(f"Error processing single image with EasyOCR: {e}", exc_info=True)
193
- return [] # Return empty list on failure
194
- else:
195
- raise TypeError("Input 'images' must be a PIL Image or a list of PIL Images.")
213
+ # Catch other potential processing errors
214
+ raise ValueError(f"Error processing EasyOCR detection item: {detection}") from e
215
+
216
+ return standardized_regions