natural-pdf 0.1.5__py3-none-any.whl → 0.1.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. docs/ocr/index.md +34 -47
  2. docs/tutorials/01-loading-and-extraction.ipynb +60 -46
  3. docs/tutorials/02-finding-elements.ipynb +42 -42
  4. docs/tutorials/03-extracting-blocks.ipynb +17 -17
  5. docs/tutorials/04-table-extraction.ipynb +12 -12
  6. docs/tutorials/05-excluding-content.ipynb +30 -30
  7. docs/tutorials/06-document-qa.ipynb +28 -28
  8. docs/tutorials/07-layout-analysis.ipynb +63 -35
  9. docs/tutorials/07-working-with-regions.ipynb +55 -51
  10. docs/tutorials/07-working-with-regions.md +2 -2
  11. docs/tutorials/08-spatial-navigation.ipynb +60 -60
  12. docs/tutorials/09-section-extraction.ipynb +113 -113
  13. docs/tutorials/10-form-field-extraction.ipynb +78 -50
  14. docs/tutorials/11-enhanced-table-processing.ipynb +6 -6
  15. docs/tutorials/12-ocr-integration.ipynb +149 -131
  16. docs/tutorials/12-ocr-integration.md +0 -13
  17. docs/tutorials/13-semantic-search.ipynb +313 -873
  18. natural_pdf/__init__.py +21 -23
  19. natural_pdf/analyzers/layout/gemini.py +264 -0
  20. natural_pdf/analyzers/layout/layout_manager.py +28 -1
  21. natural_pdf/analyzers/layout/layout_options.py +11 -0
  22. natural_pdf/analyzers/layout/yolo.py +6 -2
  23. natural_pdf/collections/pdf_collection.py +21 -0
  24. natural_pdf/core/element_manager.py +16 -13
  25. natural_pdf/core/page.py +165 -36
  26. natural_pdf/core/pdf.py +146 -41
  27. natural_pdf/elements/base.py +11 -17
  28. natural_pdf/elements/collections.py +100 -38
  29. natural_pdf/elements/region.py +77 -38
  30. natural_pdf/elements/text.py +5 -0
  31. natural_pdf/ocr/__init__.py +49 -36
  32. natural_pdf/ocr/engine.py +146 -51
  33. natural_pdf/ocr/engine_easyocr.py +141 -161
  34. natural_pdf/ocr/engine_paddle.py +107 -193
  35. natural_pdf/ocr/engine_surya.py +75 -148
  36. natural_pdf/ocr/ocr_factory.py +114 -0
  37. natural_pdf/ocr/ocr_manager.py +65 -93
  38. natural_pdf/ocr/ocr_options.py +7 -17
  39. natural_pdf/ocr/utils.py +98 -0
  40. natural_pdf/templates/spa/css/style.css +334 -0
  41. natural_pdf/templates/spa/index.html +31 -0
  42. natural_pdf/templates/spa/js/app.js +472 -0
  43. natural_pdf/templates/spa/words.txt +235976 -0
  44. natural_pdf/utils/debug.py +32 -0
  45. natural_pdf/utils/identifiers.py +29 -0
  46. natural_pdf/utils/packaging.py +418 -0
  47. {natural_pdf-0.1.5.dist-info → natural_pdf-0.1.6.dist-info}/METADATA +41 -19
  48. {natural_pdf-0.1.5.dist-info → natural_pdf-0.1.6.dist-info}/RECORD +51 -44
  49. {natural_pdf-0.1.5.dist-info → natural_pdf-0.1.6.dist-info}/WHEEL +1 -1
  50. {natural_pdf-0.1.5.dist-info → natural_pdf-0.1.6.dist-info}/top_level.txt +0 -1
  51. natural_pdf/templates/ocr_debug.html +0 -517
  52. tests/test_loading.py +0 -50
  53. tests/test_optional_deps.py +0 -298
  54. {natural_pdf-0.1.5.dist-info → natural_pdf-0.1.6.dist-info}/licenses/LICENSE +0 -0
natural_pdf/ocr/engine.py CHANGED
@@ -11,35 +11,137 @@ from .ocr_options import BaseOCROptions
11
11
  logger = logging.getLogger(__name__)
12
12
 
13
13
 
14
+ class TextRegion:
15
+ """Standard representation of an OCR text region."""
16
+
17
+ def __init__(self, bbox: Tuple[float, float, float, float], text: str, confidence: float, source: str = "ocr"):
18
+ """
19
+ Initialize a text region.
20
+
21
+ Args:
22
+ bbox: Tuple of (x0, y0, x1, y1) coordinates
23
+ text: The recognized text
24
+ confidence: Confidence score (0.0-1.0)
25
+ source: Source of the text region (default: "ocr")
26
+ """
27
+ self.bbox = bbox
28
+ self.text = text
29
+ self.confidence = confidence
30
+ self.source = source
31
+
32
+ @classmethod
33
+ def from_polygon(cls, polygon: List[List[float]], text: str, confidence: float):
34
+ """Create from polygon coordinates [[x1,y1], [x2,y2], ...]"""
35
+ x_coords = [float(point[0]) for point in polygon]
36
+ y_coords = [float(point[1]) for point in polygon]
37
+ bbox = (min(x_coords), min(y_coords), max(x_coords), max(y_coords))
38
+ return cls(bbox, text, confidence)
39
+
40
+ def to_dict(self) -> Dict[str, Any]:
41
+ """Convert to dictionary representation for compatibility."""
42
+ return {
43
+ "bbox": self.bbox,
44
+ "text": self.text,
45
+ "confidence": self.confidence,
46
+ "source": self.source
47
+ }
48
+
49
+
14
50
  class OCREngine(ABC):
15
51
  """Abstract Base Class for OCR engines."""
52
+
53
+ # Default values as class constants
54
+ DEFAULT_MIN_CONFIDENCE = 0.2
55
+ DEFAULT_LANGUAGES = ['en']
56
+ DEFAULT_DEVICE = 'cpu'
16
57
 
17
58
  def __init__(self):
18
59
  """Initializes the base OCR engine."""
19
60
  self.logger = logging.getLogger(f"{__name__}.{self.__class__.__name__}")
20
61
  self.logger.info(f"Initializing {self.__class__.__name__}")
62
+ self._model = None
63
+ self._initialized = False
21
64
  self._reader_cache = {} # Cache for initialized models/readers
22
65
 
23
- @abstractmethod
24
66
  def process_image(
25
67
  self,
26
- images: Union[Image.Image, List[Image.Image]], # Accept single or list
27
- options: BaseOCROptions,
28
- ) -> Union[List[Dict[str, Any]], List[List[Dict[str, Any]]]]: # Return single or list of lists
68
+ images: Union[Image.Image, List[Image.Image]],
69
+ languages: Optional[List[str]] = None,
70
+ min_confidence: Optional[float] = None,
71
+ device: Optional[str] = None,
72
+ detect_only: bool = False,
73
+ options: Optional[BaseOCROptions] = None,
74
+ ) -> Union[List[Dict[str, Any]], List[List[Dict[str, Any]]]]:
29
75
  """
30
- Processes a single image or a batch of images using the specific engine and options.
31
-
76
+ Process a single image or batch of images with OCR.
77
+
32
78
  Args:
33
- images: A single PIL Image or a list of PIL Images.
34
- options: An instance of a dataclass inheriting from BaseOCROptions
35
- containing configuration for this run.
36
-
79
+ images: A single PIL Image or a list of PIL Images
80
+ languages: List of languages to use (default: ['en'])
81
+ min_confidence: Minimum confidence threshold (default: 0.2)
82
+ device: Device to use for processing (default: 'cpu')
83
+ detect_only: Whether to only detect text regions without recognition
84
+ options: Engine-specific options
85
+
37
86
  Returns:
38
- If input is a single image: List of result dictionaries.
39
- If input is a list of images: List of lists of result dictionaries,
40
- corresponding to each input image.
41
- An empty list indicates failure for that image.
87
+ For a single image: List of text region dictionaries
88
+ For a batch: List of lists of text region dictionaries
42
89
  """
90
+ # Convert single image to batch format
91
+ single_image = not isinstance(images, list)
92
+ image_batch = [images] if single_image else images
93
+
94
+ # Use default values where parameters are not provided
95
+ effective_languages = languages or self.DEFAULT_LANGUAGES
96
+ effective_confidence = min_confidence if min_confidence is not None else self.DEFAULT_MIN_CONFIDENCE
97
+ effective_device = device or self.DEFAULT_DEVICE
98
+
99
+ # Ensure the model is initialized
100
+ self._ensure_initialized(effective_languages, effective_device, options)
101
+
102
+ # Process each image in the batch
103
+ results = []
104
+ for img in image_batch:
105
+ # Preprocess the image for the specific engine
106
+ processed_img = self._preprocess_image(img)
107
+
108
+ # Process the image with the engine-specific implementation
109
+ raw_results = self._process_single_image(processed_img, detect_only, options)
110
+
111
+ # Convert results to standardized format
112
+ text_regions = self._standardize_results(raw_results, effective_confidence, detect_only)
113
+
114
+ # Convert TextRegion objects to dictionaries for backward compatibility
115
+ region_dicts = [region.to_dict() for region in text_regions]
116
+ results.append(region_dicts)
117
+
118
+ # Return results in the appropriate format
119
+ return results[0] if single_image else results
120
+
121
+ def _ensure_initialized(self, languages: List[str], device: str, options: Optional[BaseOCROptions]):
122
+ """Ensure the model is initialized with the correct parameters."""
123
+ if not self._initialized:
124
+ self._initialize_model(languages, device, options)
125
+ self._initialized = True
126
+
127
+ @abstractmethod
128
+ def _initialize_model(self, languages: List[str], device: str, options: Optional[BaseOCROptions]):
129
+ """Initialize the OCR model with the given parameters."""
130
+ raise NotImplementedError("Subclasses must implement this method")
131
+
132
+ @abstractmethod
133
+ def _preprocess_image(self, image: Image.Image) -> Any:
134
+ """Convert PIL Image to engine-specific format."""
135
+ raise NotImplementedError("Subclasses must implement this method")
136
+
137
+ @abstractmethod
138
+ def _process_single_image(self, image: Any, detect_only: bool, options: Optional[BaseOCROptions]) -> Any:
139
+ """Process a single image with the initialized model."""
140
+ raise NotImplementedError("Subclasses must implement this method")
141
+
142
+ @abstractmethod
143
+ def _standardize_results(self, raw_results: Any, min_confidence: float, detect_only: bool) -> List[TextRegion]:
144
+ """Convert engine-specific results to standardized TextRegion objects."""
43
145
  raise NotImplementedError("Subclasses must implement this method")
44
146
 
45
147
  @abstractmethod
@@ -63,48 +165,41 @@ class OCREngine(ABC):
63
165
  Returns:
64
166
  A string cache key.
65
167
  """
66
- # Basic key includes languages and device
67
- lang_key = "-".join(sorted(options.languages))
68
- device_key = str(options.device).lower()
168
+ lang_key = "-".join(sorted(getattr(options, "languages", self.DEFAULT_LANGUAGES)))
169
+ device_key = str(getattr(options, "device", self.DEFAULT_DEVICE)).lower()
69
170
  return f"{self.__class__.__name__}_{lang_key}_{device_key}"
70
171
 
71
- def _standardize_bbox(self, bbox: Any) -> Optional[Tuple[float, float, float, float]]:
72
- """
73
- Helper to standardize bounding boxes to (x0, y0, x1, y1) format.
74
-
75
- Args:
76
- bbox: The bounding box in the engine's native format.
77
- Expected formats:
78
- - List/Tuple of 4 numbers: (x0, y0, x1, y1)
79
- - List of points: [[x1,y1],[x2,y2],[x3,y3],[x4,y4]] (polygon)
80
-
81
- Returns:
82
- Tuple[float, float, float, float] or None if conversion fails.
83
- """
84
- try:
85
- if (
86
- isinstance(bbox, (list, tuple))
87
- and len(bbox) == 4
88
- and all(isinstance(n, (int, float)) for n in bbox)
89
- ):
90
- # Already in (x0, y0, x1, y1) format (or similar)
172
+ def _standardize_bbox(self, bbox: Any) -> Tuple[float, float, float, float]:
173
+ """Standardizes bounding boxes to (x0, y0, x1, y1) format. Raises ValueError if standardization fails."""
174
+ # Check if it's already in the correct tuple/list format
175
+ if (
176
+ isinstance(bbox, (list, tuple))
177
+ and len(bbox) == 4
178
+ and all(isinstance(n, (int, float)) for n in bbox)
179
+ ):
180
+ try:
91
181
  return tuple(float(c) for c in bbox[:4])
92
- elif (
93
- isinstance(bbox, (list, tuple))
94
- and len(bbox) > 0
95
- and isinstance(bbox[0], (list, tuple))
96
- ):
97
- # Polygon format [[x1,y1],[x2,y2],...]
182
+ except (ValueError, TypeError) as e:
183
+ raise ValueError(f"Invalid number format in bbox: {bbox}") from e
184
+
185
+ # Check if it's in polygon format [[x1,y1],[x2,y2],...]
186
+ elif (
187
+ isinstance(bbox, (list, tuple))
188
+ and len(bbox) > 0
189
+ and isinstance(bbox[0], (list, tuple))
190
+ and len(bbox[0]) == 2 # Ensure points are pairs
191
+ ):
192
+ try:
98
193
  x_coords = [float(point[0]) for point in bbox]
99
194
  y_coords = [float(point[1]) for point in bbox]
100
- x0 = min(x_coords)
101
- y0 = min(y_coords)
102
- x1 = max(x_coords)
103
- y1 = max(y_coords)
104
- return (x0, y0, x1, y1)
105
- except Exception as e:
106
- self.logger.warning(f"Could not standardize bounding box: {bbox}. Error: {e}")
107
- return None
195
+ if not x_coords or not y_coords: # Handle empty polygon case
196
+ raise ValueError("Empty polygon provided")
197
+ return (min(x_coords), min(y_coords), max(x_coords), max(y_coords))
198
+ except (ValueError, TypeError, IndexError) as e:
199
+ raise ValueError(f"Invalid polygon format or values: {bbox}") from e
200
+
201
+ # If it's neither format, raise an error
202
+ raise ValueError(f"Could not standardize bounding box from unexpected format: {bbox}")
108
203
 
109
204
  def __del__(self):
110
205
  """Cleanup resources when the engine is deleted."""
@@ -1,13 +1,12 @@
1
1
  # ocr_engine_easyocr.py
2
2
  import importlib.util
3
- import inspect # Used for dynamic parameter passing
4
3
  import logging
5
4
  from typing import Any, Dict, List, Optional, Tuple, Union
6
5
 
7
6
  import numpy as np
8
7
  from PIL import Image
9
8
 
10
- from .engine import OCREngine
9
+ from .engine import OCREngine, TextRegion
11
10
  from .ocr_options import BaseOCROptions, EasyOCROptions
12
11
 
13
12
  logger = logging.getLogger(__name__)
@@ -18,178 +17,159 @@ class EasyOCREngine(OCREngine):
18
17
 
19
18
  def __init__(self):
20
19
  super().__init__()
21
- self._easyocr = None # Lazy load easyocr module
22
-
23
- def _lazy_import_easyocr(self):
24
- """Imports easyocr only when needed."""
25
- if self._easyocr is None:
26
- if not self.is_available():
27
- raise ImportError("EasyOCR is not installed or available.")
28
- try:
29
- import easyocr
30
-
31
- self._easyocr = easyocr
32
- logger.info("EasyOCR module imported successfully.")
33
- except ImportError as e:
34
- logger.error(f"Failed to import EasyOCR: {e}")
35
- raise
36
- return self._easyocr
20
+ # No longer need _easyocr attribute
21
+ # self._easyocr = None
37
22
 
38
23
  def is_available(self) -> bool:
39
24
  """Check if EasyOCR is installed."""
40
25
  return importlib.util.find_spec("easyocr") is not None
41
26
 
42
- def _get_cache_key(self, options: EasyOCROptions) -> str:
43
- """Generate a more specific cache key for EasyOCR."""
44
- base_key = super()._get_cache_key(options)
45
- recog_key = options.recog_network
46
- detect_key = options.detect_network
47
- quantize_key = str(options.quantize)
48
- return f"{base_key}_{recog_key}_{detect_key}_{quantize_key}"
49
-
50
- def _get_reader(self, options: EasyOCROptions):
51
- """Get or initialize an EasyOCR reader based on options."""
52
- cache_key = self._get_cache_key(options)
53
- if cache_key in self._reader_cache:
54
- logger.debug(f"Using cached EasyOCR reader for key: {cache_key}")
55
- return self._reader_cache[cache_key]
56
-
57
- logger.info(f"Creating new EasyOCR reader for key: {cache_key}")
58
- easyocr = self._lazy_import_easyocr()
59
-
60
- constructor_sig = inspect.signature(easyocr.Reader.__init__)
61
- constructor_args = {}
62
- constructor_args["lang_list"] = options.languages
63
- constructor_args["gpu"] = (
64
- "cuda" in str(options.device).lower() or "mps" in str(options.device).lower()
65
- )
66
-
67
- for field_name, param in constructor_sig.parameters.items():
68
- if field_name in ["self", "lang_list", "gpu"]:
69
- continue
70
- if hasattr(options, field_name):
71
- constructor_args[field_name] = getattr(options, field_name)
72
- elif field_name in options.extra_args:
73
- constructor_args[field_name] = options.extra_args[field_name]
74
-
75
- logger.debug(f"EasyOCR Reader constructor args: {constructor_args}")
27
+ def _initialize_model(self, languages: List[str], device: str, options: Optional[BaseOCROptions]):
28
+ """Initialize the EasyOCR model."""
29
+ # Import directly here
30
+ try:
31
+ import easyocr
32
+ self.logger.info("EasyOCR module imported successfully.")
33
+ except ImportError as e:
34
+ self.logger.error(f"Failed to import EasyOCR: {e}")
35
+ raise
36
+
37
+ # Cast to EasyOCROptions if possible, otherwise use default
38
+ easy_options = options if isinstance(options, EasyOCROptions) else EasyOCROptions()
39
+
40
+ # Prepare constructor arguments
41
+ use_gpu = "cuda" in device.lower() or "mps" in device.lower()
42
+
43
+ constructor_args = {
44
+ "lang_list": languages,
45
+ "gpu": use_gpu,
46
+ # Explicitly map relevant options
47
+ "model_storage_directory": easy_options.model_storage_directory,
48
+ "user_network_directory": easy_options.user_network_directory,
49
+ "recog_network": easy_options.recog_network,
50
+ "detect_network": easy_options.detect_network,
51
+ "download_enabled": easy_options.download_enabled,
52
+ "detector": easy_options.detector,
53
+ "recognizer": easy_options.recognizer,
54
+ "verbose": easy_options.verbose,
55
+ "quantize": easy_options.quantize,
56
+ "cudnn_benchmark": easy_options.cudnn_benchmark,
57
+ }
58
+
59
+ # Filter out None values, as EasyOCR expects non-None or default behaviour
60
+ constructor_args = {k: v for k, v in constructor_args.items() if v is not None}
61
+
62
+ self.logger.debug(f"EasyOCR Reader constructor args: {constructor_args}")
63
+
64
+ # Create the reader
76
65
  try:
77
- reader = easyocr.Reader(**constructor_args)
78
- self._reader_cache[cache_key] = reader
79
- logger.info("EasyOCR reader created successfully.")
80
- return reader
66
+ self._model = easyocr.Reader(**constructor_args)
67
+ self.logger.info("EasyOCR reader created successfully")
81
68
  except Exception as e:
82
- logger.error(f"Failed to create EasyOCR reader: {e}", exc_info=True)
69
+ self.logger.error(f"Failed to create EasyOCR reader: {e}")
83
70
  raise
84
71
 
85
- def _prepare_readtext_args(self, options: EasyOCROptions, reader) -> Dict[str, Any]:
86
- """Helper to prepare arguments for the readtext method."""
87
- readtext_sig = inspect.signature(reader.readtext)
72
+ def _preprocess_image(self, image: Image.Image) -> np.ndarray:
73
+ """Convert PIL Image to numpy array for EasyOCR."""
74
+ return np.array(image)
75
+
76
+ def _process_single_image(self, image: np.ndarray, detect_only: bool, options: Optional[EasyOCROptions]) -> Any:
77
+ """Process a single image with EasyOCR."""
78
+ if self._model is None:
79
+ raise RuntimeError("EasyOCR model not initialized")
80
+
81
+ # Cast options to proper type if provided
82
+ easy_options = options if isinstance(options, EasyOCROptions) else EasyOCROptions()
83
+
84
+ # Prepare readtext arguments (only needed if not detect_only)
88
85
  readtext_args = {}
89
- for field_name, param in readtext_sig.parameters.items():
90
- if field_name == "image":
91
- continue
92
- if hasattr(options, field_name):
93
- readtext_args[field_name] = getattr(options, field_name)
94
- elif field_name in options.extra_args:
95
- readtext_args[field_name] = options.extra_args[field_name]
96
- logger.debug(f"EasyOCR readtext args: {readtext_args}")
97
- return readtext_args
98
-
99
- def _standardize_results(
100
- self, raw_results: List[Any], options: EasyOCROptions
101
- ) -> List[Dict[str, Any]]:
102
- """Standardizes raw results from EasyOCR's readtext."""
103
- standardized_results = []
104
- min_confidence = options.min_confidence
105
-
86
+ if not detect_only:
87
+ for param in [
88
+ "detail", "paragraph", "min_size", "contrast_ths", "adjust_contrast",
89
+ "filter_ths", "text_threshold", "low_text", "link_threshold",
90
+ "canvas_size", "mag_ratio", "slope_ths", "ycenter_ths", "height_ths",
91
+ "width_ths", "y_ths", "x_ths", "add_margin", "output_format"
92
+ ]:
93
+ if hasattr(easy_options, param):
94
+ val = getattr(easy_options, param)
95
+ if val is not None:
96
+ readtext_args[param] = val
97
+
98
+ # Process differently based on detect_only flag
99
+ if detect_only:
100
+ # Returns tuple (horizontal_list, free_list)
101
+ # horizontal_list is a list containing one item: the list of boxes
102
+ # Each box is [[x1, y1], [x2, y1], [x2, y2], [x1, y2]]
103
+ bboxes_tuple = self._model.detect(image, **readtext_args) # Pass args here too? Check EasyOCR docs if needed.
104
+ if bboxes_tuple and isinstance(bboxes_tuple, tuple) and len(bboxes_tuple) > 0 and isinstance(bboxes_tuple[0], list):
105
+ return bboxes_tuple[0] # Return the list of polygons directly
106
+ else:
107
+ self.logger.warning(f"EasyOCR detect returned unexpected format: {bboxes_tuple}")
108
+ return [] # Return empty list on unexpected format
109
+ else:
110
+ return self._model.readtext(image, **readtext_args)
111
+
112
+ def _standardize_results(self, raw_results: Any, min_confidence: float, detect_only: bool) -> List[TextRegion]:
113
+ """Convert EasyOCR results to standardized TextRegion objects."""
114
+ standardized_regions = []
115
+
116
+ if detect_only:
117
+ # In detect_only mode, raw_results is already a list of bounding boxes
118
+ # Each bbox is in [x_min, x_max, y_min, y_max] format
119
+ if isinstance(raw_results, list):
120
+ for detection in raw_results:
121
+ try:
122
+ if isinstance(detection, (list, tuple)) and len(detection) == 4:
123
+ x_min, x_max, y_min, y_max = detection
124
+ # Convert to standardized (x0, y0, x1, y1) format
125
+ try:
126
+ bbox = (float(x_min), float(y_min), float(x_max), float(y_max))
127
+ standardized_regions.append(TextRegion(bbox, text=None, confidence=None))
128
+ except (ValueError, TypeError) as e:
129
+ raise ValueError(f"Invalid number format in EasyOCR detect bbox: {detection}") from e
130
+ else:
131
+ raise ValueError(f"Invalid detection format from EasyOCR: {detection}")
132
+ except ValueError as e:
133
+ # Re-raise any value errors from standardization or format checks
134
+ raise e
135
+ except Exception as e:
136
+ # Catch other potential processing errors
137
+ raise ValueError(f"Error processing EasyOCR detection item: {detection}") from e
138
+ else:
139
+ raise ValueError(f"Expected list of bounding boxes in detect_only mode, got: {raw_results}")
140
+
141
+ return standardized_regions
142
+
143
+ # Full OCR mode (readtext results)
106
144
  for detection in raw_results:
107
145
  try:
108
- if (
109
- options.detail == 1
110
- and isinstance(detection, (list, tuple))
111
- and len(detection) >= 3
112
- ):
113
- bbox_raw = detection[0]
146
+ # Detail mode (list/tuple result)
147
+ if isinstance(detection, (list, tuple)) and len(detection) >= 3:
148
+ bbox_raw = detection[0] # This is usually a polygon [[x1,y1],...]
114
149
  text = str(detection[1])
115
150
  confidence = float(detection[2])
116
-
151
+
117
152
  if confidence >= min_confidence:
118
- bbox = self._standardize_bbox(bbox_raw)
119
- if bbox:
120
- standardized_results.append(
121
- {
122
- "bbox": bbox,
123
- "text": text,
124
- "confidence": confidence,
125
- "source": "ocr",
126
- }
127
- )
128
- else:
129
- logger.warning(f"Skipping result due to invalid bbox: {bbox_raw}")
130
-
131
- elif options.detail == 0 and isinstance(detection, str):
132
- standardized_results.append(
133
- {"bbox": None, "text": detection, "confidence": 1.0, "source": "ocr"}
134
- )
135
- except (IndexError, ValueError, TypeError) as e:
136
- logger.warning(f"Skipping invalid detection format: {detection}. Error: {e}")
137
- continue
138
- return standardized_results
139
-
140
- def process_image(
141
- self, images: Union[Image.Image, List[Image.Image]], options: BaseOCROptions
142
- ) -> Union[List[Dict[str, Any]], List[List[Dict[str, Any]]]]:
143
- """Processes a single image or a batch of images with EasyOCR."""
144
-
145
- if not isinstance(options, EasyOCROptions):
146
- logger.warning("Received BaseOCROptions, expected EasyOCROptions. Using defaults.")
147
- # Create default EasyOCR options if base was passed, preserving base settings
148
- options = EasyOCROptions(
149
- languages=options.languages,
150
- min_confidence=options.min_confidence,
151
- device=options.device,
152
- extra_args=options.extra_args, # Pass along any extra args
153
- )
154
-
155
- reader = self._get_reader(options)
156
- readtext_args = self._prepare_readtext_args(options, reader)
157
-
158
- # --- Handle single image or batch ---
159
- if isinstance(images, list):
160
- # --- Batch Processing (Iterative for EasyOCR) ---
161
- all_results = []
162
- logger.info(f"Processing batch of {len(images)} images with EasyOCR (iteratively)...")
163
- for i, img in enumerate(images):
164
- if not isinstance(img, Image.Image):
165
- logger.warning(f"Item at index {i} in batch is not a PIL Image. Skipping.")
166
- all_results.append([])
167
- continue
168
- img_array = np.array(img)
169
- try:
170
- logger.debug(f"Processing image {i+1}/{len(images)} in batch.")
171
- raw_results = reader.readtext(img_array, **readtext_args)
172
- standardized = self._standardize_results(raw_results, options)
173
- all_results.append(standardized)
174
- except Exception as e:
175
- logger.error(
176
- f"Error processing image {i+1} in EasyOCR batch: {e}", exc_info=True
177
- )
178
- all_results.append([]) # Append empty list for failed image
179
- logger.info(f"Finished processing batch with EasyOCR.")
180
- return all_results # Return List[List[Dict]]
181
-
182
- elif isinstance(images, Image.Image):
183
- # --- Single Image Processing ---
184
- logger.info("Processing single image with EasyOCR...")
185
- img_array = np.array(images)
186
- try:
187
- raw_results = reader.readtext(img_array, **readtext_args)
188
- standardized = self._standardize_results(raw_results, options)
189
- logger.info(f"Finished processing single image. Found {len(standardized)} results.")
190
- return standardized # Return List[Dict]
153
+ try:
154
+ # Use the standard helper for polygons
155
+ bbox = self._standardize_bbox(bbox_raw)
156
+ standardized_regions.append(TextRegion(bbox, text, confidence))
157
+ except ValueError as e:
158
+ raise ValueError(f"Could not standardize bounding box from EasyOCR readtext: {bbox_raw}") from e
159
+
160
+ # Simple mode (string result)
161
+ elif isinstance(detection, str):
162
+ if 0.0 >= min_confidence: # Always include if min_confidence is 0
163
+ standardized_regions.append(TextRegion((0, 0, 0, 0), detection, 1.0))
164
+ else:
165
+ # Handle unexpected format in OCR mode
166
+ raise ValueError(f"Invalid OCR detection format from EasyOCR readtext: {detection}")
167
+
168
+ except ValueError as e:
169
+ # Re-raise any value errors from standardization or format checks
170
+ raise e
191
171
  except Exception as e:
192
- logger.error(f"Error processing single image with EasyOCR: {e}", exc_info=True)
193
- return [] # Return empty list on failure
194
- else:
195
- raise TypeError("Input 'images' must be a PIL Image or a list of PIL Images.")
172
+ # Catch other potential processing errors
173
+ raise ValueError(f"Error processing EasyOCR detection item: {detection}") from e
174
+
175
+ return standardized_regions