natural-pdf 0.1.6__py3-none-any.whl → 0.1.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. docs/finetuning/index.md +176 -0
  2. docs/tutorials/01-loading-and-extraction.ipynb +34 -1550
  3. natural_pdf/__init__.py +1 -0
  4. natural_pdf/analyzers/layout/gemini.py +63 -47
  5. natural_pdf/collections/pdf_collection.py +5 -2
  6. natural_pdf/core/element_manager.py +6 -4
  7. natural_pdf/core/page.py +36 -27
  8. natural_pdf/core/pdf.py +25 -16
  9. natural_pdf/elements/base.py +1 -3
  10. natural_pdf/elements/collections.py +13 -14
  11. natural_pdf/elements/region.py +7 -6
  12. natural_pdf/exporters/__init__.py +4 -0
  13. natural_pdf/exporters/base.py +61 -0
  14. natural_pdf/exporters/paddleocr.py +345 -0
  15. natural_pdf/ocr/__init__.py +16 -8
  16. natural_pdf/ocr/engine.py +46 -30
  17. natural_pdf/ocr/engine_easyocr.py +81 -40
  18. natural_pdf/ocr/engine_paddle.py +39 -28
  19. natural_pdf/ocr/engine_surya.py +32 -16
  20. natural_pdf/ocr/ocr_factory.py +34 -23
  21. natural_pdf/ocr/ocr_manager.py +15 -11
  22. natural_pdf/ocr/ocr_options.py +5 -0
  23. natural_pdf/ocr/utils.py +46 -31
  24. natural_pdf/templates/finetune/fine_tune_paddleocr.md +415 -0
  25. natural_pdf/utils/debug.py +4 -2
  26. natural_pdf/utils/identifiers.py +9 -5
  27. natural_pdf/utils/packaging.py +172 -105
  28. natural_pdf/utils/text_extraction.py +44 -64
  29. natural_pdf/utils/visualization.py +1 -1
  30. {natural_pdf-0.1.6.dist-info → natural_pdf-0.1.7.dist-info}/METADATA +5 -3
  31. {natural_pdf-0.1.6.dist-info → natural_pdf-0.1.7.dist-info}/RECORD +34 -30
  32. {natural_pdf-0.1.6.dist-info → natural_pdf-0.1.7.dist-info}/WHEEL +0 -0
  33. {natural_pdf-0.1.6.dist-info → natural_pdf-0.1.7.dist-info}/licenses/LICENSE +0 -0
  34. {natural_pdf-0.1.6.dist-info → natural_pdf-0.1.7.dist-info}/top_level.txt +0 -0
natural_pdf/ocr/engine.py CHANGED
@@ -13,11 +13,17 @@ logger = logging.getLogger(__name__)
13
13
 
14
14
  class TextRegion:
15
15
  """Standard representation of an OCR text region."""
16
-
17
- def __init__(self, bbox: Tuple[float, float, float, float], text: str, confidence: float, source: str = "ocr"):
16
+
17
+ def __init__(
18
+ self,
19
+ bbox: Tuple[float, float, float, float],
20
+ text: str,
21
+ confidence: float,
22
+ source: str = "ocr",
23
+ ):
18
24
  """
19
25
  Initialize a text region.
20
-
26
+
21
27
  Args:
22
28
  bbox: Tuple of (x0, y0, x1, y1) coordinates
23
29
  text: The recognized text
@@ -28,7 +34,7 @@ class TextRegion:
28
34
  self.text = text
29
35
  self.confidence = confidence
30
36
  self.source = source
31
-
37
+
32
38
  @classmethod
33
39
  def from_polygon(cls, polygon: List[List[float]], text: str, confidence: float):
34
40
  """Create from polygon coordinates [[x1,y1], [x2,y2], ...]"""
@@ -36,24 +42,24 @@ class TextRegion:
36
42
  y_coords = [float(point[1]) for point in polygon]
37
43
  bbox = (min(x_coords), min(y_coords), max(x_coords), max(y_coords))
38
44
  return cls(bbox, text, confidence)
39
-
45
+
40
46
  def to_dict(self) -> Dict[str, Any]:
41
47
  """Convert to dictionary representation for compatibility."""
42
48
  return {
43
49
  "bbox": self.bbox,
44
50
  "text": self.text,
45
51
  "confidence": self.confidence,
46
- "source": self.source
52
+ "source": self.source,
47
53
  }
48
54
 
49
55
 
50
56
  class OCREngine(ABC):
51
57
  """Abstract Base Class for OCR engines."""
52
-
58
+
53
59
  # Default values as class constants
54
60
  DEFAULT_MIN_CONFIDENCE = 0.2
55
- DEFAULT_LANGUAGES = ['en']
56
- DEFAULT_DEVICE = 'cpu'
61
+ DEFAULT_LANGUAGES = ["en"]
62
+ DEFAULT_DEVICE = "cpu"
57
63
 
58
64
  def __init__(self):
59
65
  """Initializes the base OCR engine."""
@@ -74,7 +80,7 @@ class OCREngine(ABC):
74
80
  ) -> Union[List[Dict[str, Any]], List[List[Dict[str, Any]]]]:
75
81
  """
76
82
  Process a single image or batch of images with OCR.
77
-
83
+
78
84
  Args:
79
85
  images: A single PIL Image or a list of PIL Images
80
86
  languages: List of languages to use (default: ['en'])
@@ -82,7 +88,7 @@ class OCREngine(ABC):
82
88
  device: Device to use for processing (default: 'cpu')
83
89
  detect_only: Whether to only detect text regions without recognition
84
90
  options: Engine-specific options
85
-
91
+
86
92
  Returns:
87
93
  For a single image: List of text region dictionaries
88
94
  For a batch: List of lists of text region dictionaries
@@ -90,42 +96,48 @@ class OCREngine(ABC):
90
96
  # Convert single image to batch format
91
97
  single_image = not isinstance(images, list)
92
98
  image_batch = [images] if single_image else images
93
-
99
+
94
100
  # Use default values where parameters are not provided
95
101
  effective_languages = languages or self.DEFAULT_LANGUAGES
96
- effective_confidence = min_confidence if min_confidence is not None else self.DEFAULT_MIN_CONFIDENCE
102
+ effective_confidence = (
103
+ min_confidence if min_confidence is not None else self.DEFAULT_MIN_CONFIDENCE
104
+ )
97
105
  effective_device = device or self.DEFAULT_DEVICE
98
-
106
+
99
107
  # Ensure the model is initialized
100
108
  self._ensure_initialized(effective_languages, effective_device, options)
101
-
109
+
102
110
  # Process each image in the batch
103
111
  results = []
104
112
  for img in image_batch:
105
113
  # Preprocess the image for the specific engine
106
114
  processed_img = self._preprocess_image(img)
107
-
115
+
108
116
  # Process the image with the engine-specific implementation
109
117
  raw_results = self._process_single_image(processed_img, detect_only, options)
110
-
118
+
111
119
  # Convert results to standardized format
112
120
  text_regions = self._standardize_results(raw_results, effective_confidence, detect_only)
113
-
121
+
114
122
  # Convert TextRegion objects to dictionaries for backward compatibility
115
123
  region_dicts = [region.to_dict() for region in text_regions]
116
124
  results.append(region_dicts)
117
-
125
+
118
126
  # Return results in the appropriate format
119
127
  return results[0] if single_image else results
120
128
 
121
- def _ensure_initialized(self, languages: List[str], device: str, options: Optional[BaseOCROptions]):
129
+ def _ensure_initialized(
130
+ self, languages: List[str], device: str, options: Optional[BaseOCROptions]
131
+ ):
122
132
  """Ensure the model is initialized with the correct parameters."""
123
133
  if not self._initialized:
124
134
  self._initialize_model(languages, device, options)
125
135
  self._initialized = True
126
-
136
+
127
137
  @abstractmethod
128
- def _initialize_model(self, languages: List[str], device: str, options: Optional[BaseOCROptions]):
138
+ def _initialize_model(
139
+ self, languages: List[str], device: str, options: Optional[BaseOCROptions]
140
+ ):
129
141
  """Initialize the OCR model with the given parameters."""
130
142
  raise NotImplementedError("Subclasses must implement this method")
131
143
 
@@ -133,14 +145,18 @@ class OCREngine(ABC):
133
145
  def _preprocess_image(self, image: Image.Image) -> Any:
134
146
  """Convert PIL Image to engine-specific format."""
135
147
  raise NotImplementedError("Subclasses must implement this method")
136
-
148
+
137
149
  @abstractmethod
138
- def _process_single_image(self, image: Any, detect_only: bool, options: Optional[BaseOCROptions]) -> Any:
150
+ def _process_single_image(
151
+ self, image: Any, detect_only: bool, options: Optional[BaseOCROptions]
152
+ ) -> Any:
139
153
  """Process a single image with the initialized model."""
140
154
  raise NotImplementedError("Subclasses must implement this method")
141
-
155
+
142
156
  @abstractmethod
143
- def _standardize_results(self, raw_results: Any, min_confidence: float, detect_only: bool) -> List[TextRegion]:
157
+ def _standardize_results(
158
+ self, raw_results: Any, min_confidence: float, detect_only: bool
159
+ ) -> List[TextRegion]:
144
160
  """Convert engine-specific results to standardized TextRegion objects."""
145
161
  raise NotImplementedError("Subclasses must implement this method")
146
162
 
@@ -181,23 +197,23 @@ class OCREngine(ABC):
181
197
  return tuple(float(c) for c in bbox[:4])
182
198
  except (ValueError, TypeError) as e:
183
199
  raise ValueError(f"Invalid number format in bbox: {bbox}") from e
184
-
200
+
185
201
  # Check if it's in polygon format [[x1,y1],[x2,y2],...]
186
202
  elif (
187
203
  isinstance(bbox, (list, tuple))
188
204
  and len(bbox) > 0
189
205
  and isinstance(bbox[0], (list, tuple))
190
- and len(bbox[0]) == 2 # Ensure points are pairs
206
+ and len(bbox[0]) == 2 # Ensure points are pairs
191
207
  ):
192
208
  try:
193
209
  x_coords = [float(point[0]) for point in bbox]
194
210
  y_coords = [float(point[1]) for point in bbox]
195
- if not x_coords or not y_coords: # Handle empty polygon case
211
+ if not x_coords or not y_coords: # Handle empty polygon case
196
212
  raise ValueError("Empty polygon provided")
197
213
  return (min(x_coords), min(y_coords), max(x_coords), max(y_coords))
198
214
  except (ValueError, TypeError, IndexError) as e:
199
215
  raise ValueError(f"Invalid polygon format or values: {bbox}") from e
200
-
216
+
201
217
  # If it's neither format, raise an error
202
218
  raise ValueError(f"Could not standardize bounding box from unexpected format: {bbox}")
203
219
 
@@ -18,28 +18,31 @@ class EasyOCREngine(OCREngine):
18
18
  def __init__(self):
19
19
  super().__init__()
20
20
  # No longer need _easyocr attribute
21
- # self._easyocr = None
21
+ # self._easyocr = None
22
22
 
23
23
  def is_available(self) -> bool:
24
24
  """Check if EasyOCR is installed."""
25
25
  return importlib.util.find_spec("easyocr") is not None
26
26
 
27
- def _initialize_model(self, languages: List[str], device: str, options: Optional[BaseOCROptions]):
27
+ def _initialize_model(
28
+ self, languages: List[str], device: str, options: Optional[BaseOCROptions]
29
+ ):
28
30
  """Initialize the EasyOCR model."""
29
31
  # Import directly here
30
32
  try:
31
33
  import easyocr
34
+
32
35
  self.logger.info("EasyOCR module imported successfully.")
33
36
  except ImportError as e:
34
37
  self.logger.error(f"Failed to import EasyOCR: {e}")
35
38
  raise
36
-
39
+
37
40
  # Cast to EasyOCROptions if possible, otherwise use default
38
41
  easy_options = options if isinstance(options, EasyOCROptions) else EasyOCROptions()
39
-
42
+
40
43
  # Prepare constructor arguments
41
44
  use_gpu = "cuda" in device.lower() or "mps" in device.lower()
42
-
45
+
43
46
  constructor_args = {
44
47
  "lang_list": languages,
45
48
  "gpu": use_gpu,
@@ -55,12 +58,12 @@ class EasyOCREngine(OCREngine):
55
58
  "quantize": easy_options.quantize,
56
59
  "cudnn_benchmark": easy_options.cudnn_benchmark,
57
60
  }
58
-
61
+
59
62
  # Filter out None values, as EasyOCR expects non-None or default behaviour
60
63
  constructor_args = {k: v for k, v in constructor_args.items() if v is not None}
61
-
64
+
62
65
  self.logger.debug(f"EasyOCR Reader constructor args: {constructor_args}")
63
-
66
+
64
67
  # Create the reader
65
68
  try:
66
69
  self._model = easyocr.Reader(**constructor_args)
@@ -73,46 +76,72 @@ class EasyOCREngine(OCREngine):
73
76
  """Convert PIL Image to numpy array for EasyOCR."""
74
77
  return np.array(image)
75
78
 
76
- def _process_single_image(self, image: np.ndarray, detect_only: bool, options: Optional[EasyOCROptions]) -> Any:
79
+ def _process_single_image(
80
+ self, image: np.ndarray, detect_only: bool, options: Optional[EasyOCROptions]
81
+ ) -> Any:
77
82
  """Process a single image with EasyOCR."""
78
83
  if self._model is None:
79
84
  raise RuntimeError("EasyOCR model not initialized")
80
-
85
+
81
86
  # Cast options to proper type if provided
82
87
  easy_options = options if isinstance(options, EasyOCROptions) else EasyOCROptions()
83
-
88
+
84
89
  # Prepare readtext arguments (only needed if not detect_only)
85
90
  readtext_args = {}
86
91
  if not detect_only:
87
92
  for param in [
88
- "detail", "paragraph", "min_size", "contrast_ths", "adjust_contrast",
89
- "filter_ths", "text_threshold", "low_text", "link_threshold",
90
- "canvas_size", "mag_ratio", "slope_ths", "ycenter_ths", "height_ths",
91
- "width_ths", "y_ths", "x_ths", "add_margin", "output_format"
93
+ "detail",
94
+ "paragraph",
95
+ "min_size",
96
+ "contrast_ths",
97
+ "adjust_contrast",
98
+ "filter_ths",
99
+ "text_threshold",
100
+ "low_text",
101
+ "link_threshold",
102
+ "canvas_size",
103
+ "mag_ratio",
104
+ "slope_ths",
105
+ "ycenter_ths",
106
+ "height_ths",
107
+ "width_ths",
108
+ "y_ths",
109
+ "x_ths",
110
+ "add_margin",
111
+ "output_format",
92
112
  ]:
93
113
  if hasattr(easy_options, param):
94
114
  val = getattr(easy_options, param)
95
115
  if val is not None:
96
116
  readtext_args[param] = val
97
-
117
+
98
118
  # Process differently based on detect_only flag
99
119
  if detect_only:
100
120
  # Returns tuple (horizontal_list, free_list)
101
121
  # horizontal_list is a list containing one item: the list of boxes
102
122
  # Each box is [[x1, y1], [x2, y1], [x2, y2], [x1, y2]]
103
- bboxes_tuple = self._model.detect(image, **readtext_args) # Pass args here too? Check EasyOCR docs if needed.
104
- if bboxes_tuple and isinstance(bboxes_tuple, tuple) and len(bboxes_tuple) > 0 and isinstance(bboxes_tuple[0], list):
105
- return bboxes_tuple[0] # Return the list of polygons directly
123
+ bboxes_tuple = self._model.detect(
124
+ image, **readtext_args
125
+ ) # Pass args here too? Check EasyOCR docs if needed.
126
+ if (
127
+ bboxes_tuple
128
+ and isinstance(bboxes_tuple, tuple)
129
+ and len(bboxes_tuple) > 0
130
+ and isinstance(bboxes_tuple[0], list)
131
+ ):
132
+ return bboxes_tuple[0] # Return the list of polygons directly
106
133
  else:
107
134
  self.logger.warning(f"EasyOCR detect returned unexpected format: {bboxes_tuple}")
108
- return [] # Return empty list on unexpected format
135
+ return [] # Return empty list on unexpected format
109
136
  else:
110
137
  return self._model.readtext(image, **readtext_args)
111
138
 
112
- def _standardize_results(self, raw_results: Any, min_confidence: float, detect_only: bool) -> List[TextRegion]:
139
+ def _standardize_results(
140
+ self, raw_results: Any, min_confidence: float, detect_only: bool
141
+ ) -> List[TextRegion]:
113
142
  """Convert EasyOCR results to standardized TextRegion objects."""
114
143
  standardized_regions = []
115
-
144
+
116
145
  if detect_only:
117
146
  # In detect_only mode, raw_results is already a list of bounding boxes
118
147
  # Each bbox is in [x_min, x_max, y_min, y_max] format
@@ -120,13 +149,17 @@ class EasyOCREngine(OCREngine):
120
149
  for detection in raw_results:
121
150
  try:
122
151
  if isinstance(detection, (list, tuple)) and len(detection) == 4:
123
- x_min, x_max, y_min, y_max = detection
124
- # Convert to standardized (x0, y0, x1, y1) format
125
- try:
126
- bbox = (float(x_min), float(y_min), float(x_max), float(y_max))
127
- standardized_regions.append(TextRegion(bbox, text=None, confidence=None))
128
- except (ValueError, TypeError) as e:
129
- raise ValueError(f"Invalid number format in EasyOCR detect bbox: {detection}") from e
152
+ x_min, x_max, y_min, y_max = detection
153
+ # Convert to standardized (x0, y0, x1, y1) format
154
+ try:
155
+ bbox = (float(x_min), float(y_min), float(x_max), float(y_max))
156
+ standardized_regions.append(
157
+ TextRegion(bbox, text=None, confidence=None)
158
+ )
159
+ except (ValueError, TypeError) as e:
160
+ raise ValueError(
161
+ f"Invalid number format in EasyOCR detect bbox: {detection}"
162
+ ) from e
130
163
  else:
131
164
  raise ValueError(f"Invalid detection format from EasyOCR: {detection}")
132
165
  except ValueError as e:
@@ -134,42 +167,50 @@ class EasyOCREngine(OCREngine):
134
167
  raise e
135
168
  except Exception as e:
136
169
  # Catch other potential processing errors
137
- raise ValueError(f"Error processing EasyOCR detection item: {detection}") from e
170
+ raise ValueError(
171
+ f"Error processing EasyOCR detection item: {detection}"
172
+ ) from e
138
173
  else:
139
- raise ValueError(f"Expected list of bounding boxes in detect_only mode, got: {raw_results}")
140
-
174
+ raise ValueError(
175
+ f"Expected list of bounding boxes in detect_only mode, got: {raw_results}"
176
+ )
177
+
141
178
  return standardized_regions
142
-
179
+
143
180
  # Full OCR mode (readtext results)
144
181
  for detection in raw_results:
145
182
  try:
146
183
  # Detail mode (list/tuple result)
147
184
  if isinstance(detection, (list, tuple)) and len(detection) >= 3:
148
- bbox_raw = detection[0] # This is usually a polygon [[x1,y1],...]
185
+ bbox_raw = detection[0] # This is usually a polygon [[x1,y1],...]
149
186
  text = str(detection[1])
150
187
  confidence = float(detection[2])
151
-
188
+
152
189
  if confidence >= min_confidence:
153
190
  try:
154
191
  # Use the standard helper for polygons
155
192
  bbox = self._standardize_bbox(bbox_raw)
156
193
  standardized_regions.append(TextRegion(bbox, text, confidence))
157
194
  except ValueError as e:
158
- raise ValueError(f"Could not standardize bounding box from EasyOCR readtext: {bbox_raw}") from e
159
-
195
+ raise ValueError(
196
+ f"Could not standardize bounding box from EasyOCR readtext: {bbox_raw}"
197
+ ) from e
198
+
160
199
  # Simple mode (string result)
161
200
  elif isinstance(detection, str):
162
201
  if 0.0 >= min_confidence: # Always include if min_confidence is 0
163
202
  standardized_regions.append(TextRegion((0, 0, 0, 0), detection, 1.0))
164
203
  else:
165
204
  # Handle unexpected format in OCR mode
166
- raise ValueError(f"Invalid OCR detection format from EasyOCR readtext: {detection}")
167
-
205
+ raise ValueError(
206
+ f"Invalid OCR detection format from EasyOCR readtext: {detection}"
207
+ )
208
+
168
209
  except ValueError as e:
169
210
  # Re-raise any value errors from standardization or format checks
170
211
  raise e
171
212
  except Exception as e:
172
213
  # Catch other potential processing errors
173
214
  raise ValueError(f"Error processing EasyOCR detection item: {detection}") from e
174
-
215
+
175
216
  return standardized_regions
@@ -27,40 +27,43 @@ class PaddleOCREngine(OCREngine):
27
27
  paddleocr_installed = importlib.util.find_spec("paddleocr") is not None
28
28
  return paddle_installed and paddleocr_installed
29
29
 
30
- def _initialize_model(self, languages: List[str], device: str, options: Optional[BaseOCROptions]):
30
+ def _initialize_model(
31
+ self, languages: List[str], device: str, options: Optional[BaseOCROptions]
32
+ ):
31
33
  """Initialize the PaddleOCR model."""
32
34
  try:
33
- import paddleocr
35
+ import paddleocr
36
+
34
37
  self.logger.info("PaddleOCR module imported successfully.")
35
38
  except ImportError as e:
36
- self.logger.error(f"Failed to import PaddleOCR/PaddlePaddle: {e}")
37
- raise
38
-
39
+ self.logger.error(f"Failed to import PaddleOCR/PaddlePaddle: {e}")
40
+ raise
41
+
39
42
  # Cast to PaddleOCROptions if possible
40
43
  paddle_options = options if isinstance(options, PaddleOCROptions) else PaddleOCROptions()
41
-
44
+
42
45
  # Determine parameters
43
46
  primary_lang = languages[0] if languages else "en"
44
47
  use_gpu = "cuda" in str(device).lower()
45
-
48
+
46
49
  # Create constructor arguments
47
50
  constructor_args = {
48
51
  "lang": primary_lang,
49
52
  "use_gpu": use_gpu,
50
53
  "use_angle_cls": paddle_options.use_angle_cls,
51
- "det": True,
52
- "rec": True # We'll control recognition at process time
54
+ "det": True,
55
+ "rec": True, # We'll control recognition at process time
53
56
  }
54
-
57
+
55
58
  # Add optional parameters if available
56
59
  for param in ["det_model_dir", "rec_model_dir", "cls_model_dir", "show_log", "use_onnx"]:
57
60
  if hasattr(paddle_options, param):
58
61
  val = getattr(paddle_options, param)
59
62
  if val is not None:
60
63
  constructor_args[param] = val
61
-
64
+
62
65
  self.logger.debug(f"PaddleOCR constructor args: {constructor_args}")
63
-
66
+
64
67
  # Create the model
65
68
  try:
66
69
  self._model = paddleocr.PaddleOCR(**constructor_args)
@@ -78,31 +81,35 @@ class PaddleOCREngine(OCREngine):
78
81
  img_array_bgr = img_array_rgb[:, :, ::-1] # Convert RGB to BGR
79
82
  return img_array_bgr
80
83
 
81
- def _process_single_image(self, image: np.ndarray, detect_only: bool, options: Optional[PaddleOCROptions]) -> Any:
84
+ def _process_single_image(
85
+ self, image: np.ndarray, detect_only: bool, options: Optional[PaddleOCROptions]
86
+ ) -> Any:
82
87
  """Process a single image with PaddleOCR."""
83
88
  if self._model is None:
84
89
  raise RuntimeError("PaddleOCR model not initialized")
85
-
90
+
86
91
  # Prepare OCR arguments
87
92
  ocr_args = {}
88
93
  if options and isinstance(options, PaddleOCROptions):
89
94
  ocr_args["cls"] = options.cls if options.cls is not None else options.use_angle_cls
90
95
  ocr_args["det"] = options.det
91
96
  ocr_args["rec"] = not detect_only # Control recognition based on detect_only flag
92
-
97
+
93
98
  # Run OCR
94
99
  raw_results = self._model.ocr(image, **ocr_args)
95
100
  return raw_results
96
101
 
97
- def _standardize_results(self, raw_results: Any, min_confidence: float, detect_only: bool) -> List[TextRegion]:
102
+ def _standardize_results(
103
+ self, raw_results: Any, min_confidence: float, detect_only: bool
104
+ ) -> List[TextRegion]:
98
105
  """Convert PaddleOCR results to standardized TextRegion objects."""
99
106
  standardized_regions = []
100
-
107
+
101
108
  if not raw_results or not isinstance(raw_results, list) or len(raw_results) == 0:
102
109
  return standardized_regions
103
-
110
+
104
111
  page_results = raw_results[0] if raw_results[0] is not None else []
105
-
112
+
106
113
  for detection in page_results:
107
114
  # Initialize text and confidence
108
115
  text = None
@@ -111,20 +118,22 @@ class PaddleOCREngine(OCREngine):
111
118
 
112
119
  # Paddle always seems to return the tuple structure [bbox, (text, conf)]
113
120
  # even if rec=False. We need to parse this structure regardless.
114
- if len(detection) == 4: # Handle potential alternative format?
115
- detection = [detection, ('', 1.0)] # Treat as bbox + dummy text/conf
121
+ if len(detection) == 4: # Handle potential alternative format?
122
+ detection = [detection, ("", 1.0)] # Treat as bbox + dummy text/conf
116
123
 
117
124
  if not isinstance(detection, (list, tuple)) or len(detection) < 2:
118
125
  raise ValueError(f"Invalid detection format from PaddleOCR: {detection}")
119
-
126
+
120
127
  bbox_raw = detection[0]
121
128
  text_confidence = detection[1]
122
-
129
+
123
130
  if not isinstance(text_confidence, tuple) or len(text_confidence) < 2:
124
- # Even if detect_only, we expect the (text, conf) structure,
131
+ # Even if detect_only, we expect the (text, conf) structure,
125
132
  # it might just contain dummy values.
126
- raise ValueError(f"Invalid text/confidence structure from PaddleOCR: {text_confidence}")
127
-
133
+ raise ValueError(
134
+ f"Invalid text/confidence structure from PaddleOCR: {text_confidence}"
135
+ )
136
+
128
137
  # Extract text/conf only if not detect_only
129
138
  if not detect_only:
130
139
  text = str(text_confidence[0])
@@ -134,7 +143,9 @@ class PaddleOCREngine(OCREngine):
134
143
  try:
135
144
  bbox = self._standardize_bbox(bbox_raw)
136
145
  except ValueError as e:
137
- raise ValueError(f"Could not standardize bounding box from PaddleOCR: {bbox_raw}") from e
146
+ raise ValueError(
147
+ f"Could not standardize bounding box from PaddleOCR: {bbox_raw}"
148
+ ) from e
138
149
 
139
150
  # Append based on mode
140
151
  if detect_only:
@@ -143,5 +154,5 @@ class PaddleOCREngine(OCREngine):
143
154
  elif confidence >= min_confidence:
144
155
  # Only append if confidence meets threshold in full OCR mode
145
156
  standardized_regions.append(TextRegion(bbox, text, confidence))
146
-
157
+
147
158
  return standardized_regions
@@ -20,14 +20,16 @@ class SuryaOCREngine(OCREngine):
20
20
  self._surya_recognition = None
21
21
  self._surya_detection = None
22
22
 
23
- def _initialize_model(self, languages: List[str], device: str, options: Optional[BaseOCROptions]):
23
+ def _initialize_model(
24
+ self, languages: List[str], device: str, options: Optional[BaseOCROptions]
25
+ ):
24
26
  """Initialize Surya predictors."""
25
27
  if not self.is_available():
26
28
  raise ImportError("Surya OCR library is not installed or available.")
27
29
 
28
30
  # Store languages for use in _process_single_image
29
31
  self._langs = languages
30
-
32
+
31
33
  from surya.detection import DetectionPredictor
32
34
  from surya.recognition import RecognitionPredictor
33
35
 
@@ -41,21 +43,27 @@ class SuryaOCREngine(OCREngine):
41
43
  self._detection_predictor = self._surya_detection(**predictor_args)
42
44
  self.logger.info("Instantiating Surya RecognitionPredictor...")
43
45
  self._recognition_predictor = self._surya_recognition(**predictor_args)
44
-
46
+
45
47
  self.logger.info("Surya predictors initialized.")
46
48
 
47
49
  def _preprocess_image(self, image: Image.Image) -> Image.Image:
48
50
  """Surya uses PIL images directly, so just return the image."""
49
51
  return image
50
52
 
51
- def _process_single_image(self, image: Image.Image, detect_only: bool, options: Optional[SuryaOCROptions]) -> Any:
53
+ def _process_single_image(
54
+ self, image: Image.Image, detect_only: bool, options: Optional[SuryaOCROptions]
55
+ ) -> Any:
52
56
  """Process a single image with Surya OCR."""
53
57
  if not self._recognition_predictor or not self._detection_predictor:
54
58
  raise RuntimeError("Surya predictors are not initialized.")
55
59
 
56
60
  # Store languages instance variable during initialization to use here
57
- langs = [[lang] for lang in self._langs] if hasattr(self, '_langs') else [[self.DEFAULT_LANGUAGES[0]]]
58
-
61
+ langs = (
62
+ [[lang] for lang in self._langs]
63
+ if hasattr(self, "_langs")
64
+ else [[self.DEFAULT_LANGUAGES[0]]]
65
+ )
66
+
59
67
  # Surya expects lists of images, so we need to wrap our single image
60
68
  if detect_only:
61
69
  results = self._detection_predictor(images=[image])
@@ -63,33 +71,41 @@ class SuryaOCREngine(OCREngine):
63
71
  results = self._recognition_predictor(
64
72
  images=[image],
65
73
  langs=langs, # Use the languages set during initialization
66
- det_predictor=self._detection_predictor
74
+ det_predictor=self._detection_predictor,
67
75
  )
68
-
76
+
69
77
  # Surya may return a list with one result per image or a single result object
70
78
  # Return the result as-is and handle the extraction in _standardize_results
71
79
  return results
72
80
 
73
- def _standardize_results(self, raw_results: Any, min_confidence: float, detect_only: bool) -> List[TextRegion]:
81
+ def _standardize_results(
82
+ self, raw_results: Any, min_confidence: float, detect_only: bool
83
+ ) -> List[TextRegion]:
74
84
  """Convert Surya results to standardized TextRegion objects."""
75
85
  standardized_regions = []
76
-
86
+
77
87
  raw_result = raw_results
78
88
  if isinstance(raw_results, list) and len(raw_results) > 0:
79
89
  raw_result = raw_results[0]
80
-
81
- results = raw_result.text_lines if hasattr(raw_result, "text_lines") and not detect_only else raw_result.bboxes
90
+
91
+ results = (
92
+ raw_result.text_lines
93
+ if hasattr(raw_result, "text_lines") and not detect_only
94
+ else raw_result.bboxes
95
+ )
82
96
 
83
97
  for line in results:
84
98
  # Always extract bbox first
85
99
  try:
86
100
  # Prioritize line.bbox, fallback to line.polygon
87
- bbox_raw = line.bbox if hasattr(line, 'bbox') else getattr(line, 'polygon', None)
101
+ bbox_raw = line.bbox if hasattr(line, "bbox") else getattr(line, "polygon", None)
88
102
  if bbox_raw is None:
89
- raise ValueError("Missing bbox/polygon data")
103
+ raise ValueError("Missing bbox/polygon data")
90
104
  bbox = self._standardize_bbox(bbox_raw)
91
105
  except ValueError as e:
92
- raise ValueError(f"Could not standardize bounding box from Surya result: {bbox_raw}") from e
106
+ raise ValueError(
107
+ f"Could not standardize bounding box from Surya result: {bbox_raw}"
108
+ ) from e
93
109
 
94
110
  if detect_only:
95
111
  # For detect_only, text and confidence are None
@@ -100,7 +116,7 @@ class SuryaOCREngine(OCREngine):
100
116
  confidence = line.confidence
101
117
  if confidence >= min_confidence:
102
118
  standardized_regions.append(TextRegion(bbox, text, confidence))
103
-
119
+
104
120
  return standardized_regions
105
121
 
106
122
  def is_available(self) -> bool: