natural-pdf 0.1.5__py3-none-any.whl → 0.1.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. docs/finetuning/index.md +176 -0
  2. docs/ocr/index.md +34 -47
  3. docs/tutorials/01-loading-and-extraction.ipynb +34 -1536
  4. docs/tutorials/02-finding-elements.ipynb +42 -42
  5. docs/tutorials/03-extracting-blocks.ipynb +17 -17
  6. docs/tutorials/04-table-extraction.ipynb +12 -12
  7. docs/tutorials/05-excluding-content.ipynb +30 -30
  8. docs/tutorials/06-document-qa.ipynb +28 -28
  9. docs/tutorials/07-layout-analysis.ipynb +63 -35
  10. docs/tutorials/07-working-with-regions.ipynb +55 -51
  11. docs/tutorials/07-working-with-regions.md +2 -2
  12. docs/tutorials/08-spatial-navigation.ipynb +60 -60
  13. docs/tutorials/09-section-extraction.ipynb +113 -113
  14. docs/tutorials/10-form-field-extraction.ipynb +78 -50
  15. docs/tutorials/11-enhanced-table-processing.ipynb +6 -6
  16. docs/tutorials/12-ocr-integration.ipynb +149 -131
  17. docs/tutorials/12-ocr-integration.md +0 -13
  18. docs/tutorials/13-semantic-search.ipynb +313 -873
  19. natural_pdf/__init__.py +21 -22
  20. natural_pdf/analyzers/layout/gemini.py +280 -0
  21. natural_pdf/analyzers/layout/layout_manager.py +28 -1
  22. natural_pdf/analyzers/layout/layout_options.py +11 -0
  23. natural_pdf/analyzers/layout/yolo.py +6 -2
  24. natural_pdf/collections/pdf_collection.py +24 -0
  25. natural_pdf/core/element_manager.py +18 -13
  26. natural_pdf/core/page.py +174 -36
  27. natural_pdf/core/pdf.py +156 -42
  28. natural_pdf/elements/base.py +9 -17
  29. natural_pdf/elements/collections.py +99 -38
  30. natural_pdf/elements/region.py +77 -37
  31. natural_pdf/elements/text.py +5 -0
  32. natural_pdf/exporters/__init__.py +4 -0
  33. natural_pdf/exporters/base.py +61 -0
  34. natural_pdf/exporters/paddleocr.py +345 -0
  35. natural_pdf/ocr/__init__.py +57 -36
  36. natural_pdf/ocr/engine.py +160 -49
  37. natural_pdf/ocr/engine_easyocr.py +178 -157
  38. natural_pdf/ocr/engine_paddle.py +114 -189
  39. natural_pdf/ocr/engine_surya.py +87 -144
  40. natural_pdf/ocr/ocr_factory.py +125 -0
  41. natural_pdf/ocr/ocr_manager.py +65 -89
  42. natural_pdf/ocr/ocr_options.py +8 -13
  43. natural_pdf/ocr/utils.py +113 -0
  44. natural_pdf/templates/finetune/fine_tune_paddleocr.md +415 -0
  45. natural_pdf/templates/spa/css/style.css +334 -0
  46. natural_pdf/templates/spa/index.html +31 -0
  47. natural_pdf/templates/spa/js/app.js +472 -0
  48. natural_pdf/templates/spa/words.txt +235976 -0
  49. natural_pdf/utils/debug.py +34 -0
  50. natural_pdf/utils/identifiers.py +33 -0
  51. natural_pdf/utils/packaging.py +485 -0
  52. natural_pdf/utils/text_extraction.py +44 -64
  53. natural_pdf/utils/visualization.py +1 -1
  54. {natural_pdf-0.1.5.dist-info → natural_pdf-0.1.7.dist-info}/METADATA +44 -20
  55. {natural_pdf-0.1.5.dist-info → natural_pdf-0.1.7.dist-info}/RECORD +58 -47
  56. {natural_pdf-0.1.5.dist-info → natural_pdf-0.1.7.dist-info}/WHEEL +1 -1
  57. {natural_pdf-0.1.5.dist-info → natural_pdf-0.1.7.dist-info}/top_level.txt +0 -1
  58. natural_pdf/templates/ocr_debug.html +0 -517
  59. tests/test_loading.py +0 -50
  60. tests/test_optional_deps.py +0 -298
  61. {natural_pdf-0.1.5.dist-info → natural_pdf-0.1.7.dist-info}/licenses/LICENSE +0 -0
@@ -1,13 +1,12 @@
1
1
  # ocr_engine_paddleocr.py
2
2
  import importlib.util
3
- import inspect # Used for dynamic parameter passing
4
3
  import logging
5
4
  from typing import Any, Dict, List, Optional, Tuple, Union
6
5
 
7
6
  import numpy as np
8
7
  from PIL import Image
9
8
 
10
- from .engine import OCREngine
9
+ from .engine import OCREngine, TextRegion
11
10
  from .ocr_options import BaseOCROptions, PaddleOCROptions
12
11
 
13
12
  logger = logging.getLogger(__name__)
@@ -16,55 +15,8 @@ logger = logging.getLogger(__name__)
16
15
  class PaddleOCREngine(OCREngine):
17
16
  """PaddleOCR engine implementation."""
18
17
 
19
- LANGUAGE_MAP = {
20
- "en": "en",
21
- "zh": "ch",
22
- "zh-cn": "ch",
23
- "zh-tw": "chinese_cht",
24
- "ja": "japan",
25
- "ko": "korean",
26
- "th": "thai",
27
- "fr": "french",
28
- "de": "german",
29
- "ru": "russian",
30
- "ar": "arabic",
31
- "hi": "hindi",
32
- "vi": "vietnam",
33
- "fa": "cyrillic",
34
- "ur": "cyrillic",
35
- "rs": "serbian",
36
- "oc": "latin",
37
- "rsc": "cyrillic",
38
- "bg": "bulgarian",
39
- "uk": "cyrillic",
40
- "be": "cyrillic",
41
- "te": "telugu",
42
- "kn": "kannada",
43
- "ta": "tamil",
44
- "latin": "latin",
45
- "cyrillic": "cyrillic",
46
- "devanagari": "devanagari",
47
- }
48
-
49
18
  def __init__(self):
50
19
  super().__init__()
51
- self._paddleocr = None
52
-
53
- def _lazy_import_paddleocr(self):
54
- """Imports paddleocr only when needed."""
55
- if self._paddleocr is None:
56
- if not self.is_available():
57
- raise ImportError("PaddleOCR or PaddlePaddle is not installed or available.")
58
- try:
59
- import paddle
60
- import paddleocr
61
-
62
- self._paddleocr = paddleocr
63
- logger.info("PaddleOCR module imported successfully.")
64
- except ImportError as e:
65
- logger.error(f"Failed to import PaddleOCR/PaddlePaddle: {e}")
66
- raise
67
- return self._paddleocr
68
20
 
69
21
  def is_available(self) -> bool:
70
22
  """Check if PaddleOCR and paddlepaddle are installed."""
@@ -75,159 +27,132 @@ class PaddleOCREngine(OCREngine):
75
27
  paddleocr_installed = importlib.util.find_spec("paddleocr") is not None
76
28
  return paddle_installed and paddleocr_installed
77
29
 
78
- def _map_language(self, iso_lang: str) -> str:
79
- """Map ISO language code to PaddleOCR language code."""
80
- return self.LANGUAGE_MAP.get(iso_lang.lower(), "en")
81
-
82
- def _get_cache_key(self, options: PaddleOCROptions) -> str:
83
- """Generate a more specific cache key for PaddleOCR."""
84
- base_key = super()._get_cache_key(options)
85
- primary_lang = self._map_language(options.languages[0]) if options.languages else "en"
86
- angle_cls_key = str(options.use_angle_cls)
87
- precision_key = options.precision
88
- return f"{base_key}_{primary_lang}_{angle_cls_key}_{precision_key}"
89
-
90
- def _get_reader(self, options: PaddleOCROptions):
91
- """Get or initialize a PaddleOCR reader based on options."""
92
- cache_key = self._get_cache_key(options)
93
- if cache_key in self._reader_cache:
94
- logger.debug(f"Using cached PaddleOCR reader for key: {cache_key}")
95
- return self._reader_cache[cache_key]
96
-
97
- logger.info(f"Creating new PaddleOCR reader for key: {cache_key}")
98
- paddleocr = self._lazy_import_paddleocr()
99
-
100
- constructor_sig = inspect.signature(paddleocr.PaddleOCR.__init__)
101
- constructor_args = {}
102
- constructor_args["lang"] = (
103
- self._map_language(options.languages[0]) if options.languages else "en"
104
- )
30
+ def _initialize_model(
31
+ self, languages: List[str], device: str, options: Optional[BaseOCROptions]
32
+ ):
33
+ """Initialize the PaddleOCR model."""
34
+ try:
35
+ import paddleocr
105
36
 
106
- for field_name, param in constructor_sig.parameters.items():
107
- if field_name in ["self", "lang"]:
108
- continue
109
- if field_name == "use_gpu":
110
- constructor_args["use_gpu"] = options.use_gpu
111
- continue
112
- if hasattr(options, field_name):
113
- constructor_args[field_name] = getattr(options, field_name)
114
- elif field_name in options.extra_args:
115
- constructor_args[field_name] = options.extra_args[field_name]
37
+ self.logger.info("PaddleOCR module imported successfully.")
38
+ except ImportError as e:
39
+ self.logger.error(f"Failed to import PaddleOCR/PaddlePaddle: {e}")
40
+ raise
116
41
 
117
- constructor_args.pop("device", None)
118
- logger.debug(f"PaddleOCR constructor args: {constructor_args}")
42
+ # Cast to PaddleOCROptions if possible
43
+ paddle_options = options if isinstance(options, PaddleOCROptions) else PaddleOCROptions()
119
44
 
120
- try:
121
- show_log = constructor_args.get("show_log", False)
122
- original_log_level = logging.getLogger("ppocr").level
123
- if not show_log:
124
- logging.getLogger("ppocr").setLevel(logging.ERROR)
45
+ # Determine parameters
46
+ primary_lang = languages[0] if languages else "en"
47
+ use_gpu = "cuda" in str(device).lower()
48
+
49
+ # Create constructor arguments
50
+ constructor_args = {
51
+ "lang": primary_lang,
52
+ "use_gpu": use_gpu,
53
+ "use_angle_cls": paddle_options.use_angle_cls,
54
+ "det": True,
55
+ "rec": True, # We'll control recognition at process time
56
+ }
125
57
 
126
- reader = paddleocr.PaddleOCR(**constructor_args)
58
+ # Add optional parameters if available
59
+ for param in ["det_model_dir", "rec_model_dir", "cls_model_dir", "show_log", "use_onnx"]:
60
+ if hasattr(paddle_options, param):
61
+ val = getattr(paddle_options, param)
62
+ if val is not None:
63
+ constructor_args[param] = val
127
64
 
128
- if not show_log:
129
- logging.getLogger("ppocr").setLevel(original_log_level)
65
+ self.logger.debug(f"PaddleOCR constructor args: {constructor_args}")
130
66
 
131
- self._reader_cache[cache_key] = reader
132
- logger.info("PaddleOCR reader created successfully.")
133
- return reader
67
+ # Create the model
68
+ try:
69
+ self._model = paddleocr.PaddleOCR(**constructor_args)
70
+ self.logger.info("PaddleOCR model created successfully")
134
71
  except Exception as e:
135
- logger.error(f"Failed to create PaddleOCR reader: {e}", exc_info=True)
72
+ self.logger.error(f"Failed to create PaddleOCR model: {e}")
136
73
  raise
137
74
 
138
- def _prepare_ocr_args(self, options: PaddleOCROptions) -> Dict[str, Any]:
139
- """Helper to prepare arguments for the ocr method (excluding image)."""
140
- ocr_args = {}
141
- # Determine 'cls' value based on options precedence
142
- ocr_args["cls"] = options.cls if options.cls is not None else options.use_angle_cls
143
- ocr_args["det"] = options.det
144
- ocr_args["rec"] = options.rec
145
- # Add extra args if needed (less common for ocr method itself)
146
- # for field_name in options.extra_args:
147
- # if field_name in ['cls', 'det', 'rec']: # Check against known ocr args
148
- # ocr_args[field_name] = options.extra_args[field_name]
149
- logger.debug(f"PaddleOCR ocr args (excluding image): {ocr_args}")
150
- return ocr_args
151
-
152
- def _standardize_results(
153
- self, raw_page_results: Optional[List[Any]], options: PaddleOCROptions
154
- ) -> List[Dict[str, Any]]:
155
- """Standardizes raw results from a single page/image from PaddleOCR."""
156
- standardized_page = []
157
- if not raw_page_results: # Handle None or empty list
158
- return standardized_page
159
-
160
- min_confidence = options.min_confidence
161
- for detection in raw_page_results:
162
- try:
163
- if not isinstance(detection, (list, tuple)) or len(detection) < 2:
164
- continue
165
- bbox_raw = detection[0]
166
- text_confidence = detection[1]
167
- if not isinstance(text_confidence, tuple) or len(text_confidence) < 2:
168
- continue
169
-
170
- text = str(text_confidence[0])
171
- confidence = float(text_confidence[1])
172
-
173
- if confidence >= min_confidence:
174
- bbox = self._standardize_bbox(bbox_raw)
175
- if bbox:
176
- standardized_page.append(
177
- {"bbox": bbox, "text": text, "confidence": confidence, "source": "ocr"}
178
- )
179
- else:
180
- logger.warning(f"Skipping result due to invalid bbox: {bbox_raw}")
181
- except (IndexError, ValueError, TypeError) as e:
182
- logger.warning(f"Skipping invalid detection format: {detection}. Error: {e}")
183
- continue
184
- return standardized_page
185
-
186
- def _pil_to_bgr(self, image: Image.Image) -> np.ndarray:
187
- """Converts PIL Image to BGR numpy array."""
188
- if image.mode == "BGR": # Already BGR
75
+ def _preprocess_image(self, image: Image.Image) -> np.ndarray:
76
+ """Convert PIL Image to BGR numpy array for PaddleOCR."""
77
+ if image.mode == "BGR":
189
78
  return np.array(image)
190
79
  img_rgb = image.convert("RGB")
191
80
  img_array_rgb = np.array(img_rgb)
192
81
  img_array_bgr = img_array_rgb[:, :, ::-1] # Convert RGB to BGR
193
82
  return img_array_bgr
194
83
 
195
- def process_image(
196
- self, images: Union[Image.Image, List[Image.Image]], options: BaseOCROptions
197
- ) -> Union[List[Dict[str, Any]], List[List[Dict[str, Any]]]]:
198
- """Processes a single image or a batch of images with PaddleOCR."""
199
-
200
- if not isinstance(options, PaddleOCROptions):
201
- logger.warning("Received BaseOCROptions, expected PaddleOCROptions. Using defaults.")
202
- options = PaddleOCROptions(
203
- languages=options.languages,
204
- min_confidence=options.min_confidence,
205
- device=options.device,
206
- extra_args=options.extra_args,
207
- )
208
-
209
- reader = self._get_reader(options)
210
- ocr_args = self._prepare_ocr_args(options)
211
-
212
- # Helper function to process one image
213
- def process_one(img):
84
+ def _process_single_image(
85
+ self, image: np.ndarray, detect_only: bool, options: Optional[PaddleOCROptions]
86
+ ) -> Any:
87
+ """Process a single image with PaddleOCR."""
88
+ if self._model is None:
89
+ raise RuntimeError("PaddleOCR model not initialized")
90
+
91
+ # Prepare OCR arguments
92
+ ocr_args = {}
93
+ if options and isinstance(options, PaddleOCROptions):
94
+ ocr_args["cls"] = options.cls if options.cls is not None else options.use_angle_cls
95
+ ocr_args["det"] = options.det
96
+ ocr_args["rec"] = not detect_only # Control recognition based on detect_only flag
97
+
98
+ # Run OCR
99
+ raw_results = self._model.ocr(image, **ocr_args)
100
+ return raw_results
101
+
102
+ def _standardize_results(
103
+ self, raw_results: Any, min_confidence: float, detect_only: bool
104
+ ) -> List[TextRegion]:
105
+ """Convert PaddleOCR results to standardized TextRegion objects."""
106
+ standardized_regions = []
107
+
108
+ if not raw_results or not isinstance(raw_results, list) or len(raw_results) == 0:
109
+ return standardized_regions
110
+
111
+ page_results = raw_results[0] if raw_results[0] is not None else []
112
+
113
+ for detection in page_results:
114
+ # Initialize text and confidence
115
+ text = None
116
+ confidence = None
117
+ bbox_raw = None
118
+
119
+ # Paddle always seems to return the tuple structure [bbox, (text, conf)]
120
+ # even if rec=False. We need to parse this structure regardless.
121
+ if len(detection) == 4: # Handle potential alternative format?
122
+ detection = [detection, ("", 1.0)] # Treat as bbox + dummy text/conf
123
+
124
+ if not isinstance(detection, (list, tuple)) or len(detection) < 2:
125
+ raise ValueError(f"Invalid detection format from PaddleOCR: {detection}")
126
+
127
+ bbox_raw = detection[0]
128
+ text_confidence = detection[1]
129
+
130
+ if not isinstance(text_confidence, tuple) or len(text_confidence) < 2:
131
+ # Even if detect_only, we expect the (text, conf) structure,
132
+ # it might just contain dummy values.
133
+ raise ValueError(
134
+ f"Invalid text/confidence structure from PaddleOCR: {text_confidence}"
135
+ )
136
+
137
+ # Extract text/conf only if not detect_only
138
+ if not detect_only:
139
+ text = str(text_confidence[0])
140
+ confidence = float(text_confidence[1])
141
+
142
+ # Standardize the bbox (always needed)
214
143
  try:
215
- img_array_bgr = self._pil_to_bgr(img)
216
- raw_results = reader.ocr(img_array_bgr, **ocr_args)
217
-
218
- page_results = []
219
- if raw_results and isinstance(raw_results, list) and len(raw_results) > 0:
220
- page_results = raw_results[0]
221
-
222
- return self._standardize_results(page_results, options)
223
- except Exception as e:
224
- logger.error(f"Error processing image with PaddleOCR: {e}")
225
- return []
226
-
227
- # Handle single image or list of images
228
- if isinstance(images, Image.Image):
229
- return process_one(images)
230
- elif isinstance(images, list):
231
- return [process_one(img) for img in images]
232
- else:
233
- raise TypeError("Input 'images' must be a PIL Image or a list of PIL Images.")
144
+ bbox = self._standardize_bbox(bbox_raw)
145
+ except ValueError as e:
146
+ raise ValueError(
147
+ f"Could not standardize bounding box from PaddleOCR: {bbox_raw}"
148
+ ) from e
149
+
150
+ # Append based on mode
151
+ if detect_only:
152
+ # Append regardless of dummy confidence value, set text/conf to None
153
+ standardized_regions.append(TextRegion(bbox, text=None, confidence=None))
154
+ elif confidence >= min_confidence:
155
+ # Only append if confidence meets threshold in full OCR mode
156
+ standardized_regions.append(TextRegion(bbox, text, confidence))
157
+
158
+ return standardized_regions
@@ -6,11 +6,9 @@ from typing import Any, Dict, List, Optional, Tuple, Union
6
6
  import numpy as np
7
7
  from PIL import Image
8
8
 
9
- from .engine import OCREngine
9
+ from .engine import OCREngine, TextRegion
10
10
  from .ocr_options import BaseOCROptions, SuryaOCROptions
11
11
 
12
- logger = logging.getLogger(__name__)
13
-
14
12
 
15
13
  class SuryaOCREngine(OCREngine):
16
14
  """Surya OCR engine implementation."""
@@ -21,161 +19,106 @@ class SuryaOCREngine(OCREngine):
21
19
  self._detection_predictor = None
22
20
  self._surya_recognition = None
23
21
  self._surya_detection = None
24
- self._initialized = False
25
-
26
- def _lazy_load_predictors(self, options: SuryaOCROptions):
27
- """Initializes Surya predictors when first needed."""
28
- if self._initialized:
29
- return
30
22
 
23
+ def _initialize_model(
24
+ self, languages: List[str], device: str, options: Optional[BaseOCROptions]
25
+ ):
26
+ """Initialize Surya predictors."""
31
27
  if not self.is_available():
32
28
  raise ImportError("Surya OCR library is not installed or available.")
33
29
 
34
- try:
35
- from surya.detection import DetectionPredictor
36
- from surya.recognition import RecognitionPredictor
30
+ # Store languages for use in _process_single_image
31
+ self._langs = languages
37
32
 
38
- self._surya_recognition = RecognitionPredictor
39
- self._surya_detection = DetectionPredictor
40
- logger.info("Surya modules imported successfully.")
33
+ from surya.detection import DetectionPredictor
34
+ from surya.recognition import RecognitionPredictor
41
35
 
42
- # --- Instantiate Predictors ---
43
- # Add arguments from options if Surya supports them
44
- # Example: device = options.device or 'cuda' if torch.cuda.is_available() else 'cpu'
45
- # predictor_args = {'device': options.device} # If applicable
46
- predictor_args = {} # Assuming parameterless init based on example
36
+ self._surya_recognition = RecognitionPredictor
37
+ self._surya_detection = DetectionPredictor
38
+ self.logger.info("Surya modules imported successfully.")
47
39
 
48
- logger.info("Instantiating Surya DetectionPredictor...")
49
- self._detection_predictor = self._surya_detection(**predictor_args)
50
- logger.info("Instantiating Surya RecognitionPredictor...")
51
- self._recognition_predictor = self._surya_recognition(**predictor_args)
40
+ predictor_args = {} # Configure if needed
52
41
 
53
- self._initialized = True
54
- logger.info("Surya predictors initialized.")
42
+ self.logger.info("Instantiating Surya DetectionPredictor...")
43
+ self._detection_predictor = self._surya_detection(**predictor_args)
44
+ self.logger.info("Instantiating Surya RecognitionPredictor...")
45
+ self._recognition_predictor = self._surya_recognition(**predictor_args)
55
46
 
56
- except ImportError as e:
57
- logger.error(f"Failed to import Surya modules: {e}")
58
- raise
59
- except Exception as e:
60
- logger.error(f"Failed to initialize Surya predictors: {e}", exc_info=True)
61
- raise
47
+ self.logger.info("Surya predictors initialized.")
62
48
 
63
- def is_available(self) -> bool:
64
- """Check if the surya library is installed."""
65
- return importlib.util.find_spec("surya") is not None
49
+ def _preprocess_image(self, image: Image.Image) -> Image.Image:
50
+ """Surya uses PIL images directly, so just return the image."""
51
+ return image
52
+
53
+ def _process_single_image(
54
+ self, image: Image.Image, detect_only: bool, options: Optional[SuryaOCROptions]
55
+ ) -> Any:
56
+ """Process a single image with Surya OCR."""
57
+ if not self._recognition_predictor or not self._detection_predictor:
58
+ raise RuntimeError("Surya predictors are not initialized.")
59
+
60
+ # Store languages instance variable during initialization to use here
61
+ langs = (
62
+ [[lang] for lang in self._langs]
63
+ if hasattr(self, "_langs")
64
+ else [[self.DEFAULT_LANGUAGES[0]]]
65
+ )
66
+
67
+ # Surya expects lists of images, so we need to wrap our single image
68
+ if detect_only:
69
+ results = self._detection_predictor(images=[image])
70
+ else:
71
+ results = self._recognition_predictor(
72
+ images=[image],
73
+ langs=langs, # Use the languages set during initialization
74
+ det_predictor=self._detection_predictor,
75
+ )
76
+
77
+ # Surya may return a list with one result per image or a single result object
78
+ # Return the result as-is and handle the extraction in _standardize_results
79
+ return results
66
80
 
67
81
  def _standardize_results(
68
- self, raw_ocr_result: Any, options: SuryaOCROptions
69
- ) -> List[Dict[str, Any]]:
70
- """Standardizes raw results from a single image from Surya."""
71
- standardized_page = []
72
- min_confidence = options.min_confidence
73
-
74
- # Check if the result has the expected structure (OCRResult with text_lines)
75
- if not hasattr(raw_ocr_result, "text_lines") or not isinstance(
76
- raw_ocr_result.text_lines, list
77
- ):
78
- logger.warning(f"Unexpected Surya result format: {type(raw_ocr_result)}. Skipping.")
79
- return standardized_page
80
-
81
- for line in raw_ocr_result.text_lines:
82
+ self, raw_results: Any, min_confidence: float, detect_only: bool
83
+ ) -> List[TextRegion]:
84
+ """Convert Surya results to standardized TextRegion objects."""
85
+ standardized_regions = []
86
+
87
+ raw_result = raw_results
88
+ if isinstance(raw_results, list) and len(raw_results) > 0:
89
+ raw_result = raw_results[0]
90
+
91
+ results = (
92
+ raw_result.text_lines
93
+ if hasattr(raw_result, "text_lines") and not detect_only
94
+ else raw_result.bboxes
95
+ )
96
+
97
+ for line in results:
98
+ # Always extract bbox first
82
99
  try:
83
- # Extract data from Surya's TextLine object
84
- text = line.text
100
+ # Prioritize line.bbox, fallback to line.polygon
101
+ bbox_raw = line.bbox if hasattr(line, "bbox") else getattr(line, "polygon", None)
102
+ if bbox_raw is None:
103
+ raise ValueError("Missing bbox/polygon data")
104
+ bbox = self._standardize_bbox(bbox_raw)
105
+ except ValueError as e:
106
+ raise ValueError(
107
+ f"Could not standardize bounding box from Surya result: {bbox_raw}"
108
+ ) from e
109
+
110
+ if detect_only:
111
+ # For detect_only, text and confidence are None
112
+ standardized_regions.append(TextRegion(bbox, text=None, confidence=None))
113
+ else:
114
+ # For full OCR, extract text and confidence, then filter
115
+ text = line.text if hasattr(line, "text") else ""
85
116
  confidence = line.confidence
86
- # Surya provides both polygon and bbox, bbox is already (x0, y0, x1, y1)
87
- bbox_raw = line.bbox # Use bbox directly if available and correct format
88
-
89
117
  if confidence >= min_confidence:
90
- bbox = self._standardize_bbox(bbox_raw) # Validate/convert format
91
- if bbox:
92
- standardized_page.append(
93
- {"bbox": bbox, "text": text, "confidence": confidence, "source": "ocr"}
94
- )
95
- else:
96
- # Try polygon if bbox failed standardization
97
- bbox_poly = self._standardize_bbox(line.polygon)
98
- if bbox_poly:
99
- standardized_page.append(
100
- {
101
- "bbox": bbox_poly,
102
- "text": text,
103
- "confidence": confidence,
104
- "source": "ocr",
105
- }
106
- )
107
- else:
108
- logger.warning(
109
- f"Skipping Surya line due to invalid bbox/polygon: {line}"
110
- )
111
-
112
- except (AttributeError, ValueError, TypeError) as e:
113
- logger.warning(f"Skipping invalid Surya TextLine format: {line}. Error: {e}")
114
- continue
115
- return standardized_page
116
-
117
- def process_image(
118
- self, images: Union[Image.Image, List[Image.Image]], options: BaseOCROptions
119
- ) -> Union[List[Dict[str, Any]], List[List[Dict[str, Any]]]]:
120
- """Processes a single image or a batch of images with Surya OCR."""
121
-
122
- if not isinstance(options, SuryaOCROptions):
123
- logger.warning("Received BaseOCROptions, expected SuryaOCROptions. Using defaults.")
124
- options = SuryaOCROptions(
125
- languages=options.languages,
126
- min_confidence=options.min_confidence,
127
- device=options.device,
128
- extra_args=options.extra_args,
129
- )
118
+ standardized_regions.append(TextRegion(bbox, text, confidence))
130
119
 
131
- # Ensure predictors are loaded/initialized
132
- self._lazy_load_predictors(options)
133
- if not self._recognition_predictor or not self._detection_predictor:
134
- raise RuntimeError("Surya predictors could not be initialized.")
135
-
136
- # --- Prepare inputs for Surya ---
137
- is_batch = isinstance(images, list)
138
- input_images: List[Image.Image] = images if is_batch else [images]
139
- # Surya expects a list of language lists, one per image
140
- input_langs: List[List[str]] = [options.languages for _ in input_images]
141
-
142
- if not input_images:
143
- logger.warning("No images provided for Surya processing.")
144
- return [] if not is_batch else [[]]
145
-
146
- # --- Run Surya Prediction ---
147
- try:
148
- processing_mode = "batch" if is_batch else "single image"
149
- logger.info(f"Processing {processing_mode} ({len(input_images)} images) with Surya...")
150
- # Call Surya's predictor
151
- # It returns a list of OCRResult objects, one per input image
152
- predictions = self._recognition_predictor(
153
- images=input_images, langs=input_langs, det_predictor=self._detection_predictor
154
- )
155
- logger.info(f"Surya prediction complete. Received {len(predictions)} results.")
156
-
157
- # --- Standardize Results ---
158
- if len(predictions) != len(input_images):
159
- logger.error(
160
- f"Surya result count ({len(predictions)}) does not match input count ({len(input_images)}). Returning empty results."
161
- )
162
- # Decide on error handling: raise error or return empty structure
163
- return [[] for _ in input_images] if is_batch else []
164
-
165
- all_standardized_results = [
166
- self._standardize_results(res, options) for res in predictions
167
- ]
168
-
169
- if is_batch:
170
- return all_standardized_results # Return List[List[Dict]]
171
- else:
172
- return all_standardized_results[0] # Return List[Dict] for single image
120
+ return standardized_regions
173
121
 
174
- except Exception as e:
175
- logger.error(f"Error during Surya OCR processing: {e}", exc_info=True)
176
- # Return empty structure matching input type on failure
177
- return [[] for _ in input_images] if is_batch else []
178
-
179
- # Note: Caching is handled differently for Surya as predictors are stateful
180
- # and initialized once. The base class _reader_cache is not used here.
181
- # If predictors could be configured per-run, caching would need rethinking.
122
+ def is_available(self) -> bool:
123
+ """Check if the surya library is installed."""
124
+ return importlib.util.find_spec("surya") is not None