natural-pdf 0.1.6__py3-none-any.whl → 0.1.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. docs/categorizing-documents/index.md +168 -0
  2. docs/data-extraction/index.md +87 -0
  3. docs/element-selection/index.ipynb +218 -164
  4. docs/element-selection/index.md +20 -0
  5. docs/finetuning/index.md +176 -0
  6. docs/index.md +19 -0
  7. docs/ocr/index.md +63 -16
  8. docs/tutorials/01-loading-and-extraction.ipynb +411 -248
  9. docs/tutorials/02-finding-elements.ipynb +123 -46
  10. docs/tutorials/03-extracting-blocks.ipynb +24 -19
  11. docs/tutorials/04-table-extraction.ipynb +17 -12
  12. docs/tutorials/05-excluding-content.ipynb +37 -32
  13. docs/tutorials/06-document-qa.ipynb +36 -31
  14. docs/tutorials/07-layout-analysis.ipynb +45 -40
  15. docs/tutorials/07-working-with-regions.ipynb +61 -60
  16. docs/tutorials/08-spatial-navigation.ipynb +76 -71
  17. docs/tutorials/09-section-extraction.ipynb +160 -155
  18. docs/tutorials/10-form-field-extraction.ipynb +71 -66
  19. docs/tutorials/11-enhanced-table-processing.ipynb +11 -6
  20. docs/tutorials/12-ocr-integration.ipynb +3420 -312
  21. docs/tutorials/12-ocr-integration.md +68 -106
  22. docs/tutorials/13-semantic-search.ipynb +641 -251
  23. natural_pdf/__init__.py +3 -0
  24. natural_pdf/analyzers/layout/gemini.py +63 -47
  25. natural_pdf/classification/manager.py +343 -0
  26. natural_pdf/classification/mixin.py +149 -0
  27. natural_pdf/classification/results.py +62 -0
  28. natural_pdf/collections/mixins.py +63 -0
  29. natural_pdf/collections/pdf_collection.py +326 -17
  30. natural_pdf/core/element_manager.py +73 -4
  31. natural_pdf/core/page.py +255 -83
  32. natural_pdf/core/pdf.py +385 -367
  33. natural_pdf/elements/base.py +1 -3
  34. natural_pdf/elements/collections.py +279 -49
  35. natural_pdf/elements/region.py +106 -21
  36. natural_pdf/elements/text.py +5 -2
  37. natural_pdf/exporters/__init__.py +4 -0
  38. natural_pdf/exporters/base.py +61 -0
  39. natural_pdf/exporters/paddleocr.py +345 -0
  40. natural_pdf/extraction/manager.py +134 -0
  41. natural_pdf/extraction/mixin.py +246 -0
  42. natural_pdf/extraction/result.py +37 -0
  43. natural_pdf/ocr/__init__.py +16 -8
  44. natural_pdf/ocr/engine.py +46 -30
  45. natural_pdf/ocr/engine_easyocr.py +86 -42
  46. natural_pdf/ocr/engine_paddle.py +39 -28
  47. natural_pdf/ocr/engine_surya.py +32 -16
  48. natural_pdf/ocr/ocr_factory.py +34 -23
  49. natural_pdf/ocr/ocr_manager.py +98 -34
  50. natural_pdf/ocr/ocr_options.py +38 -10
  51. natural_pdf/ocr/utils.py +59 -33
  52. natural_pdf/qa/document_qa.py +0 -4
  53. natural_pdf/selectors/parser.py +363 -238
  54. natural_pdf/templates/finetune/fine_tune_paddleocr.md +420 -0
  55. natural_pdf/utils/debug.py +4 -2
  56. natural_pdf/utils/identifiers.py +9 -5
  57. natural_pdf/utils/locks.py +8 -0
  58. natural_pdf/utils/packaging.py +172 -105
  59. natural_pdf/utils/text_extraction.py +96 -65
  60. natural_pdf/utils/tqdm_utils.py +43 -0
  61. natural_pdf/utils/visualization.py +1 -1
  62. {natural_pdf-0.1.6.dist-info → natural_pdf-0.1.8.dist-info}/METADATA +10 -3
  63. {natural_pdf-0.1.6.dist-info → natural_pdf-0.1.8.dist-info}/RECORD +66 -51
  64. {natural_pdf-0.1.6.dist-info → natural_pdf-0.1.8.dist-info}/WHEEL +1 -1
  65. {natural_pdf-0.1.6.dist-info → natural_pdf-0.1.8.dist-info}/licenses/LICENSE +0 -0
  66. {natural_pdf-0.1.6.dist-info → natural_pdf-0.1.8.dist-info}/top_level.txt +0 -0
@@ -18,28 +18,31 @@ class EasyOCREngine(OCREngine):
18
18
  def __init__(self):
19
19
  super().__init__()
20
20
  # No longer need _easyocr attribute
21
- # self._easyocr = None
21
+ # self._easyocr = None
22
22
 
23
23
  def is_available(self) -> bool:
24
24
  """Check if EasyOCR is installed."""
25
25
  return importlib.util.find_spec("easyocr") is not None
26
26
 
27
- def _initialize_model(self, languages: List[str], device: str, options: Optional[BaseOCROptions]):
27
+ def _initialize_model(
28
+ self, languages: List[str], device: str, options: Optional[BaseOCROptions]
29
+ ):
28
30
  """Initialize the EasyOCR model."""
29
31
  # Import directly here
30
32
  try:
31
33
  import easyocr
34
+
32
35
  self.logger.info("EasyOCR module imported successfully.")
33
36
  except ImportError as e:
34
37
  self.logger.error(f"Failed to import EasyOCR: {e}")
35
38
  raise
36
-
39
+
37
40
  # Cast to EasyOCROptions if possible, otherwise use default
38
41
  easy_options = options if isinstance(options, EasyOCROptions) else EasyOCROptions()
39
-
42
+
40
43
  # Prepare constructor arguments
41
44
  use_gpu = "cuda" in device.lower() or "mps" in device.lower()
42
-
45
+
43
46
  constructor_args = {
44
47
  "lang_list": languages,
45
48
  "gpu": use_gpu,
@@ -55,12 +58,12 @@ class EasyOCREngine(OCREngine):
55
58
  "quantize": easy_options.quantize,
56
59
  "cudnn_benchmark": easy_options.cudnn_benchmark,
57
60
  }
58
-
61
+
59
62
  # Filter out None values, as EasyOCR expects non-None or default behaviour
60
63
  constructor_args = {k: v for k, v in constructor_args.items() if v is not None}
61
-
64
+
62
65
  self.logger.debug(f"EasyOCR Reader constructor args: {constructor_args}")
63
-
66
+
64
67
  # Create the reader
65
68
  try:
66
69
  self._model = easyocr.Reader(**constructor_args)
@@ -73,103 +76,144 @@ class EasyOCREngine(OCREngine):
73
76
  """Convert PIL Image to numpy array for EasyOCR."""
74
77
  return np.array(image)
75
78
 
76
- def _process_single_image(self, image: np.ndarray, detect_only: bool, options: Optional[EasyOCROptions]) -> Any:
79
+ def _process_single_image(
80
+ self, image: np.ndarray, detect_only: bool, options: Optional[EasyOCROptions]
81
+ ) -> Any:
77
82
  """Process a single image with EasyOCR."""
78
83
  if self._model is None:
79
84
  raise RuntimeError("EasyOCR model not initialized")
80
-
85
+
81
86
  # Cast options to proper type if provided
82
87
  easy_options = options if isinstance(options, EasyOCROptions) else EasyOCROptions()
83
-
88
+
84
89
  # Prepare readtext arguments (only needed if not detect_only)
85
90
  readtext_args = {}
86
91
  if not detect_only:
87
92
  for param in [
88
- "detail", "paragraph", "min_size", "contrast_ths", "adjust_contrast",
89
- "filter_ths", "text_threshold", "low_text", "link_threshold",
90
- "canvas_size", "mag_ratio", "slope_ths", "ycenter_ths", "height_ths",
91
- "width_ths", "y_ths", "x_ths", "add_margin", "output_format"
93
+ "detail",
94
+ "paragraph",
95
+ "min_size",
96
+ "contrast_ths",
97
+ "adjust_contrast",
98
+ "filter_ths",
99
+ "text_threshold",
100
+ "low_text",
101
+ "link_threshold",
102
+ "canvas_size",
103
+ "mag_ratio",
104
+ "slope_ths",
105
+ "ycenter_ths",
106
+ "height_ths",
107
+ "width_ths",
108
+ "y_ths",
109
+ "x_ths",
110
+ "add_margin",
111
+ "output_format",
92
112
  ]:
93
113
  if hasattr(easy_options, param):
94
114
  val = getattr(easy_options, param)
95
115
  if val is not None:
96
116
  readtext_args[param] = val
97
-
117
+
98
118
  # Process differently based on detect_only flag
99
119
  if detect_only:
100
120
  # Returns tuple (horizontal_list, free_list)
101
121
  # horizontal_list is a list containing one item: the list of boxes
102
122
  # Each box is [[x1, y1], [x2, y1], [x2, y2], [x1, y2]]
103
- bboxes_tuple = self._model.detect(image, **readtext_args) # Pass args here too? Check EasyOCR docs if needed.
104
- if bboxes_tuple and isinstance(bboxes_tuple, tuple) and len(bboxes_tuple) > 0 and isinstance(bboxes_tuple[0], list):
105
- return bboxes_tuple[0] # Return the list of polygons directly
123
+ bboxes_tuple = self._model.detect(
124
+ image, **readtext_args
125
+ ) # Pass args here too? Check EasyOCR docs if needed.
126
+ if (
127
+ bboxes_tuple
128
+ and isinstance(bboxes_tuple, tuple)
129
+ and len(bboxes_tuple) > 0
130
+ and isinstance(bboxes_tuple[0], list)
131
+ ):
132
+ return bboxes_tuple[0] # Return the list of polygons directly
106
133
  else:
107
134
  self.logger.warning(f"EasyOCR detect returned unexpected format: {bboxes_tuple}")
108
- return [] # Return empty list on unexpected format
135
+ return [] # Return empty list on unexpected format
109
136
  else:
110
137
  return self._model.readtext(image, **readtext_args)
111
138
 
112
- def _standardize_results(self, raw_results: Any, min_confidence: float, detect_only: bool) -> List[TextRegion]:
139
+ def _standardize_results(
140
+ self, raw_results: Any, min_confidence: float, detect_only: bool
141
+ ) -> List[TextRegion]:
113
142
  """Convert EasyOCR results to standardized TextRegion objects."""
114
143
  standardized_regions = []
115
-
144
+
116
145
  if detect_only:
146
+ results = raw_results[0]
117
147
  # In detect_only mode, raw_results is already a list of bounding boxes
118
148
  # Each bbox is in [x_min, x_max, y_min, y_max] format
119
- if isinstance(raw_results, list):
120
- for detection in raw_results:
149
+ if isinstance(results, list):
150
+ for detection in results:
121
151
  try:
152
+ # This block expects 'detection' to be a list/tuple of 4 numbers
122
153
  if isinstance(detection, (list, tuple)) and len(detection) == 4:
123
- x_min, x_max, y_min, y_max = detection
124
- # Convert to standardized (x0, y0, x1, y1) format
125
- try:
126
- bbox = (float(x_min), float(y_min), float(x_max), float(y_max))
127
- standardized_regions.append(TextRegion(bbox, text=None, confidence=None))
128
- except (ValueError, TypeError) as e:
129
- raise ValueError(f"Invalid number format in EasyOCR detect bbox: {detection}") from e
154
+ x_min, x_max, y_min, y_max = detection
155
+ # Convert to standardized (x0, y0, x1, y1) format
156
+ try:
157
+ bbox = (float(x_min), float(y_min), float(x_max), float(y_max))
158
+ standardized_regions.append(
159
+ TextRegion(bbox, text=None, confidence=None)
160
+ )
161
+ except (ValueError, TypeError) as e:
162
+ raise ValueError(
163
+ f"Invalid number format in EasyOCR detect bbox: {detection}"
164
+ ) from e
130
165
  else:
166
+ # This is where the error is raised if 'detection' is not a list/tuple of 4 numbers
131
167
  raise ValueError(f"Invalid detection format from EasyOCR: {detection}")
132
168
  except ValueError as e:
133
169
  # Re-raise any value errors from standardization or format checks
134
170
  raise e
135
171
  except Exception as e:
136
172
  # Catch other potential processing errors
137
- raise ValueError(f"Error processing EasyOCR detection item: {detection}") from e
173
+ raise ValueError(
174
+ f"Error processing EasyOCR detection item: {detection}"
175
+ ) from e
138
176
  else:
139
- raise ValueError(f"Expected list of bounding boxes in detect_only mode, got: {raw_results}")
140
-
177
+ raise ValueError(
178
+ f"Expected list of bounding boxes in detect_only mode, got: {type(raw_results)}"
179
+ )
180
+
141
181
  return standardized_regions
142
-
182
+
143
183
  # Full OCR mode (readtext results)
144
184
  for detection in raw_results:
145
185
  try:
146
186
  # Detail mode (list/tuple result)
147
187
  if isinstance(detection, (list, tuple)) and len(detection) >= 3:
148
- bbox_raw = detection[0] # This is usually a polygon [[x1,y1],...]
188
+ bbox_raw = detection[0] # This is usually a polygon [[x1,y1],...]
149
189
  text = str(detection[1])
150
190
  confidence = float(detection[2])
151
-
191
+
152
192
  if confidence >= min_confidence:
153
193
  try:
154
194
  # Use the standard helper for polygons
155
195
  bbox = self._standardize_bbox(bbox_raw)
156
196
  standardized_regions.append(TextRegion(bbox, text, confidence))
157
197
  except ValueError as e:
158
- raise ValueError(f"Could not standardize bounding box from EasyOCR readtext: {bbox_raw}") from e
159
-
198
+ raise ValueError(
199
+ f"Could not standardize bounding box from EasyOCR readtext: {bbox_raw}"
200
+ ) from e
201
+
160
202
  # Simple mode (string result)
161
203
  elif isinstance(detection, str):
162
204
  if 0.0 >= min_confidence: # Always include if min_confidence is 0
163
205
  standardized_regions.append(TextRegion((0, 0, 0, 0), detection, 1.0))
164
206
  else:
165
207
  # Handle unexpected format in OCR mode
166
- raise ValueError(f"Invalid OCR detection format from EasyOCR readtext: {detection}")
167
-
208
+ raise ValueError(
209
+ f"Invalid OCR detection format from EasyOCR readtext: {detection}"
210
+ )
211
+
168
212
  except ValueError as e:
169
213
  # Re-raise any value errors from standardization or format checks
170
214
  raise e
171
215
  except Exception as e:
172
216
  # Catch other potential processing errors
173
217
  raise ValueError(f"Error processing EasyOCR detection item: {detection}") from e
174
-
218
+
175
219
  return standardized_regions
@@ -27,40 +27,43 @@ class PaddleOCREngine(OCREngine):
27
27
  paddleocr_installed = importlib.util.find_spec("paddleocr") is not None
28
28
  return paddle_installed and paddleocr_installed
29
29
 
30
- def _initialize_model(self, languages: List[str], device: str, options: Optional[BaseOCROptions]):
30
+ def _initialize_model(
31
+ self, languages: List[str], device: str, options: Optional[BaseOCROptions]
32
+ ):
31
33
  """Initialize the PaddleOCR model."""
32
34
  try:
33
- import paddleocr
35
+ import paddleocr
36
+
34
37
  self.logger.info("PaddleOCR module imported successfully.")
35
38
  except ImportError as e:
36
- self.logger.error(f"Failed to import PaddleOCR/PaddlePaddle: {e}")
37
- raise
38
-
39
+ self.logger.error(f"Failed to import PaddleOCR/PaddlePaddle: {e}")
40
+ raise
41
+
39
42
  # Cast to PaddleOCROptions if possible
40
43
  paddle_options = options if isinstance(options, PaddleOCROptions) else PaddleOCROptions()
41
-
44
+
42
45
  # Determine parameters
43
46
  primary_lang = languages[0] if languages else "en"
44
47
  use_gpu = "cuda" in str(device).lower()
45
-
48
+
46
49
  # Create constructor arguments
47
50
  constructor_args = {
48
51
  "lang": primary_lang,
49
52
  "use_gpu": use_gpu,
50
53
  "use_angle_cls": paddle_options.use_angle_cls,
51
- "det": True,
52
- "rec": True # We'll control recognition at process time
54
+ "det": True,
55
+ "rec": True, # We'll control recognition at process time
53
56
  }
54
-
57
+
55
58
  # Add optional parameters if available
56
59
  for param in ["det_model_dir", "rec_model_dir", "cls_model_dir", "show_log", "use_onnx"]:
57
60
  if hasattr(paddle_options, param):
58
61
  val = getattr(paddle_options, param)
59
62
  if val is not None:
60
63
  constructor_args[param] = val
61
-
64
+
62
65
  self.logger.debug(f"PaddleOCR constructor args: {constructor_args}")
63
-
66
+
64
67
  # Create the model
65
68
  try:
66
69
  self._model = paddleocr.PaddleOCR(**constructor_args)
@@ -78,31 +81,35 @@ class PaddleOCREngine(OCREngine):
78
81
  img_array_bgr = img_array_rgb[:, :, ::-1] # Convert RGB to BGR
79
82
  return img_array_bgr
80
83
 
81
- def _process_single_image(self, image: np.ndarray, detect_only: bool, options: Optional[PaddleOCROptions]) -> Any:
84
+ def _process_single_image(
85
+ self, image: np.ndarray, detect_only: bool, options: Optional[PaddleOCROptions]
86
+ ) -> Any:
82
87
  """Process a single image with PaddleOCR."""
83
88
  if self._model is None:
84
89
  raise RuntimeError("PaddleOCR model not initialized")
85
-
90
+
86
91
  # Prepare OCR arguments
87
92
  ocr_args = {}
88
93
  if options and isinstance(options, PaddleOCROptions):
89
94
  ocr_args["cls"] = options.cls if options.cls is not None else options.use_angle_cls
90
95
  ocr_args["det"] = options.det
91
96
  ocr_args["rec"] = not detect_only # Control recognition based on detect_only flag
92
-
97
+
93
98
  # Run OCR
94
99
  raw_results = self._model.ocr(image, **ocr_args)
95
100
  return raw_results
96
101
 
97
- def _standardize_results(self, raw_results: Any, min_confidence: float, detect_only: bool) -> List[TextRegion]:
102
+ def _standardize_results(
103
+ self, raw_results: Any, min_confidence: float, detect_only: bool
104
+ ) -> List[TextRegion]:
98
105
  """Convert PaddleOCR results to standardized TextRegion objects."""
99
106
  standardized_regions = []
100
-
107
+
101
108
  if not raw_results or not isinstance(raw_results, list) or len(raw_results) == 0:
102
109
  return standardized_regions
103
-
110
+
104
111
  page_results = raw_results[0] if raw_results[0] is not None else []
105
-
112
+
106
113
  for detection in page_results:
107
114
  # Initialize text and confidence
108
115
  text = None
@@ -111,20 +118,22 @@ class PaddleOCREngine(OCREngine):
111
118
 
112
119
  # Paddle always seems to return the tuple structure [bbox, (text, conf)]
113
120
  # even if rec=False. We need to parse this structure regardless.
114
- if len(detection) == 4: # Handle potential alternative format?
115
- detection = [detection, ('', 1.0)] # Treat as bbox + dummy text/conf
121
+ if len(detection) == 4: # Handle potential alternative format?
122
+ detection = [detection, ("", 1.0)] # Treat as bbox + dummy text/conf
116
123
 
117
124
  if not isinstance(detection, (list, tuple)) or len(detection) < 2:
118
125
  raise ValueError(f"Invalid detection format from PaddleOCR: {detection}")
119
-
126
+
120
127
  bbox_raw = detection[0]
121
128
  text_confidence = detection[1]
122
-
129
+
123
130
  if not isinstance(text_confidence, tuple) or len(text_confidence) < 2:
124
- # Even if detect_only, we expect the (text, conf) structure,
131
+ # Even if detect_only, we expect the (text, conf) structure,
125
132
  # it might just contain dummy values.
126
- raise ValueError(f"Invalid text/confidence structure from PaddleOCR: {text_confidence}")
127
-
133
+ raise ValueError(
134
+ f"Invalid text/confidence structure from PaddleOCR: {text_confidence}"
135
+ )
136
+
128
137
  # Extract text/conf only if not detect_only
129
138
  if not detect_only:
130
139
  text = str(text_confidence[0])
@@ -134,7 +143,9 @@ class PaddleOCREngine(OCREngine):
134
143
  try:
135
144
  bbox = self._standardize_bbox(bbox_raw)
136
145
  except ValueError as e:
137
- raise ValueError(f"Could not standardize bounding box from PaddleOCR: {bbox_raw}") from e
146
+ raise ValueError(
147
+ f"Could not standardize bounding box from PaddleOCR: {bbox_raw}"
148
+ ) from e
138
149
 
139
150
  # Append based on mode
140
151
  if detect_only:
@@ -143,5 +154,5 @@ class PaddleOCREngine(OCREngine):
143
154
  elif confidence >= min_confidence:
144
155
  # Only append if confidence meets threshold in full OCR mode
145
156
  standardized_regions.append(TextRegion(bbox, text, confidence))
146
-
157
+
147
158
  return standardized_regions
@@ -20,14 +20,16 @@ class SuryaOCREngine(OCREngine):
20
20
  self._surya_recognition = None
21
21
  self._surya_detection = None
22
22
 
23
- def _initialize_model(self, languages: List[str], device: str, options: Optional[BaseOCROptions]):
23
+ def _initialize_model(
24
+ self, languages: List[str], device: str, options: Optional[BaseOCROptions]
25
+ ):
24
26
  """Initialize Surya predictors."""
25
27
  if not self.is_available():
26
28
  raise ImportError("Surya OCR library is not installed or available.")
27
29
 
28
30
  # Store languages for use in _process_single_image
29
31
  self._langs = languages
30
-
32
+
31
33
  from surya.detection import DetectionPredictor
32
34
  from surya.recognition import RecognitionPredictor
33
35
 
@@ -41,21 +43,27 @@ class SuryaOCREngine(OCREngine):
41
43
  self._detection_predictor = self._surya_detection(**predictor_args)
42
44
  self.logger.info("Instantiating Surya RecognitionPredictor...")
43
45
  self._recognition_predictor = self._surya_recognition(**predictor_args)
44
-
46
+
45
47
  self.logger.info("Surya predictors initialized.")
46
48
 
47
49
  def _preprocess_image(self, image: Image.Image) -> Image.Image:
48
50
  """Surya uses PIL images directly, so just return the image."""
49
51
  return image
50
52
 
51
- def _process_single_image(self, image: Image.Image, detect_only: bool, options: Optional[SuryaOCROptions]) -> Any:
53
+ def _process_single_image(
54
+ self, image: Image.Image, detect_only: bool, options: Optional[SuryaOCROptions]
55
+ ) -> Any:
52
56
  """Process a single image with Surya OCR."""
53
57
  if not self._recognition_predictor or not self._detection_predictor:
54
58
  raise RuntimeError("Surya predictors are not initialized.")
55
59
 
56
60
  # Store languages instance variable during initialization to use here
57
- langs = [[lang] for lang in self._langs] if hasattr(self, '_langs') else [[self.DEFAULT_LANGUAGES[0]]]
58
-
61
+ langs = (
62
+ [[lang] for lang in self._langs]
63
+ if hasattr(self, "_langs")
64
+ else [[self.DEFAULT_LANGUAGES[0]]]
65
+ )
66
+
59
67
  # Surya expects lists of images, so we need to wrap our single image
60
68
  if detect_only:
61
69
  results = self._detection_predictor(images=[image])
@@ -63,33 +71,41 @@ class SuryaOCREngine(OCREngine):
63
71
  results = self._recognition_predictor(
64
72
  images=[image],
65
73
  langs=langs, # Use the languages set during initialization
66
- det_predictor=self._detection_predictor
74
+ det_predictor=self._detection_predictor,
67
75
  )
68
-
76
+
69
77
  # Surya may return a list with one result per image or a single result object
70
78
  # Return the result as-is and handle the extraction in _standardize_results
71
79
  return results
72
80
 
73
- def _standardize_results(self, raw_results: Any, min_confidence: float, detect_only: bool) -> List[TextRegion]:
81
+ def _standardize_results(
82
+ self, raw_results: Any, min_confidence: float, detect_only: bool
83
+ ) -> List[TextRegion]:
74
84
  """Convert Surya results to standardized TextRegion objects."""
75
85
  standardized_regions = []
76
-
86
+
77
87
  raw_result = raw_results
78
88
  if isinstance(raw_results, list) and len(raw_results) > 0:
79
89
  raw_result = raw_results[0]
80
-
81
- results = raw_result.text_lines if hasattr(raw_result, "text_lines") and not detect_only else raw_result.bboxes
90
+
91
+ results = (
92
+ raw_result.text_lines
93
+ if hasattr(raw_result, "text_lines") and not detect_only
94
+ else raw_result.bboxes
95
+ )
82
96
 
83
97
  for line in results:
84
98
  # Always extract bbox first
85
99
  try:
86
100
  # Prioritize line.bbox, fallback to line.polygon
87
- bbox_raw = line.bbox if hasattr(line, 'bbox') else getattr(line, 'polygon', None)
101
+ bbox_raw = line.bbox if hasattr(line, "bbox") else getattr(line, "polygon", None)
88
102
  if bbox_raw is None:
89
- raise ValueError("Missing bbox/polygon data")
103
+ raise ValueError("Missing bbox/polygon data")
90
104
  bbox = self._standardize_bbox(bbox_raw)
91
105
  except ValueError as e:
92
- raise ValueError(f"Could not standardize bounding box from Surya result: {bbox_raw}") from e
106
+ raise ValueError(
107
+ f"Could not standardize bounding box from Surya result: {bbox_raw}"
108
+ ) from e
93
109
 
94
110
  if detect_only:
95
111
  # For detect_only, text and confidence are None
@@ -100,7 +116,7 @@ class SuryaOCREngine(OCREngine):
100
116
  confidence = line.confidence
101
117
  if confidence >= min_confidence:
102
118
  standardized_regions.append(TextRegion(bbox, text, confidence))
103
-
119
+
104
120
  return standardized_regions
105
121
 
106
122
  def is_available(self) -> bool:
@@ -13,14 +13,14 @@ class OCRFactory:
13
13
  @staticmethod
14
14
  def create_engine(engine_type: str, **kwargs) -> OCREngine:
15
15
  """Create and return an OCR engine instance.
16
-
16
+
17
17
  Args:
18
18
  engine_type: One of 'surya', 'easyocr', 'paddle'
19
19
  **kwargs: Arguments to pass to the engine constructor
20
-
20
+
21
21
  Returns:
22
22
  An initialized OCR engine
23
-
23
+
24
24
  Raises:
25
25
  ImportError: If the required dependencies aren't installed
26
26
  ValueError: If the engine_type is unknown
@@ -28,72 +28,83 @@ class OCRFactory:
28
28
  if engine_type == "surya":
29
29
  try:
30
30
  from .engine_surya import SuryaOCREngine
31
+
31
32
  return SuryaOCREngine(**kwargs)
32
33
  except ImportError:
33
- raise ImportError("Surya engine requires the 'surya' package. "
34
- "Install with: pip install surya")
34
+ raise ImportError(
35
+ "Surya engine requires the 'surya' package. " "Install with: pip install surya"
36
+ )
35
37
  elif engine_type == "easyocr":
36
38
  try:
37
39
  from .engine_easyocr import EasyOCREngine
40
+
38
41
  return EasyOCREngine(**kwargs)
39
42
  except ImportError:
40
- raise ImportError("EasyOCR engine requires the 'easyocr' package. "
41
- "Install with: pip install easyocr")
43
+ raise ImportError(
44
+ "EasyOCR engine requires the 'easyocr' package. "
45
+ "Install with: pip install easyocr"
46
+ )
42
47
  elif engine_type == "paddle":
43
48
  try:
44
49
  from .engine_paddle import PaddleOCREngine
50
+
45
51
  return PaddleOCREngine(**kwargs)
46
52
  except ImportError:
47
- raise ImportError("PaddleOCR engine requires 'paddleocr' and 'paddlepaddle'. "
48
- "Install with: pip install paddleocr paddlepaddle")
53
+ raise ImportError(
54
+ "PaddleOCR engine requires 'paddleocr' and 'paddlepaddle'. "
55
+ "Install with: pip install paddleocr paddlepaddle"
56
+ )
49
57
  else:
50
58
  raise ValueError(f"Unknown engine type: {engine_type}")
51
-
59
+
52
60
  @staticmethod
53
61
  def list_available_engines() -> Dict[str, bool]:
54
62
  """Returns a dictionary of engine names and their availability status."""
55
63
  engines = {}
56
-
64
+
57
65
  # Check Surya
58
66
  try:
59
67
  engines["surya"] = importlib.util.find_spec("surya") is not None
60
68
  except ImportError:
61
69
  engines["surya"] = False
62
-
70
+
63
71
  # Check EasyOCR
64
72
  try:
65
73
  engines["easyocr"] = importlib.util.find_spec("easyocr") is not None
66
74
  except ImportError:
67
75
  engines["easyocr"] = False
68
-
76
+
69
77
  # Check PaddleOCR
70
78
  try:
71
- paddle = importlib.util.find_spec("paddle") is not None or importlib.util.find_spec("paddlepaddle") is not None
79
+ paddle = (
80
+ importlib.util.find_spec("paddle") is not None
81
+ or importlib.util.find_spec("paddlepaddle") is not None
82
+ )
72
83
  paddleocr = importlib.util.find_spec("paddleocr") is not None
73
84
  engines["paddle"] = paddle and paddleocr
74
85
  except ImportError:
75
86
  engines["paddle"] = False
76
-
87
+
77
88
  return engines
78
-
89
+
79
90
  @staticmethod
80
91
  def get_recommended_engine(**kwargs) -> OCREngine:
81
92
  """Returns the best available OCR engine based on what's installed.
82
-
93
+
83
94
  First tries engines in order of preference: EasyOCR, Paddle, Surya.
84
95
  If none are available, raises ImportError with installation instructions.
85
-
96
+
86
97
  Args:
87
98
  **kwargs: Arguments to pass to the engine constructor
88
-
99
+
89
100
  Returns:
90
101
  The best available OCR engine instance
91
-
102
+
92
103
  Raises:
93
104
  ImportError: If no engines are available
94
105
  """
95
106
  available = OCRFactory.list_available_engines()
96
-
107
+
97
108
  # Try engines in order of recommendation
98
109
  if available.get("easyocr", False):
99
110
  logger.info("Using EasyOCR engine (recommended)")
@@ -104,11 +115,11 @@ class OCRFactory:
104
115
  elif available.get("surya", False):
105
116
  logger.info("Using Surya OCR engine")
106
117
  return OCRFactory.create_engine("surya", **kwargs)
107
-
118
+
108
119
  # If we get here, no engines are available
109
120
  raise ImportError(
110
121
  "No OCR engines available. Please install at least one of: \n"
111
122
  "- EasyOCR (recommended): pip install easyocr\n"
112
123
  "- PaddleOCR: pip install paddleocr paddlepaddle\n"
113
124
  "- Surya OCR: pip install surya"
114
- )
125
+ )