natural-pdf 0.1.6__py3-none-any.whl → 0.1.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. docs/finetuning/index.md +176 -0
  2. docs/tutorials/01-loading-and-extraction.ipynb +34 -1550
  3. natural_pdf/__init__.py +1 -0
  4. natural_pdf/analyzers/layout/gemini.py +63 -47
  5. natural_pdf/collections/pdf_collection.py +5 -2
  6. natural_pdf/core/element_manager.py +6 -4
  7. natural_pdf/core/page.py +36 -27
  8. natural_pdf/core/pdf.py +25 -16
  9. natural_pdf/elements/base.py +1 -3
  10. natural_pdf/elements/collections.py +13 -14
  11. natural_pdf/elements/region.py +7 -6
  12. natural_pdf/exporters/__init__.py +4 -0
  13. natural_pdf/exporters/base.py +61 -0
  14. natural_pdf/exporters/paddleocr.py +345 -0
  15. natural_pdf/ocr/__init__.py +16 -8
  16. natural_pdf/ocr/engine.py +46 -30
  17. natural_pdf/ocr/engine_easyocr.py +81 -40
  18. natural_pdf/ocr/engine_paddle.py +39 -28
  19. natural_pdf/ocr/engine_surya.py +32 -16
  20. natural_pdf/ocr/ocr_factory.py +34 -23
  21. natural_pdf/ocr/ocr_manager.py +15 -11
  22. natural_pdf/ocr/ocr_options.py +5 -0
  23. natural_pdf/ocr/utils.py +46 -31
  24. natural_pdf/templates/finetune/fine_tune_paddleocr.md +415 -0
  25. natural_pdf/utils/debug.py +4 -2
  26. natural_pdf/utils/identifiers.py +9 -5
  27. natural_pdf/utils/packaging.py +172 -105
  28. natural_pdf/utils/text_extraction.py +44 -64
  29. natural_pdf/utils/visualization.py +1 -1
  30. {natural_pdf-0.1.6.dist-info → natural_pdf-0.1.7.dist-info}/METADATA +5 -3
  31. {natural_pdf-0.1.6.dist-info → natural_pdf-0.1.7.dist-info}/RECORD +34 -30
  32. {natural_pdf-0.1.6.dist-info → natural_pdf-0.1.7.dist-info}/WHEEL +0 -0
  33. {natural_pdf-0.1.6.dist-info → natural_pdf-0.1.7.dist-info}/licenses/LICENSE +0 -0
  34. {natural_pdf-0.1.6.dist-info → natural_pdf-0.1.7.dist-info}/top_level.txt +0 -0
@@ -11,7 +11,7 @@ from natural_pdf.elements.base import DirectionalMixin
11
11
  # Import new utils
12
12
  from natural_pdf.utils.text_extraction import filter_chars_spatially, generate_text_layout
13
13
 
14
- from natural_pdf.ocr.utils import _apply_ocr_correction_to_elements # Import utility
14
+ from natural_pdf.ocr.utils import _apply_ocr_correction_to_elements # Import utility
15
15
 
16
16
  if TYPE_CHECKING:
17
17
  from natural_pdf.core.page import Page
@@ -1108,7 +1108,7 @@ class Region(DirectionalMixin):
1108
1108
 
1109
1109
  # Determine rendering resolution from parameters
1110
1110
  final_resolution = ocr_params.get("resolution")
1111
- if final_resolution is None and hasattr(self.page, '_parent') and self.page._parent:
1111
+ if final_resolution is None and hasattr(self.page, "_parent") and self.page._parent:
1112
1112
  final_resolution = getattr(self.page._parent, "_config", {}).get("resolution", 150)
1113
1113
  elif final_resolution is None:
1114
1114
  final_resolution = 150
@@ -1191,6 +1191,7 @@ class Region(DirectionalMixin):
1191
1191
  ocr_char_dict.setdefault("adv", ocr_char_dict.get("width", 0))
1192
1192
  element_data["_char_dicts"] = [ocr_char_dict]
1193
1193
  from natural_pdf.elements.text import TextElement
1194
+
1194
1195
  elem = TextElement(element_data, self.page)
1195
1196
  created_elements.append(elem)
1196
1197
  self.page._element_mgr.add_element(elem, element_type="words")
@@ -1692,7 +1693,7 @@ class Region(DirectionalMixin):
1692
1693
  def correct_ocr(
1693
1694
  self,
1694
1695
  correction_callback: Callable[[Any], Optional[str]],
1695
- ) -> "Region": # Return self for chaining
1696
+ ) -> "Region": # Return self for chaining
1696
1697
  """
1697
1698
  Applies corrections to OCR-generated text elements within this region
1698
1699
  using a user-provided callback function.
@@ -1722,9 +1723,9 @@ class Region(DirectionalMixin):
1722
1723
 
1723
1724
  # Delegate to the utility function
1724
1725
  _apply_ocr_correction_to_elements(
1725
- elements=target_elements, # Pass the ElementCollection directly
1726
+ elements=target_elements, # Pass the ElementCollection directly
1726
1727
  correction_callback=correction_callback,
1727
- caller_info=f"Region({self.bbox})", # Pass caller info
1728
+ caller_info=f"Region({self.bbox})", # Pass caller info
1728
1729
  )
1729
1730
 
1730
- return self # Return self for chaining
1731
+ return self # Return self for chaining
@@ -0,0 +1,4 @@
1
+ from .base import FinetuneExporter
2
+ from .paddleocr import PaddleOCRRecognitionExporter
3
+
4
+ __all__ = ["FinetuneExporter", "PaddleOCRRecognitionExporter"]
@@ -0,0 +1,61 @@
1
+ import abc
2
+ import logging
3
+ from typing import Union, List, TYPE_CHECKING
4
+
5
+ if TYPE_CHECKING:
6
+ from natural_pdf.core.pdf import PDF
7
+ from natural_pdf.collections.pdf_collection import PDFCollection
8
+
9
+ logger = logging.getLogger(__name__)
10
+
11
+
12
+ class FinetuneExporter(abc.ABC):
13
+ """
14
+ Abstract base class for exporting data suitable for fine-tuning models.
15
+ """
16
+
17
+ @abc.abstractmethod
18
+ def __init__(self, **kwargs):
19
+ """
20
+ Initialize the exporter with format-specific options.
21
+ """
22
+ pass
23
+
24
+ @abc.abstractmethod
25
+ def export(self, source: Union["PDF", "PDFCollection", List["PDF"]], output_dir: str, **kwargs):
26
+ """
27
+ Exports the data from the source PDF(s) to the specified output directory
28
+ in a format suitable for fine-tuning a specific model type.
29
+
30
+ Args:
31
+ source: The PDF object, PDFCollection, or list of PDF objects to process.
32
+ output_dir: The path to the directory where the exported files will be saved.
33
+ **kwargs: Additional export-time arguments.
34
+ """
35
+ pass
36
+
37
+ def _resolve_source_pdfs(
38
+ self, source: Union["PDF", "PDFCollection", List["PDF"]]
39
+ ) -> List["PDF"]:
40
+ """
41
+ Helper to consistently resolve the input source to a list of PDF objects.
42
+ """
43
+ from natural_pdf.core.pdf import PDF # Avoid circular import at module level
44
+ from natural_pdf.collections.pdf_collection import PDFCollection # Avoid circular import
45
+
46
+ pdfs_to_process: List["PDF"] = []
47
+ if isinstance(source, PDF):
48
+ pdfs_to_process = [source]
49
+ elif isinstance(source, PDFCollection):
50
+ pdfs_to_process = source.pdfs
51
+ elif isinstance(source, list) and all(isinstance(p, PDF) for p in source):
52
+ pdfs_to_process = source
53
+ else:
54
+ raise TypeError(
55
+ f"Unsupported source type: {type(source)}. Must be PDF, PDFCollection, or List[PDF]."
56
+ )
57
+
58
+ if not pdfs_to_process:
59
+ logger.warning("No PDF documents provided in the source.")
60
+
61
+ return pdfs_to_process
@@ -0,0 +1,345 @@
1
+ import os
2
+ import logging
3
+ import random
4
+ import shutil
5
+ from typing import Union, List, Optional, TYPE_CHECKING, Set, Tuple
6
+ from tqdm import tqdm
7
+
8
+ from natural_pdf.exporters.base import FinetuneExporter
9
+
10
+ # Need to import this utility
11
+ from natural_pdf.utils.identifiers import generate_short_path_hash
12
+
13
+ if TYPE_CHECKING:
14
+ from natural_pdf.core.pdf import PDF
15
+ from natural_pdf.collections.pdf_collection import PDFCollection
16
+ from natural_pdf.elements.text import TextElement
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+ DEFAULT_SELECTOR_CORRECTED = "text[source^=manifest]" # Match manifest-import etc.
21
+
22
+
23
+ class PaddleOCRRecognitionExporter(FinetuneExporter):
24
+ """
25
+ Exports data for fine-tuning a PaddleOCR text recognition model.
26
+
27
+ Creates a directory structure with cropped text images and label files
28
+ (`train.txt`, `val.txt`, or `label.txt`) suitable for PaddleOCR training.
29
+ Optionally includes a Jupyter Notebook guide for fine-tuning on Colab.
30
+ """
31
+
32
+ def __init__(
33
+ self,
34
+ resolution: int = 150,
35
+ padding: int = 2,
36
+ selector: Optional[str] = None,
37
+ corrected_only: bool = False,
38
+ split_ratio: Optional[float] = 0.9,
39
+ include_guide: bool = True,
40
+ random_seed: Optional[int] = 42,
41
+ ):
42
+ """
43
+ Initialize the PaddleOCR Recognition Exporter.
44
+
45
+ Args:
46
+ resolution: DPI resolution for rendering text region images (default: 150).
47
+ padding: Padding (in points) to add around text element bbox before cropping (default: 2).
48
+ selector: CSS-like selector to filter which TextElements to export.
49
+ If None and corrected_only is False, all 'text' elements are considered.
50
+ corrected_only: If True, overrides selector and exports only elements likely
51
+ originating from a correction manifest (selector="text[source^=manifest]").
52
+ (default: False).
53
+ split_ratio: Ratio for splitting data into training/validation sets (e.g., 0.9 for 90% train).
54
+ If None, creates a single `label.txt` file (default: 0.9).
55
+ include_guide: If True, includes a template Jupyter Notebook guide for fine-tuning
56
+ in the output directory (default: True).
57
+ random_seed: Seed for the random number generator used for train/val split shuffling,
58
+ ensuring reproducibility (default: 42).
59
+ """
60
+ if corrected_only and selector:
61
+ logger.warning(
62
+ f"Both 'corrected_only=True' and 'selector=\"{selector}\"' were provided. "
63
+ f"Using corrected_only=True (selector='{DEFAULT_SELECTOR_CORRECTED}')."
64
+ )
65
+ self.selector = DEFAULT_SELECTOR_CORRECTED
66
+ elif corrected_only:
67
+ self.selector = DEFAULT_SELECTOR_CORRECTED
68
+ elif selector:
69
+ self.selector = selector
70
+ else:
71
+ self.selector = "text" # Default to all text elements if nothing else specified
72
+
73
+ self.resolution = resolution
74
+ self.padding = padding
75
+ self.split_ratio = split_ratio
76
+ self.include_guide = include_guide
77
+ self.random_seed = random_seed
78
+
79
+ logger.info(
80
+ f"Initialized PaddleOCRRecognitionExporter: selector='{self.selector}', resolution={resolution}, "
81
+ f"padding={padding}, split_ratio={split_ratio}, include_guide={include_guide}"
82
+ )
83
+
84
+ def export(
85
+ self,
86
+ source: Union["PDF", "PDFCollection", List["PDF"]],
87
+ output_dir: str,
88
+ **kwargs, # Allow for potential future args
89
+ ):
90
+ """
91
+ Exports text elements from the source PDF(s) to the specified output directory
92
+ in PaddleOCR text recognition format.
93
+
94
+ Args:
95
+ source: The PDF object, PDFCollection, or list of PDF objects to process.
96
+ output_dir: The path to the directory where the exported files will be saved.
97
+ The directory will be created if it doesn't exist.
98
+ **kwargs: Optional keyword arguments (currently unused).
99
+ """
100
+ # --- 1. Setup and Validation ---
101
+ pdfs_to_process = self._resolve_source_pdfs(source)
102
+ if not pdfs_to_process:
103
+ logger.error("No valid PDF sources found. Aborting export.")
104
+ return
105
+
106
+ try:
107
+ os.makedirs(output_dir, exist_ok=True)
108
+ images_dir = os.path.join(output_dir, "images")
109
+ os.makedirs(images_dir, exist_ok=True)
110
+ except OSError as e:
111
+ logger.error(f"Failed to create output directory '{output_dir}': {e}", exc_info=True)
112
+ raise
113
+
114
+ # --- 2. Collect Elements and Render Images ---
115
+ labels: List[Tuple[str, str]] = [] # List of (relative_image_path, text_label)
116
+ char_set: Set[str] = set()
117
+ elements_processed = 0
118
+ elements_skipped = 0
119
+
120
+ logger.info(
121
+ f"Processing {len(pdfs_to_process)} PDF(s) to find elements matching selector: '{self.selector}'"
122
+ )
123
+
124
+ for pdf in tqdm(pdfs_to_process, desc="Processing PDFs"):
125
+ # Need to ensure pdf.path exists and is string
126
+ if not hasattr(pdf, "path") or not isinstance(pdf.path, str):
127
+ logger.warning(f"Skipping PDF object without a valid path attribute: {pdf}")
128
+ continue
129
+ pdf_hash = generate_short_path_hash(pdf.path)
130
+ try:
131
+ # Find elements using the specified selector
132
+ # Need to check if pdf has find_all method
133
+ if not hasattr(pdf, "find_all"):
134
+ logger.warning(
135
+ f"PDF object {pdf.path} does not have find_all method. Skipping."
136
+ )
137
+ continue
138
+
139
+ elements = pdf.find_all(
140
+ self.selector, apply_exclusions=False
141
+ ) # Usually want all text, even if excluded
142
+ if not elements:
143
+ logger.debug(f"No elements matching '{self.selector}' found in {pdf.path}")
144
+ continue
145
+
146
+ for i, element in enumerate(
147
+ tqdm(
148
+ elements,
149
+ desc=f"Exporting '{os.path.basename(pdf.path)}'",
150
+ leave=False,
151
+ position=1,
152
+ )
153
+ ):
154
+ # Ensure it's a TextElement with necessary methods/attributes
155
+ # Removed check for to_image as it's called after expand()
156
+ if not (
157
+ hasattr(element, "page")
158
+ and hasattr(element, "text")
159
+ and hasattr(element, "expand")
160
+ ):
161
+ logger.warning(f"Skipping invalid/non-text element {i} in {pdf.path}")
162
+ elements_skipped += 1
163
+ continue
164
+
165
+ element_text = element.text
166
+ # Skip elements with no text, non-string text, or newlines
167
+ if (
168
+ not element_text
169
+ or not isinstance(element_text, str)
170
+ or "\n" in element_text
171
+ ):
172
+ if "\n" in str(element_text):
173
+ reason = "contains newline"
174
+ elif not element_text:
175
+ reason = "empty text"
176
+ else:
177
+ reason = "invalid text type"
178
+ logger.debug(
179
+ f"Skipping element {i} in {pdf.path} page {getattr(element.page, 'number', 'N/A')} because {reason}."
180
+ )
181
+ elements_skipped += 1
182
+ continue
183
+
184
+ # Use page index if available, otherwise fallback or skip? Fallback to 0 for now.
185
+ page_index = getattr(element.page, "index", 0)
186
+ image_filename = f"{pdf_hash}_p{page_index}_e{i}.png"
187
+ relative_image_path = os.path.join("images", image_filename)
188
+ absolute_image_path = os.path.join(output_dir, relative_image_path)
189
+
190
+ try:
191
+ # Expand region, render, and save image
192
+ region = element.expand(self.padding)
193
+ img = region.to_image(
194
+ resolution=self.resolution, crop_only=True, include_highlights=False
195
+ )
196
+ img.save(absolute_image_path, "PNG")
197
+
198
+ # Add to labels and character set
199
+ labels.append(
200
+ (relative_image_path.replace(os.path.sep, "/"), element_text)
201
+ ) # Use forward slashes for labels
202
+ char_set.update(element_text)
203
+ elements_processed += 1
204
+
205
+ except Exception as e:
206
+ page_num_str = getattr(
207
+ element.page, "number", "N/A"
208
+ ) # Get page number safely
209
+ logger.error(
210
+ f"Failed to process/save image for element {i} in {pdf.path} page {page_num_str}: {e}",
211
+ exc_info=False, # Keep log cleaner
212
+ )
213
+ elements_skipped += 1
214
+
215
+ except Exception as e:
216
+ logger.error(f"Failed to process PDF {pdf.path}: {e}", exc_info=True)
217
+ # Continue with other PDFs if possible
218
+
219
+ if elements_processed == 0:
220
+ logger.error(
221
+ f"No text elements were successfully processed and exported matching '{self.selector}'. Aborting."
222
+ )
223
+ # Clean up potentially created directories? Or leave them empty? Let's leave them.
224
+ return
225
+
226
+ logger.info(f"Processed {elements_processed} text elements, skipped {elements_skipped}.")
227
+
228
+ # --- 3. Generate Dictionary File (`dict.txt`) ---
229
+ dict_path = os.path.join(output_dir, "dict.txt")
230
+ try:
231
+ # Log the character set before sorting/writing
232
+ logger.debug(f"Exporter final char_set before sorting: {repr(char_set)}")
233
+ # PaddleOCR typically doesn't require special tokens like <UNK> or <BLK> in the dict
234
+ # for recognition models, but this might depend on the specific base model.
235
+ # Start with just the characters found.
236
+ sorted_chars = sorted(list(char_set), reverse=True)
237
+ with open(dict_path, "w", encoding="utf-8") as f_dict:
238
+ for char in sorted_chars:
239
+ # Ensure we don't write empty strings or just newlines as dictionary entries
240
+ if char and char != "\n":
241
+ f_dict.write(char + "\n")
242
+ logger.info(f"Created dictionary file with {len(sorted_chars)} characters: {dict_path}")
243
+ except Exception as e:
244
+ logger.error(f"Failed to write dictionary file '{dict_path}': {e}", exc_info=True)
245
+ raise # Re-raise as this is critical
246
+
247
+ # --- 4. Generate Label Files (`train.txt`, `val.txt` or `label.txt`) ---
248
+ if self.split_ratio is not None and 0 < self.split_ratio < 1:
249
+ if self.random_seed is not None:
250
+ random.seed(self.random_seed)
251
+ random.shuffle(labels)
252
+ split_index = int(len(labels) * self.split_ratio)
253
+ train_labels = labels[:split_index]
254
+ val_labels = labels[split_index:]
255
+
256
+ try:
257
+ train_path = os.path.join(output_dir, "train.txt")
258
+ with open(train_path, "w", encoding="utf-8") as f_train:
259
+ for img_path, text in train_labels:
260
+ f_train.write(f"{img_path}\t{text}\n") # Use literal tabs and newlines
261
+ logger.info(
262
+ f"Created training label file with {len(train_labels)} entries: {train_path}"
263
+ )
264
+
265
+ val_path = os.path.join(output_dir, "val.txt")
266
+ with open(val_path, "w", encoding="utf-8") as f_val:
267
+ for img_path, text in val_labels:
268
+ f_val.write(f"{img_path}\t{text}\n") # Use literal tabs and newlines
269
+ logger.info(
270
+ f"Created validation label file with {len(val_labels)} entries: {val_path}"
271
+ )
272
+ except Exception as e:
273
+ logger.error(f"Failed to write train/validation label files: {e}", exc_info=True)
274
+ raise
275
+ else:
276
+ # Create a single label file
277
+ label_path = os.path.join(output_dir, "label.txt")
278
+ try:
279
+ with open(label_path, "w", encoding="utf-8") as f_label:
280
+ for img_path, text in labels:
281
+ f_label.write(f"{img_path}\t{text}\n") # Use literal tabs and newlines
282
+ logger.info(f"Created single label file with {len(labels)} entries: {label_path}")
283
+ except Exception as e:
284
+ logger.error(f"Failed to write label file '{label_path}': {e}", exc_info=True)
285
+ raise
286
+
287
+ # --- 5. Include Guide Notebook ---
288
+ if self.include_guide:
289
+ self._copy_guide_notebook(output_dir)
290
+
291
+ logger.info(f"PaddleOCR recognition data export completed successfully to '{output_dir}'.")
292
+
293
+ def _copy_guide_notebook(self, output_dir: str):
294
+ """Locates, converts (md->ipynb), and copies the guide notebook."""
295
+ try:
296
+ # Try importing conversion library
297
+ import jupytext
298
+ from nbformat import write as write_notebook
299
+ except ImportError:
300
+ logger.warning(
301
+ "Could not import 'jupytext' or 'nbformat'. Skipping guide notebook generation. "
302
+ "Install with 'pip install natural-pdf[dev]' or 'pip install jupytext nbformat'."
303
+ )
304
+ return
305
+
306
+ try:
307
+ # Locate the template .md file relative to this script
308
+ exporter_dir = os.path.dirname(os.path.abspath(__file__))
309
+ # Go up two levels (exporters -> natural_pdf) then down to templates/finetune
310
+ template_dir = os.path.abspath(
311
+ os.path.join(exporter_dir, "..", "templates", "finetune")
312
+ )
313
+ template_md_path = os.path.join(template_dir, "fine_tune_paddleocr.md")
314
+ output_ipynb_path = os.path.join(output_dir, "fine_tune_paddleocr.ipynb")
315
+
316
+ if not os.path.exists(template_md_path):
317
+ logger.error(
318
+ f"Guide template not found at expected location: {template_md_path}. Trying alternate path."
319
+ )
320
+ # Try path relative to workspace root as fallback if run from project root
321
+ alt_template_path = os.path.abspath(
322
+ os.path.join("natural_pdf", "templates", "finetune", "fine_tune_paddleocr.md")
323
+ )
324
+ if os.path.exists(alt_template_path):
325
+ template_md_path = alt_template_path
326
+ logger.info(f"Found guide template at alternate path: {template_md_path}")
327
+ else:
328
+ logger.error(
329
+ f"Guide template also not found at: {alt_template_path}. Cannot copy guide."
330
+ )
331
+ return
332
+
333
+ # Convert Markdown to Notebook object using jupytext
334
+ logger.debug(f"Reading guide template from: {template_md_path}")
335
+ notebook = jupytext.read(template_md_path) # Reads md and returns NotebookNode
336
+
337
+ # Write the Notebook object to the output .ipynb file
338
+ logger.debug(f"Writing guide notebook to: {output_ipynb_path}")
339
+ with open(output_ipynb_path, "w", encoding="utf-8") as f_nb:
340
+ write_notebook(notebook, f_nb)
341
+
342
+ logger.info(f"Copied and converted fine-tuning guide notebook to: {output_ipynb_path}")
343
+
344
+ except Exception as e:
345
+ logger.error(f"Failed to copy/convert guide notebook: {e}", exc_info=True)
@@ -11,7 +11,13 @@ logger = logging.getLogger("natural_pdf.ocr")
11
11
 
12
12
  # Import the base classes that are always available
13
13
  from .engine import OCREngine
14
- from .ocr_options import OCROptions, BaseOCROptions, EasyOCROptions, PaddleOCROptions, SuryaOCROptions
14
+ from .ocr_options import (
15
+ OCROptions,
16
+ BaseOCROptions,
17
+ EasyOCROptions,
18
+ PaddleOCROptions,
19
+ SuryaOCROptions,
20
+ )
15
21
  from .ocr_manager import OCRManager
16
22
  from .ocr_factory import OCRFactory
17
23
 
@@ -22,13 +28,14 @@ __all__ = [
22
28
  "OCROptions",
23
29
  "BaseOCROptions",
24
30
  "EasyOCROptions",
25
- "PaddleOCROptions",
31
+ "PaddleOCROptions",
26
32
  "SuryaOCROptions",
27
33
  "OCRFactory",
28
34
  "get_engine",
29
- "list_available_engines"
35
+ "list_available_engines",
30
36
  ]
31
37
 
38
+
32
39
  def get_engine(engine_name=None, **kwargs):
33
40
  """
34
41
  Get OCR engine by name with graceful handling of missing dependencies.
@@ -40,27 +47,27 @@ def get_engine(engine_name=None, **kwargs):
40
47
 
41
48
  Returns:
42
49
  OCREngine instance
43
-
50
+
44
51
  Raises:
45
52
  ImportError: If the requested engine's dependencies aren't installed
46
53
  ValueError: If the engine_name is unknown
47
54
  """
48
55
  logger.debug(f"Initializing OCR engine: {engine_name or 'best available'}")
49
-
56
+
50
57
  try:
51
58
  if engine_name is None or engine_name == "default":
52
59
  # Use the factory to get the best available engine
53
60
  engine = OCRFactory.get_recommended_engine(**kwargs)
54
61
  logger.info(f"Using recommended OCR engine: {engine.__class__.__name__}")
55
62
  return engine
56
-
63
+
57
64
  # Use the factory to create a specific engine
58
65
  normalized_name = engine_name.lower()
59
66
  if normalized_name in ["easyocr", "paddle", "surya"]:
60
67
  return OCRFactory.create_engine(normalized_name, **kwargs)
61
68
  else:
62
69
  raise ValueError(f"Unknown OCR engine: {engine_name}")
63
-
70
+
64
71
  except ImportError as e:
65
72
  logger.error(f"OCR engine dependency error: {e}")
66
73
  raise
@@ -68,10 +75,11 @@ def get_engine(engine_name=None, **kwargs):
68
75
  logger.error(f"Error initializing OCR engine: {e}")
69
76
  raise
70
77
 
78
+
71
79
  def list_available_engines():
72
80
  """
73
81
  List all available OCR engines.
74
-
82
+
75
83
  Returns:
76
84
  Dict[str, bool]: Dictionary mapping engine names to availability status
77
85
  """