natural-pdf 0.1.8__py3-none-any.whl → 0.1.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (134) hide show
  1. natural_pdf/__init__.py +1 -0
  2. natural_pdf/analyzers/layout/base.py +1 -5
  3. natural_pdf/analyzers/layout/gemini.py +61 -51
  4. natural_pdf/analyzers/layout/layout_analyzer.py +40 -11
  5. natural_pdf/analyzers/layout/layout_manager.py +26 -84
  6. natural_pdf/analyzers/layout/layout_options.py +7 -0
  7. natural_pdf/analyzers/layout/pdfplumber_table_finder.py +142 -0
  8. natural_pdf/analyzers/layout/surya.py +46 -123
  9. natural_pdf/analyzers/layout/tatr.py +51 -4
  10. natural_pdf/analyzers/text_structure.py +3 -5
  11. natural_pdf/analyzers/utils.py +3 -3
  12. natural_pdf/classification/manager.py +241 -158
  13. natural_pdf/classification/mixin.py +52 -38
  14. natural_pdf/classification/results.py +71 -45
  15. natural_pdf/collections/mixins.py +85 -20
  16. natural_pdf/collections/pdf_collection.py +245 -100
  17. natural_pdf/core/element_manager.py +30 -14
  18. natural_pdf/core/highlighting_service.py +13 -22
  19. natural_pdf/core/page.py +423 -101
  20. natural_pdf/core/pdf.py +694 -195
  21. natural_pdf/elements/base.py +134 -40
  22. natural_pdf/elements/collections.py +610 -134
  23. natural_pdf/elements/region.py +659 -90
  24. natural_pdf/elements/text.py +1 -1
  25. natural_pdf/export/mixin.py +137 -0
  26. natural_pdf/exporters/base.py +3 -3
  27. natural_pdf/exporters/paddleocr.py +4 -3
  28. natural_pdf/extraction/manager.py +50 -49
  29. natural_pdf/extraction/mixin.py +90 -57
  30. natural_pdf/extraction/result.py +9 -23
  31. natural_pdf/ocr/__init__.py +5 -5
  32. natural_pdf/ocr/engine_doctr.py +346 -0
  33. natural_pdf/ocr/ocr_factory.py +24 -4
  34. natural_pdf/ocr/ocr_manager.py +61 -25
  35. natural_pdf/ocr/ocr_options.py +70 -10
  36. natural_pdf/ocr/utils.py +6 -4
  37. natural_pdf/search/__init__.py +20 -34
  38. natural_pdf/search/haystack_search_service.py +309 -265
  39. natural_pdf/search/haystack_utils.py +99 -75
  40. natural_pdf/search/search_service_protocol.py +11 -12
  41. natural_pdf/selectors/parser.py +219 -143
  42. natural_pdf/utils/debug.py +3 -3
  43. natural_pdf/utils/identifiers.py +1 -1
  44. natural_pdf/utils/locks.py +1 -1
  45. natural_pdf/utils/packaging.py +8 -6
  46. natural_pdf/utils/text_extraction.py +24 -16
  47. natural_pdf/utils/tqdm_utils.py +18 -10
  48. natural_pdf/utils/visualization.py +18 -0
  49. natural_pdf/widgets/viewer.py +4 -25
  50. {natural_pdf-0.1.8.dist-info → natural_pdf-0.1.10.dist-info}/METADATA +12 -3
  51. natural_pdf-0.1.10.dist-info/RECORD +80 -0
  52. {natural_pdf-0.1.8.dist-info → natural_pdf-0.1.10.dist-info}/WHEEL +1 -1
  53. {natural_pdf-0.1.8.dist-info → natural_pdf-0.1.10.dist-info}/top_level.txt +0 -2
  54. docs/api/index.md +0 -386
  55. docs/assets/favicon.png +0 -3
  56. docs/assets/favicon.svg +0 -3
  57. docs/assets/javascripts/custom.js +0 -17
  58. docs/assets/logo.svg +0 -3
  59. docs/assets/sample-screen.png +0 -0
  60. docs/assets/social-preview.png +0 -17
  61. docs/assets/social-preview.svg +0 -17
  62. docs/assets/stylesheets/custom.css +0 -65
  63. docs/categorizing-documents/index.md +0 -168
  64. docs/data-extraction/index.md +0 -87
  65. docs/document-qa/index.ipynb +0 -435
  66. docs/document-qa/index.md +0 -79
  67. docs/element-selection/index.ipynb +0 -969
  68. docs/element-selection/index.md +0 -249
  69. docs/finetuning/index.md +0 -176
  70. docs/index.md +0 -189
  71. docs/installation/index.md +0 -69
  72. docs/interactive-widget/index.ipynb +0 -962
  73. docs/interactive-widget/index.md +0 -12
  74. docs/layout-analysis/index.ipynb +0 -818
  75. docs/layout-analysis/index.md +0 -185
  76. docs/ocr/index.md +0 -256
  77. docs/pdf-navigation/index.ipynb +0 -314
  78. docs/pdf-navigation/index.md +0 -97
  79. docs/regions/index.ipynb +0 -816
  80. docs/regions/index.md +0 -294
  81. docs/tables/index.ipynb +0 -658
  82. docs/tables/index.md +0 -144
  83. docs/text-analysis/index.ipynb +0 -370
  84. docs/text-analysis/index.md +0 -105
  85. docs/text-extraction/index.ipynb +0 -1478
  86. docs/text-extraction/index.md +0 -292
  87. docs/tutorials/01-loading-and-extraction.ipynb +0 -1873
  88. docs/tutorials/01-loading-and-extraction.md +0 -95
  89. docs/tutorials/02-finding-elements.ipynb +0 -417
  90. docs/tutorials/02-finding-elements.md +0 -149
  91. docs/tutorials/03-extracting-blocks.ipynb +0 -152
  92. docs/tutorials/03-extracting-blocks.md +0 -48
  93. docs/tutorials/04-table-extraction.ipynb +0 -119
  94. docs/tutorials/04-table-extraction.md +0 -50
  95. docs/tutorials/05-excluding-content.ipynb +0 -275
  96. docs/tutorials/05-excluding-content.md +0 -109
  97. docs/tutorials/06-document-qa.ipynb +0 -337
  98. docs/tutorials/06-document-qa.md +0 -91
  99. docs/tutorials/07-layout-analysis.ipynb +0 -293
  100. docs/tutorials/07-layout-analysis.md +0 -66
  101. docs/tutorials/07-working-with-regions.ipynb +0 -414
  102. docs/tutorials/07-working-with-regions.md +0 -151
  103. docs/tutorials/08-spatial-navigation.ipynb +0 -513
  104. docs/tutorials/08-spatial-navigation.md +0 -190
  105. docs/tutorials/09-section-extraction.ipynb +0 -2439
  106. docs/tutorials/09-section-extraction.md +0 -256
  107. docs/tutorials/10-form-field-extraction.ipynb +0 -517
  108. docs/tutorials/10-form-field-extraction.md +0 -201
  109. docs/tutorials/11-enhanced-table-processing.ipynb +0 -59
  110. docs/tutorials/11-enhanced-table-processing.md +0 -9
  111. docs/tutorials/12-ocr-integration.ipynb +0 -3712
  112. docs/tutorials/12-ocr-integration.md +0 -137
  113. docs/tutorials/13-semantic-search.ipynb +0 -1718
  114. docs/tutorials/13-semantic-search.md +0 -77
  115. docs/visual-debugging/index.ipynb +0 -2970
  116. docs/visual-debugging/index.md +0 -157
  117. docs/visual-debugging/region.png +0 -0
  118. natural_pdf/templates/finetune/fine_tune_paddleocr.md +0 -420
  119. natural_pdf/templates/spa/css/style.css +0 -334
  120. natural_pdf/templates/spa/index.html +0 -31
  121. natural_pdf/templates/spa/js/app.js +0 -472
  122. natural_pdf/templates/spa/words.txt +0 -235976
  123. natural_pdf/widgets/frontend/viewer.js +0 -88
  124. natural_pdf-0.1.8.dist-info/RECORD +0 -156
  125. notebooks/Examples.ipynb +0 -1293
  126. pdfs/.gitkeep +0 -0
  127. pdfs/01-practice.pdf +0 -543
  128. pdfs/0500000US42001.pdf +0 -0
  129. pdfs/0500000US42007.pdf +0 -0
  130. pdfs/2014 Statistics.pdf +0 -0
  131. pdfs/2019 Statistics.pdf +0 -0
  132. pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
  133. pdfs/needs-ocr.pdf +0 -0
  134. {natural_pdf-0.1.8.dist-info → natural_pdf-0.1.10.dist-info}/licenses/LICENSE +0 -0
@@ -11,15 +11,15 @@ logger = logging.getLogger("natural_pdf.ocr")
11
11
 
12
12
  # Import the base classes that are always available
13
13
  from .engine import OCREngine
14
+ from .ocr_factory import OCRFactory
15
+ from .ocr_manager import OCRManager
14
16
  from .ocr_options import (
15
- OCROptions,
16
17
  BaseOCROptions,
17
18
  EasyOCROptions,
19
+ OCROptions,
18
20
  PaddleOCROptions,
19
21
  SuryaOCROptions,
20
22
  )
21
- from .ocr_manager import OCRManager
22
- from .ocr_factory import OCRFactory
23
23
 
24
24
  # Add all public symbols that should be available when importing this module
25
25
  __all__ = [
@@ -41,7 +41,7 @@ def get_engine(engine_name=None, **kwargs):
41
41
  Get OCR engine by name with graceful handling of missing dependencies.
42
42
 
43
43
  Args:
44
- engine_name: Name of the engine to use ('easyocr', 'paddle', 'surya')
44
+ engine_name: Name of the engine to use ('easyocr', 'paddle', 'surya', 'doctr')
45
45
  If None, the best available engine is used
46
46
  **kwargs: Additional arguments to pass to the engine constructor
47
47
 
@@ -63,7 +63,7 @@ def get_engine(engine_name=None, **kwargs):
63
63
 
64
64
  # Use the factory to create a specific engine
65
65
  normalized_name = engine_name.lower()
66
- if normalized_name in ["easyocr", "paddle", "surya"]:
66
+ if normalized_name in ["easyocr", "paddle", "surya", "doctr"]:
67
67
  return OCRFactory.create_engine(normalized_name, **kwargs)
68
68
  else:
69
69
  raise ValueError(f"Unknown OCR engine: {engine_name}")
@@ -0,0 +1,346 @@
1
+ # natural_pdf/ocr/engine_doctr.py
2
+ import importlib.util
3
+ import logging
4
+ from typing import Any, List, Optional
5
+
6
+ import numpy as np
7
+ from PIL import Image
8
+
9
+ from .engine import OCREngine, TextRegion
10
+ from .ocr_options import BaseOCROptions, DoctrOCROptions
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ class DoctrOCREngine(OCREngine):
16
+ """docTR engine implementation."""
17
+
18
+ def __init__(self):
19
+ super().__init__()
20
+ self._model = None # Will hold the doctr ocr_predictor
21
+ self._detection_model = None # Will hold detection_predictor if detect_only is used
22
+ self._orientation_model = None # Will hold page_orientation_predictor if enabled
23
+
24
+ def is_available(self) -> bool:
25
+ """Check if doctr is installed."""
26
+ return importlib.util.find_spec("doctr") is not None
27
+
28
+ def _initialize_model(
29
+ self, languages: List[str], device: str, options: Optional[BaseOCROptions]
30
+ ):
31
+ """Initialize the doctr model."""
32
+ if not self.is_available():
33
+ raise ImportError(
34
+ "Doctr engine requires the 'python-doctr' package. "
35
+ "Install with: pip install python-doctr[torch] or python-doctr[tf]"
36
+ )
37
+
38
+ try:
39
+ import doctr.models
40
+
41
+ self.logger.info("doctr.models imported successfully.")
42
+ except ImportError as e:
43
+ self.logger.error(f"Failed to import doctr: {e}")
44
+ raise
45
+
46
+ # Cast to DoctrOCROptions or use default
47
+ doctr_opts = options if isinstance(options, DoctrOCROptions) else DoctrOCROptions()
48
+
49
+ # Check if CUDA is available in device string
50
+ use_cuda = device.lower().startswith("cuda") if device else False
51
+
52
+ # Prepare OCR predictor arguments
53
+ predictor_args = {
54
+ "det_arch": doctr_opts.det_arch,
55
+ "reco_arch": doctr_opts.reco_arch,
56
+ "pretrained": doctr_opts.pretrained,
57
+ "assume_straight_pages": doctr_opts.assume_straight_pages,
58
+ "export_as_straight_boxes": doctr_opts.export_as_straight_boxes,
59
+ }
60
+ # Filter out None values
61
+ predictor_args = {k: v for k, v in predictor_args.items() if v is not None}
62
+
63
+ self.logger.debug(f"doctr ocr_predictor constructor args: {predictor_args}")
64
+ try:
65
+ # Create the main OCR predictor (doesn't accept batch_size)
66
+ self._model = doctr.models.ocr_predictor(**predictor_args)
67
+
68
+ # Apply CUDA if available
69
+ if use_cuda:
70
+ self._model = self._model.cuda()
71
+
72
+ self.logger.info("doctr ocr_predictor created successfully")
73
+
74
+ # Now initialize the detection-only model
75
+ try:
76
+ detection_args = {
77
+ "arch": doctr_opts.det_arch,
78
+ "pretrained": doctr_opts.pretrained,
79
+ "assume_straight_pages": doctr_opts.assume_straight_pages,
80
+ "symmetric_pad": doctr_opts.symmetric_pad,
81
+ "preserve_aspect_ratio": doctr_opts.preserve_aspect_ratio,
82
+ "batch_size": doctr_opts.batch_size,
83
+ }
84
+ self._detection_model = doctr.models.detection_predictor(**detection_args)
85
+
86
+ # Apply CUDA if available
87
+ if use_cuda:
88
+ self._detection_model = self._detection_model.cuda()
89
+
90
+ # Configure postprocessing parameters if provided
91
+ if doctr_opts.bin_thresh is not None:
92
+ self._detection_model.model.postprocessor.bin_thresh = doctr_opts.bin_thresh
93
+ if doctr_opts.box_thresh is not None:
94
+ self._detection_model.model.postprocessor.box_thresh = doctr_opts.box_thresh
95
+
96
+ self.logger.info("doctr detection_predictor created successfully")
97
+ except Exception as e:
98
+ self.logger.error(f"Failed to create detection_predictor: {e}")
99
+ self._detection_model = None
100
+
101
+ # Initialize orientation predictor if enabled
102
+ if doctr_opts.use_orientation_predictor:
103
+ try:
104
+ self._orientation_model = doctr.models.page_orientation_predictor(
105
+ pretrained=True, batch_size=doctr_opts.batch_size
106
+ )
107
+ if use_cuda:
108
+ self._orientation_model = self._orientation_model.cuda()
109
+ self.logger.info("doctr page_orientation_predictor created successfully")
110
+ except Exception as e:
111
+ self.logger.error(f"Failed to create page_orientation_predictor: {e}")
112
+ self._orientation_model = None
113
+
114
+ except Exception as e:
115
+ self.logger.error(f"Failed to create doctr models: {e}")
116
+ raise
117
+
118
+ # Doctr doesn't explicitly use language list in ocr_predictor initialization
119
+ if languages and languages != [self.DEFAULT_LANGUAGES[0]]:
120
+ logger.warning(
121
+ f"Doctr engine currently doesn't support language selection during initialization. Using its default language capabilities for model: {doctr_opts.reco_arch}"
122
+ )
123
+
124
+ def _preprocess_image(self, image: Image.Image) -> np.ndarray:
125
+ """Convert PIL Image to RGB numpy array for doctr."""
126
+ # Ensure the image is in RGB mode
127
+ if image.mode != "RGB":
128
+ image = image.convert("RGB")
129
+ # Convert to numpy array
130
+ return np.array(image)
131
+
132
+ def _process_single_image(
133
+ self, image: np.ndarray, detect_only: bool, options: Optional[DoctrOCROptions]
134
+ ) -> Any:
135
+ """Process a single image with doctr."""
136
+ if self._model is None:
137
+ raise RuntimeError("Doctr model not initialized")
138
+
139
+ # Capture image dimensions for denormalization
140
+ height, width = image.shape[:2]
141
+
142
+ # Cast options to DoctrOCROptions or use default
143
+ doctr_opts = options if isinstance(options, DoctrOCROptions) else DoctrOCROptions()
144
+
145
+ # Check if we need to detect orientation first
146
+ if self._orientation_model is not None and options and options.use_orientation_predictor:
147
+ try:
148
+ # Process with orientation predictor
149
+ # For orientation predictor, we need to pass a batch of images
150
+ orientations = self._orientation_model([image])
151
+ orientation = orientations[1][0] # Get the orientation angle
152
+ logger.info(f"Detected page orientation: {orientation} degrees")
153
+ # Note: doctr handles rotation internally for detection/recognition
154
+ except Exception as e:
155
+ logger.error(f"Error detecting orientation: {e}")
156
+
157
+ # Process differently based on detect_only flag
158
+ if detect_only and self._detection_model is not None:
159
+ try:
160
+ # Apply threshold settings at runtime for this detection
161
+ if doctr_opts.bin_thresh is not None:
162
+ original_bin_thresh = self._detection_model.model.postprocessor.bin_thresh
163
+ self._detection_model.model.postprocessor.bin_thresh = doctr_opts.bin_thresh
164
+ logger.debug(f"Temporarily set bin_thresh to {doctr_opts.bin_thresh}")
165
+
166
+ if doctr_opts.box_thresh is not None:
167
+ original_box_thresh = self._detection_model.model.postprocessor.box_thresh
168
+ self._detection_model.model.postprocessor.box_thresh = doctr_opts.box_thresh
169
+ logger.debug(f"Temporarily set box_thresh to {doctr_opts.box_thresh}")
170
+
171
+ # Use the dedicated detection model with a list of numpy arrays
172
+ result = self._detection_model([image])
173
+
174
+ # Restore original thresholds
175
+ if doctr_opts.bin_thresh is not None:
176
+ self._detection_model.model.postprocessor.bin_thresh = original_bin_thresh
177
+
178
+ if doctr_opts.box_thresh is not None:
179
+ self._detection_model.model.postprocessor.box_thresh = original_box_thresh
180
+
181
+ # Return tuple of (result, dimensions)
182
+ return (result, (height, width))
183
+ except Exception as e:
184
+ logger.error(f"Error in detection_predictor: {e}")
185
+ # Fall back to OCR predictor if detection fails
186
+ logger.warning("Falling back to OCR predictor for detection")
187
+
188
+ # Process with full OCR model, passing a list of numpy arrays directly
189
+ try:
190
+ # For full OCR, we should also apply the thresholds
191
+ if (
192
+ detect_only
193
+ and doctr_opts.bin_thresh is not None
194
+ and hasattr(self._model.det_predictor.model.postprocessor, "bin_thresh")
195
+ ):
196
+ original_bin_thresh = self._model.det_predictor.model.postprocessor.bin_thresh
197
+ self._model.det_predictor.model.postprocessor.bin_thresh = doctr_opts.bin_thresh
198
+
199
+ if (
200
+ detect_only
201
+ and doctr_opts.box_thresh is not None
202
+ and hasattr(self._model.det_predictor.model.postprocessor, "box_thresh")
203
+ ):
204
+ original_box_thresh = self._model.det_predictor.model.postprocessor.box_thresh
205
+ self._model.det_predictor.model.postprocessor.box_thresh = doctr_opts.box_thresh
206
+
207
+ result = self._model([image])
208
+
209
+ # Restore original thresholds
210
+ if (
211
+ detect_only
212
+ and doctr_opts.bin_thresh is not None
213
+ and hasattr(self._model.det_predictor.model.postprocessor, "bin_thresh")
214
+ ):
215
+ self._model.det_predictor.model.postprocessor.bin_thresh = original_bin_thresh
216
+
217
+ if (
218
+ detect_only
219
+ and doctr_opts.box_thresh is not None
220
+ and hasattr(self._model.det_predictor.model.postprocessor, "box_thresh")
221
+ ):
222
+ self._model.det_predictor.model.postprocessor.box_thresh = original_box_thresh
223
+
224
+ # Return tuple of (result, dimensions)
225
+ return (result, (height, width))
226
+ except Exception as e:
227
+ logger.error(f"Error in OCR prediction: {e}")
228
+ raise
229
+
230
+ def _standardize_results(
231
+ self, raw_results: Any, min_confidence: float, detect_only: bool
232
+ ) -> List[TextRegion]:
233
+ """Convert doctr results to standardized TextRegion objects."""
234
+ standardized_regions = []
235
+
236
+ # Extract results and dimensions
237
+ if isinstance(raw_results, tuple) and len(raw_results) == 2:
238
+ results, dimensions = raw_results
239
+ image_height, image_width = dimensions
240
+ else:
241
+ # Fallback if dimensions aren't provided
242
+ results = raw_results
243
+ image_width = 1
244
+ image_height = 1
245
+ logger.warning("Image dimensions not provided, using normalized coordinates")
246
+
247
+ # Handle detection-only results differently
248
+ if detect_only and self._detection_model is not None and not hasattr(results, "pages"):
249
+ # Import doctr utils for detach_scores if needed
250
+ try:
251
+ from doctr.utils.geometry import detach_scores
252
+ except ImportError:
253
+ logger.error("Failed to import doctr.utils.geometry.detach_scores")
254
+ return standardized_regions
255
+
256
+ # Extract coordinates and scores from detection results
257
+ for result in results:
258
+ # Detection results structure is different from ocr_predictor
259
+ if "words" in result:
260
+ try:
261
+ # Detach the coordinates and scores
262
+ detached_coords, prob_scores = detach_scores([result.get("words")])
263
+
264
+ for i, coords in enumerate(detached_coords[0]):
265
+ score = (
266
+ prob_scores[0][i]
267
+ if prob_scores and len(prob_scores[0]) > i
268
+ else 0.0
269
+ )
270
+
271
+ if score >= min_confidence:
272
+ try:
273
+ # Handle both straight and rotated boxes
274
+ if coords.shape == (
275
+ 4,
276
+ ): # Straight box as [xmin, ymin, xmax, ymax]
277
+ xmin, ymin, xmax, ymax = coords.tolist()
278
+ # Denormalize coordinates
279
+ bbox = (
280
+ float(xmin * image_width),
281
+ float(ymin * image_height),
282
+ float(xmax * image_width),
283
+ float(ymax * image_height),
284
+ )
285
+ else: # Polygon points
286
+ # Get bounding box from polygon
287
+ coords_list = coords.tolist()
288
+ x_coords = [p[0] * image_width for p in coords_list]
289
+ y_coords = [p[1] * image_height for p in coords_list]
290
+ bbox = (
291
+ float(min(x_coords)),
292
+ float(min(y_coords)),
293
+ float(max(x_coords)),
294
+ float(max(y_coords)),
295
+ )
296
+
297
+ # In detection mode, we don't have text or confidence score
298
+ standardized_regions.append(TextRegion(bbox, None, score))
299
+ except Exception as e:
300
+ logger.error(f"Error processing detection result: {e}")
301
+ except Exception as e:
302
+ logger.error(f"Error detaching scores: {e}")
303
+
304
+ return standardized_regions
305
+
306
+ # Process standard OCR results
307
+ if not hasattr(results, "pages") or not results.pages:
308
+ logger.warning("Doctr result object does not contain pages.")
309
+ return standardized_regions
310
+
311
+ # Process results page by page (we typically process one image at a time)
312
+ for page in results.pages:
313
+ # Extract information from blocks, lines, words
314
+ for block in page.blocks:
315
+ for line in block.lines:
316
+ for word in line.words:
317
+ if word.confidence >= min_confidence:
318
+ try:
319
+ # doctr geometry is ((x_min, y_min), (x_max, y_max)) as relative coordinates
320
+ x_min, y_min = word.geometry[0]
321
+ x_max, y_max = word.geometry[1]
322
+
323
+ # Denormalize coordinates to absolute pixel values
324
+ bbox = (
325
+ float(x_min * image_width),
326
+ float(y_min * image_height),
327
+ float(x_max * image_width),
328
+ float(y_max * image_height),
329
+ )
330
+
331
+ # Skip text content if detect_only is True
332
+ text = None if detect_only else word.value
333
+ confidence = None if detect_only else word.confidence
334
+
335
+ standardized_regions.append(TextRegion(bbox, text, confidence))
336
+ except (ValueError, TypeError, IndexError) as e:
337
+ logger.error(
338
+ f"Could not standardize bounding box/word from doctr result: {word}"
339
+ )
340
+ logger.error(f"Error: {e}")
341
+
342
+ return standardized_regions
343
+
344
+ def get_default_options(self) -> DoctrOCROptions:
345
+ """Return the default options specific to this engine."""
346
+ return DoctrOCROptions()
@@ -1,6 +1,6 @@
1
- import logging
2
1
  import importlib.util
3
- from typing import Dict, Any, Optional, Type, Union, List
2
+ import logging
3
+ from typing import Any, Dict, List, Optional, Type, Union
4
4
 
5
5
  from .engine import OCREngine
6
6
 
@@ -15,7 +15,7 @@ class OCRFactory:
15
15
  """Create and return an OCR engine instance.
16
16
 
17
17
  Args:
18
- engine_type: One of 'surya', 'easyocr', 'paddle'
18
+ engine_type: One of 'surya', 'easyocr', 'paddle', 'doctr'
19
19
  **kwargs: Arguments to pass to the engine constructor
20
20
 
21
21
  Returns:
@@ -54,6 +54,16 @@ class OCRFactory:
54
54
  "PaddleOCR engine requires 'paddleocr' and 'paddlepaddle'. "
55
55
  "Install with: pip install paddleocr paddlepaddle"
56
56
  )
57
+ elif engine_type == "doctr":
58
+ try:
59
+ from .engine_doctr import DoctrOCREngine
60
+
61
+ return DoctrOCREngine(**kwargs)
62
+ except ImportError:
63
+ raise ImportError(
64
+ "Doctr engine requires the 'python-doctr' package. "
65
+ "Install with: pip install python-doctr[torch] or python-doctr[tf]"
66
+ )
57
67
  else:
58
68
  raise ValueError(f"Unknown engine type: {engine_type}")
59
69
 
@@ -85,13 +95,19 @@ class OCRFactory:
85
95
  except ImportError:
86
96
  engines["paddle"] = False
87
97
 
98
+ # Check Doctr
99
+ try:
100
+ engines["doctr"] = importlib.util.find_spec("doctr") is not None
101
+ except ImportError:
102
+ engines["doctr"] = False
103
+
88
104
  return engines
89
105
 
90
106
  @staticmethod
91
107
  def get_recommended_engine(**kwargs) -> OCREngine:
92
108
  """Returns the best available OCR engine based on what's installed.
93
109
 
94
- First tries engines in order of preference: EasyOCR, Paddle, Surya.
110
+ First tries engines in order of preference: EasyOCR, Doctr, Paddle, Surya.
95
111
  If none are available, raises ImportError with installation instructions.
96
112
 
97
113
  Args:
@@ -109,6 +125,9 @@ class OCRFactory:
109
125
  if available.get("easyocr", False):
110
126
  logger.info("Using EasyOCR engine (recommended)")
111
127
  return OCRFactory.create_engine("easyocr", **kwargs)
128
+ elif available.get("doctr", False):
129
+ logger.info("Using Doctr engine")
130
+ return OCRFactory.create_engine("doctr", **kwargs)
112
131
  elif available.get("paddle", False):
113
132
  logger.info("Using PaddleOCR engine")
114
133
  return OCRFactory.create_engine("paddle", **kwargs)
@@ -120,6 +139,7 @@ class OCRFactory:
120
139
  raise ImportError(
121
140
  "No OCR engines available. Please install at least one of: \n"
122
141
  "- EasyOCR (recommended): pip install easyocr\n"
142
+ "- Doctr: pip install python-doctr[torch] or python-doctr[tf]\n"
123
143
  "- PaddleOCR: pip install paddleocr paddlepaddle\n"
124
144
  "- Surya OCR: pip install surya"
125
145
  )
@@ -1,19 +1,26 @@
1
1
  # ocr_manager.py
2
2
  import copy # For deep copying options
3
3
  import logging
4
+ import threading # Import threading for lock
5
+ import time # Import time for timing
4
6
  from typing import Any, Dict, List, Optional, Type, Union
5
- import threading # Import threading for lock
6
- import time # Import time for timing
7
7
 
8
8
  from PIL import Image
9
9
 
10
10
  # Import engine classes and options
11
11
  from .engine import OCREngine
12
+ from .engine_doctr import DoctrOCREngine
12
13
  from .engine_easyocr import EasyOCREngine
13
14
  from .engine_paddle import PaddleOCREngine
14
15
  from .engine_surya import SuryaOCREngine
15
- from .ocr_options import OCROptions
16
- from .ocr_options import BaseOCROptions, EasyOCROptions, PaddleOCROptions, SuryaOCROptions
16
+ from .ocr_options import (
17
+ BaseOCROptions,
18
+ DoctrOCROptions,
19
+ EasyOCROptions,
20
+ OCROptions,
21
+ PaddleOCROptions,
22
+ SuryaOCROptions,
23
+ )
17
24
 
18
25
  logger = logging.getLogger(__name__)
19
26
 
@@ -25,15 +32,20 @@ class OCRManager:
25
32
  ENGINE_REGISTRY: Dict[str, Dict[str, Any]] = {
26
33
  "easyocr": {"class": EasyOCREngine, "options_class": EasyOCROptions},
27
34
  "paddle": {"class": PaddleOCREngine, "options_class": PaddleOCROptions},
28
- "surya": {"class": SuryaOCREngine, "options_class": SuryaOCROptions}, # <-- Add Surya
35
+ "surya": {"class": SuryaOCREngine, "options_class": SuryaOCROptions},
36
+ "doctr": {"class": DoctrOCREngine, "options_class": DoctrOCROptions},
29
37
  # Add other engines here
30
38
  }
31
39
 
32
40
  def __init__(self):
33
41
  """Initializes the OCR Manager."""
34
42
  self._engine_instances: Dict[str, OCREngine] = {} # Cache for engine instances
35
- self._engine_locks: Dict[str, threading.Lock] = {} # Lock per engine type for initialization
36
- self._engine_inference_locks: Dict[str, threading.Lock] = {} # Lock per engine type for inference
43
+ self._engine_locks: Dict[str, threading.Lock] = (
44
+ {}
45
+ ) # Lock per engine type for initialization
46
+ self._engine_inference_locks: Dict[str, threading.Lock] = (
47
+ {}
48
+ ) # Lock per engine type for inference
37
49
  logger.info("OCRManager initialized.")
38
50
 
39
51
  def _get_engine_instance(self, engine_name: str) -> OCREngine:
@@ -51,7 +63,7 @@ class OCRManager:
51
63
  # Get or create the lock for this engine type
52
64
  if engine_name not in self._engine_locks:
53
65
  self._engine_locks[engine_name] = threading.Lock()
54
-
66
+
55
67
  engine_init_lock = self._engine_locks[engine_name]
56
68
 
57
69
  # Acquire lock to safely check and potentially initialize the engine
@@ -61,9 +73,11 @@ class OCRManager:
61
73
  return self._engine_instances[engine_name]
62
74
 
63
75
  # If still not initialized, create it now under the lock
64
- logger.info(f"[{threading.current_thread().name}] Creating shared instance of engine: {engine_name}")
76
+ logger.info(
77
+ f"[{threading.current_thread().name}] Creating shared instance of engine: {engine_name}"
78
+ )
65
79
  engine_class = self.ENGINE_REGISTRY[engine_name]["class"]
66
- start_time = time.monotonic() # Optional: time initialization
80
+ start_time = time.monotonic() # Optional: time initialization
67
81
  try:
68
82
  engine_instance = engine_class() # Instantiate first
69
83
  if not engine_instance.is_available():
@@ -75,14 +89,20 @@ class OCRManager:
75
89
  # Store the shared instance
76
90
  self._engine_instances[engine_name] = engine_instance
77
91
  end_time = time.monotonic()
78
- logger.info(f"[{threading.current_thread().name}] Shared instance of {engine_name} created successfully (Duration: {end_time - start_time:.2f}s).")
92
+ logger.info(
93
+ f"[{threading.current_thread().name}] Shared instance of {engine_name} created successfully (Duration: {end_time - start_time:.2f}s)."
94
+ )
79
95
  return engine_instance
80
96
  except Exception as e:
81
- # Ensure we don't leave a partial state if init fails
82
- logger.error(f"[{threading.current_thread().name}] Failed to create shared instance of {engine_name}: {e}", exc_info=True)
83
- # Remove potentially partial entry if exists
84
- if engine_name in self._engine_instances: del self._engine_instances[engine_name]
85
- raise # Re-raise the exception after logging
97
+ # Ensure we don't leave a partial state if init fails
98
+ logger.error(
99
+ f"[{threading.current_thread().name}] Failed to create shared instance of {engine_name}: {e}",
100
+ exc_info=True,
101
+ )
102
+ # Remove potentially partial entry if exists
103
+ if engine_name in self._engine_instances:
104
+ del self._engine_instances[engine_name]
105
+ raise # Re-raise the exception after logging
86
106
 
87
107
  def _get_engine_inference_lock(self, engine_name: str) -> threading.Lock:
88
108
  """Gets or creates the inference lock for a given engine type."""
@@ -112,7 +132,7 @@ class OCRManager:
112
132
 
113
133
  Args:
114
134
  images: A single PIL Image or a list of PIL Images to process.
115
- engine: Name of the engine (e.g., 'easyocr', 'paddle', 'surya').
135
+ engine: Name of the engine (e.g., 'easyocr', 'paddle', 'surya', 'doctr').
116
136
  Defaults to 'easyocr' if not specified.
117
137
  languages: List of language codes (e.g., ['en', 'fr'], ['en', 'german']).
118
138
  **Passed directly to the engine.** Must be codes understood
@@ -169,27 +189,41 @@ class OCRManager:
169
189
  processing_mode = "batch" if is_batch else "single image"
170
190
  # Log thread name for clarity during parallel calls
171
191
  thread_id = threading.current_thread().name
172
- logger.info(f"[{thread_id}] Processing {processing_mode} using shared engine instance '{selected_engine_name}'...")
192
+ logger.info(
193
+ f"[{thread_id}] Processing {processing_mode} using shared engine instance '{selected_engine_name}'..."
194
+ )
173
195
  logger.debug(
174
196
  f" Engine Args: languages={languages}, min_confidence={min_confidence}, device={device}, options={final_options}"
175
197
  )
176
198
 
177
199
  # Log image dimensions before processing
178
200
  if is_batch:
179
- image_dims = [f"{img.width}x{img.height}" for img in images if hasattr(img, 'width') and hasattr(img, 'height')]
180
- logger.debug(f"[{thread_id}] Processing batch of {len(images)} images with dimensions: {image_dims}")
181
- elif hasattr(images, 'width') and hasattr(images, 'height'):
182
- logger.debug(f"[{thread_id}] Processing single image with dimensions: {images.width}x{images.height}")
201
+ image_dims = [
202
+ f"{img.width}x{img.height}"
203
+ for img in images
204
+ if hasattr(img, "width") and hasattr(img, "height")
205
+ ]
206
+ logger.debug(
207
+ f"[{thread_id}] Processing batch of {len(images)} images with dimensions: {image_dims}"
208
+ )
209
+ elif hasattr(images, "width") and hasattr(images, "height"):
210
+ logger.debug(
211
+ f"[{thread_id}] Processing single image with dimensions: {images.width}x{images.height}"
212
+ )
183
213
  else:
184
214
  logger.warning(f"[{thread_id}] Could not determine dimensions of input image(s).")
185
215
 
186
216
  # Acquire lock specifically for the inference call
187
217
  inference_lock = self._get_engine_inference_lock(selected_engine_name)
188
- logger.debug(f"[{thread_id}] Attempting to acquire inference lock for {selected_engine_name}...")
218
+ logger.debug(
219
+ f"[{thread_id}] Attempting to acquire inference lock for {selected_engine_name}..."
220
+ )
189
221
  inference_wait_start = time.monotonic()
190
222
  with inference_lock:
191
223
  inference_acquired_time = time.monotonic()
192
- logger.debug(f"[{thread_id}] Acquired inference lock for {selected_engine_name} (waited {inference_acquired_time - inference_wait_start:.2f}s). Calling process_image...")
224
+ logger.debug(
225
+ f"[{thread_id}] Acquired inference lock for {selected_engine_name} (waited {inference_acquired_time - inference_wait_start:.2f}s). Calling process_image..."
226
+ )
193
227
  inference_start_time = time.monotonic()
194
228
 
195
229
  results = engine_instance.process_image(
@@ -201,7 +235,9 @@ class OCRManager:
201
235
  options=final_options,
202
236
  )
203
237
  inference_end_time = time.monotonic()
204
- logger.debug(f"[{thread_id}] process_image call finished for {selected_engine_name} (Duration: {inference_end_time - inference_start_time:.2f}s). Releasing lock.")
238
+ logger.debug(
239
+ f"[{thread_id}] process_image call finished for {selected_engine_name} (Duration: {inference_end_time - inference_start_time:.2f}s). Releasing lock."
240
+ )
205
241
 
206
242
  # Log result summary based on mode
207
243
  if is_batch: