natural-pdf 0.1.7__py3-none-any.whl → 0.1.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (134) hide show
  1. natural_pdf/__init__.py +3 -0
  2. natural_pdf/analyzers/layout/base.py +1 -5
  3. natural_pdf/analyzers/layout/gemini.py +61 -51
  4. natural_pdf/analyzers/layout/layout_analyzer.py +40 -11
  5. natural_pdf/analyzers/layout/layout_manager.py +26 -84
  6. natural_pdf/analyzers/layout/layout_options.py +7 -0
  7. natural_pdf/analyzers/layout/pdfplumber_table_finder.py +142 -0
  8. natural_pdf/analyzers/layout/surya.py +46 -123
  9. natural_pdf/analyzers/layout/tatr.py +51 -4
  10. natural_pdf/analyzers/text_structure.py +3 -5
  11. natural_pdf/analyzers/utils.py +3 -3
  12. natural_pdf/classification/manager.py +422 -0
  13. natural_pdf/classification/mixin.py +163 -0
  14. natural_pdf/classification/results.py +80 -0
  15. natural_pdf/collections/mixins.py +111 -0
  16. natural_pdf/collections/pdf_collection.py +434 -15
  17. natural_pdf/core/element_manager.py +83 -0
  18. natural_pdf/core/highlighting_service.py +13 -22
  19. natural_pdf/core/page.py +578 -93
  20. natural_pdf/core/pdf.py +912 -460
  21. natural_pdf/elements/base.py +134 -40
  22. natural_pdf/elements/collections.py +712 -109
  23. natural_pdf/elements/region.py +722 -69
  24. natural_pdf/elements/text.py +4 -1
  25. natural_pdf/export/mixin.py +137 -0
  26. natural_pdf/exporters/base.py +3 -3
  27. natural_pdf/exporters/paddleocr.py +5 -4
  28. natural_pdf/extraction/manager.py +135 -0
  29. natural_pdf/extraction/mixin.py +279 -0
  30. natural_pdf/extraction/result.py +23 -0
  31. natural_pdf/ocr/__init__.py +5 -5
  32. natural_pdf/ocr/engine_doctr.py +346 -0
  33. natural_pdf/ocr/engine_easyocr.py +6 -3
  34. natural_pdf/ocr/ocr_factory.py +24 -4
  35. natural_pdf/ocr/ocr_manager.py +122 -26
  36. natural_pdf/ocr/ocr_options.py +94 -11
  37. natural_pdf/ocr/utils.py +19 -6
  38. natural_pdf/qa/document_qa.py +0 -4
  39. natural_pdf/search/__init__.py +20 -34
  40. natural_pdf/search/haystack_search_service.py +309 -265
  41. natural_pdf/search/haystack_utils.py +99 -75
  42. natural_pdf/search/search_service_protocol.py +11 -12
  43. natural_pdf/selectors/parser.py +431 -230
  44. natural_pdf/utils/debug.py +3 -3
  45. natural_pdf/utils/identifiers.py +1 -1
  46. natural_pdf/utils/locks.py +8 -0
  47. natural_pdf/utils/packaging.py +8 -6
  48. natural_pdf/utils/text_extraction.py +60 -1
  49. natural_pdf/utils/tqdm_utils.py +51 -0
  50. natural_pdf/utils/visualization.py +18 -0
  51. natural_pdf/widgets/viewer.py +4 -25
  52. {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/METADATA +17 -3
  53. natural_pdf-0.1.9.dist-info/RECORD +80 -0
  54. {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/WHEEL +1 -1
  55. {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/top_level.txt +0 -2
  56. docs/api/index.md +0 -386
  57. docs/assets/favicon.png +0 -3
  58. docs/assets/favicon.svg +0 -3
  59. docs/assets/javascripts/custom.js +0 -17
  60. docs/assets/logo.svg +0 -3
  61. docs/assets/sample-screen.png +0 -0
  62. docs/assets/social-preview.png +0 -17
  63. docs/assets/social-preview.svg +0 -17
  64. docs/assets/stylesheets/custom.css +0 -65
  65. docs/document-qa/index.ipynb +0 -435
  66. docs/document-qa/index.md +0 -79
  67. docs/element-selection/index.ipynb +0 -915
  68. docs/element-selection/index.md +0 -229
  69. docs/finetuning/index.md +0 -176
  70. docs/index.md +0 -170
  71. docs/installation/index.md +0 -69
  72. docs/interactive-widget/index.ipynb +0 -962
  73. docs/interactive-widget/index.md +0 -12
  74. docs/layout-analysis/index.ipynb +0 -818
  75. docs/layout-analysis/index.md +0 -185
  76. docs/ocr/index.md +0 -209
  77. docs/pdf-navigation/index.ipynb +0 -314
  78. docs/pdf-navigation/index.md +0 -97
  79. docs/regions/index.ipynb +0 -816
  80. docs/regions/index.md +0 -294
  81. docs/tables/index.ipynb +0 -658
  82. docs/tables/index.md +0 -144
  83. docs/text-analysis/index.ipynb +0 -370
  84. docs/text-analysis/index.md +0 -105
  85. docs/text-extraction/index.ipynb +0 -1478
  86. docs/text-extraction/index.md +0 -292
  87. docs/tutorials/01-loading-and-extraction.ipynb +0 -194
  88. docs/tutorials/01-loading-and-extraction.md +0 -95
  89. docs/tutorials/02-finding-elements.ipynb +0 -340
  90. docs/tutorials/02-finding-elements.md +0 -149
  91. docs/tutorials/03-extracting-blocks.ipynb +0 -147
  92. docs/tutorials/03-extracting-blocks.md +0 -48
  93. docs/tutorials/04-table-extraction.ipynb +0 -114
  94. docs/tutorials/04-table-extraction.md +0 -50
  95. docs/tutorials/05-excluding-content.ipynb +0 -270
  96. docs/tutorials/05-excluding-content.md +0 -109
  97. docs/tutorials/06-document-qa.ipynb +0 -332
  98. docs/tutorials/06-document-qa.md +0 -91
  99. docs/tutorials/07-layout-analysis.ipynb +0 -288
  100. docs/tutorials/07-layout-analysis.md +0 -66
  101. docs/tutorials/07-working-with-regions.ipynb +0 -413
  102. docs/tutorials/07-working-with-regions.md +0 -151
  103. docs/tutorials/08-spatial-navigation.ipynb +0 -508
  104. docs/tutorials/08-spatial-navigation.md +0 -190
  105. docs/tutorials/09-section-extraction.ipynb +0 -2434
  106. docs/tutorials/09-section-extraction.md +0 -256
  107. docs/tutorials/10-form-field-extraction.ipynb +0 -512
  108. docs/tutorials/10-form-field-extraction.md +0 -201
  109. docs/tutorials/11-enhanced-table-processing.ipynb +0 -54
  110. docs/tutorials/11-enhanced-table-processing.md +0 -9
  111. docs/tutorials/12-ocr-integration.ipynb +0 -604
  112. docs/tutorials/12-ocr-integration.md +0 -175
  113. docs/tutorials/13-semantic-search.ipynb +0 -1328
  114. docs/tutorials/13-semantic-search.md +0 -77
  115. docs/visual-debugging/index.ipynb +0 -2970
  116. docs/visual-debugging/index.md +0 -157
  117. docs/visual-debugging/region.png +0 -0
  118. natural_pdf/templates/finetune/fine_tune_paddleocr.md +0 -415
  119. natural_pdf/templates/spa/css/style.css +0 -334
  120. natural_pdf/templates/spa/index.html +0 -31
  121. natural_pdf/templates/spa/js/app.js +0 -472
  122. natural_pdf/templates/spa/words.txt +0 -235976
  123. natural_pdf/widgets/frontend/viewer.js +0 -88
  124. natural_pdf-0.1.7.dist-info/RECORD +0 -145
  125. notebooks/Examples.ipynb +0 -1293
  126. pdfs/.gitkeep +0 -0
  127. pdfs/01-practice.pdf +0 -543
  128. pdfs/0500000US42001.pdf +0 -0
  129. pdfs/0500000US42007.pdf +0 -0
  130. pdfs/2014 Statistics.pdf +0 -0
  131. pdfs/2019 Statistics.pdf +0 -0
  132. pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
  133. pdfs/needs-ocr.pdf +0 -0
  134. {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,346 @@
1
+ # natural_pdf/ocr/engine_doctr.py
2
+ import importlib.util
3
+ import logging
4
+ from typing import Any, List, Optional
5
+
6
+ import numpy as np
7
+ from PIL import Image
8
+
9
+ from .engine import OCREngine, TextRegion
10
+ from .ocr_options import BaseOCROptions, DoctrOCROptions
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ class DoctrOCREngine(OCREngine):
16
+ """docTR engine implementation."""
17
+
18
+ def __init__(self):
19
+ super().__init__()
20
+ self._model = None # Will hold the doctr ocr_predictor
21
+ self._detection_model = None # Will hold detection_predictor if detect_only is used
22
+ self._orientation_model = None # Will hold page_orientation_predictor if enabled
23
+
24
+ def is_available(self) -> bool:
25
+ """Check if doctr is installed."""
26
+ return importlib.util.find_spec("doctr") is not None
27
+
28
+ def _initialize_model(
29
+ self, languages: List[str], device: str, options: Optional[BaseOCROptions]
30
+ ):
31
+ """Initialize the doctr model."""
32
+ if not self.is_available():
33
+ raise ImportError(
34
+ "Doctr engine requires the 'python-doctr' package. "
35
+ "Install with: pip install python-doctr[torch] or python-doctr[tf]"
36
+ )
37
+
38
+ try:
39
+ import doctr.models
40
+
41
+ self.logger.info("doctr.models imported successfully.")
42
+ except ImportError as e:
43
+ self.logger.error(f"Failed to import doctr: {e}")
44
+ raise
45
+
46
+ # Cast to DoctrOCROptions or use default
47
+ doctr_opts = options if isinstance(options, DoctrOCROptions) else DoctrOCROptions()
48
+
49
+ # Check if CUDA is available in device string
50
+ use_cuda = device.lower().startswith("cuda") if device else False
51
+
52
+ # Prepare OCR predictor arguments
53
+ predictor_args = {
54
+ "det_arch": doctr_opts.det_arch,
55
+ "reco_arch": doctr_opts.reco_arch,
56
+ "pretrained": doctr_opts.pretrained,
57
+ "assume_straight_pages": doctr_opts.assume_straight_pages,
58
+ "export_as_straight_boxes": doctr_opts.export_as_straight_boxes,
59
+ }
60
+ # Filter out None values
61
+ predictor_args = {k: v for k, v in predictor_args.items() if v is not None}
62
+
63
+ self.logger.debug(f"doctr ocr_predictor constructor args: {predictor_args}")
64
+ try:
65
+ # Create the main OCR predictor (doesn't accept batch_size)
66
+ self._model = doctr.models.ocr_predictor(**predictor_args)
67
+
68
+ # Apply CUDA if available
69
+ if use_cuda:
70
+ self._model = self._model.cuda()
71
+
72
+ self.logger.info("doctr ocr_predictor created successfully")
73
+
74
+ # Now initialize the detection-only model
75
+ try:
76
+ detection_args = {
77
+ "arch": doctr_opts.det_arch,
78
+ "pretrained": doctr_opts.pretrained,
79
+ "assume_straight_pages": doctr_opts.assume_straight_pages,
80
+ "symmetric_pad": doctr_opts.symmetric_pad,
81
+ "preserve_aspect_ratio": doctr_opts.preserve_aspect_ratio,
82
+ "batch_size": doctr_opts.batch_size,
83
+ }
84
+ self._detection_model = doctr.models.detection_predictor(**detection_args)
85
+
86
+ # Apply CUDA if available
87
+ if use_cuda:
88
+ self._detection_model = self._detection_model.cuda()
89
+
90
+ # Configure postprocessing parameters if provided
91
+ if doctr_opts.bin_thresh is not None:
92
+ self._detection_model.model.postprocessor.bin_thresh = doctr_opts.bin_thresh
93
+ if doctr_opts.box_thresh is not None:
94
+ self._detection_model.model.postprocessor.box_thresh = doctr_opts.box_thresh
95
+
96
+ self.logger.info("doctr detection_predictor created successfully")
97
+ except Exception as e:
98
+ self.logger.error(f"Failed to create detection_predictor: {e}")
99
+ self._detection_model = None
100
+
101
+ # Initialize orientation predictor if enabled
102
+ if doctr_opts.use_orientation_predictor:
103
+ try:
104
+ self._orientation_model = doctr.models.page_orientation_predictor(
105
+ pretrained=True, batch_size=doctr_opts.batch_size
106
+ )
107
+ if use_cuda:
108
+ self._orientation_model = self._orientation_model.cuda()
109
+ self.logger.info("doctr page_orientation_predictor created successfully")
110
+ except Exception as e:
111
+ self.logger.error(f"Failed to create page_orientation_predictor: {e}")
112
+ self._orientation_model = None
113
+
114
+ except Exception as e:
115
+ self.logger.error(f"Failed to create doctr models: {e}")
116
+ raise
117
+
118
+ # Doctr doesn't explicitly use language list in ocr_predictor initialization
119
+ if languages and languages != [self.DEFAULT_LANGUAGES[0]]:
120
+ logger.warning(
121
+ f"Doctr engine currently doesn't support language selection during initialization. Using its default language capabilities for model: {doctr_opts.reco_arch}"
122
+ )
123
+
124
+ def _preprocess_image(self, image: Image.Image) -> np.ndarray:
125
+ """Convert PIL Image to RGB numpy array for doctr."""
126
+ # Ensure the image is in RGB mode
127
+ if image.mode != "RGB":
128
+ image = image.convert("RGB")
129
+ # Convert to numpy array
130
+ return np.array(image)
131
+
132
+ def _process_single_image(
133
+ self, image: np.ndarray, detect_only: bool, options: Optional[DoctrOCROptions]
134
+ ) -> Any:
135
+ """Process a single image with doctr."""
136
+ if self._model is None:
137
+ raise RuntimeError("Doctr model not initialized")
138
+
139
+ # Capture image dimensions for denormalization
140
+ height, width = image.shape[:2]
141
+
142
+ # Cast options to DoctrOCROptions or use default
143
+ doctr_opts = options if isinstance(options, DoctrOCROptions) else DoctrOCROptions()
144
+
145
+ # Check if we need to detect orientation first
146
+ if self._orientation_model is not None and options and options.use_orientation_predictor:
147
+ try:
148
+ # Process with orientation predictor
149
+ # For orientation predictor, we need to pass a batch of images
150
+ orientations = self._orientation_model([image])
151
+ orientation = orientations[1][0] # Get the orientation angle
152
+ logger.info(f"Detected page orientation: {orientation} degrees")
153
+ # Note: doctr handles rotation internally for detection/recognition
154
+ except Exception as e:
155
+ logger.error(f"Error detecting orientation: {e}")
156
+
157
+ # Process differently based on detect_only flag
158
+ if detect_only and self._detection_model is not None:
159
+ try:
160
+ # Apply threshold settings at runtime for this detection
161
+ if doctr_opts.bin_thresh is not None:
162
+ original_bin_thresh = self._detection_model.model.postprocessor.bin_thresh
163
+ self._detection_model.model.postprocessor.bin_thresh = doctr_opts.bin_thresh
164
+ logger.debug(f"Temporarily set bin_thresh to {doctr_opts.bin_thresh}")
165
+
166
+ if doctr_opts.box_thresh is not None:
167
+ original_box_thresh = self._detection_model.model.postprocessor.box_thresh
168
+ self._detection_model.model.postprocessor.box_thresh = doctr_opts.box_thresh
169
+ logger.debug(f"Temporarily set box_thresh to {doctr_opts.box_thresh}")
170
+
171
+ # Use the dedicated detection model with a list of numpy arrays
172
+ result = self._detection_model([image])
173
+
174
+ # Restore original thresholds
175
+ if doctr_opts.bin_thresh is not None:
176
+ self._detection_model.model.postprocessor.bin_thresh = original_bin_thresh
177
+
178
+ if doctr_opts.box_thresh is not None:
179
+ self._detection_model.model.postprocessor.box_thresh = original_box_thresh
180
+
181
+ # Return tuple of (result, dimensions)
182
+ return (result, (height, width))
183
+ except Exception as e:
184
+ logger.error(f"Error in detection_predictor: {e}")
185
+ # Fall back to OCR predictor if detection fails
186
+ logger.warning("Falling back to OCR predictor for detection")
187
+
188
+ # Process with full OCR model, passing a list of numpy arrays directly
189
+ try:
190
+ # For full OCR, we should also apply the thresholds
191
+ if (
192
+ detect_only
193
+ and doctr_opts.bin_thresh is not None
194
+ and hasattr(self._model.det_predictor.model.postprocessor, "bin_thresh")
195
+ ):
196
+ original_bin_thresh = self._model.det_predictor.model.postprocessor.bin_thresh
197
+ self._model.det_predictor.model.postprocessor.bin_thresh = doctr_opts.bin_thresh
198
+
199
+ if (
200
+ detect_only
201
+ and doctr_opts.box_thresh is not None
202
+ and hasattr(self._model.det_predictor.model.postprocessor, "box_thresh")
203
+ ):
204
+ original_box_thresh = self._model.det_predictor.model.postprocessor.box_thresh
205
+ self._model.det_predictor.model.postprocessor.box_thresh = doctr_opts.box_thresh
206
+
207
+ result = self._model([image])
208
+
209
+ # Restore original thresholds
210
+ if (
211
+ detect_only
212
+ and doctr_opts.bin_thresh is not None
213
+ and hasattr(self._model.det_predictor.model.postprocessor, "bin_thresh")
214
+ ):
215
+ self._model.det_predictor.model.postprocessor.bin_thresh = original_bin_thresh
216
+
217
+ if (
218
+ detect_only
219
+ and doctr_opts.box_thresh is not None
220
+ and hasattr(self._model.det_predictor.model.postprocessor, "box_thresh")
221
+ ):
222
+ self._model.det_predictor.model.postprocessor.box_thresh = original_box_thresh
223
+
224
+ # Return tuple of (result, dimensions)
225
+ return (result, (height, width))
226
+ except Exception as e:
227
+ logger.error(f"Error in OCR prediction: {e}")
228
+ raise
229
+
230
+ def _standardize_results(
231
+ self, raw_results: Any, min_confidence: float, detect_only: bool
232
+ ) -> List[TextRegion]:
233
+ """Convert doctr results to standardized TextRegion objects."""
234
+ standardized_regions = []
235
+
236
+ # Extract results and dimensions
237
+ if isinstance(raw_results, tuple) and len(raw_results) == 2:
238
+ results, dimensions = raw_results
239
+ image_height, image_width = dimensions
240
+ else:
241
+ # Fallback if dimensions aren't provided
242
+ results = raw_results
243
+ image_width = 1
244
+ image_height = 1
245
+ logger.warning("Image dimensions not provided, using normalized coordinates")
246
+
247
+ # Handle detection-only results differently
248
+ if detect_only and self._detection_model is not None and not hasattr(results, "pages"):
249
+ # Import doctr utils for detach_scores if needed
250
+ try:
251
+ from doctr.utils.geometry import detach_scores
252
+ except ImportError:
253
+ logger.error("Failed to import doctr.utils.geometry.detach_scores")
254
+ return standardized_regions
255
+
256
+ # Extract coordinates and scores from detection results
257
+ for result in results:
258
+ # Detection results structure is different from ocr_predictor
259
+ if "words" in result:
260
+ try:
261
+ # Detach the coordinates and scores
262
+ detached_coords, prob_scores = detach_scores([result.get("words")])
263
+
264
+ for i, coords in enumerate(detached_coords[0]):
265
+ score = (
266
+ prob_scores[0][i]
267
+ if prob_scores and len(prob_scores[0]) > i
268
+ else 0.0
269
+ )
270
+
271
+ if score >= min_confidence:
272
+ try:
273
+ # Handle both straight and rotated boxes
274
+ if coords.shape == (
275
+ 4,
276
+ ): # Straight box as [xmin, ymin, xmax, ymax]
277
+ xmin, ymin, xmax, ymax = coords.tolist()
278
+ # Denormalize coordinates
279
+ bbox = (
280
+ float(xmin * image_width),
281
+ float(ymin * image_height),
282
+ float(xmax * image_width),
283
+ float(ymax * image_height),
284
+ )
285
+ else: # Polygon points
286
+ # Get bounding box from polygon
287
+ coords_list = coords.tolist()
288
+ x_coords = [p[0] * image_width for p in coords_list]
289
+ y_coords = [p[1] * image_height for p in coords_list]
290
+ bbox = (
291
+ float(min(x_coords)),
292
+ float(min(y_coords)),
293
+ float(max(x_coords)),
294
+ float(max(y_coords)),
295
+ )
296
+
297
+ # In detection mode, we don't have text or confidence score
298
+ standardized_regions.append(TextRegion(bbox, None, score))
299
+ except Exception as e:
300
+ logger.error(f"Error processing detection result: {e}")
301
+ except Exception as e:
302
+ logger.error(f"Error detaching scores: {e}")
303
+
304
+ return standardized_regions
305
+
306
+ # Process standard OCR results
307
+ if not hasattr(results, "pages") or not results.pages:
308
+ logger.warning("Doctr result object does not contain pages.")
309
+ return standardized_regions
310
+
311
+ # Process results page by page (we typically process one image at a time)
312
+ for page in results.pages:
313
+ # Extract information from blocks, lines, words
314
+ for block in page.blocks:
315
+ for line in block.lines:
316
+ for word in line.words:
317
+ if word.confidence >= min_confidence:
318
+ try:
319
+ # doctr geometry is ((x_min, y_min), (x_max, y_max)) as relative coordinates
320
+ x_min, y_min = word.geometry[0]
321
+ x_max, y_max = word.geometry[1]
322
+
323
+ # Denormalize coordinates to absolute pixel values
324
+ bbox = (
325
+ float(x_min * image_width),
326
+ float(y_min * image_height),
327
+ float(x_max * image_width),
328
+ float(y_max * image_height),
329
+ )
330
+
331
+ # Skip text content if detect_only is True
332
+ text = None if detect_only else word.value
333
+ confidence = None if detect_only else word.confidence
334
+
335
+ standardized_regions.append(TextRegion(bbox, text, confidence))
336
+ except (ValueError, TypeError, IndexError) as e:
337
+ logger.error(
338
+ f"Could not standardize bounding box/word from doctr result: {word}"
339
+ )
340
+ logger.error(f"Error: {e}")
341
+
342
+ return standardized_regions
343
+
344
+ def get_default_options(self) -> DoctrOCROptions:
345
+ """Return the default options specific to this engine."""
346
+ return DoctrOCROptions()
@@ -143,11 +143,13 @@ class EasyOCREngine(OCREngine):
143
143
  standardized_regions = []
144
144
 
145
145
  if detect_only:
146
+ results = raw_results[0]
146
147
  # In detect_only mode, raw_results is already a list of bounding boxes
147
148
  # Each bbox is in [x_min, x_max, y_min, y_max] format
148
- if isinstance(raw_results, list):
149
- for detection in raw_results:
149
+ if isinstance(results, list):
150
+ for detection in results:
150
151
  try:
152
+ # This block expects 'detection' to be a list/tuple of 4 numbers
151
153
  if isinstance(detection, (list, tuple)) and len(detection) == 4:
152
154
  x_min, x_max, y_min, y_max = detection
153
155
  # Convert to standardized (x0, y0, x1, y1) format
@@ -161,6 +163,7 @@ class EasyOCREngine(OCREngine):
161
163
  f"Invalid number format in EasyOCR detect bbox: {detection}"
162
164
  ) from e
163
165
  else:
166
+ # This is where the error is raised if 'detection' is not a list/tuple of 4 numbers
164
167
  raise ValueError(f"Invalid detection format from EasyOCR: {detection}")
165
168
  except ValueError as e:
166
169
  # Re-raise any value errors from standardization or format checks
@@ -172,7 +175,7 @@ class EasyOCREngine(OCREngine):
172
175
  ) from e
173
176
  else:
174
177
  raise ValueError(
175
- f"Expected list of bounding boxes in detect_only mode, got: {raw_results}"
178
+ f"Expected list of bounding boxes in detect_only mode, got: {type(raw_results)}"
176
179
  )
177
180
 
178
181
  return standardized_regions
@@ -1,6 +1,6 @@
1
- import logging
2
1
  import importlib.util
3
- from typing import Dict, Any, Optional, Type, Union, List
2
+ import logging
3
+ from typing import Any, Dict, List, Optional, Type, Union
4
4
 
5
5
  from .engine import OCREngine
6
6
 
@@ -15,7 +15,7 @@ class OCRFactory:
15
15
  """Create and return an OCR engine instance.
16
16
 
17
17
  Args:
18
- engine_type: One of 'surya', 'easyocr', 'paddle'
18
+ engine_type: One of 'surya', 'easyocr', 'paddle', 'doctr'
19
19
  **kwargs: Arguments to pass to the engine constructor
20
20
 
21
21
  Returns:
@@ -54,6 +54,16 @@ class OCRFactory:
54
54
  "PaddleOCR engine requires 'paddleocr' and 'paddlepaddle'. "
55
55
  "Install with: pip install paddleocr paddlepaddle"
56
56
  )
57
+ elif engine_type == "doctr":
58
+ try:
59
+ from .engine_doctr import DoctrOCREngine
60
+
61
+ return DoctrOCREngine(**kwargs)
62
+ except ImportError:
63
+ raise ImportError(
64
+ "Doctr engine requires the 'python-doctr' package. "
65
+ "Install with: pip install python-doctr[torch] or python-doctr[tf]"
66
+ )
57
67
  else:
58
68
  raise ValueError(f"Unknown engine type: {engine_type}")
59
69
 
@@ -85,13 +95,19 @@ class OCRFactory:
85
95
  except ImportError:
86
96
  engines["paddle"] = False
87
97
 
98
+ # Check Doctr
99
+ try:
100
+ engines["doctr"] = importlib.util.find_spec("doctr") is not None
101
+ except ImportError:
102
+ engines["doctr"] = False
103
+
88
104
  return engines
89
105
 
90
106
  @staticmethod
91
107
  def get_recommended_engine(**kwargs) -> OCREngine:
92
108
  """Returns the best available OCR engine based on what's installed.
93
109
 
94
- First tries engines in order of preference: EasyOCR, Paddle, Surya.
110
+ First tries engines in order of preference: EasyOCR, Doctr, Paddle, Surya.
95
111
  If none are available, raises ImportError with installation instructions.
96
112
 
97
113
  Args:
@@ -109,6 +125,9 @@ class OCRFactory:
109
125
  if available.get("easyocr", False):
110
126
  logger.info("Using EasyOCR engine (recommended)")
111
127
  return OCRFactory.create_engine("easyocr", **kwargs)
128
+ elif available.get("doctr", False):
129
+ logger.info("Using Doctr engine")
130
+ return OCRFactory.create_engine("doctr", **kwargs)
112
131
  elif available.get("paddle", False):
113
132
  logger.info("Using PaddleOCR engine")
114
133
  return OCRFactory.create_engine("paddle", **kwargs)
@@ -120,6 +139,7 @@ class OCRFactory:
120
139
  raise ImportError(
121
140
  "No OCR engines available. Please install at least one of: \n"
122
141
  "- EasyOCR (recommended): pip install easyocr\n"
142
+ "- Doctr: pip install python-doctr[torch] or python-doctr[tf]\n"
123
143
  "- PaddleOCR: pip install paddleocr paddlepaddle\n"
124
144
  "- Surya OCR: pip install surya"
125
145
  )
@@ -1,17 +1,26 @@
1
1
  # ocr_manager.py
2
2
  import copy # For deep copying options
3
3
  import logging
4
+ import threading # Import threading for lock
5
+ import time # Import time for timing
4
6
  from typing import Any, Dict, List, Optional, Type, Union
5
7
 
6
8
  from PIL import Image
7
9
 
8
10
  # Import engine classes and options
9
11
  from .engine import OCREngine
12
+ from .engine_doctr import DoctrOCREngine
10
13
  from .engine_easyocr import EasyOCREngine
11
14
  from .engine_paddle import PaddleOCREngine
12
15
  from .engine_surya import SuryaOCREngine
13
- from .ocr_options import OCROptions
14
- from .ocr_options import BaseOCROptions, EasyOCROptions, PaddleOCROptions, SuryaOCROptions
16
+ from .ocr_options import (
17
+ BaseOCROptions,
18
+ DoctrOCROptions,
19
+ EasyOCROptions,
20
+ OCROptions,
21
+ PaddleOCROptions,
22
+ SuryaOCROptions,
23
+ )
15
24
 
16
25
  logger = logging.getLogger(__name__)
17
26
 
@@ -23,37 +32,88 @@ class OCRManager:
23
32
  ENGINE_REGISTRY: Dict[str, Dict[str, Any]] = {
24
33
  "easyocr": {"class": EasyOCREngine, "options_class": EasyOCROptions},
25
34
  "paddle": {"class": PaddleOCREngine, "options_class": PaddleOCROptions},
26
- "surya": {"class": SuryaOCREngine, "options_class": SuryaOCROptions}, # <-- Add Surya
35
+ "surya": {"class": SuryaOCREngine, "options_class": SuryaOCROptions},
36
+ "doctr": {"class": DoctrOCREngine, "options_class": DoctrOCROptions},
27
37
  # Add other engines here
28
38
  }
29
39
 
30
40
  def __init__(self):
31
41
  """Initializes the OCR Manager."""
32
42
  self._engine_instances: Dict[str, OCREngine] = {} # Cache for engine instances
43
+ self._engine_locks: Dict[str, threading.Lock] = (
44
+ {}
45
+ ) # Lock per engine type for initialization
46
+ self._engine_inference_locks: Dict[str, threading.Lock] = (
47
+ {}
48
+ ) # Lock per engine type for inference
33
49
  logger.info("OCRManager initialized.")
34
50
 
35
51
  def _get_engine_instance(self, engine_name: str) -> OCREngine:
36
- """Retrieves or creates an instance of the specified OCR engine."""
52
+ """Retrieves or creates an instance of the specified OCR engine, ensuring thread-safe initialization."""
37
53
  engine_name = engine_name.lower()
38
54
  if engine_name not in self.ENGINE_REGISTRY:
39
55
  raise ValueError(
40
56
  f"Unknown OCR engine: '{engine_name}'. Available: {list(self.ENGINE_REGISTRY.keys())}"
41
57
  )
42
58
 
43
- if engine_name not in self._engine_instances:
44
- logger.info(f"Creating instance of engine: {engine_name}")
59
+ # Quick check if instance already exists (avoid lock contention)
60
+ if engine_name in self._engine_instances:
61
+ return self._engine_instances[engine_name]
62
+
63
+ # Get or create the lock for this engine type
64
+ if engine_name not in self._engine_locks:
65
+ self._engine_locks[engine_name] = threading.Lock()
66
+
67
+ engine_init_lock = self._engine_locks[engine_name]
68
+
69
+ # Acquire lock to safely check and potentially initialize the engine
70
+ with engine_init_lock:
71
+ # Double-check if another thread initialized it while we waited for the lock
72
+ if engine_name in self._engine_instances:
73
+ return self._engine_instances[engine_name]
74
+
75
+ # If still not initialized, create it now under the lock
76
+ logger.info(
77
+ f"[{threading.current_thread().name}] Creating shared instance of engine: {engine_name}"
78
+ )
45
79
  engine_class = self.ENGINE_REGISTRY[engine_name]["class"]
46
- engine_instance = engine_class() # Instantiate first
47
- if not engine_instance.is_available():
48
- # Check availability before storing
49
- # Construct helpful error message with install hint
50
- install_hint = f"pip install 'natural-pdf[{engine_name}]'"
51
- raise RuntimeError(
52
- f"Engine '{engine_name}' is not available. Please install the required dependencies: {install_hint}"
80
+ start_time = time.monotonic() # Optional: time initialization
81
+ try:
82
+ engine_instance = engine_class() # Instantiate first
83
+ if not engine_instance.is_available():
84
+ # Check availability before storing
85
+ install_hint = f"pip install 'natural-pdf[{engine_name}]'"
86
+ raise RuntimeError(
87
+ f"Engine '{engine_name}' is not available. Please install the required dependencies: {install_hint}"
88
+ )
89
+ # Store the shared instance
90
+ self._engine_instances[engine_name] = engine_instance
91
+ end_time = time.monotonic()
92
+ logger.info(
93
+ f"[{threading.current_thread().name}] Shared instance of {engine_name} created successfully (Duration: {end_time - start_time:.2f}s)."
53
94
  )
54
- self._engine_instances[engine_name] = engine_instance # Store if available
95
+ return engine_instance
96
+ except Exception as e:
97
+ # Ensure we don't leave a partial state if init fails
98
+ logger.error(
99
+ f"[{threading.current_thread().name}] Failed to create shared instance of {engine_name}: {e}",
100
+ exc_info=True,
101
+ )
102
+ # Remove potentially partial entry if exists
103
+ if engine_name in self._engine_instances:
104
+ del self._engine_instances[engine_name]
105
+ raise # Re-raise the exception after logging
55
106
 
56
- return self._engine_instances[engine_name]
107
+ def _get_engine_inference_lock(self, engine_name: str) -> threading.Lock:
108
+ """Gets or creates the inference lock for a given engine type."""
109
+ engine_name = engine_name.lower()
110
+ # Assume engine_name is valid as it's checked before this would be called
111
+ if engine_name not in self._engine_inference_locks:
112
+ # Create lock if it doesn't exist (basic thread safety for dict access)
113
+ # A more robust approach might lock around this check/creation too,
114
+ # but contention here is less critical than for engine init or inference itself.
115
+ self._engine_inference_locks[engine_name] = threading.Lock()
116
+ return self._engine_inference_locks[engine_name]
57
117
 
58
118
  def apply_ocr(
59
119
  self,
@@ -72,7 +132,7 @@ class OCRManager:
72
132
 
73
133
  Args:
74
134
  images: A single PIL Image or a list of PIL Images to process.
75
- engine: Name of the engine (e.g., 'easyocr', 'paddle', 'surya').
135
+ engine: Name of the engine (e.g., 'easyocr', 'paddle', 'surya', 'doctr').
76
136
  Defaults to 'easyocr' if not specified.
77
137
  languages: List of language codes (e.g., ['en', 'fr'], ['en', 'german']).
78
138
  **Passed directly to the engine.** Must be codes understood
@@ -127,21 +187,57 @@ class OCRManager:
127
187
  try:
128
188
  engine_instance = self._get_engine_instance(selected_engine_name)
129
189
  processing_mode = "batch" if is_batch else "single image"
130
- logger.info(f"Processing {processing_mode} with engine '{selected_engine_name}'...")
190
+ # Log thread name for clarity during parallel calls
191
+ thread_id = threading.current_thread().name
192
+ logger.info(
193
+ f"[{thread_id}] Processing {processing_mode} using shared engine instance '{selected_engine_name}'..."
194
+ )
131
195
  logger.debug(
132
196
  f" Engine Args: languages={languages}, min_confidence={min_confidence}, device={device}, options={final_options}"
133
197
  )
134
198
 
135
- # Call the engine's process_image, passing common args and options object
136
- # **ASSUMPTION**: Engine process_image signatures are updated to accept these common args.
137
- results = engine_instance.process_image(
138
- images=images,
139
- languages=languages,
140
- min_confidence=min_confidence,
141
- device=device,
142
- detect_only=detect_only,
143
- options=final_options,
199
+ # Log image dimensions before processing
200
+ if is_batch:
201
+ image_dims = [
202
+ f"{img.width}x{img.height}"
203
+ for img in images
204
+ if hasattr(img, "width") and hasattr(img, "height")
205
+ ]
206
+ logger.debug(
207
+ f"[{thread_id}] Processing batch of {len(images)} images with dimensions: {image_dims}"
208
+ )
209
+ elif hasattr(images, "width") and hasattr(images, "height"):
210
+ logger.debug(
211
+ f"[{thread_id}] Processing single image with dimensions: {images.width}x{images.height}"
212
+ )
213
+ else:
214
+ logger.warning(f"[{thread_id}] Could not determine dimensions of input image(s).")
215
+
216
+ # Acquire lock specifically for the inference call
217
+ inference_lock = self._get_engine_inference_lock(selected_engine_name)
218
+ logger.debug(
219
+ f"[{thread_id}] Attempting to acquire inference lock for {selected_engine_name}..."
144
220
  )
221
+ inference_wait_start = time.monotonic()
222
+ with inference_lock:
223
+ inference_acquired_time = time.monotonic()
224
+ logger.debug(
225
+ f"[{thread_id}] Acquired inference lock for {selected_engine_name} (waited {inference_acquired_time - inference_wait_start:.2f}s). Calling process_image..."
226
+ )
227
+ inference_start_time = time.monotonic()
228
+
229
+ results = engine_instance.process_image(
230
+ images=images,
231
+ languages=languages,
232
+ min_confidence=min_confidence,
233
+ device=device,
234
+ detect_only=detect_only,
235
+ options=final_options,
236
+ )
237
+ inference_end_time = time.monotonic()
238
+ logger.debug(
239
+ f"[{thread_id}] process_image call finished for {selected_engine_name} (Duration: {inference_end_time - inference_start_time:.2f}s). Releasing lock."
240
+ )
145
241
 
146
242
  # Log result summary based on mode
147
243
  if is_batch: