natural-pdf 0.1.7__py3-none-any.whl → 0.1.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/__init__.py +3 -0
- natural_pdf/analyzers/layout/base.py +1 -5
- natural_pdf/analyzers/layout/gemini.py +61 -51
- natural_pdf/analyzers/layout/layout_analyzer.py +40 -11
- natural_pdf/analyzers/layout/layout_manager.py +26 -84
- natural_pdf/analyzers/layout/layout_options.py +7 -0
- natural_pdf/analyzers/layout/pdfplumber_table_finder.py +142 -0
- natural_pdf/analyzers/layout/surya.py +46 -123
- natural_pdf/analyzers/layout/tatr.py +51 -4
- natural_pdf/analyzers/text_structure.py +3 -5
- natural_pdf/analyzers/utils.py +3 -3
- natural_pdf/classification/manager.py +422 -0
- natural_pdf/classification/mixin.py +163 -0
- natural_pdf/classification/results.py +80 -0
- natural_pdf/collections/mixins.py +111 -0
- natural_pdf/collections/pdf_collection.py +434 -15
- natural_pdf/core/element_manager.py +83 -0
- natural_pdf/core/highlighting_service.py +13 -22
- natural_pdf/core/page.py +578 -93
- natural_pdf/core/pdf.py +912 -460
- natural_pdf/elements/base.py +134 -40
- natural_pdf/elements/collections.py +712 -109
- natural_pdf/elements/region.py +722 -69
- natural_pdf/elements/text.py +4 -1
- natural_pdf/export/mixin.py +137 -0
- natural_pdf/exporters/base.py +3 -3
- natural_pdf/exporters/paddleocr.py +5 -4
- natural_pdf/extraction/manager.py +135 -0
- natural_pdf/extraction/mixin.py +279 -0
- natural_pdf/extraction/result.py +23 -0
- natural_pdf/ocr/__init__.py +5 -5
- natural_pdf/ocr/engine_doctr.py +346 -0
- natural_pdf/ocr/engine_easyocr.py +6 -3
- natural_pdf/ocr/ocr_factory.py +24 -4
- natural_pdf/ocr/ocr_manager.py +122 -26
- natural_pdf/ocr/ocr_options.py +94 -11
- natural_pdf/ocr/utils.py +19 -6
- natural_pdf/qa/document_qa.py +0 -4
- natural_pdf/search/__init__.py +20 -34
- natural_pdf/search/haystack_search_service.py +309 -265
- natural_pdf/search/haystack_utils.py +99 -75
- natural_pdf/search/search_service_protocol.py +11 -12
- natural_pdf/selectors/parser.py +431 -230
- natural_pdf/utils/debug.py +3 -3
- natural_pdf/utils/identifiers.py +1 -1
- natural_pdf/utils/locks.py +8 -0
- natural_pdf/utils/packaging.py +8 -6
- natural_pdf/utils/text_extraction.py +60 -1
- natural_pdf/utils/tqdm_utils.py +51 -0
- natural_pdf/utils/visualization.py +18 -0
- natural_pdf/widgets/viewer.py +4 -25
- {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/METADATA +17 -3
- natural_pdf-0.1.9.dist-info/RECORD +80 -0
- {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/WHEEL +1 -1
- {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/top_level.txt +0 -2
- docs/api/index.md +0 -386
- docs/assets/favicon.png +0 -3
- docs/assets/favicon.svg +0 -3
- docs/assets/javascripts/custom.js +0 -17
- docs/assets/logo.svg +0 -3
- docs/assets/sample-screen.png +0 -0
- docs/assets/social-preview.png +0 -17
- docs/assets/social-preview.svg +0 -17
- docs/assets/stylesheets/custom.css +0 -65
- docs/document-qa/index.ipynb +0 -435
- docs/document-qa/index.md +0 -79
- docs/element-selection/index.ipynb +0 -915
- docs/element-selection/index.md +0 -229
- docs/finetuning/index.md +0 -176
- docs/index.md +0 -170
- docs/installation/index.md +0 -69
- docs/interactive-widget/index.ipynb +0 -962
- docs/interactive-widget/index.md +0 -12
- docs/layout-analysis/index.ipynb +0 -818
- docs/layout-analysis/index.md +0 -185
- docs/ocr/index.md +0 -209
- docs/pdf-navigation/index.ipynb +0 -314
- docs/pdf-navigation/index.md +0 -97
- docs/regions/index.ipynb +0 -816
- docs/regions/index.md +0 -294
- docs/tables/index.ipynb +0 -658
- docs/tables/index.md +0 -144
- docs/text-analysis/index.ipynb +0 -370
- docs/text-analysis/index.md +0 -105
- docs/text-extraction/index.ipynb +0 -1478
- docs/text-extraction/index.md +0 -292
- docs/tutorials/01-loading-and-extraction.ipynb +0 -194
- docs/tutorials/01-loading-and-extraction.md +0 -95
- docs/tutorials/02-finding-elements.ipynb +0 -340
- docs/tutorials/02-finding-elements.md +0 -149
- docs/tutorials/03-extracting-blocks.ipynb +0 -147
- docs/tutorials/03-extracting-blocks.md +0 -48
- docs/tutorials/04-table-extraction.ipynb +0 -114
- docs/tutorials/04-table-extraction.md +0 -50
- docs/tutorials/05-excluding-content.ipynb +0 -270
- docs/tutorials/05-excluding-content.md +0 -109
- docs/tutorials/06-document-qa.ipynb +0 -332
- docs/tutorials/06-document-qa.md +0 -91
- docs/tutorials/07-layout-analysis.ipynb +0 -288
- docs/tutorials/07-layout-analysis.md +0 -66
- docs/tutorials/07-working-with-regions.ipynb +0 -413
- docs/tutorials/07-working-with-regions.md +0 -151
- docs/tutorials/08-spatial-navigation.ipynb +0 -508
- docs/tutorials/08-spatial-navigation.md +0 -190
- docs/tutorials/09-section-extraction.ipynb +0 -2434
- docs/tutorials/09-section-extraction.md +0 -256
- docs/tutorials/10-form-field-extraction.ipynb +0 -512
- docs/tutorials/10-form-field-extraction.md +0 -201
- docs/tutorials/11-enhanced-table-processing.ipynb +0 -54
- docs/tutorials/11-enhanced-table-processing.md +0 -9
- docs/tutorials/12-ocr-integration.ipynb +0 -604
- docs/tutorials/12-ocr-integration.md +0 -175
- docs/tutorials/13-semantic-search.ipynb +0 -1328
- docs/tutorials/13-semantic-search.md +0 -77
- docs/visual-debugging/index.ipynb +0 -2970
- docs/visual-debugging/index.md +0 -157
- docs/visual-debugging/region.png +0 -0
- natural_pdf/templates/finetune/fine_tune_paddleocr.md +0 -415
- natural_pdf/templates/spa/css/style.css +0 -334
- natural_pdf/templates/spa/index.html +0 -31
- natural_pdf/templates/spa/js/app.js +0 -472
- natural_pdf/templates/spa/words.txt +0 -235976
- natural_pdf/widgets/frontend/viewer.js +0 -88
- natural_pdf-0.1.7.dist-info/RECORD +0 -145
- notebooks/Examples.ipynb +0 -1293
- pdfs/.gitkeep +0 -0
- pdfs/01-practice.pdf +0 -543
- pdfs/0500000US42001.pdf +0 -0
- pdfs/0500000US42007.pdf +0 -0
- pdfs/2014 Statistics.pdf +0 -0
- pdfs/2019 Statistics.pdf +0 -0
- pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
- pdfs/needs-ocr.pdf +0 -0
- {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,346 @@
|
|
1
|
+
# natural_pdf/ocr/engine_doctr.py
|
2
|
+
import importlib.util
|
3
|
+
import logging
|
4
|
+
from typing import Any, List, Optional
|
5
|
+
|
6
|
+
import numpy as np
|
7
|
+
from PIL import Image
|
8
|
+
|
9
|
+
from .engine import OCREngine, TextRegion
|
10
|
+
from .ocr_options import BaseOCROptions, DoctrOCROptions
|
11
|
+
|
12
|
+
logger = logging.getLogger(__name__)
|
13
|
+
|
14
|
+
|
15
|
+
class DoctrOCREngine(OCREngine):
|
16
|
+
"""docTR engine implementation."""
|
17
|
+
|
18
|
+
def __init__(self):
|
19
|
+
super().__init__()
|
20
|
+
self._model = None # Will hold the doctr ocr_predictor
|
21
|
+
self._detection_model = None # Will hold detection_predictor if detect_only is used
|
22
|
+
self._orientation_model = None # Will hold page_orientation_predictor if enabled
|
23
|
+
|
24
|
+
def is_available(self) -> bool:
|
25
|
+
"""Check if doctr is installed."""
|
26
|
+
return importlib.util.find_spec("doctr") is not None
|
27
|
+
|
28
|
+
def _initialize_model(
|
29
|
+
self, languages: List[str], device: str, options: Optional[BaseOCROptions]
|
30
|
+
):
|
31
|
+
"""Initialize the doctr model."""
|
32
|
+
if not self.is_available():
|
33
|
+
raise ImportError(
|
34
|
+
"Doctr engine requires the 'python-doctr' package. "
|
35
|
+
"Install with: pip install python-doctr[torch] or python-doctr[tf]"
|
36
|
+
)
|
37
|
+
|
38
|
+
try:
|
39
|
+
import doctr.models
|
40
|
+
|
41
|
+
self.logger.info("doctr.models imported successfully.")
|
42
|
+
except ImportError as e:
|
43
|
+
self.logger.error(f"Failed to import doctr: {e}")
|
44
|
+
raise
|
45
|
+
|
46
|
+
# Cast to DoctrOCROptions or use default
|
47
|
+
doctr_opts = options if isinstance(options, DoctrOCROptions) else DoctrOCROptions()
|
48
|
+
|
49
|
+
# Check if CUDA is available in device string
|
50
|
+
use_cuda = device.lower().startswith("cuda") if device else False
|
51
|
+
|
52
|
+
# Prepare OCR predictor arguments
|
53
|
+
predictor_args = {
|
54
|
+
"det_arch": doctr_opts.det_arch,
|
55
|
+
"reco_arch": doctr_opts.reco_arch,
|
56
|
+
"pretrained": doctr_opts.pretrained,
|
57
|
+
"assume_straight_pages": doctr_opts.assume_straight_pages,
|
58
|
+
"export_as_straight_boxes": doctr_opts.export_as_straight_boxes,
|
59
|
+
}
|
60
|
+
# Filter out None values
|
61
|
+
predictor_args = {k: v for k, v in predictor_args.items() if v is not None}
|
62
|
+
|
63
|
+
self.logger.debug(f"doctr ocr_predictor constructor args: {predictor_args}")
|
64
|
+
try:
|
65
|
+
# Create the main OCR predictor (doesn't accept batch_size)
|
66
|
+
self._model = doctr.models.ocr_predictor(**predictor_args)
|
67
|
+
|
68
|
+
# Apply CUDA if available
|
69
|
+
if use_cuda:
|
70
|
+
self._model = self._model.cuda()
|
71
|
+
|
72
|
+
self.logger.info("doctr ocr_predictor created successfully")
|
73
|
+
|
74
|
+
# Now initialize the detection-only model
|
75
|
+
try:
|
76
|
+
detection_args = {
|
77
|
+
"arch": doctr_opts.det_arch,
|
78
|
+
"pretrained": doctr_opts.pretrained,
|
79
|
+
"assume_straight_pages": doctr_opts.assume_straight_pages,
|
80
|
+
"symmetric_pad": doctr_opts.symmetric_pad,
|
81
|
+
"preserve_aspect_ratio": doctr_opts.preserve_aspect_ratio,
|
82
|
+
"batch_size": doctr_opts.batch_size,
|
83
|
+
}
|
84
|
+
self._detection_model = doctr.models.detection_predictor(**detection_args)
|
85
|
+
|
86
|
+
# Apply CUDA if available
|
87
|
+
if use_cuda:
|
88
|
+
self._detection_model = self._detection_model.cuda()
|
89
|
+
|
90
|
+
# Configure postprocessing parameters if provided
|
91
|
+
if doctr_opts.bin_thresh is not None:
|
92
|
+
self._detection_model.model.postprocessor.bin_thresh = doctr_opts.bin_thresh
|
93
|
+
if doctr_opts.box_thresh is not None:
|
94
|
+
self._detection_model.model.postprocessor.box_thresh = doctr_opts.box_thresh
|
95
|
+
|
96
|
+
self.logger.info("doctr detection_predictor created successfully")
|
97
|
+
except Exception as e:
|
98
|
+
self.logger.error(f"Failed to create detection_predictor: {e}")
|
99
|
+
self._detection_model = None
|
100
|
+
|
101
|
+
# Initialize orientation predictor if enabled
|
102
|
+
if doctr_opts.use_orientation_predictor:
|
103
|
+
try:
|
104
|
+
self._orientation_model = doctr.models.page_orientation_predictor(
|
105
|
+
pretrained=True, batch_size=doctr_opts.batch_size
|
106
|
+
)
|
107
|
+
if use_cuda:
|
108
|
+
self._orientation_model = self._orientation_model.cuda()
|
109
|
+
self.logger.info("doctr page_orientation_predictor created successfully")
|
110
|
+
except Exception as e:
|
111
|
+
self.logger.error(f"Failed to create page_orientation_predictor: {e}")
|
112
|
+
self._orientation_model = None
|
113
|
+
|
114
|
+
except Exception as e:
|
115
|
+
self.logger.error(f"Failed to create doctr models: {e}")
|
116
|
+
raise
|
117
|
+
|
118
|
+
# Doctr doesn't explicitly use language list in ocr_predictor initialization
|
119
|
+
if languages and languages != [self.DEFAULT_LANGUAGES[0]]:
|
120
|
+
logger.warning(
|
121
|
+
f"Doctr engine currently doesn't support language selection during initialization. Using its default language capabilities for model: {doctr_opts.reco_arch}"
|
122
|
+
)
|
123
|
+
|
124
|
+
def _preprocess_image(self, image: Image.Image) -> np.ndarray:
|
125
|
+
"""Convert PIL Image to RGB numpy array for doctr."""
|
126
|
+
# Ensure the image is in RGB mode
|
127
|
+
if image.mode != "RGB":
|
128
|
+
image = image.convert("RGB")
|
129
|
+
# Convert to numpy array
|
130
|
+
return np.array(image)
|
131
|
+
|
132
|
+
def _process_single_image(
|
133
|
+
self, image: np.ndarray, detect_only: bool, options: Optional[DoctrOCROptions]
|
134
|
+
) -> Any:
|
135
|
+
"""Process a single image with doctr."""
|
136
|
+
if self._model is None:
|
137
|
+
raise RuntimeError("Doctr model not initialized")
|
138
|
+
|
139
|
+
# Capture image dimensions for denormalization
|
140
|
+
height, width = image.shape[:2]
|
141
|
+
|
142
|
+
# Cast options to DoctrOCROptions or use default
|
143
|
+
doctr_opts = options if isinstance(options, DoctrOCROptions) else DoctrOCROptions()
|
144
|
+
|
145
|
+
# Check if we need to detect orientation first
|
146
|
+
if self._orientation_model is not None and options and options.use_orientation_predictor:
|
147
|
+
try:
|
148
|
+
# Process with orientation predictor
|
149
|
+
# For orientation predictor, we need to pass a batch of images
|
150
|
+
orientations = self._orientation_model([image])
|
151
|
+
orientation = orientations[1][0] # Get the orientation angle
|
152
|
+
logger.info(f"Detected page orientation: {orientation} degrees")
|
153
|
+
# Note: doctr handles rotation internally for detection/recognition
|
154
|
+
except Exception as e:
|
155
|
+
logger.error(f"Error detecting orientation: {e}")
|
156
|
+
|
157
|
+
# Process differently based on detect_only flag
|
158
|
+
if detect_only and self._detection_model is not None:
|
159
|
+
try:
|
160
|
+
# Apply threshold settings at runtime for this detection
|
161
|
+
if doctr_opts.bin_thresh is not None:
|
162
|
+
original_bin_thresh = self._detection_model.model.postprocessor.bin_thresh
|
163
|
+
self._detection_model.model.postprocessor.bin_thresh = doctr_opts.bin_thresh
|
164
|
+
logger.debug(f"Temporarily set bin_thresh to {doctr_opts.bin_thresh}")
|
165
|
+
|
166
|
+
if doctr_opts.box_thresh is not None:
|
167
|
+
original_box_thresh = self._detection_model.model.postprocessor.box_thresh
|
168
|
+
self._detection_model.model.postprocessor.box_thresh = doctr_opts.box_thresh
|
169
|
+
logger.debug(f"Temporarily set box_thresh to {doctr_opts.box_thresh}")
|
170
|
+
|
171
|
+
# Use the dedicated detection model with a list of numpy arrays
|
172
|
+
result = self._detection_model([image])
|
173
|
+
|
174
|
+
# Restore original thresholds
|
175
|
+
if doctr_opts.bin_thresh is not None:
|
176
|
+
self._detection_model.model.postprocessor.bin_thresh = original_bin_thresh
|
177
|
+
|
178
|
+
if doctr_opts.box_thresh is not None:
|
179
|
+
self._detection_model.model.postprocessor.box_thresh = original_box_thresh
|
180
|
+
|
181
|
+
# Return tuple of (result, dimensions)
|
182
|
+
return (result, (height, width))
|
183
|
+
except Exception as e:
|
184
|
+
logger.error(f"Error in detection_predictor: {e}")
|
185
|
+
# Fall back to OCR predictor if detection fails
|
186
|
+
logger.warning("Falling back to OCR predictor for detection")
|
187
|
+
|
188
|
+
# Process with full OCR model, passing a list of numpy arrays directly
|
189
|
+
try:
|
190
|
+
# For full OCR, we should also apply the thresholds
|
191
|
+
if (
|
192
|
+
detect_only
|
193
|
+
and doctr_opts.bin_thresh is not None
|
194
|
+
and hasattr(self._model.det_predictor.model.postprocessor, "bin_thresh")
|
195
|
+
):
|
196
|
+
original_bin_thresh = self._model.det_predictor.model.postprocessor.bin_thresh
|
197
|
+
self._model.det_predictor.model.postprocessor.bin_thresh = doctr_opts.bin_thresh
|
198
|
+
|
199
|
+
if (
|
200
|
+
detect_only
|
201
|
+
and doctr_opts.box_thresh is not None
|
202
|
+
and hasattr(self._model.det_predictor.model.postprocessor, "box_thresh")
|
203
|
+
):
|
204
|
+
original_box_thresh = self._model.det_predictor.model.postprocessor.box_thresh
|
205
|
+
self._model.det_predictor.model.postprocessor.box_thresh = doctr_opts.box_thresh
|
206
|
+
|
207
|
+
result = self._model([image])
|
208
|
+
|
209
|
+
# Restore original thresholds
|
210
|
+
if (
|
211
|
+
detect_only
|
212
|
+
and doctr_opts.bin_thresh is not None
|
213
|
+
and hasattr(self._model.det_predictor.model.postprocessor, "bin_thresh")
|
214
|
+
):
|
215
|
+
self._model.det_predictor.model.postprocessor.bin_thresh = original_bin_thresh
|
216
|
+
|
217
|
+
if (
|
218
|
+
detect_only
|
219
|
+
and doctr_opts.box_thresh is not None
|
220
|
+
and hasattr(self._model.det_predictor.model.postprocessor, "box_thresh")
|
221
|
+
):
|
222
|
+
self._model.det_predictor.model.postprocessor.box_thresh = original_box_thresh
|
223
|
+
|
224
|
+
# Return tuple of (result, dimensions)
|
225
|
+
return (result, (height, width))
|
226
|
+
except Exception as e:
|
227
|
+
logger.error(f"Error in OCR prediction: {e}")
|
228
|
+
raise
|
229
|
+
|
230
|
+
def _standardize_results(
|
231
|
+
self, raw_results: Any, min_confidence: float, detect_only: bool
|
232
|
+
) -> List[TextRegion]:
|
233
|
+
"""Convert doctr results to standardized TextRegion objects."""
|
234
|
+
standardized_regions = []
|
235
|
+
|
236
|
+
# Extract results and dimensions
|
237
|
+
if isinstance(raw_results, tuple) and len(raw_results) == 2:
|
238
|
+
results, dimensions = raw_results
|
239
|
+
image_height, image_width = dimensions
|
240
|
+
else:
|
241
|
+
# Fallback if dimensions aren't provided
|
242
|
+
results = raw_results
|
243
|
+
image_width = 1
|
244
|
+
image_height = 1
|
245
|
+
logger.warning("Image dimensions not provided, using normalized coordinates")
|
246
|
+
|
247
|
+
# Handle detection-only results differently
|
248
|
+
if detect_only and self._detection_model is not None and not hasattr(results, "pages"):
|
249
|
+
# Import doctr utils for detach_scores if needed
|
250
|
+
try:
|
251
|
+
from doctr.utils.geometry import detach_scores
|
252
|
+
except ImportError:
|
253
|
+
logger.error("Failed to import doctr.utils.geometry.detach_scores")
|
254
|
+
return standardized_regions
|
255
|
+
|
256
|
+
# Extract coordinates and scores from detection results
|
257
|
+
for result in results:
|
258
|
+
# Detection results structure is different from ocr_predictor
|
259
|
+
if "words" in result:
|
260
|
+
try:
|
261
|
+
# Detach the coordinates and scores
|
262
|
+
detached_coords, prob_scores = detach_scores([result.get("words")])
|
263
|
+
|
264
|
+
for i, coords in enumerate(detached_coords[0]):
|
265
|
+
score = (
|
266
|
+
prob_scores[0][i]
|
267
|
+
if prob_scores and len(prob_scores[0]) > i
|
268
|
+
else 0.0
|
269
|
+
)
|
270
|
+
|
271
|
+
if score >= min_confidence:
|
272
|
+
try:
|
273
|
+
# Handle both straight and rotated boxes
|
274
|
+
if coords.shape == (
|
275
|
+
4,
|
276
|
+
): # Straight box as [xmin, ymin, xmax, ymax]
|
277
|
+
xmin, ymin, xmax, ymax = coords.tolist()
|
278
|
+
# Denormalize coordinates
|
279
|
+
bbox = (
|
280
|
+
float(xmin * image_width),
|
281
|
+
float(ymin * image_height),
|
282
|
+
float(xmax * image_width),
|
283
|
+
float(ymax * image_height),
|
284
|
+
)
|
285
|
+
else: # Polygon points
|
286
|
+
# Get bounding box from polygon
|
287
|
+
coords_list = coords.tolist()
|
288
|
+
x_coords = [p[0] * image_width for p in coords_list]
|
289
|
+
y_coords = [p[1] * image_height for p in coords_list]
|
290
|
+
bbox = (
|
291
|
+
float(min(x_coords)),
|
292
|
+
float(min(y_coords)),
|
293
|
+
float(max(x_coords)),
|
294
|
+
float(max(y_coords)),
|
295
|
+
)
|
296
|
+
|
297
|
+
# In detection mode, we don't have text or confidence score
|
298
|
+
standardized_regions.append(TextRegion(bbox, None, score))
|
299
|
+
except Exception as e:
|
300
|
+
logger.error(f"Error processing detection result: {e}")
|
301
|
+
except Exception as e:
|
302
|
+
logger.error(f"Error detaching scores: {e}")
|
303
|
+
|
304
|
+
return standardized_regions
|
305
|
+
|
306
|
+
# Process standard OCR results
|
307
|
+
if not hasattr(results, "pages") or not results.pages:
|
308
|
+
logger.warning("Doctr result object does not contain pages.")
|
309
|
+
return standardized_regions
|
310
|
+
|
311
|
+
# Process results page by page (we typically process one image at a time)
|
312
|
+
for page in results.pages:
|
313
|
+
# Extract information from blocks, lines, words
|
314
|
+
for block in page.blocks:
|
315
|
+
for line in block.lines:
|
316
|
+
for word in line.words:
|
317
|
+
if word.confidence >= min_confidence:
|
318
|
+
try:
|
319
|
+
# doctr geometry is ((x_min, y_min), (x_max, y_max)) as relative coordinates
|
320
|
+
x_min, y_min = word.geometry[0]
|
321
|
+
x_max, y_max = word.geometry[1]
|
322
|
+
|
323
|
+
# Denormalize coordinates to absolute pixel values
|
324
|
+
bbox = (
|
325
|
+
float(x_min * image_width),
|
326
|
+
float(y_min * image_height),
|
327
|
+
float(x_max * image_width),
|
328
|
+
float(y_max * image_height),
|
329
|
+
)
|
330
|
+
|
331
|
+
# Skip text content if detect_only is True
|
332
|
+
text = None if detect_only else word.value
|
333
|
+
confidence = None if detect_only else word.confidence
|
334
|
+
|
335
|
+
standardized_regions.append(TextRegion(bbox, text, confidence))
|
336
|
+
except (ValueError, TypeError, IndexError) as e:
|
337
|
+
logger.error(
|
338
|
+
f"Could not standardize bounding box/word from doctr result: {word}"
|
339
|
+
)
|
340
|
+
logger.error(f"Error: {e}")
|
341
|
+
|
342
|
+
return standardized_regions
|
343
|
+
|
344
|
+
def get_default_options(self) -> DoctrOCROptions:
|
345
|
+
"""Return the default options specific to this engine."""
|
346
|
+
return DoctrOCROptions()
|
@@ -143,11 +143,13 @@ class EasyOCREngine(OCREngine):
|
|
143
143
|
standardized_regions = []
|
144
144
|
|
145
145
|
if detect_only:
|
146
|
+
results = raw_results[0]
|
146
147
|
# In detect_only mode, raw_results is already a list of bounding boxes
|
147
148
|
# Each bbox is in [x_min, x_max, y_min, y_max] format
|
148
|
-
if isinstance(
|
149
|
-
for detection in
|
149
|
+
if isinstance(results, list):
|
150
|
+
for detection in results:
|
150
151
|
try:
|
152
|
+
# This block expects 'detection' to be a list/tuple of 4 numbers
|
151
153
|
if isinstance(detection, (list, tuple)) and len(detection) == 4:
|
152
154
|
x_min, x_max, y_min, y_max = detection
|
153
155
|
# Convert to standardized (x0, y0, x1, y1) format
|
@@ -161,6 +163,7 @@ class EasyOCREngine(OCREngine):
|
|
161
163
|
f"Invalid number format in EasyOCR detect bbox: {detection}"
|
162
164
|
) from e
|
163
165
|
else:
|
166
|
+
# This is where the error is raised if 'detection' is not a list/tuple of 4 numbers
|
164
167
|
raise ValueError(f"Invalid detection format from EasyOCR: {detection}")
|
165
168
|
except ValueError as e:
|
166
169
|
# Re-raise any value errors from standardization or format checks
|
@@ -172,7 +175,7 @@ class EasyOCREngine(OCREngine):
|
|
172
175
|
) from e
|
173
176
|
else:
|
174
177
|
raise ValueError(
|
175
|
-
f"Expected list of bounding boxes in detect_only mode, got: {raw_results}"
|
178
|
+
f"Expected list of bounding boxes in detect_only mode, got: {type(raw_results)}"
|
176
179
|
)
|
177
180
|
|
178
181
|
return standardized_regions
|
natural_pdf/ocr/ocr_factory.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1
|
-
import logging
|
2
1
|
import importlib.util
|
3
|
-
|
2
|
+
import logging
|
3
|
+
from typing import Any, Dict, List, Optional, Type, Union
|
4
4
|
|
5
5
|
from .engine import OCREngine
|
6
6
|
|
@@ -15,7 +15,7 @@ class OCRFactory:
|
|
15
15
|
"""Create and return an OCR engine instance.
|
16
16
|
|
17
17
|
Args:
|
18
|
-
engine_type: One of 'surya', 'easyocr', 'paddle'
|
18
|
+
engine_type: One of 'surya', 'easyocr', 'paddle', 'doctr'
|
19
19
|
**kwargs: Arguments to pass to the engine constructor
|
20
20
|
|
21
21
|
Returns:
|
@@ -54,6 +54,16 @@ class OCRFactory:
|
|
54
54
|
"PaddleOCR engine requires 'paddleocr' and 'paddlepaddle'. "
|
55
55
|
"Install with: pip install paddleocr paddlepaddle"
|
56
56
|
)
|
57
|
+
elif engine_type == "doctr":
|
58
|
+
try:
|
59
|
+
from .engine_doctr import DoctrOCREngine
|
60
|
+
|
61
|
+
return DoctrOCREngine(**kwargs)
|
62
|
+
except ImportError:
|
63
|
+
raise ImportError(
|
64
|
+
"Doctr engine requires the 'python-doctr' package. "
|
65
|
+
"Install with: pip install python-doctr[torch] or python-doctr[tf]"
|
66
|
+
)
|
57
67
|
else:
|
58
68
|
raise ValueError(f"Unknown engine type: {engine_type}")
|
59
69
|
|
@@ -85,13 +95,19 @@ class OCRFactory:
|
|
85
95
|
except ImportError:
|
86
96
|
engines["paddle"] = False
|
87
97
|
|
98
|
+
# Check Doctr
|
99
|
+
try:
|
100
|
+
engines["doctr"] = importlib.util.find_spec("doctr") is not None
|
101
|
+
except ImportError:
|
102
|
+
engines["doctr"] = False
|
103
|
+
|
88
104
|
return engines
|
89
105
|
|
90
106
|
@staticmethod
|
91
107
|
def get_recommended_engine(**kwargs) -> OCREngine:
|
92
108
|
"""Returns the best available OCR engine based on what's installed.
|
93
109
|
|
94
|
-
First tries engines in order of preference: EasyOCR, Paddle, Surya.
|
110
|
+
First tries engines in order of preference: EasyOCR, Doctr, Paddle, Surya.
|
95
111
|
If none are available, raises ImportError with installation instructions.
|
96
112
|
|
97
113
|
Args:
|
@@ -109,6 +125,9 @@ class OCRFactory:
|
|
109
125
|
if available.get("easyocr", False):
|
110
126
|
logger.info("Using EasyOCR engine (recommended)")
|
111
127
|
return OCRFactory.create_engine("easyocr", **kwargs)
|
128
|
+
elif available.get("doctr", False):
|
129
|
+
logger.info("Using Doctr engine")
|
130
|
+
return OCRFactory.create_engine("doctr", **kwargs)
|
112
131
|
elif available.get("paddle", False):
|
113
132
|
logger.info("Using PaddleOCR engine")
|
114
133
|
return OCRFactory.create_engine("paddle", **kwargs)
|
@@ -120,6 +139,7 @@ class OCRFactory:
|
|
120
139
|
raise ImportError(
|
121
140
|
"No OCR engines available. Please install at least one of: \n"
|
122
141
|
"- EasyOCR (recommended): pip install easyocr\n"
|
142
|
+
"- Doctr: pip install python-doctr[torch] or python-doctr[tf]\n"
|
123
143
|
"- PaddleOCR: pip install paddleocr paddlepaddle\n"
|
124
144
|
"- Surya OCR: pip install surya"
|
125
145
|
)
|
natural_pdf/ocr/ocr_manager.py
CHANGED
@@ -1,17 +1,26 @@
|
|
1
1
|
# ocr_manager.py
|
2
2
|
import copy # For deep copying options
|
3
3
|
import logging
|
4
|
+
import threading # Import threading for lock
|
5
|
+
import time # Import time for timing
|
4
6
|
from typing import Any, Dict, List, Optional, Type, Union
|
5
7
|
|
6
8
|
from PIL import Image
|
7
9
|
|
8
10
|
# Import engine classes and options
|
9
11
|
from .engine import OCREngine
|
12
|
+
from .engine_doctr import DoctrOCREngine
|
10
13
|
from .engine_easyocr import EasyOCREngine
|
11
14
|
from .engine_paddle import PaddleOCREngine
|
12
15
|
from .engine_surya import SuryaOCREngine
|
13
|
-
from .ocr_options import
|
14
|
-
|
16
|
+
from .ocr_options import (
|
17
|
+
BaseOCROptions,
|
18
|
+
DoctrOCROptions,
|
19
|
+
EasyOCROptions,
|
20
|
+
OCROptions,
|
21
|
+
PaddleOCROptions,
|
22
|
+
SuryaOCROptions,
|
23
|
+
)
|
15
24
|
|
16
25
|
logger = logging.getLogger(__name__)
|
17
26
|
|
@@ -23,37 +32,88 @@ class OCRManager:
|
|
23
32
|
ENGINE_REGISTRY: Dict[str, Dict[str, Any]] = {
|
24
33
|
"easyocr": {"class": EasyOCREngine, "options_class": EasyOCROptions},
|
25
34
|
"paddle": {"class": PaddleOCREngine, "options_class": PaddleOCROptions},
|
26
|
-
"surya": {"class": SuryaOCREngine, "options_class": SuryaOCROptions},
|
35
|
+
"surya": {"class": SuryaOCREngine, "options_class": SuryaOCROptions},
|
36
|
+
"doctr": {"class": DoctrOCREngine, "options_class": DoctrOCROptions},
|
27
37
|
# Add other engines here
|
28
38
|
}
|
29
39
|
|
30
40
|
def __init__(self):
|
31
41
|
"""Initializes the OCR Manager."""
|
32
42
|
self._engine_instances: Dict[str, OCREngine] = {} # Cache for engine instances
|
43
|
+
self._engine_locks: Dict[str, threading.Lock] = (
|
44
|
+
{}
|
45
|
+
) # Lock per engine type for initialization
|
46
|
+
self._engine_inference_locks: Dict[str, threading.Lock] = (
|
47
|
+
{}
|
48
|
+
) # Lock per engine type for inference
|
33
49
|
logger.info("OCRManager initialized.")
|
34
50
|
|
35
51
|
def _get_engine_instance(self, engine_name: str) -> OCREngine:
|
36
|
-
"""Retrieves or creates an instance of the specified OCR engine."""
|
52
|
+
"""Retrieves or creates an instance of the specified OCR engine, ensuring thread-safe initialization."""
|
37
53
|
engine_name = engine_name.lower()
|
38
54
|
if engine_name not in self.ENGINE_REGISTRY:
|
39
55
|
raise ValueError(
|
40
56
|
f"Unknown OCR engine: '{engine_name}'. Available: {list(self.ENGINE_REGISTRY.keys())}"
|
41
57
|
)
|
42
58
|
|
43
|
-
if
|
44
|
-
|
59
|
+
# Quick check if instance already exists (avoid lock contention)
|
60
|
+
if engine_name in self._engine_instances:
|
61
|
+
return self._engine_instances[engine_name]
|
62
|
+
|
63
|
+
# Get or create the lock for this engine type
|
64
|
+
if engine_name not in self._engine_locks:
|
65
|
+
self._engine_locks[engine_name] = threading.Lock()
|
66
|
+
|
67
|
+
engine_init_lock = self._engine_locks[engine_name]
|
68
|
+
|
69
|
+
# Acquire lock to safely check and potentially initialize the engine
|
70
|
+
with engine_init_lock:
|
71
|
+
# Double-check if another thread initialized it while we waited for the lock
|
72
|
+
if engine_name in self._engine_instances:
|
73
|
+
return self._engine_instances[engine_name]
|
74
|
+
|
75
|
+
# If still not initialized, create it now under the lock
|
76
|
+
logger.info(
|
77
|
+
f"[{threading.current_thread().name}] Creating shared instance of engine: {engine_name}"
|
78
|
+
)
|
45
79
|
engine_class = self.ENGINE_REGISTRY[engine_name]["class"]
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
80
|
+
start_time = time.monotonic() # Optional: time initialization
|
81
|
+
try:
|
82
|
+
engine_instance = engine_class() # Instantiate first
|
83
|
+
if not engine_instance.is_available():
|
84
|
+
# Check availability before storing
|
85
|
+
install_hint = f"pip install 'natural-pdf[{engine_name}]'"
|
86
|
+
raise RuntimeError(
|
87
|
+
f"Engine '{engine_name}' is not available. Please install the required dependencies: {install_hint}"
|
88
|
+
)
|
89
|
+
# Store the shared instance
|
90
|
+
self._engine_instances[engine_name] = engine_instance
|
91
|
+
end_time = time.monotonic()
|
92
|
+
logger.info(
|
93
|
+
f"[{threading.current_thread().name}] Shared instance of {engine_name} created successfully (Duration: {end_time - start_time:.2f}s)."
|
53
94
|
)
|
54
|
-
|
95
|
+
return engine_instance
|
96
|
+
except Exception as e:
|
97
|
+
# Ensure we don't leave a partial state if init fails
|
98
|
+
logger.error(
|
99
|
+
f"[{threading.current_thread().name}] Failed to create shared instance of {engine_name}: {e}",
|
100
|
+
exc_info=True,
|
101
|
+
)
|
102
|
+
# Remove potentially partial entry if exists
|
103
|
+
if engine_name in self._engine_instances:
|
104
|
+
del self._engine_instances[engine_name]
|
105
|
+
raise # Re-raise the exception after logging
|
55
106
|
|
56
|
-
|
107
|
+
def _get_engine_inference_lock(self, engine_name: str) -> threading.Lock:
|
108
|
+
"""Gets or creates the inference lock for a given engine type."""
|
109
|
+
engine_name = engine_name.lower()
|
110
|
+
# Assume engine_name is valid as it's checked before this would be called
|
111
|
+
if engine_name not in self._engine_inference_locks:
|
112
|
+
# Create lock if it doesn't exist (basic thread safety for dict access)
|
113
|
+
# A more robust approach might lock around this check/creation too,
|
114
|
+
# but contention here is less critical than for engine init or inference itself.
|
115
|
+
self._engine_inference_locks[engine_name] = threading.Lock()
|
116
|
+
return self._engine_inference_locks[engine_name]
|
57
117
|
|
58
118
|
def apply_ocr(
|
59
119
|
self,
|
@@ -72,7 +132,7 @@ class OCRManager:
|
|
72
132
|
|
73
133
|
Args:
|
74
134
|
images: A single PIL Image or a list of PIL Images to process.
|
75
|
-
engine: Name of the engine (e.g., 'easyocr', 'paddle', 'surya').
|
135
|
+
engine: Name of the engine (e.g., 'easyocr', 'paddle', 'surya', 'doctr').
|
76
136
|
Defaults to 'easyocr' if not specified.
|
77
137
|
languages: List of language codes (e.g., ['en', 'fr'], ['en', 'german']).
|
78
138
|
**Passed directly to the engine.** Must be codes understood
|
@@ -127,21 +187,57 @@ class OCRManager:
|
|
127
187
|
try:
|
128
188
|
engine_instance = self._get_engine_instance(selected_engine_name)
|
129
189
|
processing_mode = "batch" if is_batch else "single image"
|
130
|
-
|
190
|
+
# Log thread name for clarity during parallel calls
|
191
|
+
thread_id = threading.current_thread().name
|
192
|
+
logger.info(
|
193
|
+
f"[{thread_id}] Processing {processing_mode} using shared engine instance '{selected_engine_name}'..."
|
194
|
+
)
|
131
195
|
logger.debug(
|
132
196
|
f" Engine Args: languages={languages}, min_confidence={min_confidence}, device={device}, options={final_options}"
|
133
197
|
)
|
134
198
|
|
135
|
-
#
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
199
|
+
# Log image dimensions before processing
|
200
|
+
if is_batch:
|
201
|
+
image_dims = [
|
202
|
+
f"{img.width}x{img.height}"
|
203
|
+
for img in images
|
204
|
+
if hasattr(img, "width") and hasattr(img, "height")
|
205
|
+
]
|
206
|
+
logger.debug(
|
207
|
+
f"[{thread_id}] Processing batch of {len(images)} images with dimensions: {image_dims}"
|
208
|
+
)
|
209
|
+
elif hasattr(images, "width") and hasattr(images, "height"):
|
210
|
+
logger.debug(
|
211
|
+
f"[{thread_id}] Processing single image with dimensions: {images.width}x{images.height}"
|
212
|
+
)
|
213
|
+
else:
|
214
|
+
logger.warning(f"[{thread_id}] Could not determine dimensions of input image(s).")
|
215
|
+
|
216
|
+
# Acquire lock specifically for the inference call
|
217
|
+
inference_lock = self._get_engine_inference_lock(selected_engine_name)
|
218
|
+
logger.debug(
|
219
|
+
f"[{thread_id}] Attempting to acquire inference lock for {selected_engine_name}..."
|
144
220
|
)
|
221
|
+
inference_wait_start = time.monotonic()
|
222
|
+
with inference_lock:
|
223
|
+
inference_acquired_time = time.monotonic()
|
224
|
+
logger.debug(
|
225
|
+
f"[{thread_id}] Acquired inference lock for {selected_engine_name} (waited {inference_acquired_time - inference_wait_start:.2f}s). Calling process_image..."
|
226
|
+
)
|
227
|
+
inference_start_time = time.monotonic()
|
228
|
+
|
229
|
+
results = engine_instance.process_image(
|
230
|
+
images=images,
|
231
|
+
languages=languages,
|
232
|
+
min_confidence=min_confidence,
|
233
|
+
device=device,
|
234
|
+
detect_only=detect_only,
|
235
|
+
options=final_options,
|
236
|
+
)
|
237
|
+
inference_end_time = time.monotonic()
|
238
|
+
logger.debug(
|
239
|
+
f"[{thread_id}] process_image call finished for {selected_engine_name} (Duration: {inference_end_time - inference_start_time:.2f}s). Releasing lock."
|
240
|
+
)
|
145
241
|
|
146
242
|
# Log result summary based on mode
|
147
243
|
if is_batch:
|