natural-pdf 0.1.6__py3-none-any.whl → 0.1.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docs/categorizing-documents/index.md +168 -0
- docs/data-extraction/index.md +87 -0
- docs/element-selection/index.ipynb +218 -164
- docs/element-selection/index.md +20 -0
- docs/finetuning/index.md +176 -0
- docs/index.md +19 -0
- docs/ocr/index.md +63 -16
- docs/tutorials/01-loading-and-extraction.ipynb +411 -248
- docs/tutorials/02-finding-elements.ipynb +123 -46
- docs/tutorials/03-extracting-blocks.ipynb +24 -19
- docs/tutorials/04-table-extraction.ipynb +17 -12
- docs/tutorials/05-excluding-content.ipynb +37 -32
- docs/tutorials/06-document-qa.ipynb +36 -31
- docs/tutorials/07-layout-analysis.ipynb +45 -40
- docs/tutorials/07-working-with-regions.ipynb +61 -60
- docs/tutorials/08-spatial-navigation.ipynb +76 -71
- docs/tutorials/09-section-extraction.ipynb +160 -155
- docs/tutorials/10-form-field-extraction.ipynb +71 -66
- docs/tutorials/11-enhanced-table-processing.ipynb +11 -6
- docs/tutorials/12-ocr-integration.ipynb +3420 -312
- docs/tutorials/12-ocr-integration.md +68 -106
- docs/tutorials/13-semantic-search.ipynb +641 -251
- natural_pdf/__init__.py +3 -0
- natural_pdf/analyzers/layout/gemini.py +63 -47
- natural_pdf/classification/manager.py +343 -0
- natural_pdf/classification/mixin.py +149 -0
- natural_pdf/classification/results.py +62 -0
- natural_pdf/collections/mixins.py +63 -0
- natural_pdf/collections/pdf_collection.py +326 -17
- natural_pdf/core/element_manager.py +73 -4
- natural_pdf/core/page.py +255 -83
- natural_pdf/core/pdf.py +385 -367
- natural_pdf/elements/base.py +1 -3
- natural_pdf/elements/collections.py +279 -49
- natural_pdf/elements/region.py +106 -21
- natural_pdf/elements/text.py +5 -2
- natural_pdf/exporters/__init__.py +4 -0
- natural_pdf/exporters/base.py +61 -0
- natural_pdf/exporters/paddleocr.py +345 -0
- natural_pdf/extraction/manager.py +134 -0
- natural_pdf/extraction/mixin.py +246 -0
- natural_pdf/extraction/result.py +37 -0
- natural_pdf/ocr/__init__.py +16 -8
- natural_pdf/ocr/engine.py +46 -30
- natural_pdf/ocr/engine_easyocr.py +86 -42
- natural_pdf/ocr/engine_paddle.py +39 -28
- natural_pdf/ocr/engine_surya.py +32 -16
- natural_pdf/ocr/ocr_factory.py +34 -23
- natural_pdf/ocr/ocr_manager.py +98 -34
- natural_pdf/ocr/ocr_options.py +38 -10
- natural_pdf/ocr/utils.py +59 -33
- natural_pdf/qa/document_qa.py +0 -4
- natural_pdf/selectors/parser.py +363 -238
- natural_pdf/templates/finetune/fine_tune_paddleocr.md +420 -0
- natural_pdf/utils/debug.py +4 -2
- natural_pdf/utils/identifiers.py +9 -5
- natural_pdf/utils/locks.py +8 -0
- natural_pdf/utils/packaging.py +172 -105
- natural_pdf/utils/text_extraction.py +96 -65
- natural_pdf/utils/tqdm_utils.py +43 -0
- natural_pdf/utils/visualization.py +1 -1
- {natural_pdf-0.1.6.dist-info → natural_pdf-0.1.8.dist-info}/METADATA +10 -3
- {natural_pdf-0.1.6.dist-info → natural_pdf-0.1.8.dist-info}/RECORD +66 -51
- {natural_pdf-0.1.6.dist-info → natural_pdf-0.1.8.dist-info}/WHEEL +1 -1
- {natural_pdf-0.1.6.dist-info → natural_pdf-0.1.8.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.1.6.dist-info → natural_pdf-0.1.8.dist-info}/top_level.txt +0 -0
@@ -18,28 +18,31 @@ class EasyOCREngine(OCREngine):
|
|
18
18
|
def __init__(self):
|
19
19
|
super().__init__()
|
20
20
|
# No longer need _easyocr attribute
|
21
|
-
# self._easyocr = None
|
21
|
+
# self._easyocr = None
|
22
22
|
|
23
23
|
def is_available(self) -> bool:
|
24
24
|
"""Check if EasyOCR is installed."""
|
25
25
|
return importlib.util.find_spec("easyocr") is not None
|
26
26
|
|
27
|
-
def _initialize_model(
|
27
|
+
def _initialize_model(
|
28
|
+
self, languages: List[str], device: str, options: Optional[BaseOCROptions]
|
29
|
+
):
|
28
30
|
"""Initialize the EasyOCR model."""
|
29
31
|
# Import directly here
|
30
32
|
try:
|
31
33
|
import easyocr
|
34
|
+
|
32
35
|
self.logger.info("EasyOCR module imported successfully.")
|
33
36
|
except ImportError as e:
|
34
37
|
self.logger.error(f"Failed to import EasyOCR: {e}")
|
35
38
|
raise
|
36
|
-
|
39
|
+
|
37
40
|
# Cast to EasyOCROptions if possible, otherwise use default
|
38
41
|
easy_options = options if isinstance(options, EasyOCROptions) else EasyOCROptions()
|
39
|
-
|
42
|
+
|
40
43
|
# Prepare constructor arguments
|
41
44
|
use_gpu = "cuda" in device.lower() or "mps" in device.lower()
|
42
|
-
|
45
|
+
|
43
46
|
constructor_args = {
|
44
47
|
"lang_list": languages,
|
45
48
|
"gpu": use_gpu,
|
@@ -55,12 +58,12 @@ class EasyOCREngine(OCREngine):
|
|
55
58
|
"quantize": easy_options.quantize,
|
56
59
|
"cudnn_benchmark": easy_options.cudnn_benchmark,
|
57
60
|
}
|
58
|
-
|
61
|
+
|
59
62
|
# Filter out None values, as EasyOCR expects non-None or default behaviour
|
60
63
|
constructor_args = {k: v for k, v in constructor_args.items() if v is not None}
|
61
|
-
|
64
|
+
|
62
65
|
self.logger.debug(f"EasyOCR Reader constructor args: {constructor_args}")
|
63
|
-
|
66
|
+
|
64
67
|
# Create the reader
|
65
68
|
try:
|
66
69
|
self._model = easyocr.Reader(**constructor_args)
|
@@ -73,103 +76,144 @@ class EasyOCREngine(OCREngine):
|
|
73
76
|
"""Convert PIL Image to numpy array for EasyOCR."""
|
74
77
|
return np.array(image)
|
75
78
|
|
76
|
-
def _process_single_image(
|
79
|
+
def _process_single_image(
|
80
|
+
self, image: np.ndarray, detect_only: bool, options: Optional[EasyOCROptions]
|
81
|
+
) -> Any:
|
77
82
|
"""Process a single image with EasyOCR."""
|
78
83
|
if self._model is None:
|
79
84
|
raise RuntimeError("EasyOCR model not initialized")
|
80
|
-
|
85
|
+
|
81
86
|
# Cast options to proper type if provided
|
82
87
|
easy_options = options if isinstance(options, EasyOCROptions) else EasyOCROptions()
|
83
|
-
|
88
|
+
|
84
89
|
# Prepare readtext arguments (only needed if not detect_only)
|
85
90
|
readtext_args = {}
|
86
91
|
if not detect_only:
|
87
92
|
for param in [
|
88
|
-
"detail",
|
89
|
-
"
|
90
|
-
"
|
91
|
-
"
|
93
|
+
"detail",
|
94
|
+
"paragraph",
|
95
|
+
"min_size",
|
96
|
+
"contrast_ths",
|
97
|
+
"adjust_contrast",
|
98
|
+
"filter_ths",
|
99
|
+
"text_threshold",
|
100
|
+
"low_text",
|
101
|
+
"link_threshold",
|
102
|
+
"canvas_size",
|
103
|
+
"mag_ratio",
|
104
|
+
"slope_ths",
|
105
|
+
"ycenter_ths",
|
106
|
+
"height_ths",
|
107
|
+
"width_ths",
|
108
|
+
"y_ths",
|
109
|
+
"x_ths",
|
110
|
+
"add_margin",
|
111
|
+
"output_format",
|
92
112
|
]:
|
93
113
|
if hasattr(easy_options, param):
|
94
114
|
val = getattr(easy_options, param)
|
95
115
|
if val is not None:
|
96
116
|
readtext_args[param] = val
|
97
|
-
|
117
|
+
|
98
118
|
# Process differently based on detect_only flag
|
99
119
|
if detect_only:
|
100
120
|
# Returns tuple (horizontal_list, free_list)
|
101
121
|
# horizontal_list is a list containing one item: the list of boxes
|
102
122
|
# Each box is [[x1, y1], [x2, y1], [x2, y2], [x1, y2]]
|
103
|
-
bboxes_tuple = self._model.detect(
|
104
|
-
|
105
|
-
|
123
|
+
bboxes_tuple = self._model.detect(
|
124
|
+
image, **readtext_args
|
125
|
+
) # Pass args here too? Check EasyOCR docs if needed.
|
126
|
+
if (
|
127
|
+
bboxes_tuple
|
128
|
+
and isinstance(bboxes_tuple, tuple)
|
129
|
+
and len(bboxes_tuple) > 0
|
130
|
+
and isinstance(bboxes_tuple[0], list)
|
131
|
+
):
|
132
|
+
return bboxes_tuple[0] # Return the list of polygons directly
|
106
133
|
else:
|
107
134
|
self.logger.warning(f"EasyOCR detect returned unexpected format: {bboxes_tuple}")
|
108
|
-
return []
|
135
|
+
return [] # Return empty list on unexpected format
|
109
136
|
else:
|
110
137
|
return self._model.readtext(image, **readtext_args)
|
111
138
|
|
112
|
-
def _standardize_results(
|
139
|
+
def _standardize_results(
|
140
|
+
self, raw_results: Any, min_confidence: float, detect_only: bool
|
141
|
+
) -> List[TextRegion]:
|
113
142
|
"""Convert EasyOCR results to standardized TextRegion objects."""
|
114
143
|
standardized_regions = []
|
115
|
-
|
144
|
+
|
116
145
|
if detect_only:
|
146
|
+
results = raw_results[0]
|
117
147
|
# In detect_only mode, raw_results is already a list of bounding boxes
|
118
148
|
# Each bbox is in [x_min, x_max, y_min, y_max] format
|
119
|
-
if isinstance(
|
120
|
-
for detection in
|
149
|
+
if isinstance(results, list):
|
150
|
+
for detection in results:
|
121
151
|
try:
|
152
|
+
# This block expects 'detection' to be a list/tuple of 4 numbers
|
122
153
|
if isinstance(detection, (list, tuple)) and len(detection) == 4:
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
154
|
+
x_min, x_max, y_min, y_max = detection
|
155
|
+
# Convert to standardized (x0, y0, x1, y1) format
|
156
|
+
try:
|
157
|
+
bbox = (float(x_min), float(y_min), float(x_max), float(y_max))
|
158
|
+
standardized_regions.append(
|
159
|
+
TextRegion(bbox, text=None, confidence=None)
|
160
|
+
)
|
161
|
+
except (ValueError, TypeError) as e:
|
162
|
+
raise ValueError(
|
163
|
+
f"Invalid number format in EasyOCR detect bbox: {detection}"
|
164
|
+
) from e
|
130
165
|
else:
|
166
|
+
# This is where the error is raised if 'detection' is not a list/tuple of 4 numbers
|
131
167
|
raise ValueError(f"Invalid detection format from EasyOCR: {detection}")
|
132
168
|
except ValueError as e:
|
133
169
|
# Re-raise any value errors from standardization or format checks
|
134
170
|
raise e
|
135
171
|
except Exception as e:
|
136
172
|
# Catch other potential processing errors
|
137
|
-
raise ValueError(
|
173
|
+
raise ValueError(
|
174
|
+
f"Error processing EasyOCR detection item: {detection}"
|
175
|
+
) from e
|
138
176
|
else:
|
139
|
-
raise ValueError(
|
140
|
-
|
177
|
+
raise ValueError(
|
178
|
+
f"Expected list of bounding boxes in detect_only mode, got: {type(raw_results)}"
|
179
|
+
)
|
180
|
+
|
141
181
|
return standardized_regions
|
142
|
-
|
182
|
+
|
143
183
|
# Full OCR mode (readtext results)
|
144
184
|
for detection in raw_results:
|
145
185
|
try:
|
146
186
|
# Detail mode (list/tuple result)
|
147
187
|
if isinstance(detection, (list, tuple)) and len(detection) >= 3:
|
148
|
-
bbox_raw = detection[0]
|
188
|
+
bbox_raw = detection[0] # This is usually a polygon [[x1,y1],...]
|
149
189
|
text = str(detection[1])
|
150
190
|
confidence = float(detection[2])
|
151
|
-
|
191
|
+
|
152
192
|
if confidence >= min_confidence:
|
153
193
|
try:
|
154
194
|
# Use the standard helper for polygons
|
155
195
|
bbox = self._standardize_bbox(bbox_raw)
|
156
196
|
standardized_regions.append(TextRegion(bbox, text, confidence))
|
157
197
|
except ValueError as e:
|
158
|
-
raise ValueError(
|
159
|
-
|
198
|
+
raise ValueError(
|
199
|
+
f"Could not standardize bounding box from EasyOCR readtext: {bbox_raw}"
|
200
|
+
) from e
|
201
|
+
|
160
202
|
# Simple mode (string result)
|
161
203
|
elif isinstance(detection, str):
|
162
204
|
if 0.0 >= min_confidence: # Always include if min_confidence is 0
|
163
205
|
standardized_regions.append(TextRegion((0, 0, 0, 0), detection, 1.0))
|
164
206
|
else:
|
165
207
|
# Handle unexpected format in OCR mode
|
166
|
-
raise ValueError(
|
167
|
-
|
208
|
+
raise ValueError(
|
209
|
+
f"Invalid OCR detection format from EasyOCR readtext: {detection}"
|
210
|
+
)
|
211
|
+
|
168
212
|
except ValueError as e:
|
169
213
|
# Re-raise any value errors from standardization or format checks
|
170
214
|
raise e
|
171
215
|
except Exception as e:
|
172
216
|
# Catch other potential processing errors
|
173
217
|
raise ValueError(f"Error processing EasyOCR detection item: {detection}") from e
|
174
|
-
|
218
|
+
|
175
219
|
return standardized_regions
|
natural_pdf/ocr/engine_paddle.py
CHANGED
@@ -27,40 +27,43 @@ class PaddleOCREngine(OCREngine):
|
|
27
27
|
paddleocr_installed = importlib.util.find_spec("paddleocr") is not None
|
28
28
|
return paddle_installed and paddleocr_installed
|
29
29
|
|
30
|
-
def _initialize_model(
|
30
|
+
def _initialize_model(
|
31
|
+
self, languages: List[str], device: str, options: Optional[BaseOCROptions]
|
32
|
+
):
|
31
33
|
"""Initialize the PaddleOCR model."""
|
32
34
|
try:
|
33
|
-
import paddleocr
|
35
|
+
import paddleocr
|
36
|
+
|
34
37
|
self.logger.info("PaddleOCR module imported successfully.")
|
35
38
|
except ImportError as e:
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
+
self.logger.error(f"Failed to import PaddleOCR/PaddlePaddle: {e}")
|
40
|
+
raise
|
41
|
+
|
39
42
|
# Cast to PaddleOCROptions if possible
|
40
43
|
paddle_options = options if isinstance(options, PaddleOCROptions) else PaddleOCROptions()
|
41
|
-
|
44
|
+
|
42
45
|
# Determine parameters
|
43
46
|
primary_lang = languages[0] if languages else "en"
|
44
47
|
use_gpu = "cuda" in str(device).lower()
|
45
|
-
|
48
|
+
|
46
49
|
# Create constructor arguments
|
47
50
|
constructor_args = {
|
48
51
|
"lang": primary_lang,
|
49
52
|
"use_gpu": use_gpu,
|
50
53
|
"use_angle_cls": paddle_options.use_angle_cls,
|
51
|
-
"det": True,
|
52
|
-
"rec": True # We'll control recognition at process time
|
54
|
+
"det": True,
|
55
|
+
"rec": True, # We'll control recognition at process time
|
53
56
|
}
|
54
|
-
|
57
|
+
|
55
58
|
# Add optional parameters if available
|
56
59
|
for param in ["det_model_dir", "rec_model_dir", "cls_model_dir", "show_log", "use_onnx"]:
|
57
60
|
if hasattr(paddle_options, param):
|
58
61
|
val = getattr(paddle_options, param)
|
59
62
|
if val is not None:
|
60
63
|
constructor_args[param] = val
|
61
|
-
|
64
|
+
|
62
65
|
self.logger.debug(f"PaddleOCR constructor args: {constructor_args}")
|
63
|
-
|
66
|
+
|
64
67
|
# Create the model
|
65
68
|
try:
|
66
69
|
self._model = paddleocr.PaddleOCR(**constructor_args)
|
@@ -78,31 +81,35 @@ class PaddleOCREngine(OCREngine):
|
|
78
81
|
img_array_bgr = img_array_rgb[:, :, ::-1] # Convert RGB to BGR
|
79
82
|
return img_array_bgr
|
80
83
|
|
81
|
-
def _process_single_image(
|
84
|
+
def _process_single_image(
|
85
|
+
self, image: np.ndarray, detect_only: bool, options: Optional[PaddleOCROptions]
|
86
|
+
) -> Any:
|
82
87
|
"""Process a single image with PaddleOCR."""
|
83
88
|
if self._model is None:
|
84
89
|
raise RuntimeError("PaddleOCR model not initialized")
|
85
|
-
|
90
|
+
|
86
91
|
# Prepare OCR arguments
|
87
92
|
ocr_args = {}
|
88
93
|
if options and isinstance(options, PaddleOCROptions):
|
89
94
|
ocr_args["cls"] = options.cls if options.cls is not None else options.use_angle_cls
|
90
95
|
ocr_args["det"] = options.det
|
91
96
|
ocr_args["rec"] = not detect_only # Control recognition based on detect_only flag
|
92
|
-
|
97
|
+
|
93
98
|
# Run OCR
|
94
99
|
raw_results = self._model.ocr(image, **ocr_args)
|
95
100
|
return raw_results
|
96
101
|
|
97
|
-
def _standardize_results(
|
102
|
+
def _standardize_results(
|
103
|
+
self, raw_results: Any, min_confidence: float, detect_only: bool
|
104
|
+
) -> List[TextRegion]:
|
98
105
|
"""Convert PaddleOCR results to standardized TextRegion objects."""
|
99
106
|
standardized_regions = []
|
100
|
-
|
107
|
+
|
101
108
|
if not raw_results or not isinstance(raw_results, list) or len(raw_results) == 0:
|
102
109
|
return standardized_regions
|
103
|
-
|
110
|
+
|
104
111
|
page_results = raw_results[0] if raw_results[0] is not None else []
|
105
|
-
|
112
|
+
|
106
113
|
for detection in page_results:
|
107
114
|
# Initialize text and confidence
|
108
115
|
text = None
|
@@ -111,20 +118,22 @@ class PaddleOCREngine(OCREngine):
|
|
111
118
|
|
112
119
|
# Paddle always seems to return the tuple structure [bbox, (text, conf)]
|
113
120
|
# even if rec=False. We need to parse this structure regardless.
|
114
|
-
if len(detection) == 4:
|
115
|
-
|
121
|
+
if len(detection) == 4: # Handle potential alternative format?
|
122
|
+
detection = [detection, ("", 1.0)] # Treat as bbox + dummy text/conf
|
116
123
|
|
117
124
|
if not isinstance(detection, (list, tuple)) or len(detection) < 2:
|
118
125
|
raise ValueError(f"Invalid detection format from PaddleOCR: {detection}")
|
119
|
-
|
126
|
+
|
120
127
|
bbox_raw = detection[0]
|
121
128
|
text_confidence = detection[1]
|
122
|
-
|
129
|
+
|
123
130
|
if not isinstance(text_confidence, tuple) or len(text_confidence) < 2:
|
124
|
-
# Even if detect_only, we expect the (text, conf) structure,
|
131
|
+
# Even if detect_only, we expect the (text, conf) structure,
|
125
132
|
# it might just contain dummy values.
|
126
|
-
raise ValueError(
|
127
|
-
|
133
|
+
raise ValueError(
|
134
|
+
f"Invalid text/confidence structure from PaddleOCR: {text_confidence}"
|
135
|
+
)
|
136
|
+
|
128
137
|
# Extract text/conf only if not detect_only
|
129
138
|
if not detect_only:
|
130
139
|
text = str(text_confidence[0])
|
@@ -134,7 +143,9 @@ class PaddleOCREngine(OCREngine):
|
|
134
143
|
try:
|
135
144
|
bbox = self._standardize_bbox(bbox_raw)
|
136
145
|
except ValueError as e:
|
137
|
-
raise ValueError(
|
146
|
+
raise ValueError(
|
147
|
+
f"Could not standardize bounding box from PaddleOCR: {bbox_raw}"
|
148
|
+
) from e
|
138
149
|
|
139
150
|
# Append based on mode
|
140
151
|
if detect_only:
|
@@ -143,5 +154,5 @@ class PaddleOCREngine(OCREngine):
|
|
143
154
|
elif confidence >= min_confidence:
|
144
155
|
# Only append if confidence meets threshold in full OCR mode
|
145
156
|
standardized_regions.append(TextRegion(bbox, text, confidence))
|
146
|
-
|
157
|
+
|
147
158
|
return standardized_regions
|
natural_pdf/ocr/engine_surya.py
CHANGED
@@ -20,14 +20,16 @@ class SuryaOCREngine(OCREngine):
|
|
20
20
|
self._surya_recognition = None
|
21
21
|
self._surya_detection = None
|
22
22
|
|
23
|
-
def _initialize_model(
|
23
|
+
def _initialize_model(
|
24
|
+
self, languages: List[str], device: str, options: Optional[BaseOCROptions]
|
25
|
+
):
|
24
26
|
"""Initialize Surya predictors."""
|
25
27
|
if not self.is_available():
|
26
28
|
raise ImportError("Surya OCR library is not installed or available.")
|
27
29
|
|
28
30
|
# Store languages for use in _process_single_image
|
29
31
|
self._langs = languages
|
30
|
-
|
32
|
+
|
31
33
|
from surya.detection import DetectionPredictor
|
32
34
|
from surya.recognition import RecognitionPredictor
|
33
35
|
|
@@ -41,21 +43,27 @@ class SuryaOCREngine(OCREngine):
|
|
41
43
|
self._detection_predictor = self._surya_detection(**predictor_args)
|
42
44
|
self.logger.info("Instantiating Surya RecognitionPredictor...")
|
43
45
|
self._recognition_predictor = self._surya_recognition(**predictor_args)
|
44
|
-
|
46
|
+
|
45
47
|
self.logger.info("Surya predictors initialized.")
|
46
48
|
|
47
49
|
def _preprocess_image(self, image: Image.Image) -> Image.Image:
|
48
50
|
"""Surya uses PIL images directly, so just return the image."""
|
49
51
|
return image
|
50
52
|
|
51
|
-
def _process_single_image(
|
53
|
+
def _process_single_image(
|
54
|
+
self, image: Image.Image, detect_only: bool, options: Optional[SuryaOCROptions]
|
55
|
+
) -> Any:
|
52
56
|
"""Process a single image with Surya OCR."""
|
53
57
|
if not self._recognition_predictor or not self._detection_predictor:
|
54
58
|
raise RuntimeError("Surya predictors are not initialized.")
|
55
59
|
|
56
60
|
# Store languages instance variable during initialization to use here
|
57
|
-
langs =
|
58
|
-
|
61
|
+
langs = (
|
62
|
+
[[lang] for lang in self._langs]
|
63
|
+
if hasattr(self, "_langs")
|
64
|
+
else [[self.DEFAULT_LANGUAGES[0]]]
|
65
|
+
)
|
66
|
+
|
59
67
|
# Surya expects lists of images, so we need to wrap our single image
|
60
68
|
if detect_only:
|
61
69
|
results = self._detection_predictor(images=[image])
|
@@ -63,33 +71,41 @@ class SuryaOCREngine(OCREngine):
|
|
63
71
|
results = self._recognition_predictor(
|
64
72
|
images=[image],
|
65
73
|
langs=langs, # Use the languages set during initialization
|
66
|
-
det_predictor=self._detection_predictor
|
74
|
+
det_predictor=self._detection_predictor,
|
67
75
|
)
|
68
|
-
|
76
|
+
|
69
77
|
# Surya may return a list with one result per image or a single result object
|
70
78
|
# Return the result as-is and handle the extraction in _standardize_results
|
71
79
|
return results
|
72
80
|
|
73
|
-
def _standardize_results(
|
81
|
+
def _standardize_results(
|
82
|
+
self, raw_results: Any, min_confidence: float, detect_only: bool
|
83
|
+
) -> List[TextRegion]:
|
74
84
|
"""Convert Surya results to standardized TextRegion objects."""
|
75
85
|
standardized_regions = []
|
76
|
-
|
86
|
+
|
77
87
|
raw_result = raw_results
|
78
88
|
if isinstance(raw_results, list) and len(raw_results) > 0:
|
79
89
|
raw_result = raw_results[0]
|
80
|
-
|
81
|
-
results =
|
90
|
+
|
91
|
+
results = (
|
92
|
+
raw_result.text_lines
|
93
|
+
if hasattr(raw_result, "text_lines") and not detect_only
|
94
|
+
else raw_result.bboxes
|
95
|
+
)
|
82
96
|
|
83
97
|
for line in results:
|
84
98
|
# Always extract bbox first
|
85
99
|
try:
|
86
100
|
# Prioritize line.bbox, fallback to line.polygon
|
87
|
-
bbox_raw = line.bbox if hasattr(line,
|
101
|
+
bbox_raw = line.bbox if hasattr(line, "bbox") else getattr(line, "polygon", None)
|
88
102
|
if bbox_raw is None:
|
89
|
-
|
103
|
+
raise ValueError("Missing bbox/polygon data")
|
90
104
|
bbox = self._standardize_bbox(bbox_raw)
|
91
105
|
except ValueError as e:
|
92
|
-
raise ValueError(
|
106
|
+
raise ValueError(
|
107
|
+
f"Could not standardize bounding box from Surya result: {bbox_raw}"
|
108
|
+
) from e
|
93
109
|
|
94
110
|
if detect_only:
|
95
111
|
# For detect_only, text and confidence are None
|
@@ -100,7 +116,7 @@ class SuryaOCREngine(OCREngine):
|
|
100
116
|
confidence = line.confidence
|
101
117
|
if confidence >= min_confidence:
|
102
118
|
standardized_regions.append(TextRegion(bbox, text, confidence))
|
103
|
-
|
119
|
+
|
104
120
|
return standardized_regions
|
105
121
|
|
106
122
|
def is_available(self) -> bool:
|
natural_pdf/ocr/ocr_factory.py
CHANGED
@@ -13,14 +13,14 @@ class OCRFactory:
|
|
13
13
|
@staticmethod
|
14
14
|
def create_engine(engine_type: str, **kwargs) -> OCREngine:
|
15
15
|
"""Create and return an OCR engine instance.
|
16
|
-
|
16
|
+
|
17
17
|
Args:
|
18
18
|
engine_type: One of 'surya', 'easyocr', 'paddle'
|
19
19
|
**kwargs: Arguments to pass to the engine constructor
|
20
|
-
|
20
|
+
|
21
21
|
Returns:
|
22
22
|
An initialized OCR engine
|
23
|
-
|
23
|
+
|
24
24
|
Raises:
|
25
25
|
ImportError: If the required dependencies aren't installed
|
26
26
|
ValueError: If the engine_type is unknown
|
@@ -28,72 +28,83 @@ class OCRFactory:
|
|
28
28
|
if engine_type == "surya":
|
29
29
|
try:
|
30
30
|
from .engine_surya import SuryaOCREngine
|
31
|
+
|
31
32
|
return SuryaOCREngine(**kwargs)
|
32
33
|
except ImportError:
|
33
|
-
raise ImportError(
|
34
|
-
|
34
|
+
raise ImportError(
|
35
|
+
"Surya engine requires the 'surya' package. " "Install with: pip install surya"
|
36
|
+
)
|
35
37
|
elif engine_type == "easyocr":
|
36
38
|
try:
|
37
39
|
from .engine_easyocr import EasyOCREngine
|
40
|
+
|
38
41
|
return EasyOCREngine(**kwargs)
|
39
42
|
except ImportError:
|
40
|
-
raise ImportError(
|
41
|
-
|
43
|
+
raise ImportError(
|
44
|
+
"EasyOCR engine requires the 'easyocr' package. "
|
45
|
+
"Install with: pip install easyocr"
|
46
|
+
)
|
42
47
|
elif engine_type == "paddle":
|
43
48
|
try:
|
44
49
|
from .engine_paddle import PaddleOCREngine
|
50
|
+
|
45
51
|
return PaddleOCREngine(**kwargs)
|
46
52
|
except ImportError:
|
47
|
-
raise ImportError(
|
48
|
-
|
53
|
+
raise ImportError(
|
54
|
+
"PaddleOCR engine requires 'paddleocr' and 'paddlepaddle'. "
|
55
|
+
"Install with: pip install paddleocr paddlepaddle"
|
56
|
+
)
|
49
57
|
else:
|
50
58
|
raise ValueError(f"Unknown engine type: {engine_type}")
|
51
|
-
|
59
|
+
|
52
60
|
@staticmethod
|
53
61
|
def list_available_engines() -> Dict[str, bool]:
|
54
62
|
"""Returns a dictionary of engine names and their availability status."""
|
55
63
|
engines = {}
|
56
|
-
|
64
|
+
|
57
65
|
# Check Surya
|
58
66
|
try:
|
59
67
|
engines["surya"] = importlib.util.find_spec("surya") is not None
|
60
68
|
except ImportError:
|
61
69
|
engines["surya"] = False
|
62
|
-
|
70
|
+
|
63
71
|
# Check EasyOCR
|
64
72
|
try:
|
65
73
|
engines["easyocr"] = importlib.util.find_spec("easyocr") is not None
|
66
74
|
except ImportError:
|
67
75
|
engines["easyocr"] = False
|
68
|
-
|
76
|
+
|
69
77
|
# Check PaddleOCR
|
70
78
|
try:
|
71
|
-
paddle =
|
79
|
+
paddle = (
|
80
|
+
importlib.util.find_spec("paddle") is not None
|
81
|
+
or importlib.util.find_spec("paddlepaddle") is not None
|
82
|
+
)
|
72
83
|
paddleocr = importlib.util.find_spec("paddleocr") is not None
|
73
84
|
engines["paddle"] = paddle and paddleocr
|
74
85
|
except ImportError:
|
75
86
|
engines["paddle"] = False
|
76
|
-
|
87
|
+
|
77
88
|
return engines
|
78
|
-
|
89
|
+
|
79
90
|
@staticmethod
|
80
91
|
def get_recommended_engine(**kwargs) -> OCREngine:
|
81
92
|
"""Returns the best available OCR engine based on what's installed.
|
82
|
-
|
93
|
+
|
83
94
|
First tries engines in order of preference: EasyOCR, Paddle, Surya.
|
84
95
|
If none are available, raises ImportError with installation instructions.
|
85
|
-
|
96
|
+
|
86
97
|
Args:
|
87
98
|
**kwargs: Arguments to pass to the engine constructor
|
88
|
-
|
99
|
+
|
89
100
|
Returns:
|
90
101
|
The best available OCR engine instance
|
91
|
-
|
102
|
+
|
92
103
|
Raises:
|
93
104
|
ImportError: If no engines are available
|
94
105
|
"""
|
95
106
|
available = OCRFactory.list_available_engines()
|
96
|
-
|
107
|
+
|
97
108
|
# Try engines in order of recommendation
|
98
109
|
if available.get("easyocr", False):
|
99
110
|
logger.info("Using EasyOCR engine (recommended)")
|
@@ -104,11 +115,11 @@ class OCRFactory:
|
|
104
115
|
elif available.get("surya", False):
|
105
116
|
logger.info("Using Surya OCR engine")
|
106
117
|
return OCRFactory.create_engine("surya", **kwargs)
|
107
|
-
|
118
|
+
|
108
119
|
# If we get here, no engines are available
|
109
120
|
raise ImportError(
|
110
121
|
"No OCR engines available. Please install at least one of: \n"
|
111
122
|
"- EasyOCR (recommended): pip install easyocr\n"
|
112
123
|
"- PaddleOCR: pip install paddleocr paddlepaddle\n"
|
113
124
|
"- Surya OCR: pip install surya"
|
114
|
-
)
|
125
|
+
)
|