natural-pdf 0.1.6__py3-none-any.whl → 0.1.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docs/finetuning/index.md +176 -0
- docs/tutorials/01-loading-and-extraction.ipynb +34 -1550
- natural_pdf/__init__.py +1 -0
- natural_pdf/analyzers/layout/gemini.py +63 -47
- natural_pdf/collections/pdf_collection.py +5 -2
- natural_pdf/core/element_manager.py +6 -4
- natural_pdf/core/page.py +36 -27
- natural_pdf/core/pdf.py +25 -16
- natural_pdf/elements/base.py +1 -3
- natural_pdf/elements/collections.py +13 -14
- natural_pdf/elements/region.py +7 -6
- natural_pdf/exporters/__init__.py +4 -0
- natural_pdf/exporters/base.py +61 -0
- natural_pdf/exporters/paddleocr.py +345 -0
- natural_pdf/ocr/__init__.py +16 -8
- natural_pdf/ocr/engine.py +46 -30
- natural_pdf/ocr/engine_easyocr.py +81 -40
- natural_pdf/ocr/engine_paddle.py +39 -28
- natural_pdf/ocr/engine_surya.py +32 -16
- natural_pdf/ocr/ocr_factory.py +34 -23
- natural_pdf/ocr/ocr_manager.py +15 -11
- natural_pdf/ocr/ocr_options.py +5 -0
- natural_pdf/ocr/utils.py +46 -31
- natural_pdf/templates/finetune/fine_tune_paddleocr.md +415 -0
- natural_pdf/utils/debug.py +4 -2
- natural_pdf/utils/identifiers.py +9 -5
- natural_pdf/utils/packaging.py +172 -105
- natural_pdf/utils/text_extraction.py +44 -64
- natural_pdf/utils/visualization.py +1 -1
- {natural_pdf-0.1.6.dist-info → natural_pdf-0.1.7.dist-info}/METADATA +5 -3
- {natural_pdf-0.1.6.dist-info → natural_pdf-0.1.7.dist-info}/RECORD +34 -30
- {natural_pdf-0.1.6.dist-info → natural_pdf-0.1.7.dist-info}/WHEEL +0 -0
- {natural_pdf-0.1.6.dist-info → natural_pdf-0.1.7.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.1.6.dist-info → natural_pdf-0.1.7.dist-info}/top_level.txt +0 -0
natural_pdf/ocr/engine.py
CHANGED
@@ -13,11 +13,17 @@ logger = logging.getLogger(__name__)
|
|
13
13
|
|
14
14
|
class TextRegion:
|
15
15
|
"""Standard representation of an OCR text region."""
|
16
|
-
|
17
|
-
def __init__(
|
16
|
+
|
17
|
+
def __init__(
|
18
|
+
self,
|
19
|
+
bbox: Tuple[float, float, float, float],
|
20
|
+
text: str,
|
21
|
+
confidence: float,
|
22
|
+
source: str = "ocr",
|
23
|
+
):
|
18
24
|
"""
|
19
25
|
Initialize a text region.
|
20
|
-
|
26
|
+
|
21
27
|
Args:
|
22
28
|
bbox: Tuple of (x0, y0, x1, y1) coordinates
|
23
29
|
text: The recognized text
|
@@ -28,7 +34,7 @@ class TextRegion:
|
|
28
34
|
self.text = text
|
29
35
|
self.confidence = confidence
|
30
36
|
self.source = source
|
31
|
-
|
37
|
+
|
32
38
|
@classmethod
|
33
39
|
def from_polygon(cls, polygon: List[List[float]], text: str, confidence: float):
|
34
40
|
"""Create from polygon coordinates [[x1,y1], [x2,y2], ...]"""
|
@@ -36,24 +42,24 @@ class TextRegion:
|
|
36
42
|
y_coords = [float(point[1]) for point in polygon]
|
37
43
|
bbox = (min(x_coords), min(y_coords), max(x_coords), max(y_coords))
|
38
44
|
return cls(bbox, text, confidence)
|
39
|
-
|
45
|
+
|
40
46
|
def to_dict(self) -> Dict[str, Any]:
|
41
47
|
"""Convert to dictionary representation for compatibility."""
|
42
48
|
return {
|
43
49
|
"bbox": self.bbox,
|
44
50
|
"text": self.text,
|
45
51
|
"confidence": self.confidence,
|
46
|
-
"source": self.source
|
52
|
+
"source": self.source,
|
47
53
|
}
|
48
54
|
|
49
55
|
|
50
56
|
class OCREngine(ABC):
|
51
57
|
"""Abstract Base Class for OCR engines."""
|
52
|
-
|
58
|
+
|
53
59
|
# Default values as class constants
|
54
60
|
DEFAULT_MIN_CONFIDENCE = 0.2
|
55
|
-
DEFAULT_LANGUAGES = [
|
56
|
-
DEFAULT_DEVICE =
|
61
|
+
DEFAULT_LANGUAGES = ["en"]
|
62
|
+
DEFAULT_DEVICE = "cpu"
|
57
63
|
|
58
64
|
def __init__(self):
|
59
65
|
"""Initializes the base OCR engine."""
|
@@ -74,7 +80,7 @@ class OCREngine(ABC):
|
|
74
80
|
) -> Union[List[Dict[str, Any]], List[List[Dict[str, Any]]]]:
|
75
81
|
"""
|
76
82
|
Process a single image or batch of images with OCR.
|
77
|
-
|
83
|
+
|
78
84
|
Args:
|
79
85
|
images: A single PIL Image or a list of PIL Images
|
80
86
|
languages: List of languages to use (default: ['en'])
|
@@ -82,7 +88,7 @@ class OCREngine(ABC):
|
|
82
88
|
device: Device to use for processing (default: 'cpu')
|
83
89
|
detect_only: Whether to only detect text regions without recognition
|
84
90
|
options: Engine-specific options
|
85
|
-
|
91
|
+
|
86
92
|
Returns:
|
87
93
|
For a single image: List of text region dictionaries
|
88
94
|
For a batch: List of lists of text region dictionaries
|
@@ -90,42 +96,48 @@ class OCREngine(ABC):
|
|
90
96
|
# Convert single image to batch format
|
91
97
|
single_image = not isinstance(images, list)
|
92
98
|
image_batch = [images] if single_image else images
|
93
|
-
|
99
|
+
|
94
100
|
# Use default values where parameters are not provided
|
95
101
|
effective_languages = languages or self.DEFAULT_LANGUAGES
|
96
|
-
effective_confidence =
|
102
|
+
effective_confidence = (
|
103
|
+
min_confidence if min_confidence is not None else self.DEFAULT_MIN_CONFIDENCE
|
104
|
+
)
|
97
105
|
effective_device = device or self.DEFAULT_DEVICE
|
98
|
-
|
106
|
+
|
99
107
|
# Ensure the model is initialized
|
100
108
|
self._ensure_initialized(effective_languages, effective_device, options)
|
101
|
-
|
109
|
+
|
102
110
|
# Process each image in the batch
|
103
111
|
results = []
|
104
112
|
for img in image_batch:
|
105
113
|
# Preprocess the image for the specific engine
|
106
114
|
processed_img = self._preprocess_image(img)
|
107
|
-
|
115
|
+
|
108
116
|
# Process the image with the engine-specific implementation
|
109
117
|
raw_results = self._process_single_image(processed_img, detect_only, options)
|
110
|
-
|
118
|
+
|
111
119
|
# Convert results to standardized format
|
112
120
|
text_regions = self._standardize_results(raw_results, effective_confidence, detect_only)
|
113
|
-
|
121
|
+
|
114
122
|
# Convert TextRegion objects to dictionaries for backward compatibility
|
115
123
|
region_dicts = [region.to_dict() for region in text_regions]
|
116
124
|
results.append(region_dicts)
|
117
|
-
|
125
|
+
|
118
126
|
# Return results in the appropriate format
|
119
127
|
return results[0] if single_image else results
|
120
128
|
|
121
|
-
def _ensure_initialized(
|
129
|
+
def _ensure_initialized(
|
130
|
+
self, languages: List[str], device: str, options: Optional[BaseOCROptions]
|
131
|
+
):
|
122
132
|
"""Ensure the model is initialized with the correct parameters."""
|
123
133
|
if not self._initialized:
|
124
134
|
self._initialize_model(languages, device, options)
|
125
135
|
self._initialized = True
|
126
|
-
|
136
|
+
|
127
137
|
@abstractmethod
|
128
|
-
def _initialize_model(
|
138
|
+
def _initialize_model(
|
139
|
+
self, languages: List[str], device: str, options: Optional[BaseOCROptions]
|
140
|
+
):
|
129
141
|
"""Initialize the OCR model with the given parameters."""
|
130
142
|
raise NotImplementedError("Subclasses must implement this method")
|
131
143
|
|
@@ -133,14 +145,18 @@ class OCREngine(ABC):
|
|
133
145
|
def _preprocess_image(self, image: Image.Image) -> Any:
|
134
146
|
"""Convert PIL Image to engine-specific format."""
|
135
147
|
raise NotImplementedError("Subclasses must implement this method")
|
136
|
-
|
148
|
+
|
137
149
|
@abstractmethod
|
138
|
-
def _process_single_image(
|
150
|
+
def _process_single_image(
|
151
|
+
self, image: Any, detect_only: bool, options: Optional[BaseOCROptions]
|
152
|
+
) -> Any:
|
139
153
|
"""Process a single image with the initialized model."""
|
140
154
|
raise NotImplementedError("Subclasses must implement this method")
|
141
|
-
|
155
|
+
|
142
156
|
@abstractmethod
|
143
|
-
def _standardize_results(
|
157
|
+
def _standardize_results(
|
158
|
+
self, raw_results: Any, min_confidence: float, detect_only: bool
|
159
|
+
) -> List[TextRegion]:
|
144
160
|
"""Convert engine-specific results to standardized TextRegion objects."""
|
145
161
|
raise NotImplementedError("Subclasses must implement this method")
|
146
162
|
|
@@ -181,23 +197,23 @@ class OCREngine(ABC):
|
|
181
197
|
return tuple(float(c) for c in bbox[:4])
|
182
198
|
except (ValueError, TypeError) as e:
|
183
199
|
raise ValueError(f"Invalid number format in bbox: {bbox}") from e
|
184
|
-
|
200
|
+
|
185
201
|
# Check if it's in polygon format [[x1,y1],[x2,y2],...]
|
186
202
|
elif (
|
187
203
|
isinstance(bbox, (list, tuple))
|
188
204
|
and len(bbox) > 0
|
189
205
|
and isinstance(bbox[0], (list, tuple))
|
190
|
-
and len(bbox[0]) == 2
|
206
|
+
and len(bbox[0]) == 2 # Ensure points are pairs
|
191
207
|
):
|
192
208
|
try:
|
193
209
|
x_coords = [float(point[0]) for point in bbox]
|
194
210
|
y_coords = [float(point[1]) for point in bbox]
|
195
|
-
if not x_coords or not y_coords:
|
211
|
+
if not x_coords or not y_coords: # Handle empty polygon case
|
196
212
|
raise ValueError("Empty polygon provided")
|
197
213
|
return (min(x_coords), min(y_coords), max(x_coords), max(y_coords))
|
198
214
|
except (ValueError, TypeError, IndexError) as e:
|
199
215
|
raise ValueError(f"Invalid polygon format or values: {bbox}") from e
|
200
|
-
|
216
|
+
|
201
217
|
# If it's neither format, raise an error
|
202
218
|
raise ValueError(f"Could not standardize bounding box from unexpected format: {bbox}")
|
203
219
|
|
@@ -18,28 +18,31 @@ class EasyOCREngine(OCREngine):
|
|
18
18
|
def __init__(self):
|
19
19
|
super().__init__()
|
20
20
|
# No longer need _easyocr attribute
|
21
|
-
# self._easyocr = None
|
21
|
+
# self._easyocr = None
|
22
22
|
|
23
23
|
def is_available(self) -> bool:
|
24
24
|
"""Check if EasyOCR is installed."""
|
25
25
|
return importlib.util.find_spec("easyocr") is not None
|
26
26
|
|
27
|
-
def _initialize_model(
|
27
|
+
def _initialize_model(
|
28
|
+
self, languages: List[str], device: str, options: Optional[BaseOCROptions]
|
29
|
+
):
|
28
30
|
"""Initialize the EasyOCR model."""
|
29
31
|
# Import directly here
|
30
32
|
try:
|
31
33
|
import easyocr
|
34
|
+
|
32
35
|
self.logger.info("EasyOCR module imported successfully.")
|
33
36
|
except ImportError as e:
|
34
37
|
self.logger.error(f"Failed to import EasyOCR: {e}")
|
35
38
|
raise
|
36
|
-
|
39
|
+
|
37
40
|
# Cast to EasyOCROptions if possible, otherwise use default
|
38
41
|
easy_options = options if isinstance(options, EasyOCROptions) else EasyOCROptions()
|
39
|
-
|
42
|
+
|
40
43
|
# Prepare constructor arguments
|
41
44
|
use_gpu = "cuda" in device.lower() or "mps" in device.lower()
|
42
|
-
|
45
|
+
|
43
46
|
constructor_args = {
|
44
47
|
"lang_list": languages,
|
45
48
|
"gpu": use_gpu,
|
@@ -55,12 +58,12 @@ class EasyOCREngine(OCREngine):
|
|
55
58
|
"quantize": easy_options.quantize,
|
56
59
|
"cudnn_benchmark": easy_options.cudnn_benchmark,
|
57
60
|
}
|
58
|
-
|
61
|
+
|
59
62
|
# Filter out None values, as EasyOCR expects non-None or default behaviour
|
60
63
|
constructor_args = {k: v for k, v in constructor_args.items() if v is not None}
|
61
|
-
|
64
|
+
|
62
65
|
self.logger.debug(f"EasyOCR Reader constructor args: {constructor_args}")
|
63
|
-
|
66
|
+
|
64
67
|
# Create the reader
|
65
68
|
try:
|
66
69
|
self._model = easyocr.Reader(**constructor_args)
|
@@ -73,46 +76,72 @@ class EasyOCREngine(OCREngine):
|
|
73
76
|
"""Convert PIL Image to numpy array for EasyOCR."""
|
74
77
|
return np.array(image)
|
75
78
|
|
76
|
-
def _process_single_image(
|
79
|
+
def _process_single_image(
|
80
|
+
self, image: np.ndarray, detect_only: bool, options: Optional[EasyOCROptions]
|
81
|
+
) -> Any:
|
77
82
|
"""Process a single image with EasyOCR."""
|
78
83
|
if self._model is None:
|
79
84
|
raise RuntimeError("EasyOCR model not initialized")
|
80
|
-
|
85
|
+
|
81
86
|
# Cast options to proper type if provided
|
82
87
|
easy_options = options if isinstance(options, EasyOCROptions) else EasyOCROptions()
|
83
|
-
|
88
|
+
|
84
89
|
# Prepare readtext arguments (only needed if not detect_only)
|
85
90
|
readtext_args = {}
|
86
91
|
if not detect_only:
|
87
92
|
for param in [
|
88
|
-
"detail",
|
89
|
-
"
|
90
|
-
"
|
91
|
-
"
|
93
|
+
"detail",
|
94
|
+
"paragraph",
|
95
|
+
"min_size",
|
96
|
+
"contrast_ths",
|
97
|
+
"adjust_contrast",
|
98
|
+
"filter_ths",
|
99
|
+
"text_threshold",
|
100
|
+
"low_text",
|
101
|
+
"link_threshold",
|
102
|
+
"canvas_size",
|
103
|
+
"mag_ratio",
|
104
|
+
"slope_ths",
|
105
|
+
"ycenter_ths",
|
106
|
+
"height_ths",
|
107
|
+
"width_ths",
|
108
|
+
"y_ths",
|
109
|
+
"x_ths",
|
110
|
+
"add_margin",
|
111
|
+
"output_format",
|
92
112
|
]:
|
93
113
|
if hasattr(easy_options, param):
|
94
114
|
val = getattr(easy_options, param)
|
95
115
|
if val is not None:
|
96
116
|
readtext_args[param] = val
|
97
|
-
|
117
|
+
|
98
118
|
# Process differently based on detect_only flag
|
99
119
|
if detect_only:
|
100
120
|
# Returns tuple (horizontal_list, free_list)
|
101
121
|
# horizontal_list is a list containing one item: the list of boxes
|
102
122
|
# Each box is [[x1, y1], [x2, y1], [x2, y2], [x1, y2]]
|
103
|
-
bboxes_tuple = self._model.detect(
|
104
|
-
|
105
|
-
|
123
|
+
bboxes_tuple = self._model.detect(
|
124
|
+
image, **readtext_args
|
125
|
+
) # Pass args here too? Check EasyOCR docs if needed.
|
126
|
+
if (
|
127
|
+
bboxes_tuple
|
128
|
+
and isinstance(bboxes_tuple, tuple)
|
129
|
+
and len(bboxes_tuple) > 0
|
130
|
+
and isinstance(bboxes_tuple[0], list)
|
131
|
+
):
|
132
|
+
return bboxes_tuple[0] # Return the list of polygons directly
|
106
133
|
else:
|
107
134
|
self.logger.warning(f"EasyOCR detect returned unexpected format: {bboxes_tuple}")
|
108
|
-
return []
|
135
|
+
return [] # Return empty list on unexpected format
|
109
136
|
else:
|
110
137
|
return self._model.readtext(image, **readtext_args)
|
111
138
|
|
112
|
-
def _standardize_results(
|
139
|
+
def _standardize_results(
|
140
|
+
self, raw_results: Any, min_confidence: float, detect_only: bool
|
141
|
+
) -> List[TextRegion]:
|
113
142
|
"""Convert EasyOCR results to standardized TextRegion objects."""
|
114
143
|
standardized_regions = []
|
115
|
-
|
144
|
+
|
116
145
|
if detect_only:
|
117
146
|
# In detect_only mode, raw_results is already a list of bounding boxes
|
118
147
|
# Each bbox is in [x_min, x_max, y_min, y_max] format
|
@@ -120,13 +149,17 @@ class EasyOCREngine(OCREngine):
|
|
120
149
|
for detection in raw_results:
|
121
150
|
try:
|
122
151
|
if isinstance(detection, (list, tuple)) and len(detection) == 4:
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
152
|
+
x_min, x_max, y_min, y_max = detection
|
153
|
+
# Convert to standardized (x0, y0, x1, y1) format
|
154
|
+
try:
|
155
|
+
bbox = (float(x_min), float(y_min), float(x_max), float(y_max))
|
156
|
+
standardized_regions.append(
|
157
|
+
TextRegion(bbox, text=None, confidence=None)
|
158
|
+
)
|
159
|
+
except (ValueError, TypeError) as e:
|
160
|
+
raise ValueError(
|
161
|
+
f"Invalid number format in EasyOCR detect bbox: {detection}"
|
162
|
+
) from e
|
130
163
|
else:
|
131
164
|
raise ValueError(f"Invalid detection format from EasyOCR: {detection}")
|
132
165
|
except ValueError as e:
|
@@ -134,42 +167,50 @@ class EasyOCREngine(OCREngine):
|
|
134
167
|
raise e
|
135
168
|
except Exception as e:
|
136
169
|
# Catch other potential processing errors
|
137
|
-
raise ValueError(
|
170
|
+
raise ValueError(
|
171
|
+
f"Error processing EasyOCR detection item: {detection}"
|
172
|
+
) from e
|
138
173
|
else:
|
139
|
-
raise ValueError(
|
140
|
-
|
174
|
+
raise ValueError(
|
175
|
+
f"Expected list of bounding boxes in detect_only mode, got: {raw_results}"
|
176
|
+
)
|
177
|
+
|
141
178
|
return standardized_regions
|
142
|
-
|
179
|
+
|
143
180
|
# Full OCR mode (readtext results)
|
144
181
|
for detection in raw_results:
|
145
182
|
try:
|
146
183
|
# Detail mode (list/tuple result)
|
147
184
|
if isinstance(detection, (list, tuple)) and len(detection) >= 3:
|
148
|
-
bbox_raw = detection[0]
|
185
|
+
bbox_raw = detection[0] # This is usually a polygon [[x1,y1],...]
|
149
186
|
text = str(detection[1])
|
150
187
|
confidence = float(detection[2])
|
151
|
-
|
188
|
+
|
152
189
|
if confidence >= min_confidence:
|
153
190
|
try:
|
154
191
|
# Use the standard helper for polygons
|
155
192
|
bbox = self._standardize_bbox(bbox_raw)
|
156
193
|
standardized_regions.append(TextRegion(bbox, text, confidence))
|
157
194
|
except ValueError as e:
|
158
|
-
raise ValueError(
|
159
|
-
|
195
|
+
raise ValueError(
|
196
|
+
f"Could not standardize bounding box from EasyOCR readtext: {bbox_raw}"
|
197
|
+
) from e
|
198
|
+
|
160
199
|
# Simple mode (string result)
|
161
200
|
elif isinstance(detection, str):
|
162
201
|
if 0.0 >= min_confidence: # Always include if min_confidence is 0
|
163
202
|
standardized_regions.append(TextRegion((0, 0, 0, 0), detection, 1.0))
|
164
203
|
else:
|
165
204
|
# Handle unexpected format in OCR mode
|
166
|
-
raise ValueError(
|
167
|
-
|
205
|
+
raise ValueError(
|
206
|
+
f"Invalid OCR detection format from EasyOCR readtext: {detection}"
|
207
|
+
)
|
208
|
+
|
168
209
|
except ValueError as e:
|
169
210
|
# Re-raise any value errors from standardization or format checks
|
170
211
|
raise e
|
171
212
|
except Exception as e:
|
172
213
|
# Catch other potential processing errors
|
173
214
|
raise ValueError(f"Error processing EasyOCR detection item: {detection}") from e
|
174
|
-
|
215
|
+
|
175
216
|
return standardized_regions
|
natural_pdf/ocr/engine_paddle.py
CHANGED
@@ -27,40 +27,43 @@ class PaddleOCREngine(OCREngine):
|
|
27
27
|
paddleocr_installed = importlib.util.find_spec("paddleocr") is not None
|
28
28
|
return paddle_installed and paddleocr_installed
|
29
29
|
|
30
|
-
def _initialize_model(
|
30
|
+
def _initialize_model(
|
31
|
+
self, languages: List[str], device: str, options: Optional[BaseOCROptions]
|
32
|
+
):
|
31
33
|
"""Initialize the PaddleOCR model."""
|
32
34
|
try:
|
33
|
-
import paddleocr
|
35
|
+
import paddleocr
|
36
|
+
|
34
37
|
self.logger.info("PaddleOCR module imported successfully.")
|
35
38
|
except ImportError as e:
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
+
self.logger.error(f"Failed to import PaddleOCR/PaddlePaddle: {e}")
|
40
|
+
raise
|
41
|
+
|
39
42
|
# Cast to PaddleOCROptions if possible
|
40
43
|
paddle_options = options if isinstance(options, PaddleOCROptions) else PaddleOCROptions()
|
41
|
-
|
44
|
+
|
42
45
|
# Determine parameters
|
43
46
|
primary_lang = languages[0] if languages else "en"
|
44
47
|
use_gpu = "cuda" in str(device).lower()
|
45
|
-
|
48
|
+
|
46
49
|
# Create constructor arguments
|
47
50
|
constructor_args = {
|
48
51
|
"lang": primary_lang,
|
49
52
|
"use_gpu": use_gpu,
|
50
53
|
"use_angle_cls": paddle_options.use_angle_cls,
|
51
|
-
"det": True,
|
52
|
-
"rec": True # We'll control recognition at process time
|
54
|
+
"det": True,
|
55
|
+
"rec": True, # We'll control recognition at process time
|
53
56
|
}
|
54
|
-
|
57
|
+
|
55
58
|
# Add optional parameters if available
|
56
59
|
for param in ["det_model_dir", "rec_model_dir", "cls_model_dir", "show_log", "use_onnx"]:
|
57
60
|
if hasattr(paddle_options, param):
|
58
61
|
val = getattr(paddle_options, param)
|
59
62
|
if val is not None:
|
60
63
|
constructor_args[param] = val
|
61
|
-
|
64
|
+
|
62
65
|
self.logger.debug(f"PaddleOCR constructor args: {constructor_args}")
|
63
|
-
|
66
|
+
|
64
67
|
# Create the model
|
65
68
|
try:
|
66
69
|
self._model = paddleocr.PaddleOCR(**constructor_args)
|
@@ -78,31 +81,35 @@ class PaddleOCREngine(OCREngine):
|
|
78
81
|
img_array_bgr = img_array_rgb[:, :, ::-1] # Convert RGB to BGR
|
79
82
|
return img_array_bgr
|
80
83
|
|
81
|
-
def _process_single_image(
|
84
|
+
def _process_single_image(
|
85
|
+
self, image: np.ndarray, detect_only: bool, options: Optional[PaddleOCROptions]
|
86
|
+
) -> Any:
|
82
87
|
"""Process a single image with PaddleOCR."""
|
83
88
|
if self._model is None:
|
84
89
|
raise RuntimeError("PaddleOCR model not initialized")
|
85
|
-
|
90
|
+
|
86
91
|
# Prepare OCR arguments
|
87
92
|
ocr_args = {}
|
88
93
|
if options and isinstance(options, PaddleOCROptions):
|
89
94
|
ocr_args["cls"] = options.cls if options.cls is not None else options.use_angle_cls
|
90
95
|
ocr_args["det"] = options.det
|
91
96
|
ocr_args["rec"] = not detect_only # Control recognition based on detect_only flag
|
92
|
-
|
97
|
+
|
93
98
|
# Run OCR
|
94
99
|
raw_results = self._model.ocr(image, **ocr_args)
|
95
100
|
return raw_results
|
96
101
|
|
97
|
-
def _standardize_results(
|
102
|
+
def _standardize_results(
|
103
|
+
self, raw_results: Any, min_confidence: float, detect_only: bool
|
104
|
+
) -> List[TextRegion]:
|
98
105
|
"""Convert PaddleOCR results to standardized TextRegion objects."""
|
99
106
|
standardized_regions = []
|
100
|
-
|
107
|
+
|
101
108
|
if not raw_results or not isinstance(raw_results, list) or len(raw_results) == 0:
|
102
109
|
return standardized_regions
|
103
|
-
|
110
|
+
|
104
111
|
page_results = raw_results[0] if raw_results[0] is not None else []
|
105
|
-
|
112
|
+
|
106
113
|
for detection in page_results:
|
107
114
|
# Initialize text and confidence
|
108
115
|
text = None
|
@@ -111,20 +118,22 @@ class PaddleOCREngine(OCREngine):
|
|
111
118
|
|
112
119
|
# Paddle always seems to return the tuple structure [bbox, (text, conf)]
|
113
120
|
# even if rec=False. We need to parse this structure regardless.
|
114
|
-
if len(detection) == 4:
|
115
|
-
|
121
|
+
if len(detection) == 4: # Handle potential alternative format?
|
122
|
+
detection = [detection, ("", 1.0)] # Treat as bbox + dummy text/conf
|
116
123
|
|
117
124
|
if not isinstance(detection, (list, tuple)) or len(detection) < 2:
|
118
125
|
raise ValueError(f"Invalid detection format from PaddleOCR: {detection}")
|
119
|
-
|
126
|
+
|
120
127
|
bbox_raw = detection[0]
|
121
128
|
text_confidence = detection[1]
|
122
|
-
|
129
|
+
|
123
130
|
if not isinstance(text_confidence, tuple) or len(text_confidence) < 2:
|
124
|
-
# Even if detect_only, we expect the (text, conf) structure,
|
131
|
+
# Even if detect_only, we expect the (text, conf) structure,
|
125
132
|
# it might just contain dummy values.
|
126
|
-
raise ValueError(
|
127
|
-
|
133
|
+
raise ValueError(
|
134
|
+
f"Invalid text/confidence structure from PaddleOCR: {text_confidence}"
|
135
|
+
)
|
136
|
+
|
128
137
|
# Extract text/conf only if not detect_only
|
129
138
|
if not detect_only:
|
130
139
|
text = str(text_confidence[0])
|
@@ -134,7 +143,9 @@ class PaddleOCREngine(OCREngine):
|
|
134
143
|
try:
|
135
144
|
bbox = self._standardize_bbox(bbox_raw)
|
136
145
|
except ValueError as e:
|
137
|
-
raise ValueError(
|
146
|
+
raise ValueError(
|
147
|
+
f"Could not standardize bounding box from PaddleOCR: {bbox_raw}"
|
148
|
+
) from e
|
138
149
|
|
139
150
|
# Append based on mode
|
140
151
|
if detect_only:
|
@@ -143,5 +154,5 @@ class PaddleOCREngine(OCREngine):
|
|
143
154
|
elif confidence >= min_confidence:
|
144
155
|
# Only append if confidence meets threshold in full OCR mode
|
145
156
|
standardized_regions.append(TextRegion(bbox, text, confidence))
|
146
|
-
|
157
|
+
|
147
158
|
return standardized_regions
|
natural_pdf/ocr/engine_surya.py
CHANGED
@@ -20,14 +20,16 @@ class SuryaOCREngine(OCREngine):
|
|
20
20
|
self._surya_recognition = None
|
21
21
|
self._surya_detection = None
|
22
22
|
|
23
|
-
def _initialize_model(
|
23
|
+
def _initialize_model(
|
24
|
+
self, languages: List[str], device: str, options: Optional[BaseOCROptions]
|
25
|
+
):
|
24
26
|
"""Initialize Surya predictors."""
|
25
27
|
if not self.is_available():
|
26
28
|
raise ImportError("Surya OCR library is not installed or available.")
|
27
29
|
|
28
30
|
# Store languages for use in _process_single_image
|
29
31
|
self._langs = languages
|
30
|
-
|
32
|
+
|
31
33
|
from surya.detection import DetectionPredictor
|
32
34
|
from surya.recognition import RecognitionPredictor
|
33
35
|
|
@@ -41,21 +43,27 @@ class SuryaOCREngine(OCREngine):
|
|
41
43
|
self._detection_predictor = self._surya_detection(**predictor_args)
|
42
44
|
self.logger.info("Instantiating Surya RecognitionPredictor...")
|
43
45
|
self._recognition_predictor = self._surya_recognition(**predictor_args)
|
44
|
-
|
46
|
+
|
45
47
|
self.logger.info("Surya predictors initialized.")
|
46
48
|
|
47
49
|
def _preprocess_image(self, image: Image.Image) -> Image.Image:
|
48
50
|
"""Surya uses PIL images directly, so just return the image."""
|
49
51
|
return image
|
50
52
|
|
51
|
-
def _process_single_image(
|
53
|
+
def _process_single_image(
|
54
|
+
self, image: Image.Image, detect_only: bool, options: Optional[SuryaOCROptions]
|
55
|
+
) -> Any:
|
52
56
|
"""Process a single image with Surya OCR."""
|
53
57
|
if not self._recognition_predictor or not self._detection_predictor:
|
54
58
|
raise RuntimeError("Surya predictors are not initialized.")
|
55
59
|
|
56
60
|
# Store languages instance variable during initialization to use here
|
57
|
-
langs =
|
58
|
-
|
61
|
+
langs = (
|
62
|
+
[[lang] for lang in self._langs]
|
63
|
+
if hasattr(self, "_langs")
|
64
|
+
else [[self.DEFAULT_LANGUAGES[0]]]
|
65
|
+
)
|
66
|
+
|
59
67
|
# Surya expects lists of images, so we need to wrap our single image
|
60
68
|
if detect_only:
|
61
69
|
results = self._detection_predictor(images=[image])
|
@@ -63,33 +71,41 @@ class SuryaOCREngine(OCREngine):
|
|
63
71
|
results = self._recognition_predictor(
|
64
72
|
images=[image],
|
65
73
|
langs=langs, # Use the languages set during initialization
|
66
|
-
det_predictor=self._detection_predictor
|
74
|
+
det_predictor=self._detection_predictor,
|
67
75
|
)
|
68
|
-
|
76
|
+
|
69
77
|
# Surya may return a list with one result per image or a single result object
|
70
78
|
# Return the result as-is and handle the extraction in _standardize_results
|
71
79
|
return results
|
72
80
|
|
73
|
-
def _standardize_results(
|
81
|
+
def _standardize_results(
|
82
|
+
self, raw_results: Any, min_confidence: float, detect_only: bool
|
83
|
+
) -> List[TextRegion]:
|
74
84
|
"""Convert Surya results to standardized TextRegion objects."""
|
75
85
|
standardized_regions = []
|
76
|
-
|
86
|
+
|
77
87
|
raw_result = raw_results
|
78
88
|
if isinstance(raw_results, list) and len(raw_results) > 0:
|
79
89
|
raw_result = raw_results[0]
|
80
|
-
|
81
|
-
results =
|
90
|
+
|
91
|
+
results = (
|
92
|
+
raw_result.text_lines
|
93
|
+
if hasattr(raw_result, "text_lines") and not detect_only
|
94
|
+
else raw_result.bboxes
|
95
|
+
)
|
82
96
|
|
83
97
|
for line in results:
|
84
98
|
# Always extract bbox first
|
85
99
|
try:
|
86
100
|
# Prioritize line.bbox, fallback to line.polygon
|
87
|
-
bbox_raw = line.bbox if hasattr(line,
|
101
|
+
bbox_raw = line.bbox if hasattr(line, "bbox") else getattr(line, "polygon", None)
|
88
102
|
if bbox_raw is None:
|
89
|
-
|
103
|
+
raise ValueError("Missing bbox/polygon data")
|
90
104
|
bbox = self._standardize_bbox(bbox_raw)
|
91
105
|
except ValueError as e:
|
92
|
-
raise ValueError(
|
106
|
+
raise ValueError(
|
107
|
+
f"Could not standardize bounding box from Surya result: {bbox_raw}"
|
108
|
+
) from e
|
93
109
|
|
94
110
|
if detect_only:
|
95
111
|
# For detect_only, text and confidence are None
|
@@ -100,7 +116,7 @@ class SuryaOCREngine(OCREngine):
|
|
100
116
|
confidence = line.confidence
|
101
117
|
if confidence >= min_confidence:
|
102
118
|
standardized_regions.append(TextRegion(bbox, text, confidence))
|
103
|
-
|
119
|
+
|
104
120
|
return standardized_regions
|
105
121
|
|
106
122
|
def is_available(self) -> bool:
|