natural-pdf 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/__init__.py +55 -0
- natural_pdf/analyzers/__init__.py +6 -0
- natural_pdf/analyzers/layout/__init__.py +1 -0
- natural_pdf/analyzers/layout/base.py +151 -0
- natural_pdf/analyzers/layout/docling.py +247 -0
- natural_pdf/analyzers/layout/layout_analyzer.py +166 -0
- natural_pdf/analyzers/layout/layout_manager.py +200 -0
- natural_pdf/analyzers/layout/layout_options.py +78 -0
- natural_pdf/analyzers/layout/paddle.py +240 -0
- natural_pdf/analyzers/layout/surya.py +151 -0
- natural_pdf/analyzers/layout/tatr.py +251 -0
- natural_pdf/analyzers/layout/yolo.py +165 -0
- natural_pdf/analyzers/text_options.py +60 -0
- natural_pdf/analyzers/text_structure.py +270 -0
- natural_pdf/analyzers/utils.py +57 -0
- natural_pdf/core/__init__.py +3 -0
- natural_pdf/core/element_manager.py +457 -0
- natural_pdf/core/highlighting_service.py +698 -0
- natural_pdf/core/page.py +1444 -0
- natural_pdf/core/pdf.py +653 -0
- natural_pdf/elements/__init__.py +3 -0
- natural_pdf/elements/base.py +761 -0
- natural_pdf/elements/collections.py +1345 -0
- natural_pdf/elements/line.py +140 -0
- natural_pdf/elements/rect.py +122 -0
- natural_pdf/elements/region.py +1793 -0
- natural_pdf/elements/text.py +304 -0
- natural_pdf/ocr/__init__.py +56 -0
- natural_pdf/ocr/engine.py +104 -0
- natural_pdf/ocr/engine_easyocr.py +179 -0
- natural_pdf/ocr/engine_paddle.py +204 -0
- natural_pdf/ocr/engine_surya.py +171 -0
- natural_pdf/ocr/ocr_manager.py +191 -0
- natural_pdf/ocr/ocr_options.py +114 -0
- natural_pdf/qa/__init__.py +3 -0
- natural_pdf/qa/document_qa.py +396 -0
- natural_pdf/selectors/__init__.py +4 -0
- natural_pdf/selectors/parser.py +354 -0
- natural_pdf/templates/__init__.py +1 -0
- natural_pdf/templates/ocr_debug.html +517 -0
- natural_pdf/utils/__init__.py +3 -0
- natural_pdf/utils/highlighting.py +12 -0
- natural_pdf/utils/reading_order.py +227 -0
- natural_pdf/utils/visualization.py +223 -0
- natural_pdf/widgets/__init__.py +4 -0
- natural_pdf/widgets/frontend/viewer.js +88 -0
- natural_pdf/widgets/viewer.py +765 -0
- natural_pdf-0.1.0.dist-info/METADATA +295 -0
- natural_pdf-0.1.0.dist-info/RECORD +52 -0
- natural_pdf-0.1.0.dist-info/WHEEL +5 -0
- natural_pdf-0.1.0.dist-info/licenses/LICENSE +21 -0
- natural_pdf-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,304 @@
|
|
1
|
+
"""
|
2
|
+
Text element classes for natural-pdf.
|
3
|
+
"""
|
4
|
+
from typing import Dict, Any, Optional, TYPE_CHECKING
|
5
|
+
|
6
|
+
from natural_pdf.elements.base import Element
|
7
|
+
|
8
|
+
if TYPE_CHECKING:
|
9
|
+
from natural_pdf.core.page import Page
|
10
|
+
|
11
|
+
|
12
|
+
class TextElement(Element):
|
13
|
+
"""
|
14
|
+
Represents a text element in a PDF.
|
15
|
+
|
16
|
+
This class is a wrapper around pdfplumber's character objects,
|
17
|
+
providing additional functionality for text extraction and analysis.
|
18
|
+
"""
|
19
|
+
|
20
|
+
def __init__(self, obj: Dict[str, Any], page: 'Page'):
|
21
|
+
"""
|
22
|
+
Initialize a text element.
|
23
|
+
|
24
|
+
Args:
|
25
|
+
obj: The underlying pdfplumber object. For OCR text elements,
|
26
|
+
should include 'text', 'bbox', 'source', and 'confidence'
|
27
|
+
page: The parent Page object
|
28
|
+
"""
|
29
|
+
# Add object_type if not present
|
30
|
+
if 'object_type' not in obj:
|
31
|
+
obj['object_type'] = 'text'
|
32
|
+
|
33
|
+
super().__init__(obj, page)
|
34
|
+
|
35
|
+
@property
|
36
|
+
def text(self) -> str:
|
37
|
+
"""Get the text content."""
|
38
|
+
return self._obj.get('text', '')
|
39
|
+
|
40
|
+
@property
|
41
|
+
def source(self) -> str:
|
42
|
+
"""Get the source of this text element (pdf or ocr)."""
|
43
|
+
return self._obj.get('source', 'pdf')
|
44
|
+
|
45
|
+
@property
|
46
|
+
def confidence(self) -> float:
|
47
|
+
"""Get the confidence score for OCR text elements."""
|
48
|
+
return self._obj.get('confidence', 1.0)
|
49
|
+
|
50
|
+
@property
|
51
|
+
def fontname(self) -> str:
|
52
|
+
"""Get the font name."""
|
53
|
+
# First check if we have a real fontname from PDF resources
|
54
|
+
if 'real_fontname' in self._obj:
|
55
|
+
return self._obj['real_fontname']
|
56
|
+
# Otherwise use standard fontname
|
57
|
+
return self._obj.get('fontname', '') or self._obj.get('font', '')
|
58
|
+
|
59
|
+
@property
|
60
|
+
def font_family(self) -> str:
|
61
|
+
"""
|
62
|
+
Get a cleaner font family name by stripping PDF-specific prefixes.
|
63
|
+
|
64
|
+
PDF font names often include prefixes like 'ABCDEF+' followed by the font name
|
65
|
+
or unique identifiers. This method attempts to extract a more readable font name.
|
66
|
+
"""
|
67
|
+
font = self.fontname
|
68
|
+
|
69
|
+
# Remove common PDF font prefixes (e.g., 'ABCDEF+')
|
70
|
+
if '+' in font:
|
71
|
+
font = font.split('+', 1)[1]
|
72
|
+
|
73
|
+
# Try to extract common font family names
|
74
|
+
common_fonts = [
|
75
|
+
'Arial', 'Helvetica', 'Times', 'Courier', 'Calibri',
|
76
|
+
'Cambria', 'Georgia', 'Verdana', 'Tahoma', 'Trebuchet'
|
77
|
+
]
|
78
|
+
|
79
|
+
for common in common_fonts:
|
80
|
+
if common.lower() in font.lower():
|
81
|
+
return common
|
82
|
+
|
83
|
+
return font
|
84
|
+
|
85
|
+
@property
|
86
|
+
def font_variant(self) -> str:
|
87
|
+
"""
|
88
|
+
Get the font variant identifier (prefix before the '+' in PDF font names).
|
89
|
+
|
90
|
+
PDF embeds font subsets with unique identifiers like 'AAAAAB+FontName'.
|
91
|
+
Different variants of the same base font will have different prefixes.
|
92
|
+
This can be used to differentiate text that looks different despite
|
93
|
+
having the same font name and size.
|
94
|
+
|
95
|
+
Returns:
|
96
|
+
The font variant prefix, or empty string if no variant is present
|
97
|
+
"""
|
98
|
+
font = self.fontname
|
99
|
+
|
100
|
+
# Extract the prefix before '+' if it exists
|
101
|
+
if '+' in font:
|
102
|
+
return font.split('+', 1)[0]
|
103
|
+
|
104
|
+
return ""
|
105
|
+
|
106
|
+
@property
|
107
|
+
def size(self) -> float:
|
108
|
+
"""Get the font size."""
|
109
|
+
return self._obj.get('size', 0)
|
110
|
+
|
111
|
+
@property
|
112
|
+
def color(self) -> tuple:
|
113
|
+
"""Get the text color (RGB tuple)."""
|
114
|
+
# PDFs often use non-RGB values, so we handle different formats
|
115
|
+
# In pdfplumber, colors can be in various formats depending on the PDF
|
116
|
+
color = self._obj.get('non_stroking_color', (0, 0, 0))
|
117
|
+
|
118
|
+
# If it's a single value, treat as grayscale
|
119
|
+
if isinstance(color, (int, float)):
|
120
|
+
return (color, color, color)
|
121
|
+
|
122
|
+
# If it's a tuple of 3 values, treat as RGB
|
123
|
+
if isinstance(color, tuple) and len(color) == 3:
|
124
|
+
return color
|
125
|
+
|
126
|
+
# If it's a tuple of 4 values, treat as CMYK and convert to approximate RGB
|
127
|
+
if isinstance(color, tuple) and len(color) == 4:
|
128
|
+
c, m, y, k = color
|
129
|
+
r = 1 - min(1, c + k)
|
130
|
+
g = 1 - min(1, m + k)
|
131
|
+
b = 1 - min(1, y + k)
|
132
|
+
return (r, g, b)
|
133
|
+
|
134
|
+
# Default to black
|
135
|
+
return (0, 0, 0)
|
136
|
+
|
137
|
+
def extract_text(self, keep_blank_chars=True, **kwargs) -> str:
|
138
|
+
"""
|
139
|
+
Extract text from this element.
|
140
|
+
|
141
|
+
Args:
|
142
|
+
keep_blank_chars: Whether to keep blank characters (default: True)
|
143
|
+
**kwargs: Additional extraction parameters
|
144
|
+
|
145
|
+
Returns:
|
146
|
+
Text content
|
147
|
+
"""
|
148
|
+
# For text elements, keep_blank_chars doesn't affect anything as we're
|
149
|
+
# simply returning the text property. Included for API consistency.
|
150
|
+
return self.text
|
151
|
+
|
152
|
+
def contains(self, substring: str, case_sensitive: bool = True) -> bool:
|
153
|
+
"""
|
154
|
+
Check if this text element contains a substring.
|
155
|
+
|
156
|
+
Args:
|
157
|
+
substring: The substring to check for
|
158
|
+
case_sensitive: Whether the check is case-sensitive
|
159
|
+
|
160
|
+
Returns:
|
161
|
+
True if the text contains the substring
|
162
|
+
"""
|
163
|
+
if case_sensitive:
|
164
|
+
return substring in self.text
|
165
|
+
else:
|
166
|
+
return substring.lower() in self.text.lower()
|
167
|
+
|
168
|
+
def matches(self, pattern: str) -> bool:
|
169
|
+
"""
|
170
|
+
Check if this text element matches a regular expression pattern.
|
171
|
+
|
172
|
+
Args:
|
173
|
+
pattern: Regular expression pattern
|
174
|
+
|
175
|
+
Returns:
|
176
|
+
True if the text matches the pattern
|
177
|
+
"""
|
178
|
+
import re
|
179
|
+
return bool(re.search(pattern, self.text))
|
180
|
+
|
181
|
+
@property
|
182
|
+
def bold(self) -> bool:
|
183
|
+
"""
|
184
|
+
Check if the text is bold based on multiple indicators in the PDF.
|
185
|
+
|
186
|
+
PDFs encode boldness in several ways:
|
187
|
+
1. Font name containing 'bold' or 'black'
|
188
|
+
2. Font descriptor flags (bit 2 indicates bold)
|
189
|
+
3. StemV value (thickness of vertical stems)
|
190
|
+
4. Font weight values (700+ is typically bold)
|
191
|
+
5. Text rendering mode 2 (fill and stroke)
|
192
|
+
"""
|
193
|
+
# Check font name (original method)
|
194
|
+
fontname = self.fontname.lower()
|
195
|
+
if 'bold' in fontname or 'black' in fontname or self.fontname.endswith('-B'):
|
196
|
+
return True
|
197
|
+
|
198
|
+
# Check font descriptor flags if available (bit 2 = bold)
|
199
|
+
flags = self._obj.get('flags')
|
200
|
+
if flags is not None and (flags & 4) != 0: # Check if bit 2 is set
|
201
|
+
return True
|
202
|
+
|
203
|
+
# Check StemV (vertical stem width) if available
|
204
|
+
# Higher StemV values indicate bolder fonts
|
205
|
+
stemv = self._obj.get('stemv') or self._obj.get('StemV')
|
206
|
+
if stemv is not None and isinstance(stemv, (int, float)) and stemv > 120:
|
207
|
+
return True
|
208
|
+
|
209
|
+
# Check font weight if available (700+ is typically bold)
|
210
|
+
weight = self._obj.get('weight') or self._obj.get('FontWeight')
|
211
|
+
if weight is not None and isinstance(weight, (int, float)) and weight >= 700:
|
212
|
+
return True
|
213
|
+
|
214
|
+
# Check text rendering mode (mode 2 = fill and stroke, can make text appear bold)
|
215
|
+
render_mode = self._obj.get('render_mode')
|
216
|
+
if render_mode is not None and render_mode == 2:
|
217
|
+
return True
|
218
|
+
|
219
|
+
# Additional check: if we have text with the same font but different paths/strokes
|
220
|
+
# Path widths or stroke widths can indicate boldness
|
221
|
+
stroke_width = self._obj.get('stroke_width') or self._obj.get('lineWidth')
|
222
|
+
if stroke_width is not None and isinstance(stroke_width, (int, float)) and stroke_width > 0:
|
223
|
+
return True
|
224
|
+
|
225
|
+
return False
|
226
|
+
|
227
|
+
@property
|
228
|
+
def italic(self) -> bool:
|
229
|
+
"""
|
230
|
+
Check if the text is italic based on multiple indicators in the PDF.
|
231
|
+
|
232
|
+
PDFs encode italic (oblique) text in several ways:
|
233
|
+
1. Font name containing 'italic' or 'oblique'
|
234
|
+
2. Font descriptor flags (bit 6 indicates italic)
|
235
|
+
3. Text with non-zero slant angle
|
236
|
+
"""
|
237
|
+
# Check font name (original method)
|
238
|
+
fontname = self.fontname.lower()
|
239
|
+
if 'italic' in fontname or 'oblique' in fontname or self.fontname.endswith('-I'):
|
240
|
+
return True
|
241
|
+
|
242
|
+
# Check font descriptor flags if available (bit 6 = italic)
|
243
|
+
flags = self._obj.get('flags')
|
244
|
+
if flags is not None and (flags & 64) != 0: # Check if bit 6 is set
|
245
|
+
return True
|
246
|
+
|
247
|
+
# Check italic angle if available
|
248
|
+
# Non-zero italic angle indicates italic font
|
249
|
+
italic_angle = self._obj.get('italic_angle') or self._obj.get('ItalicAngle')
|
250
|
+
if italic_angle is not None and isinstance(italic_angle, (int, float)) and italic_angle != 0:
|
251
|
+
return True
|
252
|
+
|
253
|
+
return False
|
254
|
+
|
255
|
+
def __repr__(self) -> str:
|
256
|
+
"""String representation of the text element."""
|
257
|
+
preview = self.text[:10] + '...' if len(self.text) > 10 else self.text
|
258
|
+
font_style = []
|
259
|
+
if self.bold:
|
260
|
+
font_style.append("bold")
|
261
|
+
if self.italic:
|
262
|
+
font_style.append("italic")
|
263
|
+
style_str = f", style={font_style}" if font_style else ""
|
264
|
+
|
265
|
+
# Use font_family for display but include raw fontname and variant
|
266
|
+
font_display = self.font_family
|
267
|
+
variant = self.font_variant
|
268
|
+
variant_str = f", variant='{variant}'" if variant else ""
|
269
|
+
|
270
|
+
if font_display != self.fontname and '+' in self.fontname:
|
271
|
+
base_font = self.fontname.split('+', 1)[1]
|
272
|
+
font_display = f"{font_display} ({base_font})"
|
273
|
+
|
274
|
+
return f"<TextElement text='{preview}' font='{font_display}'{variant_str} size={self.size}{style_str} bbox={self.bbox}>"
|
275
|
+
|
276
|
+
def font_info(self) -> dict:
|
277
|
+
"""
|
278
|
+
Get detailed font information for this text element.
|
279
|
+
|
280
|
+
Returns a dictionary with all available font-related properties,
|
281
|
+
useful for debugging font detection issues.
|
282
|
+
"""
|
283
|
+
info = {
|
284
|
+
'text': self.text,
|
285
|
+
'fontname': self.fontname,
|
286
|
+
'font_family': self.font_family,
|
287
|
+
'font_variant': self.font_variant,
|
288
|
+
'size': self.size,
|
289
|
+
'bold': self.bold,
|
290
|
+
'italic': self.italic,
|
291
|
+
'color': self.color
|
292
|
+
}
|
293
|
+
|
294
|
+
# Include raw font properties from the PDF
|
295
|
+
font_props = [
|
296
|
+
'flags', 'stemv', 'StemV', 'weight', 'FontWeight',
|
297
|
+
'render_mode', 'stroke_width', 'lineWidth'
|
298
|
+
]
|
299
|
+
|
300
|
+
for prop in font_props:
|
301
|
+
if prop in self._obj:
|
302
|
+
info[f"raw_{prop}"] = self._obj[prop]
|
303
|
+
|
304
|
+
return info
|
@@ -0,0 +1,56 @@
|
|
1
|
+
"""
|
2
|
+
OCR engines for natural-pdf.
|
3
|
+
|
4
|
+
This module provides different OCR engines that can be used with natural-pdf.
|
5
|
+
"""
|
6
|
+
import logging
|
7
|
+
|
8
|
+
# Set up module logger
|
9
|
+
logger = logging.getLogger("natural_pdf.ocr")
|
10
|
+
from .ocr_manager import OCRManager
|
11
|
+
from .engine import OCREngine
|
12
|
+
from .ocr_options import OCROptions
|
13
|
+
from .engine import OCREngine
|
14
|
+
from .engine_paddle import PaddleOCREngine
|
15
|
+
from .engine_surya import SuryaOCREngine
|
16
|
+
|
17
|
+
__all__ = ['OCRManager', 'OCREngine', 'OCROptions', 'EasyOCREngine', 'PaddleOCREngine', 'SuryaOCREngine']
|
18
|
+
|
19
|
+
DEFAULT_ENGINE = SuryaOCREngine
|
20
|
+
|
21
|
+
def get_engine(engine_name=None, **kwargs):
|
22
|
+
"""
|
23
|
+
Get OCR engine by name.
|
24
|
+
|
25
|
+
Args:
|
26
|
+
engine_name: Name of the engine to use ('easyocr', 'paddleocr', etc.)
|
27
|
+
If None, the default engine is used (PaddleOCR if available, otherwise EasyOCR)
|
28
|
+
**kwargs: Additional arguments to pass to the engine constructor
|
29
|
+
|
30
|
+
Returns:
|
31
|
+
OCREngine instance
|
32
|
+
"""
|
33
|
+
logger.debug(f"Initializing OCR engine: {engine_name or 'default'}")
|
34
|
+
|
35
|
+
if engine_name is None or engine_name == 'default':
|
36
|
+
engine = DEFAULT_ENGINE(**kwargs)
|
37
|
+
logger.info(f"Using default OCR engine: {engine.__class__.__name__}")
|
38
|
+
return engine
|
39
|
+
|
40
|
+
if engine_name.lower() == 'easyocr':
|
41
|
+
logger.info("Initializing EasyOCR engine")
|
42
|
+
return EasyOCREngine(**kwargs)
|
43
|
+
|
44
|
+
if engine_name.lower() == 'paddleocr':
|
45
|
+
try:
|
46
|
+
from .engine_paddle import PaddleOCREngine
|
47
|
+
logger.info("Initializing PaddleOCR engine")
|
48
|
+
return PaddleOCREngine(**kwargs)
|
49
|
+
except ImportError:
|
50
|
+
logger.error("PaddleOCR is not installed")
|
51
|
+
raise ImportError(
|
52
|
+
"PaddleOCR is not installed. Please install it with: pip install paddlepaddle paddleocr"
|
53
|
+
)
|
54
|
+
|
55
|
+
logger.error(f"Unknown OCR engine: {engine_name}")
|
56
|
+
raise ValueError(f"Unknown OCR engine: {engine_name}")
|
@@ -0,0 +1,104 @@
|
|
1
|
+
# ocr_engine_base.py
|
2
|
+
import logging
|
3
|
+
from abc import ABC, abstractmethod
|
4
|
+
from typing import Dict, List, Any, Optional, Tuple, Union
|
5
|
+
from PIL import Image
|
6
|
+
|
7
|
+
# Assuming ocr_options defines BaseOCROptions
|
8
|
+
from .ocr_options import BaseOCROptions
|
9
|
+
|
10
|
+
logger = logging.getLogger(__name__)
|
11
|
+
|
12
|
+
class OCREngine(ABC):
|
13
|
+
"""Abstract Base Class for OCR engines."""
|
14
|
+
|
15
|
+
def __init__(self):
|
16
|
+
"""Initializes the base OCR engine."""
|
17
|
+
self.logger = logging.getLogger(f"{__name__}.{self.__class__.__name__}")
|
18
|
+
self.logger.info(f"Initializing {self.__class__.__name__}")
|
19
|
+
self._reader_cache = {} # Cache for initialized models/readers
|
20
|
+
|
21
|
+
@abstractmethod
|
22
|
+
def process_image(
|
23
|
+
self,
|
24
|
+
images: Union[Image.Image, List[Image.Image]], # Accept single or list
|
25
|
+
options: BaseOCROptions
|
26
|
+
) -> Union[List[Dict[str, Any]], List[List[Dict[str, Any]]]]: # Return single or list of lists
|
27
|
+
"""
|
28
|
+
Processes a single image or a batch of images using the specific engine and options.
|
29
|
+
|
30
|
+
Args:
|
31
|
+
images: A single PIL Image or a list of PIL Images.
|
32
|
+
options: An instance of a dataclass inheriting from BaseOCROptions
|
33
|
+
containing configuration for this run.
|
34
|
+
|
35
|
+
Returns:
|
36
|
+
If input is a single image: List of result dictionaries.
|
37
|
+
If input is a list of images: List of lists of result dictionaries,
|
38
|
+
corresponding to each input image.
|
39
|
+
An empty list indicates failure for that image.
|
40
|
+
"""
|
41
|
+
raise NotImplementedError("Subclasses must implement this method")
|
42
|
+
|
43
|
+
@abstractmethod
|
44
|
+
def is_available(self) -> bool:
|
45
|
+
"""
|
46
|
+
Check if the engine's dependencies are installed and usable.
|
47
|
+
|
48
|
+
Returns:
|
49
|
+
True if the engine is available, False otherwise.
|
50
|
+
"""
|
51
|
+
raise NotImplementedError("Subclasses must implement this method")
|
52
|
+
|
53
|
+
def _get_cache_key(self, options: BaseOCROptions) -> str:
|
54
|
+
"""
|
55
|
+
Generates a cache key based on relevant options.
|
56
|
+
Subclasses should override if more specific key generation is needed.
|
57
|
+
|
58
|
+
Args:
|
59
|
+
options: The options dataclass instance.
|
60
|
+
|
61
|
+
Returns:
|
62
|
+
A string cache key.
|
63
|
+
"""
|
64
|
+
# Basic key includes languages and device
|
65
|
+
lang_key = "-".join(sorted(options.languages))
|
66
|
+
device_key = str(options.device).lower()
|
67
|
+
return f"{self.__class__.__name__}_{lang_key}_{device_key}"
|
68
|
+
|
69
|
+
def _standardize_bbox(self, bbox: Any) -> Optional[Tuple[float, float, float, float]]:
|
70
|
+
"""
|
71
|
+
Helper to standardize bounding boxes to (x0, y0, x1, y1) format.
|
72
|
+
|
73
|
+
Args:
|
74
|
+
bbox: The bounding box in the engine's native format.
|
75
|
+
Expected formats:
|
76
|
+
- List/Tuple of 4 numbers: (x0, y0, x1, y1)
|
77
|
+
- List of points: [[x1,y1],[x2,y2],[x3,y3],[x4,y4]] (polygon)
|
78
|
+
|
79
|
+
Returns:
|
80
|
+
Tuple[float, float, float, float] or None if conversion fails.
|
81
|
+
"""
|
82
|
+
try:
|
83
|
+
if isinstance(bbox, (list, tuple)) and len(bbox) == 4 and all(isinstance(n, (int, float)) for n in bbox):
|
84
|
+
# Already in (x0, y0, x1, y1) format (or similar)
|
85
|
+
return tuple(float(c) for c in bbox[:4])
|
86
|
+
elif isinstance(bbox, (list, tuple)) and len(bbox) > 0 and isinstance(bbox[0], (list, tuple)):
|
87
|
+
# Polygon format [[x1,y1],[x2,y2],...]
|
88
|
+
x_coords = [float(point[0]) for point in bbox]
|
89
|
+
y_coords = [float(point[1]) for point in bbox]
|
90
|
+
x0 = min(x_coords)
|
91
|
+
y0 = min(y_coords)
|
92
|
+
x1 = max(x_coords)
|
93
|
+
y1 = max(y_coords)
|
94
|
+
return (x0, y0, x1, y1)
|
95
|
+
except Exception as e:
|
96
|
+
self.logger.warning(f"Could not standardize bounding box: {bbox}. Error: {e}")
|
97
|
+
return None
|
98
|
+
|
99
|
+
def __del__(self):
|
100
|
+
"""Cleanup resources when the engine is deleted."""
|
101
|
+
self.logger.info(f"Cleaning up {self.__class__.__name__} resources.")
|
102
|
+
# Clear reader cache to free up memory/GPU resources
|
103
|
+
self._reader_cache.clear()
|
104
|
+
|
@@ -0,0 +1,179 @@
|
|
1
|
+
# ocr_engine_easyocr.py
|
2
|
+
import logging
|
3
|
+
import importlib.util
|
4
|
+
from typing import Dict, List, Any, Optional, Tuple, Union
|
5
|
+
import numpy as np
|
6
|
+
from PIL import Image
|
7
|
+
import inspect # Used for dynamic parameter passing
|
8
|
+
|
9
|
+
from .engine import OCREngine
|
10
|
+
from .ocr_options import EasyOCROptions, BaseOCROptions
|
11
|
+
|
12
|
+
logger = logging.getLogger(__name__)
|
13
|
+
|
14
|
+
class EasyOCREngine(OCREngine):
|
15
|
+
"""EasyOCR engine implementation."""
|
16
|
+
|
17
|
+
def __init__(self):
|
18
|
+
super().__init__()
|
19
|
+
self._easyocr = None # Lazy load easyocr module
|
20
|
+
|
21
|
+
def _lazy_import_easyocr(self):
|
22
|
+
"""Imports easyocr only when needed."""
|
23
|
+
if self._easyocr is None:
|
24
|
+
if not self.is_available():
|
25
|
+
raise ImportError("EasyOCR is not installed or available.")
|
26
|
+
try:
|
27
|
+
import easyocr
|
28
|
+
self._easyocr = easyocr
|
29
|
+
logger.info("EasyOCR module imported successfully.")
|
30
|
+
except ImportError as e:
|
31
|
+
logger.error(f"Failed to import EasyOCR: {e}")
|
32
|
+
raise
|
33
|
+
return self._easyocr
|
34
|
+
|
35
|
+
def is_available(self) -> bool:
|
36
|
+
"""Check if EasyOCR is installed."""
|
37
|
+
return importlib.util.find_spec("easyocr") is not None
|
38
|
+
|
39
|
+
def _get_cache_key(self, options: EasyOCROptions) -> str:
|
40
|
+
"""Generate a more specific cache key for EasyOCR."""
|
41
|
+
base_key = super()._get_cache_key(options)
|
42
|
+
recog_key = options.recog_network
|
43
|
+
detect_key = options.detect_network
|
44
|
+
quantize_key = str(options.quantize)
|
45
|
+
return f"{base_key}_{recog_key}_{detect_key}_{quantize_key}"
|
46
|
+
|
47
|
+
def _get_reader(self, options: EasyOCROptions):
|
48
|
+
"""Get or initialize an EasyOCR reader based on options."""
|
49
|
+
cache_key = self._get_cache_key(options)
|
50
|
+
if cache_key in self._reader_cache:
|
51
|
+
logger.debug(f"Using cached EasyOCR reader for key: {cache_key}")
|
52
|
+
return self._reader_cache[cache_key]
|
53
|
+
|
54
|
+
logger.info(f"Creating new EasyOCR reader for key: {cache_key}")
|
55
|
+
easyocr = self._lazy_import_easyocr()
|
56
|
+
|
57
|
+
constructor_sig = inspect.signature(easyocr.Reader.__init__)
|
58
|
+
constructor_args = {}
|
59
|
+
constructor_args['lang_list'] = options.languages
|
60
|
+
constructor_args['gpu'] = 'cuda' in str(options.device).lower() or 'mps' in str(options.device).lower()
|
61
|
+
|
62
|
+
for field_name, param in constructor_sig.parameters.items():
|
63
|
+
if field_name in ['self', 'lang_list', 'gpu']: continue
|
64
|
+
if hasattr(options, field_name):
|
65
|
+
constructor_args[field_name] = getattr(options, field_name)
|
66
|
+
elif field_name in options.extra_args:
|
67
|
+
constructor_args[field_name] = options.extra_args[field_name]
|
68
|
+
|
69
|
+
logger.debug(f"EasyOCR Reader constructor args: {constructor_args}")
|
70
|
+
try:
|
71
|
+
reader = easyocr.Reader(**constructor_args)
|
72
|
+
self._reader_cache[cache_key] = reader
|
73
|
+
logger.info("EasyOCR reader created successfully.")
|
74
|
+
return reader
|
75
|
+
except Exception as e:
|
76
|
+
logger.error(f"Failed to create EasyOCR reader: {e}", exc_info=True)
|
77
|
+
raise
|
78
|
+
|
79
|
+
def _prepare_readtext_args(self, options: EasyOCROptions, reader) -> Dict[str, Any]:
|
80
|
+
"""Helper to prepare arguments for the readtext method."""
|
81
|
+
readtext_sig = inspect.signature(reader.readtext)
|
82
|
+
readtext_args = {}
|
83
|
+
for field_name, param in readtext_sig.parameters.items():
|
84
|
+
if field_name == 'image': continue
|
85
|
+
if hasattr(options, field_name):
|
86
|
+
readtext_args[field_name] = getattr(options, field_name)
|
87
|
+
elif field_name in options.extra_args:
|
88
|
+
readtext_args[field_name] = options.extra_args[field_name]
|
89
|
+
logger.debug(f"EasyOCR readtext args: {readtext_args}")
|
90
|
+
return readtext_args
|
91
|
+
|
92
|
+
def _standardize_results(self, raw_results: List[Any], options: EasyOCROptions) -> List[Dict[str, Any]]:
|
93
|
+
"""Standardizes raw results from EasyOCR's readtext."""
|
94
|
+
standardized_results = []
|
95
|
+
min_confidence = options.min_confidence
|
96
|
+
|
97
|
+
for detection in raw_results:
|
98
|
+
try:
|
99
|
+
if options.detail == 1 and isinstance(detection, (list, tuple)) and len(detection) >= 3:
|
100
|
+
bbox_raw = detection[0]
|
101
|
+
text = str(detection[1])
|
102
|
+
confidence = float(detection[2])
|
103
|
+
|
104
|
+
if confidence >= min_confidence:
|
105
|
+
bbox = self._standardize_bbox(bbox_raw)
|
106
|
+
if bbox:
|
107
|
+
standardized_results.append({
|
108
|
+
'bbox': bbox, 'text': text, 'confidence': confidence, 'source': 'ocr'
|
109
|
+
})
|
110
|
+
else:
|
111
|
+
logger.warning(f"Skipping result due to invalid bbox: {bbox_raw}")
|
112
|
+
|
113
|
+
elif options.detail == 0 and isinstance(detection, str):
|
114
|
+
standardized_results.append({
|
115
|
+
'bbox': None, 'text': detection, 'confidence': 1.0, 'source': 'ocr'
|
116
|
+
})
|
117
|
+
except (IndexError, ValueError, TypeError) as e:
|
118
|
+
logger.warning(f"Skipping invalid detection format: {detection}. Error: {e}")
|
119
|
+
continue
|
120
|
+
return standardized_results
|
121
|
+
|
122
|
+
|
123
|
+
def process_image(
|
124
|
+
self,
|
125
|
+
images: Union[Image.Image, List[Image.Image]],
|
126
|
+
options: BaseOCROptions
|
127
|
+
) -> Union[List[Dict[str, Any]], List[List[Dict[str, Any]]]]:
|
128
|
+
"""Processes a single image or a batch of images with EasyOCR."""
|
129
|
+
|
130
|
+
if not isinstance(options, EasyOCROptions):
|
131
|
+
logger.warning("Received BaseOCROptions, expected EasyOCROptions. Using defaults.")
|
132
|
+
# Create default EasyOCR options if base was passed, preserving base settings
|
133
|
+
options = EasyOCROptions(
|
134
|
+
languages=options.languages,
|
135
|
+
min_confidence=options.min_confidence,
|
136
|
+
device=options.device,
|
137
|
+
extra_args=options.extra_args # Pass along any extra args
|
138
|
+
)
|
139
|
+
|
140
|
+
reader = self._get_reader(options)
|
141
|
+
readtext_args = self._prepare_readtext_args(options, reader)
|
142
|
+
|
143
|
+
# --- Handle single image or batch ---
|
144
|
+
if isinstance(images, list):
|
145
|
+
# --- Batch Processing (Iterative for EasyOCR) ---
|
146
|
+
all_results = []
|
147
|
+
logger.info(f"Processing batch of {len(images)} images with EasyOCR (iteratively)...")
|
148
|
+
for i, img in enumerate(images):
|
149
|
+
if not isinstance(img, Image.Image):
|
150
|
+
logger.warning(f"Item at index {i} in batch is not a PIL Image. Skipping.")
|
151
|
+
all_results.append([])
|
152
|
+
continue
|
153
|
+
img_array = np.array(img)
|
154
|
+
try:
|
155
|
+
logger.debug(f"Processing image {i+1}/{len(images)} in batch.")
|
156
|
+
raw_results = reader.readtext(img_array, **readtext_args)
|
157
|
+
standardized = self._standardize_results(raw_results, options)
|
158
|
+
all_results.append(standardized)
|
159
|
+
except Exception as e:
|
160
|
+
logger.error(f"Error processing image {i+1} in EasyOCR batch: {e}", exc_info=True)
|
161
|
+
all_results.append([]) # Append empty list for failed image
|
162
|
+
logger.info(f"Finished processing batch with EasyOCR.")
|
163
|
+
return all_results # Return List[List[Dict]]
|
164
|
+
|
165
|
+
elif isinstance(images, Image.Image):
|
166
|
+
# --- Single Image Processing ---
|
167
|
+
logger.info("Processing single image with EasyOCR...")
|
168
|
+
img_array = np.array(images)
|
169
|
+
try:
|
170
|
+
raw_results = reader.readtext(img_array, **readtext_args)
|
171
|
+
standardized = self._standardize_results(raw_results, options)
|
172
|
+
logger.info(f"Finished processing single image. Found {len(standardized)} results.")
|
173
|
+
return standardized # Return List[Dict]
|
174
|
+
except Exception as e:
|
175
|
+
logger.error(f"Error processing single image with EasyOCR: {e}", exc_info=True)
|
176
|
+
return [] # Return empty list on failure
|
177
|
+
else:
|
178
|
+
raise TypeError("Input 'images' must be a PIL Image or a list of PIL Images.")
|
179
|
+
|