natural-pdf 25.3.16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- examples/__init__.py +3 -0
- examples/another_exclusion_example.py +20 -0
- examples/basic_usage.py +190 -0
- examples/boundary_exclusion_test.py +137 -0
- examples/boundary_inclusion_fix_test.py +157 -0
- examples/chainable_layout_example.py +70 -0
- examples/color_basic_test.py +49 -0
- examples/color_name_example.py +71 -0
- examples/color_test.py +62 -0
- examples/debug_ocr.py +91 -0
- examples/direct_ocr_test.py +148 -0
- examples/direct_paddle_test.py +99 -0
- examples/direct_qa_example.py +165 -0
- examples/document_layout_analysis.py +123 -0
- examples/document_qa_example.py +185 -0
- examples/exclusion_count_debug.py +128 -0
- examples/exclusion_debug.py +107 -0
- examples/exclusion_example.py +150 -0
- examples/exclusion_optimization_example.py +190 -0
- examples/extract_text_test.py +128 -0
- examples/font_aware_example.py +101 -0
- examples/font_variant_example.py +124 -0
- examples/footer_overlap_test.py +124 -0
- examples/highlight_all_example.py +82 -0
- examples/highlight_attributes_test.py +114 -0
- examples/highlight_confidence_display.py +122 -0
- examples/highlight_demo.py +110 -0
- examples/highlight_float_test.py +71 -0
- examples/highlight_test.py +147 -0
- examples/highlighting_example.py +123 -0
- examples/image_width_example.py +84 -0
- examples/improved_api_example.py +128 -0
- examples/layout_confidence_display_test.py +65 -0
- examples/layout_confidence_test.py +82 -0
- examples/layout_coordinate_debug.py +258 -0
- examples/layout_highlight_test.py +77 -0
- examples/logging_example.py +70 -0
- examples/ocr_comprehensive.py +193 -0
- examples/ocr_debug_example.py +87 -0
- examples/ocr_default_test.py +97 -0
- examples/ocr_engine_comparison.py +235 -0
- examples/ocr_example.py +89 -0
- examples/ocr_simplified_params.py +79 -0
- examples/ocr_visualization.py +102 -0
- examples/ocr_visualization_test.py +121 -0
- examples/paddle_layout_example.py +315 -0
- examples/paddle_layout_simple.py +74 -0
- examples/paddleocr_example.py +224 -0
- examples/page_collection_example.py +103 -0
- examples/polygon_highlight_example.py +83 -0
- examples/position_methods_example.py +134 -0
- examples/region_boundary_test.py +73 -0
- examples/region_exclusion_test.py +149 -0
- examples/region_expand_example.py +109 -0
- examples/region_image_example.py +116 -0
- examples/region_ocr_test.py +119 -0
- examples/region_sections_example.py +115 -0
- examples/school_books.py +49 -0
- examples/school_books_all.py +52 -0
- examples/scouring.py +36 -0
- examples/section_extraction_example.py +232 -0
- examples/simple_document_qa.py +97 -0
- examples/spatial_navigation_example.py +108 -0
- examples/table_extraction_example.py +135 -0
- examples/table_structure_detection.py +155 -0
- examples/tatr_cells_test.py +56 -0
- examples/tatr_ocr_table_test.py +94 -0
- examples/text_search_example.py +122 -0
- examples/text_style_example.py +110 -0
- examples/tiny-text.py +61 -0
- examples/until_boundaries_example.py +156 -0
- examples/until_example.py +112 -0
- examples/very_basics.py +15 -0
- natural_pdf/__init__.py +55 -0
- natural_pdf/analyzers/__init__.py +9 -0
- natural_pdf/analyzers/document_layout.py +736 -0
- natural_pdf/analyzers/text_structure.py +153 -0
- natural_pdf/core/__init__.py +3 -0
- natural_pdf/core/page.py +2376 -0
- natural_pdf/core/pdf.py +572 -0
- natural_pdf/elements/__init__.py +3 -0
- natural_pdf/elements/base.py +553 -0
- natural_pdf/elements/collections.py +770 -0
- natural_pdf/elements/line.py +124 -0
- natural_pdf/elements/rect.py +122 -0
- natural_pdf/elements/region.py +1366 -0
- natural_pdf/elements/text.py +304 -0
- natural_pdf/ocr/__init__.py +62 -0
- natural_pdf/ocr/easyocr_engine.py +254 -0
- natural_pdf/ocr/engine.py +158 -0
- natural_pdf/ocr/paddleocr_engine.py +263 -0
- natural_pdf/qa/__init__.py +3 -0
- natural_pdf/qa/document_qa.py +405 -0
- natural_pdf/selectors/__init__.py +4 -0
- natural_pdf/selectors/parser.py +360 -0
- natural_pdf/templates/__init__.py +1 -0
- natural_pdf/templates/ocr_debug.html +517 -0
- natural_pdf/utils/__init__.py +4 -0
- natural_pdf/utils/highlighting.py +605 -0
- natural_pdf/utils/ocr.py +515 -0
- natural_pdf/utils/reading_order.py +227 -0
- natural_pdf/utils/visualization.py +151 -0
- natural_pdf-25.3.16.dist-info/LICENSE +21 -0
- natural_pdf-25.3.16.dist-info/METADATA +268 -0
- natural_pdf-25.3.16.dist-info/RECORD +109 -0
- natural_pdf-25.3.16.dist-info/WHEEL +5 -0
- natural_pdf-25.3.16.dist-info/top_level.txt +3 -0
- tests/__init__.py +3 -0
- tests/test_pdf.py +39 -0
@@ -0,0 +1,304 @@
|
|
1
|
+
"""
|
2
|
+
Text element classes for natural-pdf.
|
3
|
+
"""
|
4
|
+
from typing import Dict, Any, Optional, TYPE_CHECKING
|
5
|
+
|
6
|
+
from natural_pdf.elements.base import Element
|
7
|
+
|
8
|
+
if TYPE_CHECKING:
|
9
|
+
from natural_pdf.core.page import Page
|
10
|
+
|
11
|
+
|
12
|
+
class TextElement(Element):
|
13
|
+
"""
|
14
|
+
Represents a text element in a PDF.
|
15
|
+
|
16
|
+
This class is a wrapper around pdfplumber's character objects,
|
17
|
+
providing additional functionality for text extraction and analysis.
|
18
|
+
"""
|
19
|
+
|
20
|
+
def __init__(self, obj: Dict[str, Any], page: 'Page'):
|
21
|
+
"""
|
22
|
+
Initialize a text element.
|
23
|
+
|
24
|
+
Args:
|
25
|
+
obj: The underlying pdfplumber object. For OCR text elements,
|
26
|
+
should include 'text', 'bbox', 'source', and 'confidence'
|
27
|
+
page: The parent Page object
|
28
|
+
"""
|
29
|
+
# Add object_type if not present
|
30
|
+
if 'object_type' not in obj:
|
31
|
+
obj['object_type'] = 'text'
|
32
|
+
|
33
|
+
super().__init__(obj, page)
|
34
|
+
|
35
|
+
@property
|
36
|
+
def text(self) -> str:
|
37
|
+
"""Get the text content."""
|
38
|
+
return self._obj.get('text', '')
|
39
|
+
|
40
|
+
@property
|
41
|
+
def source(self) -> str:
|
42
|
+
"""Get the source of this text element (pdf or ocr)."""
|
43
|
+
return self._obj.get('source', 'pdf')
|
44
|
+
|
45
|
+
@property
|
46
|
+
def confidence(self) -> float:
|
47
|
+
"""Get the confidence score for OCR text elements."""
|
48
|
+
return self._obj.get('confidence', 1.0)
|
49
|
+
|
50
|
+
@property
|
51
|
+
def fontname(self) -> str:
|
52
|
+
"""Get the font name."""
|
53
|
+
# First check if we have a real fontname from PDF resources
|
54
|
+
if 'real_fontname' in self._obj:
|
55
|
+
return self._obj['real_fontname']
|
56
|
+
# Otherwise use standard fontname
|
57
|
+
return self._obj.get('fontname', '') or self._obj.get('font', '')
|
58
|
+
|
59
|
+
@property
|
60
|
+
def font_family(self) -> str:
|
61
|
+
"""
|
62
|
+
Get a cleaner font family name by stripping PDF-specific prefixes.
|
63
|
+
|
64
|
+
PDF font names often include prefixes like 'ABCDEF+' followed by the font name
|
65
|
+
or unique identifiers. This method attempts to extract a more readable font name.
|
66
|
+
"""
|
67
|
+
font = self.fontname
|
68
|
+
|
69
|
+
# Remove common PDF font prefixes (e.g., 'ABCDEF+')
|
70
|
+
if '+' in font:
|
71
|
+
font = font.split('+', 1)[1]
|
72
|
+
|
73
|
+
# Try to extract common font family names
|
74
|
+
common_fonts = [
|
75
|
+
'Arial', 'Helvetica', 'Times', 'Courier', 'Calibri',
|
76
|
+
'Cambria', 'Georgia', 'Verdana', 'Tahoma', 'Trebuchet'
|
77
|
+
]
|
78
|
+
|
79
|
+
for common in common_fonts:
|
80
|
+
if common.lower() in font.lower():
|
81
|
+
return common
|
82
|
+
|
83
|
+
return font
|
84
|
+
|
85
|
+
@property
|
86
|
+
def font_variant(self) -> str:
|
87
|
+
"""
|
88
|
+
Get the font variant identifier (prefix before the '+' in PDF font names).
|
89
|
+
|
90
|
+
PDF embeds font subsets with unique identifiers like 'AAAAAB+FontName'.
|
91
|
+
Different variants of the same base font will have different prefixes.
|
92
|
+
This can be used to differentiate text that looks different despite
|
93
|
+
having the same font name and size.
|
94
|
+
|
95
|
+
Returns:
|
96
|
+
The font variant prefix, or empty string if no variant is present
|
97
|
+
"""
|
98
|
+
font = self.fontname
|
99
|
+
|
100
|
+
# Extract the prefix before '+' if it exists
|
101
|
+
if '+' in font:
|
102
|
+
return font.split('+', 1)[0]
|
103
|
+
|
104
|
+
return ""
|
105
|
+
|
106
|
+
@property
|
107
|
+
def size(self) -> float:
|
108
|
+
"""Get the font size."""
|
109
|
+
return self._obj.get('size', 0)
|
110
|
+
|
111
|
+
@property
|
112
|
+
def color(self) -> tuple:
|
113
|
+
"""Get the text color (RGB tuple)."""
|
114
|
+
# PDFs often use non-RGB values, so we handle different formats
|
115
|
+
# In pdfplumber, colors can be in various formats depending on the PDF
|
116
|
+
color = self._obj.get('non_stroking_color', (0, 0, 0))
|
117
|
+
|
118
|
+
# If it's a single value, treat as grayscale
|
119
|
+
if isinstance(color, (int, float)):
|
120
|
+
return (color, color, color)
|
121
|
+
|
122
|
+
# If it's a tuple of 3 values, treat as RGB
|
123
|
+
if isinstance(color, tuple) and len(color) == 3:
|
124
|
+
return color
|
125
|
+
|
126
|
+
# If it's a tuple of 4 values, treat as CMYK and convert to approximate RGB
|
127
|
+
if isinstance(color, tuple) and len(color) == 4:
|
128
|
+
c, m, y, k = color
|
129
|
+
r = 1 - min(1, c + k)
|
130
|
+
g = 1 - min(1, m + k)
|
131
|
+
b = 1 - min(1, y + k)
|
132
|
+
return (r, g, b)
|
133
|
+
|
134
|
+
# Default to black
|
135
|
+
return (0, 0, 0)
|
136
|
+
|
137
|
+
def extract_text(self, keep_blank_chars=True, **kwargs) -> str:
|
138
|
+
"""
|
139
|
+
Extract text from this element.
|
140
|
+
|
141
|
+
Args:
|
142
|
+
keep_blank_chars: Whether to keep blank characters (default: True)
|
143
|
+
**kwargs: Additional extraction parameters
|
144
|
+
|
145
|
+
Returns:
|
146
|
+
Text content
|
147
|
+
"""
|
148
|
+
# For text elements, keep_blank_chars doesn't affect anything as we're
|
149
|
+
# simply returning the text property. Included for API consistency.
|
150
|
+
return self.text
|
151
|
+
|
152
|
+
def contains(self, substring: str, case_sensitive: bool = True) -> bool:
|
153
|
+
"""
|
154
|
+
Check if this text element contains a substring.
|
155
|
+
|
156
|
+
Args:
|
157
|
+
substring: The substring to check for
|
158
|
+
case_sensitive: Whether the check is case-sensitive
|
159
|
+
|
160
|
+
Returns:
|
161
|
+
True if the text contains the substring
|
162
|
+
"""
|
163
|
+
if case_sensitive:
|
164
|
+
return substring in self.text
|
165
|
+
else:
|
166
|
+
return substring.lower() in self.text.lower()
|
167
|
+
|
168
|
+
def matches(self, pattern: str) -> bool:
|
169
|
+
"""
|
170
|
+
Check if this text element matches a regular expression pattern.
|
171
|
+
|
172
|
+
Args:
|
173
|
+
pattern: Regular expression pattern
|
174
|
+
|
175
|
+
Returns:
|
176
|
+
True if the text matches the pattern
|
177
|
+
"""
|
178
|
+
import re
|
179
|
+
return bool(re.search(pattern, self.text))
|
180
|
+
|
181
|
+
@property
|
182
|
+
def bold(self) -> bool:
|
183
|
+
"""
|
184
|
+
Check if the text is bold based on multiple indicators in the PDF.
|
185
|
+
|
186
|
+
PDFs encode boldness in several ways:
|
187
|
+
1. Font name containing 'bold' or 'black'
|
188
|
+
2. Font descriptor flags (bit 2 indicates bold)
|
189
|
+
3. StemV value (thickness of vertical stems)
|
190
|
+
4. Font weight values (700+ is typically bold)
|
191
|
+
5. Text rendering mode 2 (fill and stroke)
|
192
|
+
"""
|
193
|
+
# Check font name (original method)
|
194
|
+
fontname = self.fontname.lower()
|
195
|
+
if 'bold' in fontname or 'black' in fontname or self.fontname.endswith('-B'):
|
196
|
+
return True
|
197
|
+
|
198
|
+
# Check font descriptor flags if available (bit 2 = bold)
|
199
|
+
flags = self._obj.get('flags')
|
200
|
+
if flags is not None and (flags & 4) != 0: # Check if bit 2 is set
|
201
|
+
return True
|
202
|
+
|
203
|
+
# Check StemV (vertical stem width) if available
|
204
|
+
# Higher StemV values indicate bolder fonts
|
205
|
+
stemv = self._obj.get('stemv') or self._obj.get('StemV')
|
206
|
+
if stemv is not None and isinstance(stemv, (int, float)) and stemv > 120:
|
207
|
+
return True
|
208
|
+
|
209
|
+
# Check font weight if available (700+ is typically bold)
|
210
|
+
weight = self._obj.get('weight') or self._obj.get('FontWeight')
|
211
|
+
if weight is not None and isinstance(weight, (int, float)) and weight >= 700:
|
212
|
+
return True
|
213
|
+
|
214
|
+
# Check text rendering mode (mode 2 = fill and stroke, can make text appear bold)
|
215
|
+
render_mode = self._obj.get('render_mode')
|
216
|
+
if render_mode is not None and render_mode == 2:
|
217
|
+
return True
|
218
|
+
|
219
|
+
# Additional check: if we have text with the same font but different paths/strokes
|
220
|
+
# Path widths or stroke widths can indicate boldness
|
221
|
+
stroke_width = self._obj.get('stroke_width') or self._obj.get('lineWidth')
|
222
|
+
if stroke_width is not None and isinstance(stroke_width, (int, float)) and stroke_width > 0:
|
223
|
+
return True
|
224
|
+
|
225
|
+
return False
|
226
|
+
|
227
|
+
@property
|
228
|
+
def italic(self) -> bool:
|
229
|
+
"""
|
230
|
+
Check if the text is italic based on multiple indicators in the PDF.
|
231
|
+
|
232
|
+
PDFs encode italic (oblique) text in several ways:
|
233
|
+
1. Font name containing 'italic' or 'oblique'
|
234
|
+
2. Font descriptor flags (bit 6 indicates italic)
|
235
|
+
3. Text with non-zero slant angle
|
236
|
+
"""
|
237
|
+
# Check font name (original method)
|
238
|
+
fontname = self.fontname.lower()
|
239
|
+
if 'italic' in fontname or 'oblique' in fontname or self.fontname.endswith('-I'):
|
240
|
+
return True
|
241
|
+
|
242
|
+
# Check font descriptor flags if available (bit 6 = italic)
|
243
|
+
flags = self._obj.get('flags')
|
244
|
+
if flags is not None and (flags & 64) != 0: # Check if bit 6 is set
|
245
|
+
return True
|
246
|
+
|
247
|
+
# Check italic angle if available
|
248
|
+
# Non-zero italic angle indicates italic font
|
249
|
+
italic_angle = self._obj.get('italic_angle') or self._obj.get('ItalicAngle')
|
250
|
+
if italic_angle is not None and isinstance(italic_angle, (int, float)) and italic_angle != 0:
|
251
|
+
return True
|
252
|
+
|
253
|
+
return False
|
254
|
+
|
255
|
+
def __repr__(self) -> str:
|
256
|
+
"""String representation of the text element."""
|
257
|
+
preview = self.text[:10] + '...' if len(self.text) > 10 else self.text
|
258
|
+
font_style = []
|
259
|
+
if self.bold:
|
260
|
+
font_style.append("bold")
|
261
|
+
if self.italic:
|
262
|
+
font_style.append("italic")
|
263
|
+
style_str = f", style={font_style}" if font_style else ""
|
264
|
+
|
265
|
+
# Use font_family for display but include raw fontname and variant
|
266
|
+
font_display = self.font_family
|
267
|
+
variant = self.font_variant
|
268
|
+
variant_str = f", variant='{variant}'" if variant else ""
|
269
|
+
|
270
|
+
if font_display != self.fontname and '+' in self.fontname:
|
271
|
+
base_font = self.fontname.split('+', 1)[1]
|
272
|
+
font_display = f"{font_display} ({base_font})"
|
273
|
+
|
274
|
+
return f"<TextElement text='{preview}' font='{font_display}'{variant_str} size={self.size}{style_str} bbox={self.bbox}>"
|
275
|
+
|
276
|
+
def font_info(self) -> dict:
|
277
|
+
"""
|
278
|
+
Get detailed font information for this text element.
|
279
|
+
|
280
|
+
Returns a dictionary with all available font-related properties,
|
281
|
+
useful for debugging font detection issues.
|
282
|
+
"""
|
283
|
+
info = {
|
284
|
+
'text': self.text,
|
285
|
+
'fontname': self.fontname,
|
286
|
+
'font_family': self.font_family,
|
287
|
+
'font_variant': self.font_variant,
|
288
|
+
'size': self.size,
|
289
|
+
'bold': self.bold,
|
290
|
+
'italic': self.italic,
|
291
|
+
'color': self.color
|
292
|
+
}
|
293
|
+
|
294
|
+
# Include raw font properties from the PDF
|
295
|
+
font_props = [
|
296
|
+
'flags', 'stemv', 'StemV', 'weight', 'FontWeight',
|
297
|
+
'render_mode', 'stroke_width', 'lineWidth'
|
298
|
+
]
|
299
|
+
|
300
|
+
for prop in font_props:
|
301
|
+
if prop in self._obj:
|
302
|
+
info[f"raw_{prop}"] = self._obj[prop]
|
303
|
+
|
304
|
+
return info
|
@@ -0,0 +1,62 @@
|
|
1
|
+
"""
|
2
|
+
OCR engines for natural-pdf.
|
3
|
+
|
4
|
+
This module provides different OCR engines that can be used with natural-pdf.
|
5
|
+
"""
|
6
|
+
import logging
|
7
|
+
|
8
|
+
# Set up module logger
|
9
|
+
logger = logging.getLogger("natural_pdf.ocr")
|
10
|
+
from .engine import OCREngine
|
11
|
+
from .easyocr_engine import EasyOCREngine
|
12
|
+
|
13
|
+
# Try to import PaddleOCR engine, but don't fail if it's not available
|
14
|
+
try:
|
15
|
+
from .paddleocr_engine import PaddleOCREngine
|
16
|
+
__all__ = ['OCREngine', 'EasyOCREngine', 'PaddleOCREngine']
|
17
|
+
except ImportError:
|
18
|
+
__all__ = ['OCREngine', 'EasyOCREngine']
|
19
|
+
|
20
|
+
# Default engine to use if none is specified
|
21
|
+
try:
|
22
|
+
from .paddleocr_engine import PaddleOCREngine
|
23
|
+
DEFAULT_ENGINE = PaddleOCREngine # Use PaddleOCR as default if available
|
24
|
+
except ImportError:
|
25
|
+
DEFAULT_ENGINE = EasyOCREngine # Fall back to EasyOCR if PaddleOCR is not available
|
26
|
+
|
27
|
+
def get_engine(engine_name=None, **kwargs):
|
28
|
+
"""
|
29
|
+
Get OCR engine by name.
|
30
|
+
|
31
|
+
Args:
|
32
|
+
engine_name: Name of the engine to use ('easyocr', 'paddleocr', etc.)
|
33
|
+
If None, the default engine is used (PaddleOCR if available, otherwise EasyOCR)
|
34
|
+
**kwargs: Additional arguments to pass to the engine constructor
|
35
|
+
|
36
|
+
Returns:
|
37
|
+
OCREngine instance
|
38
|
+
"""
|
39
|
+
logger.debug(f"Initializing OCR engine: {engine_name or 'default'}")
|
40
|
+
|
41
|
+
if engine_name is None or engine_name == 'default':
|
42
|
+
engine = DEFAULT_ENGINE(**kwargs)
|
43
|
+
logger.info(f"Using default OCR engine: {engine.__class__.__name__}")
|
44
|
+
return engine
|
45
|
+
|
46
|
+
if engine_name.lower() == 'easyocr':
|
47
|
+
logger.info("Initializing EasyOCR engine")
|
48
|
+
return EasyOCREngine(**kwargs)
|
49
|
+
|
50
|
+
if engine_name.lower() == 'paddleocr':
|
51
|
+
try:
|
52
|
+
from .paddleocr_engine import PaddleOCREngine
|
53
|
+
logger.info("Initializing PaddleOCR engine")
|
54
|
+
return PaddleOCREngine(**kwargs)
|
55
|
+
except ImportError:
|
56
|
+
logger.error("PaddleOCR is not installed")
|
57
|
+
raise ImportError(
|
58
|
+
"PaddleOCR is not installed. Please install it with: pip install paddlepaddle paddleocr"
|
59
|
+
)
|
60
|
+
|
61
|
+
logger.error(f"Unknown OCR engine: {engine_name}")
|
62
|
+
raise ValueError(f"Unknown OCR engine: {engine_name}")
|
@@ -0,0 +1,254 @@
|
|
1
|
+
"""
|
2
|
+
EasyOCR engine implementation.
|
3
|
+
"""
|
4
|
+
import importlib.util
|
5
|
+
from typing import Dict, List, Any, Optional
|
6
|
+
import numpy as np
|
7
|
+
from PIL import Image
|
8
|
+
|
9
|
+
from .engine import OCREngine
|
10
|
+
|
11
|
+
|
12
|
+
class EasyOCREngine(OCREngine):
|
13
|
+
"""EasyOCR implementation."""
|
14
|
+
|
15
|
+
def __init__(self, **kwargs):
|
16
|
+
"""
|
17
|
+
Initialize EasyOCR engine with optional settings.
|
18
|
+
|
19
|
+
Args:
|
20
|
+
**kwargs: Engine-specific settings
|
21
|
+
"""
|
22
|
+
super().__init__(**kwargs)
|
23
|
+
self._readers = {} # Cache for readers
|
24
|
+
|
25
|
+
# Store initialization settings to use in model initialization
|
26
|
+
self._init_settings = kwargs
|
27
|
+
|
28
|
+
def is_available(self) -> bool:
|
29
|
+
"""
|
30
|
+
Check if EasyOCR is installed.
|
31
|
+
|
32
|
+
Returns:
|
33
|
+
True if EasyOCR is available, False otherwise
|
34
|
+
"""
|
35
|
+
try:
|
36
|
+
import easyocr
|
37
|
+
return True
|
38
|
+
except ImportError:
|
39
|
+
return False
|
40
|
+
|
41
|
+
def get_reader(self, config: Dict[str, Any]):
|
42
|
+
"""
|
43
|
+
Get or initialize an EasyOCR reader based on configuration.
|
44
|
+
|
45
|
+
Args:
|
46
|
+
config: OCR configuration
|
47
|
+
|
48
|
+
Returns:
|
49
|
+
EasyOCR reader instance
|
50
|
+
"""
|
51
|
+
print(f"EasyOCR.get_reader: Config = {config}")
|
52
|
+
|
53
|
+
# Get languages from config
|
54
|
+
languages = config.get("languages", ["en"])
|
55
|
+
print(f"EasyOCR.get_reader: Languages = {languages}")
|
56
|
+
|
57
|
+
# Create a cache key from languages
|
58
|
+
cache_key = f"easyocr_{'-'.join(languages)}"
|
59
|
+
print(f"EasyOCR.get_reader: Cache key = {cache_key}")
|
60
|
+
|
61
|
+
# Return cached reader if available
|
62
|
+
if cache_key in self._readers:
|
63
|
+
print(f"EasyOCR.get_reader: Using cached reader")
|
64
|
+
return self._readers[cache_key]
|
65
|
+
|
66
|
+
# Check if easyocr is installed
|
67
|
+
if not importlib.util.find_spec("easyocr"):
|
68
|
+
print(f"EasyOCR.get_reader: EasyOCR not installed")
|
69
|
+
raise ImportError(
|
70
|
+
"EasyOCR is not installed. Please install it with: pip install easyocr"
|
71
|
+
)
|
72
|
+
|
73
|
+
# Import easyocr
|
74
|
+
print(f"EasyOCR.get_reader: Importing easyocr")
|
75
|
+
import easyocr
|
76
|
+
|
77
|
+
# Start with initialization settings
|
78
|
+
reader_kwargs = self._init_settings.copy()
|
79
|
+
print(f"EasyOCR.get_reader: Init settings = {reader_kwargs}")
|
80
|
+
|
81
|
+
# Add languages parameter
|
82
|
+
reader_kwargs["lang_list"] = languages
|
83
|
+
|
84
|
+
# Handle device parameter mapping
|
85
|
+
if "device" in config:
|
86
|
+
device = config["device"]
|
87
|
+
if device.startswith("cuda"):
|
88
|
+
reader_kwargs["gpu"] = True
|
89
|
+
else:
|
90
|
+
reader_kwargs["gpu"] = False
|
91
|
+
print(f"EasyOCR.get_reader: Set gpu={reader_kwargs.get('gpu', False)} from device={device}")
|
92
|
+
|
93
|
+
# Apply model_settings if provided
|
94
|
+
model_settings = config.get("model_settings", {})
|
95
|
+
reader_kwargs.update(model_settings)
|
96
|
+
print(f"EasyOCR.get_reader: Final kwargs = {reader_kwargs}")
|
97
|
+
|
98
|
+
# Create reader with specified settings
|
99
|
+
print(f"EasyOCR.get_reader: Creating EasyOCR.Reader with lang_list={languages}")
|
100
|
+
try:
|
101
|
+
reader = easyocr.Reader(**reader_kwargs)
|
102
|
+
print(f"EasyOCR.get_reader: Successfully created reader")
|
103
|
+
except Exception as e:
|
104
|
+
print(f"EasyOCR.get_reader: Error creating reader: {e}")
|
105
|
+
import traceback
|
106
|
+
traceback.print_exc()
|
107
|
+
raise
|
108
|
+
|
109
|
+
# Cache reader
|
110
|
+
self._readers[cache_key] = reader
|
111
|
+
print(f"EasyOCR.get_reader: Reader cached with key {cache_key}")
|
112
|
+
return reader
|
113
|
+
|
114
|
+
def process_image(self, image: Image.Image, config: Optional[Dict[str, Any]] = None) -> List[Dict[str, Any]]:
|
115
|
+
"""
|
116
|
+
Process an image with EasyOCR.
|
117
|
+
|
118
|
+
Args:
|
119
|
+
image: PIL Image to process
|
120
|
+
config: OCR configuration
|
121
|
+
|
122
|
+
Returns:
|
123
|
+
List of standardized OCR results
|
124
|
+
"""
|
125
|
+
print(f"EasyOCR.process_image: Starting with image type {type(image)}, size {image.width}x{image.height if hasattr(image, 'height') else 'unknown'}")
|
126
|
+
|
127
|
+
# Save image for debugging
|
128
|
+
try:
|
129
|
+
import os
|
130
|
+
debug_dir = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), "output")
|
131
|
+
os.makedirs(debug_dir, exist_ok=True)
|
132
|
+
debug_path = os.path.join(debug_dir, "easyocr_debug_input.png")
|
133
|
+
if isinstance(image, Image.Image):
|
134
|
+
image.save(debug_path)
|
135
|
+
print(f"EasyOCR.process_image: Saved input image to {debug_path}")
|
136
|
+
except Exception as e:
|
137
|
+
print(f"EasyOCR.process_image: Could not save debug image: {e}")
|
138
|
+
|
139
|
+
# Normalize config
|
140
|
+
if config is None:
|
141
|
+
config = {}
|
142
|
+
print(f"EasyOCR.process_image: Raw config = {config}")
|
143
|
+
config = self.normalize_config(config)
|
144
|
+
print(f"EasyOCR.process_image: Normalized config = {config}")
|
145
|
+
|
146
|
+
# Skip if OCR is disabled
|
147
|
+
if not config.get("enabled"):
|
148
|
+
print(f"EasyOCR.process_image: OCR is disabled in config, returning empty list")
|
149
|
+
return []
|
150
|
+
|
151
|
+
# Direct test with known working code for debug
|
152
|
+
print(f"EasyOCR.process_image: Running direct test with EasyOCR")
|
153
|
+
try:
|
154
|
+
import easyocr
|
155
|
+
raw_reader = easyocr.Reader(['en'])
|
156
|
+
import numpy as np
|
157
|
+
img_array = np.array(image)
|
158
|
+
direct_result = raw_reader.readtext(img_array)
|
159
|
+
print(f"EasyOCR.process_image: Direct test got {len(direct_result)} results")
|
160
|
+
except Exception as e:
|
161
|
+
print(f"EasyOCR.process_image: Direct test failed: {e}")
|
162
|
+
|
163
|
+
# Get reader
|
164
|
+
reader = self.get_reader(config)
|
165
|
+
|
166
|
+
# Convert PIL Image to numpy array if needed
|
167
|
+
if isinstance(image, Image.Image):
|
168
|
+
img_array = np.array(image)
|
169
|
+
else:
|
170
|
+
img_array = image
|
171
|
+
|
172
|
+
# Extract model_settings for readtext parameters
|
173
|
+
model_settings = config.get("model_settings", {})
|
174
|
+
|
175
|
+
# For backward compatibility, handle both flattened and nested parameters
|
176
|
+
readtext_kwargs = {}
|
177
|
+
|
178
|
+
# Add all model_settings to kwargs
|
179
|
+
readtext_kwargs.update(model_settings)
|
180
|
+
|
181
|
+
# For backward compatibility, also check nested structures
|
182
|
+
detection_params = config.get("detection_params", {})
|
183
|
+
recognition_params = config.get("recognition_params", {})
|
184
|
+
|
185
|
+
# Add nested params if provided
|
186
|
+
if detection_params:
|
187
|
+
for key, value in detection_params.items():
|
188
|
+
if key not in readtext_kwargs:
|
189
|
+
readtext_kwargs[key] = value
|
190
|
+
|
191
|
+
if recognition_params:
|
192
|
+
for key, value in recognition_params.items():
|
193
|
+
if key not in readtext_kwargs:
|
194
|
+
readtext_kwargs[key] = value
|
195
|
+
|
196
|
+
# Run OCR with all parameters
|
197
|
+
print(f"EasyOCR: Running OCR with parameters: {readtext_kwargs}")
|
198
|
+
try:
|
199
|
+
result = reader.readtext(img_array, **readtext_kwargs)
|
200
|
+
print(f"EasyOCR: Got {len(result)} results")
|
201
|
+
except Exception as e:
|
202
|
+
print(f"EasyOCR error: {e}")
|
203
|
+
import traceback
|
204
|
+
traceback.print_exc()
|
205
|
+
return []
|
206
|
+
|
207
|
+
# Apply minimum confidence threshold
|
208
|
+
min_confidence = config.get("min_confidence", 0.5)
|
209
|
+
|
210
|
+
# Convert to standardized format
|
211
|
+
standardized_results = []
|
212
|
+
|
213
|
+
for detection in result:
|
214
|
+
# Check the format based on what was returned
|
215
|
+
if isinstance(detection, list) and len(detection) >= 3:
|
216
|
+
# This is the detailed format (detail=1)
|
217
|
+
bbox = detection[0] # [[x1,y1],[x2,y2],[x3,y3],[x4,y4]]
|
218
|
+
text = detection[1]
|
219
|
+
confidence = detection[2]
|
220
|
+
|
221
|
+
# Skip if confidence is below threshold
|
222
|
+
if confidence < min_confidence:
|
223
|
+
continue
|
224
|
+
|
225
|
+
# Convert polygon bbox to rectangle (x0, y0, x1, y1)
|
226
|
+
x_coords = [point[0] for point in bbox]
|
227
|
+
y_coords = [point[1] for point in bbox]
|
228
|
+
|
229
|
+
x0 = min(x_coords)
|
230
|
+
y0 = min(y_coords)
|
231
|
+
x1 = max(x_coords)
|
232
|
+
y1 = max(y_coords)
|
233
|
+
|
234
|
+
standardized_results.append({
|
235
|
+
'bbox': (x0, y0, x1, y1),
|
236
|
+
'text': text,
|
237
|
+
'confidence': confidence,
|
238
|
+
'source': 'ocr'
|
239
|
+
})
|
240
|
+
elif isinstance(detection, str):
|
241
|
+
# Simple format (detail=0), no bbox or confidence
|
242
|
+
standardized_results.append({
|
243
|
+
'bbox': (0, 0, 1, 1), # Dummy bbox
|
244
|
+
'text': detection,
|
245
|
+
'confidence': 1.0, # Default confidence
|
246
|
+
'source': 'ocr'
|
247
|
+
})
|
248
|
+
|
249
|
+
return standardized_results
|
250
|
+
|
251
|
+
def __del__(self):
|
252
|
+
"""Cleanup resources when the engine is deleted."""
|
253
|
+
# Clear reader cache to free up memory
|
254
|
+
self._readers.clear()
|