kiri-ocr 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kiri_ocr/__init__.py +14 -0
- kiri_ocr/cli.py +244 -0
- kiri_ocr/core.py +306 -0
- kiri_ocr/detector.py +374 -0
- kiri_ocr/generator.py +570 -0
- kiri_ocr/model.py +159 -0
- kiri_ocr/renderer.py +193 -0
- kiri_ocr/training.py +508 -0
- kiri_ocr-0.1.0.data/scripts/kiri-ocr +6 -0
- kiri_ocr-0.1.0.dist-info/METADATA +218 -0
- kiri_ocr-0.1.0.dist-info/RECORD +16 -0
- kiri_ocr-0.1.0.dist-info/WHEEL +5 -0
- kiri_ocr-0.1.0.dist-info/licenses/LICENSE +201 -0
- kiri_ocr-0.1.0.dist-info/top_level.txt +2 -0
- models/__init__.py +1 -0
- models/model.kiri +0 -0
kiri_ocr/detector.py
ADDED
|
@@ -0,0 +1,374 @@
|
|
|
1
|
+
import cv2
|
|
2
|
+
import numpy as np
|
|
3
|
+
from scipy.ndimage import gaussian_filter1d
|
|
4
|
+
|
|
5
|
+
class TextDetector:
|
|
6
|
+
"""Detect text regions using Tesseract's actual approach - language-agnostic"""
|
|
7
|
+
|
|
8
|
+
def __init__(self, padding=None):
|
|
9
|
+
"""
|
|
10
|
+
Args:
|
|
11
|
+
padding: Pixels to add around detected boxes (default: None for auto)
|
|
12
|
+
If None, padding will be automatically calculated based on text size
|
|
13
|
+
"""
|
|
14
|
+
self.padding = padding
|
|
15
|
+
self._auto_padding = None
|
|
16
|
+
|
|
17
|
+
def detect_lines(self, image_path):
|
|
18
|
+
"""
|
|
19
|
+
Detect text lines using Tesseract's actual Page Layout Analysis:
|
|
20
|
+
1. Otsu binarization
|
|
21
|
+
2. Connected components with stats
|
|
22
|
+
3. Estimate median text height from components
|
|
23
|
+
4. Filter noise based on text height
|
|
24
|
+
5. Find baselines and group into text lines
|
|
25
|
+
6. Merge overlapping regions
|
|
26
|
+
|
|
27
|
+
Returns:
|
|
28
|
+
List of bounding boxes [(x, y, w, h), ...]
|
|
29
|
+
"""
|
|
30
|
+
# Load and preprocess
|
|
31
|
+
img = cv2.imread(str(image_path))
|
|
32
|
+
if img is None:
|
|
33
|
+
raise ValueError(f"Could not load image: {image_path}")
|
|
34
|
+
|
|
35
|
+
img_h, img_w = img.shape[:2]
|
|
36
|
+
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
|
|
37
|
+
|
|
38
|
+
# Step 1: Otsu binarization (foreground detection)
|
|
39
|
+
# auto-detects if it needs to invert
|
|
40
|
+
_, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
|
|
41
|
+
|
|
42
|
+
# Check if we got mostly white (wrong polarity)
|
|
43
|
+
if np.mean(binary) > 127:
|
|
44
|
+
_, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
|
|
45
|
+
binary = 255 - binary
|
|
46
|
+
|
|
47
|
+
# Step 2: Connected components analysis
|
|
48
|
+
num_labels, labels, stats, centroids = cv2.connectedComponentsWithStats(
|
|
49
|
+
binary, connectivity=8
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
# Step 3: Estimate typical text height
|
|
53
|
+
# Collect component heights
|
|
54
|
+
heights = []
|
|
55
|
+
widths = []
|
|
56
|
+
valid_components = []
|
|
57
|
+
|
|
58
|
+
for i in range(1, num_labels): # Skip background
|
|
59
|
+
x, y, w, h, area = stats[i]
|
|
60
|
+
|
|
61
|
+
# Basic sanity filters (remove 1-pixel noise)
|
|
62
|
+
if w >= 2 and h >= 3 and area >= 6:
|
|
63
|
+
heights.append(h)
|
|
64
|
+
widths.append(w)
|
|
65
|
+
valid_components.append({
|
|
66
|
+
'id': i,
|
|
67
|
+
'bbox': (x, y, w, h),
|
|
68
|
+
'area': area,
|
|
69
|
+
'centroid': centroids[i]
|
|
70
|
+
})
|
|
71
|
+
|
|
72
|
+
if not valid_components:
|
|
73
|
+
return []
|
|
74
|
+
|
|
75
|
+
# Tesseract's approach: use median height as typical character height
|
|
76
|
+
median_height = np.median(heights)
|
|
77
|
+
median_width = np.median(widths)
|
|
78
|
+
|
|
79
|
+
# Auto-calculate padding based on median text height
|
|
80
|
+
# typically uses ~10-20% of text height as padding
|
|
81
|
+
if self.padding is None:
|
|
82
|
+
self._auto_padding = max(2, int(median_height * 0.15))
|
|
83
|
+
else:
|
|
84
|
+
self._auto_padding = self.padding
|
|
85
|
+
|
|
86
|
+
# Step 4: Filter components
|
|
87
|
+
# Keep components that are reasonable relative to median text size
|
|
88
|
+
# uses: 0.25 * median < height < 3.0 * median
|
|
89
|
+
min_h = max(3, median_height * 0.25)
|
|
90
|
+
max_h = median_height * 3.0
|
|
91
|
+
min_w = max(2, median_width * 0.1)
|
|
92
|
+
max_w = img_w * 0.95 # Not entire width
|
|
93
|
+
|
|
94
|
+
text_components = []
|
|
95
|
+
for comp in valid_components:
|
|
96
|
+
x, y, w, h = comp['bbox']
|
|
97
|
+
|
|
98
|
+
# Tesseract's blob filtering
|
|
99
|
+
if min_h <= h <= max_h and min_w <= w <= max_w:
|
|
100
|
+
# Aspect ratio check (not too extreme)
|
|
101
|
+
aspect = w / h if h > 0 else 0
|
|
102
|
+
if 0.05 < aspect < 20: # Reasonable character proportions
|
|
103
|
+
# Additional filter: remove very small isolated components
|
|
104
|
+
# (likely diacritics that should be grouped with main text)
|
|
105
|
+
# Skip components that are less than 30% of median height
|
|
106
|
+
if h >= median_height * 0.3:
|
|
107
|
+
text_components.append(comp)
|
|
108
|
+
|
|
109
|
+
if not text_components:
|
|
110
|
+
return []
|
|
111
|
+
|
|
112
|
+
# Step 5: Line finding using baseline clustering
|
|
113
|
+
# Group components by vertical position (baseline detection)
|
|
114
|
+
text_components.sort(key=lambda c: c['centroid'][1])
|
|
115
|
+
|
|
116
|
+
lines = []
|
|
117
|
+
current_line = [text_components[0]]
|
|
118
|
+
|
|
119
|
+
for comp in text_components[1:]:
|
|
120
|
+
# Get current line's vertical extent
|
|
121
|
+
line_y_values = [c['centroid'][1] for c in current_line]
|
|
122
|
+
line_y_center = np.mean(line_y_values)
|
|
123
|
+
line_heights = [c['bbox'][3] for c in current_line]
|
|
124
|
+
avg_line_height = np.mean(line_heights)
|
|
125
|
+
|
|
126
|
+
comp_y = comp['centroid'][1]
|
|
127
|
+
|
|
128
|
+
# For Khmer text with diacritics, use slightly larger tolerance
|
|
129
|
+
# Use 0.45 to group base characters with nearby components
|
|
130
|
+
tolerance = avg_line_height * 0.45
|
|
131
|
+
|
|
132
|
+
if abs(comp_y - line_y_center) <= tolerance:
|
|
133
|
+
current_line.append(comp)
|
|
134
|
+
else:
|
|
135
|
+
lines.append(current_line)
|
|
136
|
+
current_line = [comp]
|
|
137
|
+
|
|
138
|
+
if current_line:
|
|
139
|
+
lines.append(current_line)
|
|
140
|
+
|
|
141
|
+
# Step 6: Create bounding boxes for each line
|
|
142
|
+
line_boxes = []
|
|
143
|
+
for line in lines:
|
|
144
|
+
if not line:
|
|
145
|
+
continue
|
|
146
|
+
|
|
147
|
+
# Get line bounding box
|
|
148
|
+
x_min = min(c['bbox'][0] for c in line)
|
|
149
|
+
y_min = min(c['bbox'][1] for c in line)
|
|
150
|
+
x_max = max(c['bbox'][0] + c['bbox'][2] for c in line)
|
|
151
|
+
y_max = max(c['bbox'][1] + c['bbox'][3] for c in line)
|
|
152
|
+
|
|
153
|
+
# Add padding (auto-calculated based on text size)
|
|
154
|
+
x_pad = max(0, x_min - self._auto_padding)
|
|
155
|
+
y_pad = max(0, y_min - self._auto_padding)
|
|
156
|
+
w_pad = min(img_w - x_pad, (x_max - x_min) + 2 * self._auto_padding)
|
|
157
|
+
h_pad = min(img_h - y_pad, (y_max - y_min) + 2 * self._auto_padding)
|
|
158
|
+
|
|
159
|
+
line_boxes.append((x_pad, y_pad, w_pad, h_pad))
|
|
160
|
+
|
|
161
|
+
# Merge overlapping boxes (fix duplicate detections)
|
|
162
|
+
line_boxes = self._merge_overlapping_boxes(line_boxes, median_height)
|
|
163
|
+
|
|
164
|
+
# Sort top to bottom
|
|
165
|
+
line_boxes = sorted(line_boxes, key=lambda b: b[1])
|
|
166
|
+
|
|
167
|
+
return line_boxes
|
|
168
|
+
|
|
169
|
+
def detect_words(self, image_path):
|
|
170
|
+
"""
|
|
171
|
+
Detect words using Tesseract's approach:
|
|
172
|
+
1. Find all text components
|
|
173
|
+
2. Estimate typical character width
|
|
174
|
+
3. Group into lines
|
|
175
|
+
4. Within lines, cluster by horizontal spacing
|
|
176
|
+
|
|
177
|
+
Returns:
|
|
178
|
+
List of word bounding boxes
|
|
179
|
+
"""
|
|
180
|
+
img = cv2.imread(str(image_path))
|
|
181
|
+
if img is None:
|
|
182
|
+
raise ValueError(f"Could not load image: {image_path}")
|
|
183
|
+
|
|
184
|
+
img_h, img_w = img.shape[:2]
|
|
185
|
+
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
|
|
186
|
+
|
|
187
|
+
# Binarization
|
|
188
|
+
_, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
|
|
189
|
+
if np.mean(binary) > 127:
|
|
190
|
+
_, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
|
|
191
|
+
binary = 255 - binary
|
|
192
|
+
|
|
193
|
+
# Connected components
|
|
194
|
+
num_labels, labels, stats, centroids = cv2.connectedComponentsWithStats(
|
|
195
|
+
binary, connectivity=8
|
|
196
|
+
)
|
|
197
|
+
|
|
198
|
+
# Collect and filter components
|
|
199
|
+
heights = []
|
|
200
|
+
widths = []
|
|
201
|
+
valid_components = []
|
|
202
|
+
|
|
203
|
+
for i in range(1, num_labels):
|
|
204
|
+
x, y, w, h, area = stats[i]
|
|
205
|
+
if w >= 2 and h >= 3 and area >= 6:
|
|
206
|
+
heights.append(h)
|
|
207
|
+
widths.append(w)
|
|
208
|
+
valid_components.append({
|
|
209
|
+
'bbox': (x, y, w, h),
|
|
210
|
+
'centroid': centroids[i]
|
|
211
|
+
})
|
|
212
|
+
|
|
213
|
+
if not valid_components:
|
|
214
|
+
return []
|
|
215
|
+
|
|
216
|
+
# Estimate text size
|
|
217
|
+
median_height = np.median(heights)
|
|
218
|
+
median_width = np.median(widths)
|
|
219
|
+
|
|
220
|
+
# Auto-calculate padding based on median text height
|
|
221
|
+
if self.padding is None:
|
|
222
|
+
self._auto_padding = max(2, int(median_height * 0.15))
|
|
223
|
+
else:
|
|
224
|
+
self._auto_padding = self.padding
|
|
225
|
+
|
|
226
|
+
# Filter components
|
|
227
|
+
min_h = max(3, median_height * 0.25)
|
|
228
|
+
max_h = median_height * 3.0
|
|
229
|
+
|
|
230
|
+
text_components = []
|
|
231
|
+
for comp in valid_components:
|
|
232
|
+
x, y, w, h = comp['bbox']
|
|
233
|
+
if min_h <= h <= max_h and w < img_w * 0.95:
|
|
234
|
+
aspect = w / h if h > 0 else 0
|
|
235
|
+
if 0.05 < aspect < 20:
|
|
236
|
+
# Filter out very small components (diacritics)
|
|
237
|
+
if h >= median_height * 0.3:
|
|
238
|
+
text_components.append(comp)
|
|
239
|
+
|
|
240
|
+
if not text_components:
|
|
241
|
+
return []
|
|
242
|
+
|
|
243
|
+
# Group into lines
|
|
244
|
+
text_components.sort(key=lambda c: c['centroid'][1])
|
|
245
|
+
lines = []
|
|
246
|
+
current_line = [text_components[0]]
|
|
247
|
+
|
|
248
|
+
for comp in text_components[1:]:
|
|
249
|
+
line_y_center = np.mean([c['centroid'][1] for c in current_line])
|
|
250
|
+
avg_height = np.mean([c['bbox'][3] for c in current_line])
|
|
251
|
+
|
|
252
|
+
# Use 0.45 tolerance for Khmer diacritics
|
|
253
|
+
if abs(comp['centroid'][1] - line_y_center) <= avg_height * 0.45:
|
|
254
|
+
current_line.append(comp)
|
|
255
|
+
else:
|
|
256
|
+
lines.append(current_line)
|
|
257
|
+
current_line = [comp]
|
|
258
|
+
|
|
259
|
+
if current_line:
|
|
260
|
+
lines.append(current_line)
|
|
261
|
+
|
|
262
|
+
# Segment lines into words
|
|
263
|
+
word_boxes = []
|
|
264
|
+
for line in lines:
|
|
265
|
+
# Sort by X position
|
|
266
|
+
line.sort(key=lambda c: c['bbox'][0])
|
|
267
|
+
|
|
268
|
+
# word spacing: typically 0.3 to 1.0 * median character width
|
|
269
|
+
# Space between words is larger than between characters
|
|
270
|
+
char_widths = [c['bbox'][2] for c in line]
|
|
271
|
+
median_char_width = np.median(char_widths) if char_widths else 10
|
|
272
|
+
|
|
273
|
+
# Word gap threshold
|
|
274
|
+
word_gap = median_char_width * 0.6
|
|
275
|
+
|
|
276
|
+
current_word = [line[0]]
|
|
277
|
+
|
|
278
|
+
for i in range(1, len(line)):
|
|
279
|
+
prev = line[i-1]
|
|
280
|
+
curr = line[i]
|
|
281
|
+
|
|
282
|
+
# Gap between characters
|
|
283
|
+
gap = curr['bbox'][0] - (prev['bbox'][0] + prev['bbox'][2])
|
|
284
|
+
|
|
285
|
+
if gap <= word_gap:
|
|
286
|
+
# Same word
|
|
287
|
+
current_word.append(curr)
|
|
288
|
+
else:
|
|
289
|
+
# New word
|
|
290
|
+
if current_word:
|
|
291
|
+
word_box = self._merge_component_boxes(current_word)
|
|
292
|
+
word_box = self._add_padding(word_box, img_w, img_h)
|
|
293
|
+
word_boxes.append(word_box)
|
|
294
|
+
current_word = [curr]
|
|
295
|
+
|
|
296
|
+
# Last word
|
|
297
|
+
if current_word:
|
|
298
|
+
word_box = self._merge_component_boxes(current_word)
|
|
299
|
+
word_box = self._add_padding(word_box, img_w, img_h)
|
|
300
|
+
word_boxes.append(word_box)
|
|
301
|
+
|
|
302
|
+
return word_boxes
|
|
303
|
+
|
|
304
|
+
def _merge_component_boxes(self, components):
|
|
305
|
+
"""Merge component bounding boxes"""
|
|
306
|
+
if not components:
|
|
307
|
+
return None
|
|
308
|
+
|
|
309
|
+
x_min = min(c['bbox'][0] for c in components)
|
|
310
|
+
y_min = min(c['bbox'][1] for c in components)
|
|
311
|
+
x_max = max(c['bbox'][0] + c['bbox'][2] for c in components)
|
|
312
|
+
y_max = max(c['bbox'][1] + c['bbox'][3] for c in components)
|
|
313
|
+
|
|
314
|
+
return (x_min, y_min, x_max - x_min, y_max - y_min)
|
|
315
|
+
|
|
316
|
+
def _add_padding(self, box, img_w, img_h):
|
|
317
|
+
"""Add padding to a box with boundary checks"""
|
|
318
|
+
if box is None:
|
|
319
|
+
return None
|
|
320
|
+
x, y, w, h = box
|
|
321
|
+
padding = self._auto_padding if self._auto_padding is not None else 10
|
|
322
|
+
x_pad = max(0, x - padding)
|
|
323
|
+
y_pad = max(0, y - padding)
|
|
324
|
+
w_pad = min(img_w - x_pad, w + 2 * padding)
|
|
325
|
+
h_pad = min(img_h - y_pad, h + 2 * padding)
|
|
326
|
+
return (x_pad, y_pad, w_pad, h_pad)
|
|
327
|
+
|
|
328
|
+
def _merge_overlapping_boxes(self, boxes, median_height):
|
|
329
|
+
"""Merge boxes that overlap vertically (same text line detected multiple times)"""
|
|
330
|
+
if not boxes:
|
|
331
|
+
return []
|
|
332
|
+
|
|
333
|
+
# Sort by Y coordinate
|
|
334
|
+
boxes = sorted(boxes, key=lambda b: b[1])
|
|
335
|
+
merged = []
|
|
336
|
+
current_box = boxes[0]
|
|
337
|
+
|
|
338
|
+
for next_box in boxes[1:]:
|
|
339
|
+
# Check vertical overlap
|
|
340
|
+
curr_y1, curr_y2 = current_box[1], current_box[1] + current_box[3]
|
|
341
|
+
next_y1, next_y2 = next_box[1], next_box[1] + next_box[3]
|
|
342
|
+
|
|
343
|
+
# Calculate overlap
|
|
344
|
+
overlap_start = max(curr_y1, next_y1)
|
|
345
|
+
overlap_end = min(curr_y2, next_y2)
|
|
346
|
+
overlap = max(0, overlap_end - overlap_start)
|
|
347
|
+
|
|
348
|
+
# Calculate heights
|
|
349
|
+
curr_height = current_box[3]
|
|
350
|
+
next_height = next_box[3]
|
|
351
|
+
|
|
352
|
+
# More strict merging: only merge if significant overlap
|
|
353
|
+
# Merge if overlap is more than 40% of the smaller box height
|
|
354
|
+
min_height = min(curr_height, next_height)
|
|
355
|
+
if overlap > min_height * 0.4:
|
|
356
|
+
# Merge: take union of both boxes
|
|
357
|
+
x_min = min(current_box[0], next_box[0])
|
|
358
|
+
y_min = min(current_box[1], next_box[1])
|
|
359
|
+
x_max = max(current_box[0] + current_box[2], next_box[0] + next_box[2])
|
|
360
|
+
y_max = max(current_box[1] + current_box[3], next_box[1] + next_box[3])
|
|
361
|
+
current_box = (x_min, y_min, x_max - x_min, y_max - y_min)
|
|
362
|
+
else:
|
|
363
|
+
# No significant overlap, save current and move to next
|
|
364
|
+
merged.append(current_box)
|
|
365
|
+
current_box = next_box
|
|
366
|
+
|
|
367
|
+
# Add the last box
|
|
368
|
+
merged.append(current_box)
|
|
369
|
+
return merged
|
|
370
|
+
|
|
371
|
+
def is_multiline(self, image_path, threshold=2):
|
|
372
|
+
"""Check if image contains multiple lines"""
|
|
373
|
+
boxes = self.detect_lines(image_path)
|
|
374
|
+
return len(boxes) >= threshold
|