kiri-ocr 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
kiri_ocr/detector.py ADDED
@@ -0,0 +1,374 @@
1
+ import cv2
2
+ import numpy as np
3
+ from scipy.ndimage import gaussian_filter1d
4
+
5
+ class TextDetector:
6
+ """Detect text regions using Tesseract's actual approach - language-agnostic"""
7
+
8
+ def __init__(self, padding=None):
9
+ """
10
+ Args:
11
+ padding: Pixels to add around detected boxes (default: None for auto)
12
+ If None, padding will be automatically calculated based on text size
13
+ """
14
+ self.padding = padding
15
+ self._auto_padding = None
16
+
17
+ def detect_lines(self, image_path):
18
+ """
19
+ Detect text lines using Tesseract's actual Page Layout Analysis:
20
+ 1. Otsu binarization
21
+ 2. Connected components with stats
22
+ 3. Estimate median text height from components
23
+ 4. Filter noise based on text height
24
+ 5. Find baselines and group into text lines
25
+ 6. Merge overlapping regions
26
+
27
+ Returns:
28
+ List of bounding boxes [(x, y, w, h), ...]
29
+ """
30
+ # Load and preprocess
31
+ img = cv2.imread(str(image_path))
32
+ if img is None:
33
+ raise ValueError(f"Could not load image: {image_path}")
34
+
35
+ img_h, img_w = img.shape[:2]
36
+ gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
37
+
38
+ # Step 1: Otsu binarization (foreground detection)
39
+ # auto-detects if it needs to invert
40
+ _, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
41
+
42
+ # Check if we got mostly white (wrong polarity)
43
+ if np.mean(binary) > 127:
44
+ _, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
45
+ binary = 255 - binary
46
+
47
+ # Step 2: Connected components analysis
48
+ num_labels, labels, stats, centroids = cv2.connectedComponentsWithStats(
49
+ binary, connectivity=8
50
+ )
51
+
52
+ # Step 3: Estimate typical text height
53
+ # Collect component heights
54
+ heights = []
55
+ widths = []
56
+ valid_components = []
57
+
58
+ for i in range(1, num_labels): # Skip background
59
+ x, y, w, h, area = stats[i]
60
+
61
+ # Basic sanity filters (remove 1-pixel noise)
62
+ if w >= 2 and h >= 3 and area >= 6:
63
+ heights.append(h)
64
+ widths.append(w)
65
+ valid_components.append({
66
+ 'id': i,
67
+ 'bbox': (x, y, w, h),
68
+ 'area': area,
69
+ 'centroid': centroids[i]
70
+ })
71
+
72
+ if not valid_components:
73
+ return []
74
+
75
+ # Tesseract's approach: use median height as typical character height
76
+ median_height = np.median(heights)
77
+ median_width = np.median(widths)
78
+
79
+ # Auto-calculate padding based on median text height
80
+ # typically uses ~10-20% of text height as padding
81
+ if self.padding is None:
82
+ self._auto_padding = max(2, int(median_height * 0.15))
83
+ else:
84
+ self._auto_padding = self.padding
85
+
86
+ # Step 4: Filter components
87
+ # Keep components that are reasonable relative to median text size
88
+ # uses: 0.25 * median < height < 3.0 * median
89
+ min_h = max(3, median_height * 0.25)
90
+ max_h = median_height * 3.0
91
+ min_w = max(2, median_width * 0.1)
92
+ max_w = img_w * 0.95 # Not entire width
93
+
94
+ text_components = []
95
+ for comp in valid_components:
96
+ x, y, w, h = comp['bbox']
97
+
98
+ # Tesseract's blob filtering
99
+ if min_h <= h <= max_h and min_w <= w <= max_w:
100
+ # Aspect ratio check (not too extreme)
101
+ aspect = w / h if h > 0 else 0
102
+ if 0.05 < aspect < 20: # Reasonable character proportions
103
+ # Additional filter: remove very small isolated components
104
+ # (likely diacritics that should be grouped with main text)
105
+ # Skip components that are less than 30% of median height
106
+ if h >= median_height * 0.3:
107
+ text_components.append(comp)
108
+
109
+ if not text_components:
110
+ return []
111
+
112
+ # Step 5: Line finding using baseline clustering
113
+ # Group components by vertical position (baseline detection)
114
+ text_components.sort(key=lambda c: c['centroid'][1])
115
+
116
+ lines = []
117
+ current_line = [text_components[0]]
118
+
119
+ for comp in text_components[1:]:
120
+ # Get current line's vertical extent
121
+ line_y_values = [c['centroid'][1] for c in current_line]
122
+ line_y_center = np.mean(line_y_values)
123
+ line_heights = [c['bbox'][3] for c in current_line]
124
+ avg_line_height = np.mean(line_heights)
125
+
126
+ comp_y = comp['centroid'][1]
127
+
128
+ # For Khmer text with diacritics, use slightly larger tolerance
129
+ # Use 0.45 to group base characters with nearby components
130
+ tolerance = avg_line_height * 0.45
131
+
132
+ if abs(comp_y - line_y_center) <= tolerance:
133
+ current_line.append(comp)
134
+ else:
135
+ lines.append(current_line)
136
+ current_line = [comp]
137
+
138
+ if current_line:
139
+ lines.append(current_line)
140
+
141
+ # Step 6: Create bounding boxes for each line
142
+ line_boxes = []
143
+ for line in lines:
144
+ if not line:
145
+ continue
146
+
147
+ # Get line bounding box
148
+ x_min = min(c['bbox'][0] for c in line)
149
+ y_min = min(c['bbox'][1] for c in line)
150
+ x_max = max(c['bbox'][0] + c['bbox'][2] for c in line)
151
+ y_max = max(c['bbox'][1] + c['bbox'][3] for c in line)
152
+
153
+ # Add padding (auto-calculated based on text size)
154
+ x_pad = max(0, x_min - self._auto_padding)
155
+ y_pad = max(0, y_min - self._auto_padding)
156
+ w_pad = min(img_w - x_pad, (x_max - x_min) + 2 * self._auto_padding)
157
+ h_pad = min(img_h - y_pad, (y_max - y_min) + 2 * self._auto_padding)
158
+
159
+ line_boxes.append((x_pad, y_pad, w_pad, h_pad))
160
+
161
+ # Merge overlapping boxes (fix duplicate detections)
162
+ line_boxes = self._merge_overlapping_boxes(line_boxes, median_height)
163
+
164
+ # Sort top to bottom
165
+ line_boxes = sorted(line_boxes, key=lambda b: b[1])
166
+
167
+ return line_boxes
168
+
169
+ def detect_words(self, image_path):
170
+ """
171
+ Detect words using Tesseract's approach:
172
+ 1. Find all text components
173
+ 2. Estimate typical character width
174
+ 3. Group into lines
175
+ 4. Within lines, cluster by horizontal spacing
176
+
177
+ Returns:
178
+ List of word bounding boxes
179
+ """
180
+ img = cv2.imread(str(image_path))
181
+ if img is None:
182
+ raise ValueError(f"Could not load image: {image_path}")
183
+
184
+ img_h, img_w = img.shape[:2]
185
+ gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
186
+
187
+ # Binarization
188
+ _, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
189
+ if np.mean(binary) > 127:
190
+ _, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
191
+ binary = 255 - binary
192
+
193
+ # Connected components
194
+ num_labels, labels, stats, centroids = cv2.connectedComponentsWithStats(
195
+ binary, connectivity=8
196
+ )
197
+
198
+ # Collect and filter components
199
+ heights = []
200
+ widths = []
201
+ valid_components = []
202
+
203
+ for i in range(1, num_labels):
204
+ x, y, w, h, area = stats[i]
205
+ if w >= 2 and h >= 3 and area >= 6:
206
+ heights.append(h)
207
+ widths.append(w)
208
+ valid_components.append({
209
+ 'bbox': (x, y, w, h),
210
+ 'centroid': centroids[i]
211
+ })
212
+
213
+ if not valid_components:
214
+ return []
215
+
216
+ # Estimate text size
217
+ median_height = np.median(heights)
218
+ median_width = np.median(widths)
219
+
220
+ # Auto-calculate padding based on median text height
221
+ if self.padding is None:
222
+ self._auto_padding = max(2, int(median_height * 0.15))
223
+ else:
224
+ self._auto_padding = self.padding
225
+
226
+ # Filter components
227
+ min_h = max(3, median_height * 0.25)
228
+ max_h = median_height * 3.0
229
+
230
+ text_components = []
231
+ for comp in valid_components:
232
+ x, y, w, h = comp['bbox']
233
+ if min_h <= h <= max_h and w < img_w * 0.95:
234
+ aspect = w / h if h > 0 else 0
235
+ if 0.05 < aspect < 20:
236
+ # Filter out very small components (diacritics)
237
+ if h >= median_height * 0.3:
238
+ text_components.append(comp)
239
+
240
+ if not text_components:
241
+ return []
242
+
243
+ # Group into lines
244
+ text_components.sort(key=lambda c: c['centroid'][1])
245
+ lines = []
246
+ current_line = [text_components[0]]
247
+
248
+ for comp in text_components[1:]:
249
+ line_y_center = np.mean([c['centroid'][1] for c in current_line])
250
+ avg_height = np.mean([c['bbox'][3] for c in current_line])
251
+
252
+ # Use 0.45 tolerance for Khmer diacritics
253
+ if abs(comp['centroid'][1] - line_y_center) <= avg_height * 0.45:
254
+ current_line.append(comp)
255
+ else:
256
+ lines.append(current_line)
257
+ current_line = [comp]
258
+
259
+ if current_line:
260
+ lines.append(current_line)
261
+
262
+ # Segment lines into words
263
+ word_boxes = []
264
+ for line in lines:
265
+ # Sort by X position
266
+ line.sort(key=lambda c: c['bbox'][0])
267
+
268
+ # word spacing: typically 0.3 to 1.0 * median character width
269
+ # Space between words is larger than between characters
270
+ char_widths = [c['bbox'][2] for c in line]
271
+ median_char_width = np.median(char_widths) if char_widths else 10
272
+
273
+ # Word gap threshold
274
+ word_gap = median_char_width * 0.6
275
+
276
+ current_word = [line[0]]
277
+
278
+ for i in range(1, len(line)):
279
+ prev = line[i-1]
280
+ curr = line[i]
281
+
282
+ # Gap between characters
283
+ gap = curr['bbox'][0] - (prev['bbox'][0] + prev['bbox'][2])
284
+
285
+ if gap <= word_gap:
286
+ # Same word
287
+ current_word.append(curr)
288
+ else:
289
+ # New word
290
+ if current_word:
291
+ word_box = self._merge_component_boxes(current_word)
292
+ word_box = self._add_padding(word_box, img_w, img_h)
293
+ word_boxes.append(word_box)
294
+ current_word = [curr]
295
+
296
+ # Last word
297
+ if current_word:
298
+ word_box = self._merge_component_boxes(current_word)
299
+ word_box = self._add_padding(word_box, img_w, img_h)
300
+ word_boxes.append(word_box)
301
+
302
+ return word_boxes
303
+
304
+ def _merge_component_boxes(self, components):
305
+ """Merge component bounding boxes"""
306
+ if not components:
307
+ return None
308
+
309
+ x_min = min(c['bbox'][0] for c in components)
310
+ y_min = min(c['bbox'][1] for c in components)
311
+ x_max = max(c['bbox'][0] + c['bbox'][2] for c in components)
312
+ y_max = max(c['bbox'][1] + c['bbox'][3] for c in components)
313
+
314
+ return (x_min, y_min, x_max - x_min, y_max - y_min)
315
+
316
+ def _add_padding(self, box, img_w, img_h):
317
+ """Add padding to a box with boundary checks"""
318
+ if box is None:
319
+ return None
320
+ x, y, w, h = box
321
+ padding = self._auto_padding if self._auto_padding is not None else 10
322
+ x_pad = max(0, x - padding)
323
+ y_pad = max(0, y - padding)
324
+ w_pad = min(img_w - x_pad, w + 2 * padding)
325
+ h_pad = min(img_h - y_pad, h + 2 * padding)
326
+ return (x_pad, y_pad, w_pad, h_pad)
327
+
328
+ def _merge_overlapping_boxes(self, boxes, median_height):
329
+ """Merge boxes that overlap vertically (same text line detected multiple times)"""
330
+ if not boxes:
331
+ return []
332
+
333
+ # Sort by Y coordinate
334
+ boxes = sorted(boxes, key=lambda b: b[1])
335
+ merged = []
336
+ current_box = boxes[0]
337
+
338
+ for next_box in boxes[1:]:
339
+ # Check vertical overlap
340
+ curr_y1, curr_y2 = current_box[1], current_box[1] + current_box[3]
341
+ next_y1, next_y2 = next_box[1], next_box[1] + next_box[3]
342
+
343
+ # Calculate overlap
344
+ overlap_start = max(curr_y1, next_y1)
345
+ overlap_end = min(curr_y2, next_y2)
346
+ overlap = max(0, overlap_end - overlap_start)
347
+
348
+ # Calculate heights
349
+ curr_height = current_box[3]
350
+ next_height = next_box[3]
351
+
352
+ # More strict merging: only merge if significant overlap
353
+ # Merge if overlap is more than 40% of the smaller box height
354
+ min_height = min(curr_height, next_height)
355
+ if overlap > min_height * 0.4:
356
+ # Merge: take union of both boxes
357
+ x_min = min(current_box[0], next_box[0])
358
+ y_min = min(current_box[1], next_box[1])
359
+ x_max = max(current_box[0] + current_box[2], next_box[0] + next_box[2])
360
+ y_max = max(current_box[1] + current_box[3], next_box[1] + next_box[3])
361
+ current_box = (x_min, y_min, x_max - x_min, y_max - y_min)
362
+ else:
363
+ # No significant overlap, save current and move to next
364
+ merged.append(current_box)
365
+ current_box = next_box
366
+
367
+ # Add the last box
368
+ merged.append(current_box)
369
+ return merged
370
+
371
+ def is_multiline(self, image_path, threshold=2):
372
+ """Check if image contains multiple lines"""
373
+ boxes = self.detect_lines(image_path)
374
+ return len(boxes) >= threshold