openvisionkit 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- openvisionkit/__init__.py +1 -0
- openvisionkit/_version.py +24 -0
- openvisionkit/capture/draw_object.py +296 -0
- openvisionkit/capture/image_template.py +61 -0
- openvisionkit/capture/screen_capture.py +13 -0
- openvisionkit/capture/video_recorder.py +128 -0
- openvisionkit/capture/video_template.py +336 -0
- openvisionkit/lib/classifier.py +186 -0
- openvisionkit/lib/face_detector.py +587 -0
- openvisionkit/lib/face_mesh_detector.py +913 -0
- openvisionkit/lib/form_detector.py +465 -0
- openvisionkit/lib/form_roi_annotator.py +679 -0
- openvisionkit/lib/form_roi_detector.py +1078 -0
- openvisionkit/lib/fps_counter.py +38 -0
- openvisionkit/lib/hair_segmentation.py +298 -0
- openvisionkit/lib/hand_detector.py +1230 -0
- openvisionkit/lib/image_detector.py +1095 -0
- openvisionkit/lib/object_detector.py +401 -0
- openvisionkit/lib/pose_detector.py +919 -0
- openvisionkit/lib/selfie_segmentation.py +528 -0
- openvisionkit/lib/text_detector.py +1229 -0
- openvisionkit/utility/live_plot.py +141 -0
- openvisionkit/utility/vision_utilis.py +871 -0
- openvisionkit-0.4.0.dist-info/METADATA +1018 -0
- openvisionkit-0.4.0.dist-info/RECORD +26 -0
- openvisionkit-0.4.0.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,1095 @@
|
|
|
1
|
+
import json
|
|
2
|
+
|
|
3
|
+
import cv2
|
|
4
|
+
import numpy as np
|
|
5
|
+
|
|
6
|
+
try:
|
|
7
|
+
import pytesseract
|
|
8
|
+
except ImportError:
|
|
9
|
+
pytesseract = None # type: ignore
|
|
10
|
+
try:
|
|
11
|
+
from skimage.metrics import structural_similarity as ssim
|
|
12
|
+
except ImportError:
|
|
13
|
+
ssim = None # type: ignore
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class ImageDetector:
|
|
17
|
+
def __init__(self, image, pre_process=False):
|
|
18
|
+
self.image = image
|
|
19
|
+
if pre_process:
|
|
20
|
+
self.image = self._preprocess(image)
|
|
21
|
+
|
|
22
|
+
def _preprocess(self, image: np.ndarray) -> np.ndarray:
|
|
23
|
+
"""
|
|
24
|
+
Preprocess the input image for better OCR results.
|
|
25
|
+
|
|
26
|
+
Args:
|
|
27
|
+
image (np.ndarray): The input image to preprocess.
|
|
28
|
+
|
|
29
|
+
Returns:
|
|
30
|
+
np.ndarray: The preprocessed image.
|
|
31
|
+
"""
|
|
32
|
+
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
|
|
33
|
+
|
|
34
|
+
# Contrast enhancement
|
|
35
|
+
gray = cv2.equalizeHist(gray)
|
|
36
|
+
|
|
37
|
+
# Noise reduction
|
|
38
|
+
blurred = cv2.GaussianBlur(gray, (3, 3), 0)
|
|
39
|
+
|
|
40
|
+
# Adaptive threshold
|
|
41
|
+
thresh = cv2.adaptiveThreshold(
|
|
42
|
+
blurred, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
return thresh
|
|
46
|
+
|
|
47
|
+
def fallback_ssim(self, image1, image2, form_name, draw_frame=False):
|
|
48
|
+
image2_resized = cv2.resize(image2, (image1.shape[1], image1.shape[0]))
|
|
49
|
+
|
|
50
|
+
gray1 = cv2.cvtColor(image1, cv2.COLOR_BGR2GRAY)
|
|
51
|
+
gray2 = cv2.cvtColor(image2_resized, cv2.COLOR_BGR2GRAY)
|
|
52
|
+
|
|
53
|
+
score, diff = ssim(gray1, gray2, full=True)
|
|
54
|
+
|
|
55
|
+
diff = (diff * 255).astype("uint8")
|
|
56
|
+
|
|
57
|
+
if draw_frame:
|
|
58
|
+
cv2.imshow(f"{form_name} - SSIM Diff (score={score:.3f})", diff)
|
|
59
|
+
cv2.waitKey(0)
|
|
60
|
+
cv2.destroyAllWindows()
|
|
61
|
+
|
|
62
|
+
return {
|
|
63
|
+
"matches": 0,
|
|
64
|
+
"homography": None,
|
|
65
|
+
"aligned_image": image2_resized,
|
|
66
|
+
"ssim_score": score,
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
def compare_matches_knn_matcher(
|
|
70
|
+
self,
|
|
71
|
+
image2,
|
|
72
|
+
form_name,
|
|
73
|
+
no_of_feature=500,
|
|
74
|
+
matched_amount=50,
|
|
75
|
+
percentage_of_matches=20,
|
|
76
|
+
draw_matches=False,
|
|
77
|
+
draw_aligned=False,
|
|
78
|
+
):
|
|
79
|
+
# Detect keypoints
|
|
80
|
+
image_detector = ImageDetector(image2)
|
|
81
|
+
|
|
82
|
+
keypoints1, descriptors1, _ = self.detect_keypoints(
|
|
83
|
+
features=no_of_feature, draw_keypoints=False
|
|
84
|
+
)
|
|
85
|
+
keypoints2, descriptors2, _ = image_detector.detect_keypoints(
|
|
86
|
+
features=no_of_feature, draw_keypoints=False
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
if descriptors1 is None or descriptors2 is None:
|
|
90
|
+
print("Feature detection failed → using SSIM fallback")
|
|
91
|
+
return self.fallback_ssim(self.image, image2, form_name)
|
|
92
|
+
|
|
93
|
+
# Safety check
|
|
94
|
+
if descriptors1 is None or descriptors2 is None:
|
|
95
|
+
raise ValueError("Descriptors could not be computed")
|
|
96
|
+
|
|
97
|
+
# Use KNN matcher instead of crossCheck
|
|
98
|
+
bf = cv2.BFMatcher(cv2.NORM_HAMMING)
|
|
99
|
+
|
|
100
|
+
matches = bf.knnMatch(descriptors1, descriptors2, k=2)
|
|
101
|
+
|
|
102
|
+
# Apply ratio test
|
|
103
|
+
good_matches = []
|
|
104
|
+
for m, n in matches:
|
|
105
|
+
if m.distance < 0.75 * n.distance:
|
|
106
|
+
good_matches.append(m)
|
|
107
|
+
|
|
108
|
+
if len(good_matches) < 4:
|
|
109
|
+
print("Not enough matches → using fallback")
|
|
110
|
+
return self.fallback_ssim(self.image, image2, form_name)
|
|
111
|
+
|
|
112
|
+
# Sort matches
|
|
113
|
+
good_matches = sorted(good_matches, key=lambda x: x.distance)
|
|
114
|
+
|
|
115
|
+
# Take top percentage
|
|
116
|
+
keep_n = int(len(good_matches) * (percentage_of_matches / 100))
|
|
117
|
+
good_matches = good_matches[: max(keep_n, 4)] # ensure at least 4
|
|
118
|
+
|
|
119
|
+
# Draw matches
|
|
120
|
+
matchedImage = cv2.drawMatches(
|
|
121
|
+
self.image,
|
|
122
|
+
keypoints1,
|
|
123
|
+
image2,
|
|
124
|
+
keypoints2,
|
|
125
|
+
good_matches[:matched_amount],
|
|
126
|
+
None,
|
|
127
|
+
flags=cv2.DrawMatchesFlags_NOT_DRAW_SINGLE_POINTS,
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
# Compute homography
|
|
131
|
+
sourcePoints = np.float32(
|
|
132
|
+
[keypoints1[m.queryIdx].pt for m in good_matches]
|
|
133
|
+
).reshape(-1, 1, 2)
|
|
134
|
+
|
|
135
|
+
destinationPoints = np.float32(
|
|
136
|
+
[keypoints2[m.trainIdx].pt for m in good_matches]
|
|
137
|
+
).reshape(-1, 1, 2)
|
|
138
|
+
|
|
139
|
+
M, mask = cv2.findHomography(destinationPoints, sourcePoints, cv2.RANSAC, 5.0)
|
|
140
|
+
|
|
141
|
+
if M is None:
|
|
142
|
+
print("Homography could not be computed so it will be using fallback")
|
|
143
|
+
return self.fallback_ssim(self.image, image2, form_name)
|
|
144
|
+
|
|
145
|
+
h, w = self.image.shape[:2]
|
|
146
|
+
imageTransformed = cv2.warpPerspective(image2, M, (w, h))
|
|
147
|
+
|
|
148
|
+
imageTransformed_small = cv2.resize(imageTransformed, (w // 3, h // 3))
|
|
149
|
+
matchedImage_small = cv2.resize(matchedImage, (w // 3, h // 3))
|
|
150
|
+
|
|
151
|
+
if draw_matches:
|
|
152
|
+
cv2.imshow(f"{form_name} - Matches (Inliers)", matchedImage_small)
|
|
153
|
+
cv2.waitKey(0)
|
|
154
|
+
cv2.destroyAllWindows()
|
|
155
|
+
|
|
156
|
+
if draw_aligned:
|
|
157
|
+
cv2.imshow(f"{form_name} - Aligned", imageTransformed_small)
|
|
158
|
+
cv2.waitKey(0)
|
|
159
|
+
cv2.destroyAllWindows()
|
|
160
|
+
|
|
161
|
+
return {
|
|
162
|
+
"matches": len(good_matches),
|
|
163
|
+
"homography": M,
|
|
164
|
+
"matched_image": matchedImage,
|
|
165
|
+
"aligned_image": imageTransformed,
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
def compare_matches_bf_matcher(
|
|
169
|
+
self,
|
|
170
|
+
image2,
|
|
171
|
+
form_name,
|
|
172
|
+
no_of_feature=500,
|
|
173
|
+
matched_amount=50,
|
|
174
|
+
percentage_of_matches=20,
|
|
175
|
+
draw_matches=False,
|
|
176
|
+
draw_aligned=False,
|
|
177
|
+
):
|
|
178
|
+
# Detect keypoints
|
|
179
|
+
image_detector = ImageDetector(image2)
|
|
180
|
+
|
|
181
|
+
keypoints1, descriptors1, _ = self.detect_keypoints(
|
|
182
|
+
features=no_of_feature, draw_keypoints=False
|
|
183
|
+
)
|
|
184
|
+
keypoints2, descriptors2, _ = image_detector.detect_keypoints(
|
|
185
|
+
features=no_of_feature, draw_keypoints=False
|
|
186
|
+
)
|
|
187
|
+
|
|
188
|
+
if descriptors1 is None or descriptors2 is None:
|
|
189
|
+
print("Feature detection failed → using SSIM fallback")
|
|
190
|
+
return self.fallback_ssim(self.image, image2, form_name)
|
|
191
|
+
|
|
192
|
+
# Safety check
|
|
193
|
+
if descriptors1 is None or descriptors2 is None:
|
|
194
|
+
raise ValueError("Descriptors could not be computed")
|
|
195
|
+
|
|
196
|
+
# Use KNN matcher instead of crossCheck
|
|
197
|
+
bf = cv2.BFMatcher(cv2.NORM_HAMMING)
|
|
198
|
+
|
|
199
|
+
matches = bf.match(descriptors1, descriptors2)
|
|
200
|
+
|
|
201
|
+
# Sort matches
|
|
202
|
+
good_matches = sorted(matches, key=lambda x: x.distance)
|
|
203
|
+
|
|
204
|
+
# Take top percentage
|
|
205
|
+
keep_n = int(len(good_matches) * (percentage_of_matches / 100))
|
|
206
|
+
good_matches = good_matches[: max(keep_n, 4)] # ensure at least 4
|
|
207
|
+
|
|
208
|
+
# Draw matches
|
|
209
|
+
matchedImage = cv2.drawMatches(
|
|
210
|
+
self.image,
|
|
211
|
+
keypoints1,
|
|
212
|
+
image2,
|
|
213
|
+
keypoints2,
|
|
214
|
+
good_matches[:matched_amount],
|
|
215
|
+
None,
|
|
216
|
+
flags=cv2.DrawMatchesFlags_NOT_DRAW_SINGLE_POINTS,
|
|
217
|
+
)
|
|
218
|
+
|
|
219
|
+
# Compute homography
|
|
220
|
+
sourcePoints = np.float32(
|
|
221
|
+
[keypoints1[m.queryIdx].pt for m in good_matches]
|
|
222
|
+
).reshape(-1, 1, 2)
|
|
223
|
+
|
|
224
|
+
destinationPoints = np.float32(
|
|
225
|
+
[keypoints2[m.trainIdx].pt for m in good_matches]
|
|
226
|
+
).reshape(-1, 1, 2)
|
|
227
|
+
|
|
228
|
+
M, mask = cv2.findHomography(destinationPoints, sourcePoints, cv2.RANSAC, 5.0)
|
|
229
|
+
|
|
230
|
+
if M is None:
|
|
231
|
+
print("Homography could not be computed so it will be using fallback")
|
|
232
|
+
return self.fallback_ssim(self.image, image2, form_name)
|
|
233
|
+
|
|
234
|
+
h, w = self.image.shape[:2]
|
|
235
|
+
imageTransformed = cv2.warpPerspective(image2, M, (w, h))
|
|
236
|
+
|
|
237
|
+
imageTransformed_small = cv2.resize(imageTransformed, (w // 3, h // 3))
|
|
238
|
+
matchedImage_small = cv2.resize(matchedImage, (w // 3, h // 3))
|
|
239
|
+
|
|
240
|
+
# it will match the form and the template and show the matched keypoints and the aligned image. The homography matrix can be used to further analyze the geometric transformation between the two images, such as calculating the angle of rotation or the scale difference.
|
|
241
|
+
if draw_matches:
|
|
242
|
+
cv2.imshow(f"{form_name} - Matches (Inliers)", matchedImage_small)
|
|
243
|
+
cv2.waitKey(0)
|
|
244
|
+
cv2.destroyAllWindows()
|
|
245
|
+
|
|
246
|
+
# it will match the form and the template and show the matched keypoints and the aligned image. The homography matrix can be used to further analyze the geometric transformation between the two images, such as calculating the angle of rotation or the scale difference.
|
|
247
|
+
if draw_aligned:
|
|
248
|
+
cv2.imshow(f"{form_name} - Aligned", imageTransformed_small)
|
|
249
|
+
cv2.waitKey(0)
|
|
250
|
+
cv2.destroyAllWindows()
|
|
251
|
+
|
|
252
|
+
return {
|
|
253
|
+
"matches": len(good_matches),
|
|
254
|
+
"homography": M,
|
|
255
|
+
"matched_image": matchedImage,
|
|
256
|
+
"aligned_image": imageTransformed,
|
|
257
|
+
}
|
|
258
|
+
|
|
259
|
+
def compute_tolerance_percentile(self, pixels):
|
|
260
|
+
"""
|
|
261
|
+
Adaptively compute dynamic tolerance using local pixel distribution.
|
|
262
|
+
|
|
263
|
+
Args:
|
|
264
|
+
pixels: Array of pixel values in HSV color space for a specific region (e.g., highlighted area).
|
|
265
|
+
|
|
266
|
+
Returns:
|
|
267
|
+
Tuple of (h_tol, s_tol, v_tol) representing the computed tolerances for hue, saturation, and value channels based on the 10th and 90th percentiles of
|
|
268
|
+
"""
|
|
269
|
+
h_vals = pixels[:, 0]
|
|
270
|
+
s_vals = pixels[:, 1]
|
|
271
|
+
v_vals = pixels[:, 2]
|
|
272
|
+
|
|
273
|
+
h_tol = int(np.percentile(h_vals, 90) - np.percentile(h_vals, 10))
|
|
274
|
+
s_tol = int(np.percentile(s_vals, 90) - np.percentile(s_vals, 10))
|
|
275
|
+
v_tol = int(np.percentile(v_vals, 90) - np.percentile(v_vals, 10))
|
|
276
|
+
|
|
277
|
+
return h_tol, s_tol, v_tol
|
|
278
|
+
|
|
279
|
+
def detect_highlighted_text(
|
|
280
|
+
self,
|
|
281
|
+
hsv_colors=None, # seed HSV values
|
|
282
|
+
h_tol=8,
|
|
283
|
+
s_tol=80,
|
|
284
|
+
v_tol=80,
|
|
285
|
+
show_mask=False,
|
|
286
|
+
show_combined_mask=False,
|
|
287
|
+
show_image_with_mask=False,
|
|
288
|
+
):
|
|
289
|
+
"""
|
|
290
|
+
Detect highlighted text by creating HSV masks around specified colors.
|
|
291
|
+
Returns combined mask and individual masks for each color.
|
|
292
|
+
|
|
293
|
+
HSV (Hue, Saturation, Value) image processing is a color representation model, often preferred over RGB in computer vision
|
|
294
|
+
for color-based segmentation and detection. It separates color information (hue) from lighting/brightness (value), allowing
|
|
295
|
+
robust object tracking under varying illumination. Common uses include object tracking, color-based filtering, and thresholding
|
|
296
|
+
in OpenCV.
|
|
297
|
+
|
|
298
|
+
Args:
|
|
299
|
+
hsv_colors: List of seed HSV tuples to detect (e.g., yellow, green)
|
|
300
|
+
h_tol, s_tol, v_tol: Tolerances for hue, saturation, and value to create color ranges
|
|
301
|
+
show: Whether to display intermediate masks and results using OpenCV windows
|
|
302
|
+
show_mask: Show individual color masks
|
|
303
|
+
show_combined_mask: Show combined mask of all detected colors
|
|
304
|
+
show_image_with_mask: Show the original image with detected areas masked
|
|
305
|
+
|
|
306
|
+
Returns:
|
|
307
|
+
image_with_mask: Original image with detected areas masked
|
|
308
|
+
combined_mask: Binary mask combining all detected colors
|
|
309
|
+
masks: List of individual masks for each specified color
|
|
310
|
+
|
|
311
|
+
Usage:
|
|
312
|
+
image = cv2.imread("doc.jpg")
|
|
313
|
+
|
|
314
|
+
# Common highlighter HSV seeds (you can refine using click sampling)
|
|
315
|
+
highlight_colors = [
|
|
316
|
+
(30, 200, 250), # yellow
|
|
317
|
+
(60, 200, 250), # green
|
|
318
|
+
(150, 200, 250), # pink
|
|
319
|
+
(15, 200, 250), # orange
|
|
320
|
+
]
|
|
321
|
+
|
|
322
|
+
mask, masks = detect_highlighted_text(image, highlight_colors)
|
|
323
|
+
"""
|
|
324
|
+
if hsv_colors is None:
|
|
325
|
+
hsv_colors = [(27, 167, 251)]
|
|
326
|
+
img_blur = cv2.GaussianBlur(self.image, (5, 5), 0)
|
|
327
|
+
hsv = cv2.cvtColor(img_blur, cv2.COLOR_BGR2HSV)
|
|
328
|
+
combined_mask = np.zeros(hsv.shape[:2], dtype=np.uint8)
|
|
329
|
+
masks = []
|
|
330
|
+
|
|
331
|
+
for i, (h, s, v) in enumerate(hsv_colors):
|
|
332
|
+
# h_tol2, s_tol2, v_tol2 = compute_tolerence = self.compute_dynamic_tolerance2(hsv, h, s, v)
|
|
333
|
+
# print(f"Computed tolerances → H: {compute_tolerence[0]}, S: {compute_tolerence[1]}, V: {compute_tolerence[2]}")
|
|
334
|
+
lower = np.array([max(0, h - h_tol), max(0, s - s_tol), max(0, v - v_tol)])
|
|
335
|
+
|
|
336
|
+
upper = np.array(
|
|
337
|
+
[min(179, h + h_tol), min(255, s + s_tol), min(255, v + v_tol)]
|
|
338
|
+
)
|
|
339
|
+
|
|
340
|
+
mask = cv2.inRange(hsv, lower, upper)
|
|
341
|
+
masks.append(mask)
|
|
342
|
+
|
|
343
|
+
# Combine all masks
|
|
344
|
+
combined_mask = cv2.bitwise_or(combined_mask, mask)
|
|
345
|
+
|
|
346
|
+
# Remove Noise
|
|
347
|
+
# kernel = np.ones((3,3), np.uint8)
|
|
348
|
+
# combined_mask = cv2.morphologyEx(combined_mask, cv2.MORPH_CLOSE, kernel)
|
|
349
|
+
|
|
350
|
+
if show_mask:
|
|
351
|
+
cv2.imshow(f"Mask {i}", mask)
|
|
352
|
+
|
|
353
|
+
img_with_mask = cv2.bitwise_and(self.image, self.image, mask=combined_mask)
|
|
354
|
+
if show_combined_mask:
|
|
355
|
+
cv2.imshow("Combined Mask", combined_mask)
|
|
356
|
+
if show_image_with_mask:
|
|
357
|
+
cv2.imshow("Image with Mask", img_with_mask)
|
|
358
|
+
|
|
359
|
+
if show_mask or show_combined_mask or show_image_with_mask:
|
|
360
|
+
cv2.waitKey(0)
|
|
361
|
+
cv2.destroyAllWindows()
|
|
362
|
+
|
|
363
|
+
return img_with_mask, combined_mask, masks
|
|
364
|
+
|
|
365
|
+
def get_dominant_hsv_colors(self, k=10):
|
|
366
|
+
img_blur = cv2.GaussianBlur(self.image, (5, 5), 0)
|
|
367
|
+
hsv = cv2.cvtColor(img_blur, cv2.COLOR_BGR2HSV)
|
|
368
|
+
|
|
369
|
+
# 🔥 REMOVE WHITE BEFORE CLUSTERING
|
|
370
|
+
s = hsv[:, :, 1]
|
|
371
|
+
v = hsv[:, :, 2]
|
|
372
|
+
mask = (s > 25) & (v > 60) # lower threshold → keeps light blue
|
|
373
|
+
|
|
374
|
+
pixels = hsv[mask]
|
|
375
|
+
|
|
376
|
+
if len(pixels) == 0:
|
|
377
|
+
return []
|
|
378
|
+
|
|
379
|
+
pixels = pixels.reshape(-1, 3).astype(np.float32)
|
|
380
|
+
|
|
381
|
+
k = min(k, len(pixels))
|
|
382
|
+
|
|
383
|
+
_, labels, centers = cv2.kmeans(
|
|
384
|
+
pixels,
|
|
385
|
+
k,
|
|
386
|
+
None,
|
|
387
|
+
(cv2.TERM_CRITERIA_EPS + cv2.TERM_CRITERIA_MAX_ITER, 100, 0.2),
|
|
388
|
+
10,
|
|
389
|
+
cv2.KMEANS_RANDOM_CENTERS,
|
|
390
|
+
)
|
|
391
|
+
|
|
392
|
+
counts = np.bincount(labels.flatten())
|
|
393
|
+
|
|
394
|
+
highlight_colors = []
|
|
395
|
+
|
|
396
|
+
for i, (h, s, v) in enumerate(centers):
|
|
397
|
+
h, s, v = int(h), int(s), int(v)
|
|
398
|
+
|
|
399
|
+
# remove tiny clusters
|
|
400
|
+
if counts[i] < 30:
|
|
401
|
+
continue
|
|
402
|
+
|
|
403
|
+
# remove dark
|
|
404
|
+
if v < 60:
|
|
405
|
+
continue
|
|
406
|
+
|
|
407
|
+
# remove near-white
|
|
408
|
+
if v > 200 and s < 40:
|
|
409
|
+
continue
|
|
410
|
+
|
|
411
|
+
highlight_colors.append((h, s, v))
|
|
412
|
+
|
|
413
|
+
# 🔥 softer deduplication
|
|
414
|
+
filtered = []
|
|
415
|
+
for c in highlight_colors:
|
|
416
|
+
if not any(abs(c[0] - fc[0]) < 4 for fc in filtered):
|
|
417
|
+
filtered.append(c)
|
|
418
|
+
|
|
419
|
+
predefined = [
|
|
420
|
+
(95, 120, 255), # light blue
|
|
421
|
+
(30, 200, 250), # yellow
|
|
422
|
+
(60, 200, 250), # green
|
|
423
|
+
(100, 200, 250), # blue
|
|
424
|
+
(150, 200, 250), # pink
|
|
425
|
+
(20, 200, 250), # orange
|
|
426
|
+
]
|
|
427
|
+
return filtered + predefined
|
|
428
|
+
|
|
429
|
+
# def get_dominant_hsv_colors(self, k=4):
|
|
430
|
+
# """
|
|
431
|
+
# auto-detect highlight colors in the image by clustering pixel colors in HSV space using K-means.
|
|
432
|
+
# Get dominant HSV colors from the image using K-means clustering.
|
|
433
|
+
|
|
434
|
+
# Args:
|
|
435
|
+
# k: Number of dominant colors to detect (default is 4)
|
|
436
|
+
|
|
437
|
+
# Returns:
|
|
438
|
+
# List of dominant HSV color tuples (h, s, v) detected in the image.
|
|
439
|
+
# Usage:
|
|
440
|
+
|
|
441
|
+
# """
|
|
442
|
+
# mg_blur = cv2.GaussianBlur(self.image, (5,5), 0)
|
|
443
|
+
# hsv = cv2.cvtColor(mg_blur, cv2.COLOR_BGR2HSV)
|
|
444
|
+
# pixels = hsv.reshape(-1, 3).astype(np.float32)
|
|
445
|
+
|
|
446
|
+
# _, labels, centers = cv2.kmeans(
|
|
447
|
+
# pixels, k, None,
|
|
448
|
+
# (cv2.TERM_CRITERIA_EPS + cv2.TERM_CRITERIA_MAX_ITER, 100, 0.2),
|
|
449
|
+
# 10,
|
|
450
|
+
# cv2.KMEANS_RANDOM_CENTERS
|
|
451
|
+
# )
|
|
452
|
+
# # Filter likely highlight colors
|
|
453
|
+
# highlight_colors = []
|
|
454
|
+
# for (h, s, v) in centers:
|
|
455
|
+
# if s > 80 and v > 150: # high saturation + brightness
|
|
456
|
+
# highlight_colors.append((int(h), int(s), int(v)))
|
|
457
|
+
|
|
458
|
+
# return highlight_colors
|
|
459
|
+
|
|
460
|
+
def detect_single_highlighted_text(self, image, hsv_colors=None):
|
|
461
|
+
"""Detect highlighted text based on a single HSV color.
|
|
462
|
+
Args:
|
|
463
|
+
image: Input image in BGR format (as read by OpenCV)
|
|
464
|
+
hsv_colors: List of HSV values to detect (default is a single yellow color)
|
|
465
|
+
Returns:
|
|
466
|
+
image_with_mask: Image with detected highlighted areas masked
|
|
467
|
+
mask: Binary mask of detected highlighted areas
|
|
468
|
+
"""
|
|
469
|
+
if hsv_colors is None:
|
|
470
|
+
hsv_colors = [27, 167, 251]
|
|
471
|
+
hsv = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
|
|
472
|
+
lower = np.array([hsv_colors[0], hsv_colors[1], hsv_colors[2]])
|
|
473
|
+
upper = np.array([140, 255, 255])
|
|
474
|
+
|
|
475
|
+
mask = cv2.inRange(hsv, lower, upper)
|
|
476
|
+
img_with_mask = cv2.bitwise_and(image, image, mask=mask)
|
|
477
|
+
return img_with_mask, mask
|
|
478
|
+
|
|
479
|
+
def refine_mask(self, mask, merge=True):
|
|
480
|
+
"""
|
|
481
|
+
Refines a binary mask by applying morphological operations to remove noise and merge words in the same line.
|
|
482
|
+
|
|
483
|
+
Args:
|
|
484
|
+
mask: The input binary mask to be refined.
|
|
485
|
+
merge: If True, applies dilation to merge words in the same line (default is True).
|
|
486
|
+
Use case merge
|
|
487
|
+
Detect each highlight separately : False
|
|
488
|
+
Group text into lines/regions : True
|
|
489
|
+
|
|
490
|
+
Returns:
|
|
491
|
+
The refined binary mask.
|
|
492
|
+
"""
|
|
493
|
+
|
|
494
|
+
kernel_small = np.ones((3, 3), np.uint8)
|
|
495
|
+
# mask = cv2.erode(mask, kernel_small, iterations=1)
|
|
496
|
+
# Remove noise
|
|
497
|
+
mask = cv2.morphologyEx(mask, cv2.MORPH_OPEN, kernel_small, iterations=1)
|
|
498
|
+
|
|
499
|
+
# Fill gaps
|
|
500
|
+
mask = cv2.morphologyEx(mask, cv2.MORPH_CLOSE, kernel_small, iterations=2)
|
|
501
|
+
|
|
502
|
+
if merge:
|
|
503
|
+
# ⚠️ This merges multiple highlights into ONE contour
|
|
504
|
+
kernel_line = np.ones((15, 5), np.uint8)
|
|
505
|
+
mask = cv2.dilate(mask, kernel_line, iterations=1)
|
|
506
|
+
|
|
507
|
+
return mask
|
|
508
|
+
|
|
509
|
+
def get_cannty_edges(self, low_threshold=50, high_threshold=150):
|
|
510
|
+
img = self.image.copy()
|
|
511
|
+
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
|
|
512
|
+
|
|
513
|
+
# Edge detection
|
|
514
|
+
blur = cv2.GaussianBlur(gray, (5, 5), 1)
|
|
515
|
+
edges = cv2.Canny(blur, low_threshold, high_threshold)
|
|
516
|
+
|
|
517
|
+
# Morphological cleanup
|
|
518
|
+
kernel = np.ones((3, 3), np.uint8)
|
|
519
|
+
edges = cv2.dilate(edges, kernel, iterations=1)
|
|
520
|
+
edges = cv2.erode(edges, kernel, iterations=1)
|
|
521
|
+
return edges
|
|
522
|
+
|
|
523
|
+
def find_contours(
|
|
524
|
+
self,
|
|
525
|
+
mask,
|
|
526
|
+
min_area=200,
|
|
527
|
+
get_canny_edges=False,
|
|
528
|
+
canny_threshold: list[int] | None = None,
|
|
529
|
+
debug=False,
|
|
530
|
+
dilate_merge=True,
|
|
531
|
+
filter_shapes: (
|
|
532
|
+
list | None
|
|
533
|
+
) = None, # to detect for triangle or rectangle/square shapes
|
|
534
|
+
sort_contours_smallest_to_largest=False,
|
|
535
|
+
sort_countours_largest_to_smallest=False,
|
|
536
|
+
sort_bbox_smaller_to_largest=False,
|
|
537
|
+
sort_bbox_top_to_bottom=False,
|
|
538
|
+
sort_bbox_left_to_right=False,
|
|
539
|
+
sort_bbox_area_largest_to_smallest=False,
|
|
540
|
+
sort_bbox_grid_wise=False,
|
|
541
|
+
bbox_grid_tolerence=10,
|
|
542
|
+
retrieval_type=cv2.RETR_EXTERNAL,
|
|
543
|
+
approximation_method=cv2.CHAIN_APPROX_SIMPLE,
|
|
544
|
+
draw_contours=False,
|
|
545
|
+
contour_box_color=(0, 255, 0),
|
|
546
|
+
contour_box_thickness=2,
|
|
547
|
+
contour_text_color=(255, 255, 0),
|
|
548
|
+
contour_text_thickness=2,
|
|
549
|
+
):
|
|
550
|
+
"""
|
|
551
|
+
Find contours in a binary mask and filter them based on area and other criteria.
|
|
552
|
+
Contours and bounding boxes are both used for object localization in computer vision, with contours providing precise,
|
|
553
|
+
detailed outlines of shapes, while bounding boxes offer simplified rectangular boxes (min/max coordinates) used primarily
|
|
554
|
+
for detection, tracking, and fast computation. Contours are better for shape analysis, whereas bounding boxes are ideal for
|
|
555
|
+
spatial localization
|
|
556
|
+
|
|
557
|
+
Args:
|
|
558
|
+
mask: Binary image (mask) where contours are to be found. This is the image on wich contour detection will be performed, typically a binary mask resulting from color segmentation or thresholding.
|
|
559
|
+
min_area: Minimum area threshold to filter contours (default is 200).
|
|
560
|
+
get_canny_edges: If True, applies Canny edge detection to the mask before finding contours (default is False).
|
|
561
|
+
canny_threshold: List of two integers representing the lower and upper thresholds for Canny edge detection (default is [100, 100]).
|
|
562
|
+
debug: If True, prints debug information about contours found and filtered.
|
|
563
|
+
dilate_merge: If True, applies dilation to merge nearby contours (default is True).
|
|
564
|
+
sort_contours_smallest_to_largest: If True, sorts the contours by area from smallest to largest (default is False).
|
|
565
|
+
filter_shapes: List of vertex counts to filter contours by shape (e.g., [3, 4] for triangles and rectangles). If empty or None, no shape filtering is applied (default is [3, 4]).
|
|
566
|
+
sort_bbox_smaller_to_largest: If True, sorts the bounding boxes of contours by area from smallest to largest (default is False).
|
|
567
|
+
sort_countours_largest_to_smallest: If True, sorts the contours by area from largest to smallest (default is False).
|
|
568
|
+
sort_bbox_top_to_bottom: If True, sorts the bounding boxes of contours from top to bottom (default is False).
|
|
569
|
+
sort_bbox_left_to_right: If True, sorts the bounding boxes of contours from left to right (default is False).
|
|
570
|
+
sort_bbox_area_largest_to_smallest: If True, sorts the bounding boxes of contours by area from largest to smallest (default is False).
|
|
571
|
+
sort_bbox_grid_wise: If True, sorts the bounding boxes in a grid-wise manner (first by rows, then by columns) with a specified tolerence (default is False).
|
|
572
|
+
bbox_grid_tolerence: Tolerence in pixels for grouping bounding boxes into the same row when sort_bbox_grid_wise is True (default is 10).
|
|
573
|
+
retrieval_type: Contour retrieval mode (default is cv2.RETR_EXTERNAL).
|
|
574
|
+
approximation_method: Contour approximation method (default is cv2.CHAIN_APPROX_SIMPLE).
|
|
575
|
+
draw_contours: If True, draws the filtered contours on a copy of the original image for visualization (default is False).
|
|
576
|
+
|
|
577
|
+
Returns:
|
|
578
|
+
List of dictionaries containing contour information (contour, area, bounding_box, approx_vertices, center) for each filtered contour.
|
|
579
|
+
"""
|
|
580
|
+
|
|
581
|
+
if canny_threshold is None:
|
|
582
|
+
canny_threshold = [100, 100]
|
|
583
|
+
if filter_shapes is None:
|
|
584
|
+
filter_shapes = []
|
|
585
|
+
if get_canny_edges:
|
|
586
|
+
cleaned = self.get_cannty_edges(
|
|
587
|
+
low_threshold=canny_threshold[0], high_threshold=canny_threshold[1]
|
|
588
|
+
)
|
|
589
|
+
else:
|
|
590
|
+
# 1. Clean noise (very important)
|
|
591
|
+
cleaned = self.refine_mask(mask, merge=dilate_merge)
|
|
592
|
+
|
|
593
|
+
# kernel = np.ones((3, 3), np.uint8)
|
|
594
|
+
|
|
595
|
+
# # Remove small noise
|
|
596
|
+
# cleaned = cv2.morphologyEx(mask, cv2.MORPH_OPEN, kernel, iterations=1)
|
|
597
|
+
|
|
598
|
+
# # Fill gaps inside highlights
|
|
599
|
+
# cleaned = cv2.morphologyEx(cleaned, cv2.MORPH_CLOSE, kernel, iterations=2)
|
|
600
|
+
|
|
601
|
+
# 2. Find contours
|
|
602
|
+
contours, _ = cv2.findContours(
|
|
603
|
+
cleaned,
|
|
604
|
+
retrieval_type,
|
|
605
|
+
approximation_method, # only outer regions
|
|
606
|
+
)
|
|
607
|
+
|
|
608
|
+
if debug:
|
|
609
|
+
print(f"Found {len(contours)} contours before filtering")
|
|
610
|
+
|
|
611
|
+
# filtered_contours = []
|
|
612
|
+
# boxes = []
|
|
613
|
+
contours_results = []
|
|
614
|
+
countour_img = self.image.copy()
|
|
615
|
+
# 3. Filter contours
|
|
616
|
+
for cnt in contours:
|
|
617
|
+
area = cv2.contourArea(cnt)
|
|
618
|
+
|
|
619
|
+
if area < min_area:
|
|
620
|
+
continue
|
|
621
|
+
|
|
622
|
+
peri = cv2.arcLength(
|
|
623
|
+
cnt, True
|
|
624
|
+
) # Computes the perimeter of the contour and True → contour is closed
|
|
625
|
+
approx = cv2.approxPolyDP(cnt, 0.02 * peri, True)
|
|
626
|
+
print("Approx vertices:", len(approx))
|
|
627
|
+
|
|
628
|
+
x, y, w, h = cv2.boundingRect(cnt)
|
|
629
|
+
print(f"Contour area: {area}, Bounding box: (x={x}, y={y}, w={w}, h={h})")
|
|
630
|
+
if w < 20 or h < 10:
|
|
631
|
+
continue
|
|
632
|
+
|
|
633
|
+
"""
|
|
634
|
+
cv2.approxPolyDP(cnt, 0.02 * peri, True)
|
|
635
|
+
|
|
636
|
+
It applies the Douglas-Peucker algorithm to simplify the contour.
|
|
637
|
+
What it does:
|
|
638
|
+
1. Reduces a complex contour (many points) → simpler polygon
|
|
639
|
+
2. Keeps the general shape, removes noise/jagged edges
|
|
640
|
+
"""
|
|
641
|
+
if draw_contours:
|
|
642
|
+
print(
|
|
643
|
+
f"Accepted contour with area {area} and bounding box (x={x}, y={y}, w={w}, h={h})"
|
|
644
|
+
)
|
|
645
|
+
cv2.putText(
|
|
646
|
+
countour_img,
|
|
647
|
+
str(len(approx)),
|
|
648
|
+
(x, y - 10),
|
|
649
|
+
cv2.FONT_HERSHEY_SIMPLEX,
|
|
650
|
+
1,
|
|
651
|
+
contour_text_color,
|
|
652
|
+
contour_text_thickness,
|
|
653
|
+
)
|
|
654
|
+
|
|
655
|
+
center_x, center_y = x + (w // 2), y + (h // 2)
|
|
656
|
+
cv2.rectangle(
|
|
657
|
+
countour_img,
|
|
658
|
+
(x, y),
|
|
659
|
+
(x + w, y + h),
|
|
660
|
+
contour_box_color,
|
|
661
|
+
contour_box_thickness,
|
|
662
|
+
)
|
|
663
|
+
cv2.circle(
|
|
664
|
+
countour_img, (center_x, center_y), 5, contour_box_color, cv2.FILLED
|
|
665
|
+
)
|
|
666
|
+
# Check if the vertex count matches the filter_shapes criteria (if provided)
|
|
667
|
+
if len(filter_shapes) != 0 and len(approx) not in filter_shapes:
|
|
668
|
+
continue
|
|
669
|
+
|
|
670
|
+
contours_results.append(
|
|
671
|
+
{
|
|
672
|
+
"contour": cnt,
|
|
673
|
+
"area": area,
|
|
674
|
+
"bounding_box": (x, y, w, h),
|
|
675
|
+
"approx_vertices": approx,
|
|
676
|
+
"center": (center_x, center_y),
|
|
677
|
+
}
|
|
678
|
+
)
|
|
679
|
+
|
|
680
|
+
# filtered_contours.append(cnt)
|
|
681
|
+
# boxes.append((x, y, w, h))
|
|
682
|
+
|
|
683
|
+
if draw_contours:
|
|
684
|
+
cv2.drawContours(countour_img, [cnt], -1, (0, 255, 0), 2)
|
|
685
|
+
|
|
686
|
+
if debug:
|
|
687
|
+
print(f"Total contours (raw): {len(contours)}")
|
|
688
|
+
print(f"Filtered contours: {len(contours_results)}")
|
|
689
|
+
for i, c in enumerate(contours_results):
|
|
690
|
+
x, y, w, h = c["bounding_box"]
|
|
691
|
+
print(
|
|
692
|
+
f"[{i}] Area={c['area']:.2f}, Box=({x},{y},{w},{h}), Center={c['center']}"
|
|
693
|
+
)
|
|
694
|
+
|
|
695
|
+
if sort_bbox_area_largest_to_smallest:
|
|
696
|
+
contours_results = sorted(
|
|
697
|
+
contours_results, key=lambda c: c["area"], reverse=True
|
|
698
|
+
)
|
|
699
|
+
|
|
700
|
+
if sort_contours_smallest_to_largest:
|
|
701
|
+
contours_results = sorted(contours_results, key=lambda c: c["area"])
|
|
702
|
+
|
|
703
|
+
if sort_countours_largest_to_smallest:
|
|
704
|
+
contours_results = sorted(
|
|
705
|
+
contours_results, key=lambda c: c["area"], reverse=True
|
|
706
|
+
)
|
|
707
|
+
# filtered_contours = sorted(filtered_contours, key=cv2.contourArea, reverse=True)
|
|
708
|
+
|
|
709
|
+
if sort_bbox_smaller_to_largest:
|
|
710
|
+
contours_results = sorted(
|
|
711
|
+
contours_results,
|
|
712
|
+
key=lambda c: c["bounding_box"][2] * c["bounding_box"][3], # w * h
|
|
713
|
+
)
|
|
714
|
+
|
|
715
|
+
if sort_bbox_top_to_bottom:
|
|
716
|
+
# Sort top-to-bottom
|
|
717
|
+
contours_results = sorted(
|
|
718
|
+
contours_results,
|
|
719
|
+
key=lambda c: (c["bounding_box"][1], c["bounding_box"][0]), # y, then x
|
|
720
|
+
)
|
|
721
|
+
# boxes = sorted(boxes, key=lambda b: (b[1], b[0]))
|
|
722
|
+
|
|
723
|
+
if sort_bbox_left_to_right:
|
|
724
|
+
contours_results = sorted(
|
|
725
|
+
contours_results,
|
|
726
|
+
key=lambda c: c["bounding_box"][0], # x
|
|
727
|
+
)
|
|
728
|
+
|
|
729
|
+
if sort_bbox_grid_wise:
|
|
730
|
+
contours_results = sorted(
|
|
731
|
+
contours_results,
|
|
732
|
+
key=lambda c: (
|
|
733
|
+
c["bounding_box"][1] // bbox_grid_tolerence,
|
|
734
|
+
c["bounding_box"][0], # then sort in row
|
|
735
|
+
),
|
|
736
|
+
)
|
|
737
|
+
|
|
738
|
+
# boxes = sorted(boxes, key=lambda b: (b[0], b[1]))
|
|
739
|
+
return contours_results, countour_img
|
|
740
|
+
|
|
741
|
+
def detect_contours(self, min_area=500):
|
|
742
|
+
edges = self.get_cannty_edges()
|
|
743
|
+
contours, _ = cv2.findContours(
|
|
744
|
+
edges, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE
|
|
745
|
+
)
|
|
746
|
+
contours = [c for c in contours if cv2.contourArea(c) > min_area]
|
|
747
|
+
contours = sorted(contours, key=cv2.contourArea, reverse=True)
|
|
748
|
+
return contours
|
|
749
|
+
|
|
750
|
+
def export_measurements(data, path="measurements.json"):
|
|
751
|
+
with open(path, "w") as f:
|
|
752
|
+
json.dump(data, f, indent=2)
|
|
753
|
+
|
|
754
|
+
def draw_grid(self, pixels_per_cm, color=(200, 200, 200)):
|
|
755
|
+
h, w = self.image.shape[:2]
|
|
756
|
+
step = int(pixels_per_cm)
|
|
757
|
+
for x in range(0, w, step):
|
|
758
|
+
cv2.line(self.image, (x, 0), (x, h), color, 1)
|
|
759
|
+
|
|
760
|
+
for y in range(0, h, step):
|
|
761
|
+
cv2.line(self.image, (0, y), (w, y), color, 1)
|
|
762
|
+
|
|
763
|
+
return self.image
|
|
764
|
+
|
|
765
|
+
def detect_reference(self, contours):
|
|
766
|
+
candidates = []
|
|
767
|
+
|
|
768
|
+
for cnt in contours:
|
|
769
|
+
area = cv2.contourArea(cnt)
|
|
770
|
+
if area < 3000:
|
|
771
|
+
continue
|
|
772
|
+
|
|
773
|
+
rect = cv2.minAreaRect(cnt)
|
|
774
|
+
w, h = rect[1]
|
|
775
|
+
|
|
776
|
+
if w == 0 or h == 0:
|
|
777
|
+
continue
|
|
778
|
+
|
|
779
|
+
aspect = max(w, h) / min(w, h)
|
|
780
|
+
|
|
781
|
+
if 1.3 < aspect < 1.5:
|
|
782
|
+
candidates.append(("A4", cnt, 21.0))
|
|
783
|
+
|
|
784
|
+
elif 1.5 < aspect < 1.7:
|
|
785
|
+
candidates.append(("CARD", cnt, 8.56))
|
|
786
|
+
|
|
787
|
+
if candidates:
|
|
788
|
+
# choose largest
|
|
789
|
+
candidates.sort(key=lambda x: cv2.contourArea(x[1]), reverse=True)
|
|
790
|
+
label, cnt, real_size = candidates[0]
|
|
791
|
+
|
|
792
|
+
return cnt, real_size, label
|
|
793
|
+
|
|
794
|
+
# fallback → AI detection (optional)
|
|
795
|
+
return None, None, None
|
|
796
|
+
|
|
797
|
+
# ─────────────────────────── NEW METHODS ───────────────────────────
|
|
798
|
+
|
|
799
|
+
def get_image_info(self) -> dict:
|
|
800
|
+
"""Return basic metadata about the current image.
|
|
801
|
+
|
|
802
|
+
Returns:
|
|
803
|
+
dict: {'height', 'width', 'channels', 'dtype', 'size_bytes'}
|
|
804
|
+
"""
|
|
805
|
+
h, w = self.image.shape[:2]
|
|
806
|
+
channels = self.image.shape[2] if self.image.ndim == 3 else 1
|
|
807
|
+
return {
|
|
808
|
+
"height": h,
|
|
809
|
+
"width": w,
|
|
810
|
+
"channels": channels,
|
|
811
|
+
"dtype": str(self.image.dtype),
|
|
812
|
+
"size_bytes": self.image.nbytes,
|
|
813
|
+
}
|
|
814
|
+
|
|
815
|
+
def apply_clahe(self, clip_limit=2.0, tile_grid_size=(8, 8)) -> np.ndarray:
|
|
816
|
+
"""Apply Contrast-Limited Adaptive Histogram Equalization (CLAHE).
|
|
817
|
+
Better than global histogram equalization for documents with uneven lighting.
|
|
818
|
+
|
|
819
|
+
Args:
|
|
820
|
+
clip_limit: Threshold for contrast limiting.
|
|
821
|
+
tile_grid_size: Size of the grid for histogram equalization.
|
|
822
|
+
Returns:
|
|
823
|
+
Grayscale numpy array with enhanced contrast.
|
|
824
|
+
"""
|
|
825
|
+
gray = cv2.cvtColor(self.image, cv2.COLOR_BGR2GRAY)
|
|
826
|
+
clahe = cv2.createCLAHE(clipLimit=clip_limit, tileGridSize=tile_grid_size)
|
|
827
|
+
return clahe.apply(gray)
|
|
828
|
+
|
|
829
|
+
def detect_blur(self, threshold=100.0) -> bool:
|
|
830
|
+
"""Return True if the image is blurry (Laplacian variance below threshold).
|
|
831
|
+
Useful for quality-gating OCR or capture pipelines.
|
|
832
|
+
|
|
833
|
+
Args:
|
|
834
|
+
threshold: Variance below this value = blurry. Typical good-image range: 200+.
|
|
835
|
+
Returns:
|
|
836
|
+
bool: True = blurry.
|
|
837
|
+
"""
|
|
838
|
+
gray = cv2.cvtColor(self.image, cv2.COLOR_BGR2GRAY)
|
|
839
|
+
variance = cv2.Laplacian(gray, cv2.CV_64F).var()
|
|
840
|
+
return variance < threshold
|
|
841
|
+
|
|
842
|
+
def get_blur_score(self) -> float:
|
|
843
|
+
"""Return the Laplacian variance as a focus/sharpness score.
|
|
844
|
+
Higher = sharper. Useful for ranking multiple captures.
|
|
845
|
+
|
|
846
|
+
Returns:
|
|
847
|
+
float
|
|
848
|
+
"""
|
|
849
|
+
gray = cv2.cvtColor(self.image, cv2.COLOR_BGR2GRAY)
|
|
850
|
+
return float(cv2.Laplacian(gray, cv2.CV_64F).var())
|
|
851
|
+
|
|
852
|
+
def get_brightness(self) -> float:
|
|
853
|
+
"""Return mean pixel brightness of the image (0–255).
|
|
854
|
+
Useful for auto-exposure feedback or quality checks.
|
|
855
|
+
|
|
856
|
+
Returns:
|
|
857
|
+
float
|
|
858
|
+
"""
|
|
859
|
+
gray = cv2.cvtColor(self.image, cv2.COLOR_BGR2GRAY)
|
|
860
|
+
return float(np.mean(gray))
|
|
861
|
+
|
|
862
|
+
def crop(self, x: int, y: int, w: int, h: int) -> np.ndarray:
|
|
863
|
+
"""Crop a rectangular region from the image.
|
|
864
|
+
|
|
865
|
+
Args:
|
|
866
|
+
x, y: Top-left corner coordinates.
|
|
867
|
+
w, h: Width and height of the crop.
|
|
868
|
+
Returns:
|
|
869
|
+
BGR numpy array crop.
|
|
870
|
+
"""
|
|
871
|
+
H, W = self.image.shape[:2]
|
|
872
|
+
x1, y1 = max(0, x), max(0, y)
|
|
873
|
+
x2, y2 = min(W, x + w), min(H, y + h)
|
|
874
|
+
return self.image[y1:y2, x1:x2].copy()
|
|
875
|
+
|
|
876
|
+
def flip(self, direction="horizontal") -> np.ndarray:
|
|
877
|
+
"""Flip the image horizontally or vertically.
|
|
878
|
+
|
|
879
|
+
Args:
|
|
880
|
+
direction: 'horizontal' | 'vertical' | 'both'
|
|
881
|
+
Returns:
|
|
882
|
+
Flipped BGR numpy array.
|
|
883
|
+
"""
|
|
884
|
+
flip_code = {"horizontal": 1, "vertical": 0, "both": -1}.get(direction, 1)
|
|
885
|
+
self.image = cv2.flip(self.image, flip_code)
|
|
886
|
+
return self.image
|
|
887
|
+
|
|
888
|
+
def adjust_brightness_contrast(self, alpha=1.0, beta=0) -> np.ndarray:
|
|
889
|
+
"""Apply linear brightness/contrast adjustment: output = alpha * input + beta.
|
|
890
|
+
|
|
891
|
+
Args:
|
|
892
|
+
alpha: Contrast multiplier (1.0 = no change, > 1 = more contrast).
|
|
893
|
+
beta: Brightness offset added to all pixels (-255 to 255).
|
|
894
|
+
Returns:
|
|
895
|
+
Adjusted BGR numpy array.
|
|
896
|
+
"""
|
|
897
|
+
self.image = cv2.convertScaleAbs(self.image, alpha=alpha, beta=beta)
|
|
898
|
+
return self.image
|
|
899
|
+
|
|
900
|
+
def denoise(self, strength=10) -> np.ndarray:
|
|
901
|
+
"""Apply fast non-local means denoising — good for scanned document noise.
|
|
902
|
+
|
|
903
|
+
Args:
|
|
904
|
+
strength: Filter strength; higher = more noise removed but more blurry.
|
|
905
|
+
Returns:
|
|
906
|
+
Denoised BGR numpy array.
|
|
907
|
+
"""
|
|
908
|
+
self.image = cv2.fastNlMeansDenoisingColored(
|
|
909
|
+
self.image, None, strength, strength, 7, 21
|
|
910
|
+
)
|
|
911
|
+
return self.image
|
|
912
|
+
|
|
913
|
+
def extract_text_regions(self, boxes):
|
|
914
|
+
"""
|
|
915
|
+
Extract text regions from the input image based on the provided bounding boxes. This method takes an image and a
|
|
916
|
+
list of bounding boxes (each defined by its top-left corner coordinates and dimensions) and extracts the corresponding regions of interest (ROIs) from the image.
|
|
917
|
+
It then applies OCR to each extracted region to obtain the text contained within it. The method returns a list of dictionaries, where each dictionary contains the
|
|
918
|
+
bounding box coordinates and the extracted text for that region. This can be useful for analyzing specific areas of the image or for further processing of the detected text.
|
|
919
|
+
|
|
920
|
+
Args:
|
|
921
|
+
image (np.ndarray): The input image from which to extract text regions.
|
|
922
|
+
boxes (List[Tuple[int, int, int, int]]): A list of bounding boxes, where each box is defined by a tuple of (x, y, width, height)
|
|
923
|
+
|
|
924
|
+
Returns:
|
|
925
|
+
List[Dict[str, Any]]: A list of dictionaries, each containing the bounding box coordinates and the extracted text for that region. For example: [{"bbox": (x1, y1, x2, y2), "text": "Extracted text from the region"}, ...
|
|
926
|
+
"""
|
|
927
|
+
results = []
|
|
928
|
+
|
|
929
|
+
# Crop the image based on the bounding boxes and apply OCR to each region
|
|
930
|
+
for x, y, w, h in boxes:
|
|
931
|
+
roi = self.image[y : y + h, x : x + w]
|
|
932
|
+
|
|
933
|
+
gray = cv2.cvtColor(roi, cv2.COLOR_BGR2GRAY)
|
|
934
|
+
_, thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
|
|
935
|
+
|
|
936
|
+
text = pytesseract.image_to_string(thresh, config="--psm 6")
|
|
937
|
+
|
|
938
|
+
results.append({"bbox": (x, y, x + w, y + h), "text": text.strip()})
|
|
939
|
+
|
|
940
|
+
return results
|
|
941
|
+
|
|
942
|
+
# ─────────────────────────── UTILITY METHODS ───────────────────────────
|
|
943
|
+
|
|
944
|
+
def resize_to_fit(self, max_width: int, max_height: int) -> np.ndarray:
|
|
945
|
+
"""Resize the image to fit within max_width x max_height, preserving aspect ratio.
|
|
946
|
+
|
|
947
|
+
Args:
|
|
948
|
+
max_width: Maximum output width in pixels.
|
|
949
|
+
max_height: Maximum output height in pixels.
|
|
950
|
+
Returns:
|
|
951
|
+
Resized BGR numpy array.
|
|
952
|
+
"""
|
|
953
|
+
h, w = self.image.shape[:2]
|
|
954
|
+
scale = min(max_width / w, max_height / h)
|
|
955
|
+
new_w, new_h = int(w * scale), int(h * scale)
|
|
956
|
+
return cv2.resize(self.image, (new_w, new_h), interpolation=cv2.INTER_AREA)
|
|
957
|
+
|
|
958
|
+
def pad_to_square(self, fill: int = 0) -> np.ndarray:
|
|
959
|
+
"""Pad the image with a constant border to make it square.
|
|
960
|
+
|
|
961
|
+
The shorter dimension is padded symmetrically; the longer dimension is unchanged.
|
|
962
|
+
|
|
963
|
+
Args:
|
|
964
|
+
fill: Constant pixel value used for padding (0 = black).
|
|
965
|
+
Returns:
|
|
966
|
+
Square BGR numpy array.
|
|
967
|
+
"""
|
|
968
|
+
h, w = self.image.shape[:2]
|
|
969
|
+
size = max(h, w)
|
|
970
|
+
pad_h = size - h
|
|
971
|
+
pad_w = size - w
|
|
972
|
+
top, left = pad_h // 2, pad_w // 2
|
|
973
|
+
return cv2.copyMakeBorder(
|
|
974
|
+
self.image,
|
|
975
|
+
top,
|
|
976
|
+
pad_h - top,
|
|
977
|
+
left,
|
|
978
|
+
pad_w - left,
|
|
979
|
+
cv2.BORDER_CONSTANT,
|
|
980
|
+
value=fill,
|
|
981
|
+
)
|
|
982
|
+
|
|
983
|
+
def normalize(self, mean=(0, 0, 0), std=(1, 1, 1)) -> np.ndarray:
|
|
984
|
+
"""Normalize pixel values to float32 in [0, 1] and optionally subtract mean / divide by std.
|
|
985
|
+
|
|
986
|
+
Args:
|
|
987
|
+
mean: Per-channel mean to subtract after scaling to [0, 1].
|
|
988
|
+
std: Per-channel std to divide by after mean subtraction.
|
|
989
|
+
Returns:
|
|
990
|
+
float32 numpy array.
|
|
991
|
+
"""
|
|
992
|
+
img = self.image.astype(np.float32) / 255.0
|
|
993
|
+
return ((img - np.array(mean)) / np.array(std)).astype(np.float32)
|
|
994
|
+
|
|
995
|
+
def create_thumbnail(self, size=(128, 128)) -> np.ndarray:
|
|
996
|
+
"""Resize the image to a fixed thumbnail size (no aspect-ratio preservation).
|
|
997
|
+
|
|
998
|
+
Args:
|
|
999
|
+
size: (width, height) tuple for the output thumbnail.
|
|
1000
|
+
Returns:
|
|
1001
|
+
BGR numpy array of shape (height, width, 3).
|
|
1002
|
+
"""
|
|
1003
|
+
return cv2.resize(self.image, size, interpolation=cv2.INTER_AREA)
|
|
1004
|
+
|
|
1005
|
+
def batch_crop(self, boxes) -> list:
|
|
1006
|
+
"""Crop multiple regions from the image at once.
|
|
1007
|
+
|
|
1008
|
+
Args:
|
|
1009
|
+
boxes: Iterable of (x, y, w, h) tuples in pixel coordinates.
|
|
1010
|
+
Coordinates are clamped to image boundaries.
|
|
1011
|
+
Returns:
|
|
1012
|
+
List of BGR numpy array crops, one per box.
|
|
1013
|
+
"""
|
|
1014
|
+
h, w = self.image.shape[:2]
|
|
1015
|
+
crops = []
|
|
1016
|
+
for x, y, bw, bh in boxes:
|
|
1017
|
+
x1, y1 = max(0, x), max(0, y)
|
|
1018
|
+
x2, y2 = min(w, x + bw), min(h, y + bh)
|
|
1019
|
+
crops.append(self.image[y1:y2, x1:x2].copy())
|
|
1020
|
+
return crops
|
|
1021
|
+
|
|
1022
|
+
def get_dominant_colors(self, k: int = 5) -> list:
|
|
1023
|
+
"""Return k dominant BGR colors using K-means clustering on all pixels.
|
|
1024
|
+
|
|
1025
|
+
Args:
|
|
1026
|
+
k: Number of clusters / dominant colors to return.
|
|
1027
|
+
Returns:
|
|
1028
|
+
List of k (B, G, R) tuples as integers.
|
|
1029
|
+
"""
|
|
1030
|
+
pixels = self.image.reshape(-1, 3).astype(np.float32)
|
|
1031
|
+
criteria = (cv2.TERM_CRITERIA_EPS + cv2.TERM_CRITERIA_MAX_ITER, 100, 0.2)
|
|
1032
|
+
_, _, centers = cv2.kmeans(
|
|
1033
|
+
pixels, k, None, criteria, 10, cv2.KMEANS_RANDOM_CENTERS
|
|
1034
|
+
)
|
|
1035
|
+
return [tuple(int(c) for c in color) for color in centers]
|
|
1036
|
+
|
|
1037
|
+
def overlay_image(
|
|
1038
|
+
self, overlay: np.ndarray, x: int, y: int, alpha: float = 1.0
|
|
1039
|
+
) -> np.ndarray:
|
|
1040
|
+
"""Blend an overlay image onto a copy of self.image at position (x, y).
|
|
1041
|
+
|
|
1042
|
+
Args:
|
|
1043
|
+
overlay: BGR numpy array to blend in.
|
|
1044
|
+
x: Left edge of the overlay region (pixels).
|
|
1045
|
+
y: Top edge of the overlay region (pixels).
|
|
1046
|
+
alpha: Opacity of the overlay (0.0 = invisible, 1.0 = fully opaque).
|
|
1047
|
+
Returns:
|
|
1048
|
+
New BGR numpy array with the overlay blended in.
|
|
1049
|
+
"""
|
|
1050
|
+
out = self.image.copy()
|
|
1051
|
+
h, w = overlay.shape[:2]
|
|
1052
|
+
y2 = min(y + h, out.shape[0])
|
|
1053
|
+
x2 = min(x + w, out.shape[1])
|
|
1054
|
+
oh, ow = y2 - y, x2 - x
|
|
1055
|
+
if oh > 0 and ow > 0:
|
|
1056
|
+
roi = out[y:y2, x:x2]
|
|
1057
|
+
out[y:y2, x:x2] = cv2.addWeighted(
|
|
1058
|
+
roi, 1 - alpha, overlay[:oh, :ow], alpha, 0
|
|
1059
|
+
)
|
|
1060
|
+
return out
|
|
1061
|
+
|
|
1062
|
+
def compare_histograms(self, other_image: np.ndarray) -> float:
|
|
1063
|
+
"""Compare self.image to another image using 3D BGR histogram correlation.
|
|
1064
|
+
|
|
1065
|
+
Both histograms are normalised to [0, 1] before comparison.
|
|
1066
|
+
|
|
1067
|
+
Args:
|
|
1068
|
+
other_image: BGR numpy array to compare against.
|
|
1069
|
+
Returns:
|
|
1070
|
+
Correlation score in [-1, 1]; 1.0 = identical histogram.
|
|
1071
|
+
"""
|
|
1072
|
+
|
|
1073
|
+
def _hist(img):
|
|
1074
|
+
h = cv2.calcHist(
|
|
1075
|
+
[img], [0, 1, 2], None, [8, 8, 8], [0, 256, 0, 256, 0, 256]
|
|
1076
|
+
)
|
|
1077
|
+
cv2.normalize(h, h, alpha=0, beta=1, norm_type=cv2.NORM_MINMAX)
|
|
1078
|
+
return h
|
|
1079
|
+
|
|
1080
|
+
return float(
|
|
1081
|
+
cv2.compareHist(_hist(self.image), _hist(other_image), cv2.HISTCMP_CORREL)
|
|
1082
|
+
)
|
|
1083
|
+
|
|
1084
|
+
def to_base64(self) -> str:
|
|
1085
|
+
"""Encode the image as a base64 PNG string (UTF-8).
|
|
1086
|
+
|
|
1087
|
+
Useful for embedding images in JSON payloads or HTML data URIs.
|
|
1088
|
+
|
|
1089
|
+
Returns:
|
|
1090
|
+
Base64-encoded string of the PNG-encoded image.
|
|
1091
|
+
"""
|
|
1092
|
+
import base64
|
|
1093
|
+
|
|
1094
|
+
_, buf = cv2.imencode(".png", self.image)
|
|
1095
|
+
return base64.b64encode(buf).decode("utf-8")
|