openvisionkit 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1095 @@
1
+ import json
2
+
3
+ import cv2
4
+ import numpy as np
5
+
6
+ try:
7
+ import pytesseract
8
+ except ImportError:
9
+ pytesseract = None # type: ignore
10
+ try:
11
+ from skimage.metrics import structural_similarity as ssim
12
+ except ImportError:
13
+ ssim = None # type: ignore
14
+
15
+
16
+ class ImageDetector:
17
+ def __init__(self, image, pre_process=False):
18
+ self.image = image
19
+ if pre_process:
20
+ self.image = self._preprocess(image)
21
+
22
+ def _preprocess(self, image: np.ndarray) -> np.ndarray:
23
+ """
24
+ Preprocess the input image for better OCR results.
25
+
26
+ Args:
27
+ image (np.ndarray): The input image to preprocess.
28
+
29
+ Returns:
30
+ np.ndarray: The preprocessed image.
31
+ """
32
+ gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
33
+
34
+ # Contrast enhancement
35
+ gray = cv2.equalizeHist(gray)
36
+
37
+ # Noise reduction
38
+ blurred = cv2.GaussianBlur(gray, (3, 3), 0)
39
+
40
+ # Adaptive threshold
41
+ thresh = cv2.adaptiveThreshold(
42
+ blurred, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2
43
+ )
44
+
45
+ return thresh
46
+
47
+ def fallback_ssim(self, image1, image2, form_name, draw_frame=False):
48
+ image2_resized = cv2.resize(image2, (image1.shape[1], image1.shape[0]))
49
+
50
+ gray1 = cv2.cvtColor(image1, cv2.COLOR_BGR2GRAY)
51
+ gray2 = cv2.cvtColor(image2_resized, cv2.COLOR_BGR2GRAY)
52
+
53
+ score, diff = ssim(gray1, gray2, full=True)
54
+
55
+ diff = (diff * 255).astype("uint8")
56
+
57
+ if draw_frame:
58
+ cv2.imshow(f"{form_name} - SSIM Diff (score={score:.3f})", diff)
59
+ cv2.waitKey(0)
60
+ cv2.destroyAllWindows()
61
+
62
+ return {
63
+ "matches": 0,
64
+ "homography": None,
65
+ "aligned_image": image2_resized,
66
+ "ssim_score": score,
67
+ }
68
+
69
+ def compare_matches_knn_matcher(
70
+ self,
71
+ image2,
72
+ form_name,
73
+ no_of_feature=500,
74
+ matched_amount=50,
75
+ percentage_of_matches=20,
76
+ draw_matches=False,
77
+ draw_aligned=False,
78
+ ):
79
+ # Detect keypoints
80
+ image_detector = ImageDetector(image2)
81
+
82
+ keypoints1, descriptors1, _ = self.detect_keypoints(
83
+ features=no_of_feature, draw_keypoints=False
84
+ )
85
+ keypoints2, descriptors2, _ = image_detector.detect_keypoints(
86
+ features=no_of_feature, draw_keypoints=False
87
+ )
88
+
89
+ if descriptors1 is None or descriptors2 is None:
90
+ print("Feature detection failed → using SSIM fallback")
91
+ return self.fallback_ssim(self.image, image2, form_name)
92
+
93
+ # Safety check
94
+ if descriptors1 is None or descriptors2 is None:
95
+ raise ValueError("Descriptors could not be computed")
96
+
97
+ # Use KNN matcher instead of crossCheck
98
+ bf = cv2.BFMatcher(cv2.NORM_HAMMING)
99
+
100
+ matches = bf.knnMatch(descriptors1, descriptors2, k=2)
101
+
102
+ # Apply ratio test
103
+ good_matches = []
104
+ for m, n in matches:
105
+ if m.distance < 0.75 * n.distance:
106
+ good_matches.append(m)
107
+
108
+ if len(good_matches) < 4:
109
+ print("Not enough matches → using fallback")
110
+ return self.fallback_ssim(self.image, image2, form_name)
111
+
112
+ # Sort matches
113
+ good_matches = sorted(good_matches, key=lambda x: x.distance)
114
+
115
+ # Take top percentage
116
+ keep_n = int(len(good_matches) * (percentage_of_matches / 100))
117
+ good_matches = good_matches[: max(keep_n, 4)] # ensure at least 4
118
+
119
+ # Draw matches
120
+ matchedImage = cv2.drawMatches(
121
+ self.image,
122
+ keypoints1,
123
+ image2,
124
+ keypoints2,
125
+ good_matches[:matched_amount],
126
+ None,
127
+ flags=cv2.DrawMatchesFlags_NOT_DRAW_SINGLE_POINTS,
128
+ )
129
+
130
+ # Compute homography
131
+ sourcePoints = np.float32(
132
+ [keypoints1[m.queryIdx].pt for m in good_matches]
133
+ ).reshape(-1, 1, 2)
134
+
135
+ destinationPoints = np.float32(
136
+ [keypoints2[m.trainIdx].pt for m in good_matches]
137
+ ).reshape(-1, 1, 2)
138
+
139
+ M, mask = cv2.findHomography(destinationPoints, sourcePoints, cv2.RANSAC, 5.0)
140
+
141
+ if M is None:
142
+ print("Homography could not be computed so it will be using fallback")
143
+ return self.fallback_ssim(self.image, image2, form_name)
144
+
145
+ h, w = self.image.shape[:2]
146
+ imageTransformed = cv2.warpPerspective(image2, M, (w, h))
147
+
148
+ imageTransformed_small = cv2.resize(imageTransformed, (w // 3, h // 3))
149
+ matchedImage_small = cv2.resize(matchedImage, (w // 3, h // 3))
150
+
151
+ if draw_matches:
152
+ cv2.imshow(f"{form_name} - Matches (Inliers)", matchedImage_small)
153
+ cv2.waitKey(0)
154
+ cv2.destroyAllWindows()
155
+
156
+ if draw_aligned:
157
+ cv2.imshow(f"{form_name} - Aligned", imageTransformed_small)
158
+ cv2.waitKey(0)
159
+ cv2.destroyAllWindows()
160
+
161
+ return {
162
+ "matches": len(good_matches),
163
+ "homography": M,
164
+ "matched_image": matchedImage,
165
+ "aligned_image": imageTransformed,
166
+ }
167
+
168
+ def compare_matches_bf_matcher(
169
+ self,
170
+ image2,
171
+ form_name,
172
+ no_of_feature=500,
173
+ matched_amount=50,
174
+ percentage_of_matches=20,
175
+ draw_matches=False,
176
+ draw_aligned=False,
177
+ ):
178
+ # Detect keypoints
179
+ image_detector = ImageDetector(image2)
180
+
181
+ keypoints1, descriptors1, _ = self.detect_keypoints(
182
+ features=no_of_feature, draw_keypoints=False
183
+ )
184
+ keypoints2, descriptors2, _ = image_detector.detect_keypoints(
185
+ features=no_of_feature, draw_keypoints=False
186
+ )
187
+
188
+ if descriptors1 is None or descriptors2 is None:
189
+ print("Feature detection failed → using SSIM fallback")
190
+ return self.fallback_ssim(self.image, image2, form_name)
191
+
192
+ # Safety check
193
+ if descriptors1 is None or descriptors2 is None:
194
+ raise ValueError("Descriptors could not be computed")
195
+
196
+ # Use KNN matcher instead of crossCheck
197
+ bf = cv2.BFMatcher(cv2.NORM_HAMMING)
198
+
199
+ matches = bf.match(descriptors1, descriptors2)
200
+
201
+ # Sort matches
202
+ good_matches = sorted(matches, key=lambda x: x.distance)
203
+
204
+ # Take top percentage
205
+ keep_n = int(len(good_matches) * (percentage_of_matches / 100))
206
+ good_matches = good_matches[: max(keep_n, 4)] # ensure at least 4
207
+
208
+ # Draw matches
209
+ matchedImage = cv2.drawMatches(
210
+ self.image,
211
+ keypoints1,
212
+ image2,
213
+ keypoints2,
214
+ good_matches[:matched_amount],
215
+ None,
216
+ flags=cv2.DrawMatchesFlags_NOT_DRAW_SINGLE_POINTS,
217
+ )
218
+
219
+ # Compute homography
220
+ sourcePoints = np.float32(
221
+ [keypoints1[m.queryIdx].pt for m in good_matches]
222
+ ).reshape(-1, 1, 2)
223
+
224
+ destinationPoints = np.float32(
225
+ [keypoints2[m.trainIdx].pt for m in good_matches]
226
+ ).reshape(-1, 1, 2)
227
+
228
+ M, mask = cv2.findHomography(destinationPoints, sourcePoints, cv2.RANSAC, 5.0)
229
+
230
+ if M is None:
231
+ print("Homography could not be computed so it will be using fallback")
232
+ return self.fallback_ssim(self.image, image2, form_name)
233
+
234
+ h, w = self.image.shape[:2]
235
+ imageTransformed = cv2.warpPerspective(image2, M, (w, h))
236
+
237
+ imageTransformed_small = cv2.resize(imageTransformed, (w // 3, h // 3))
238
+ matchedImage_small = cv2.resize(matchedImage, (w // 3, h // 3))
239
+
240
+ # it will match the form and the template and show the matched keypoints and the aligned image. The homography matrix can be used to further analyze the geometric transformation between the two images, such as calculating the angle of rotation or the scale difference.
241
+ if draw_matches:
242
+ cv2.imshow(f"{form_name} - Matches (Inliers)", matchedImage_small)
243
+ cv2.waitKey(0)
244
+ cv2.destroyAllWindows()
245
+
246
+ # it will match the form and the template and show the matched keypoints and the aligned image. The homography matrix can be used to further analyze the geometric transformation between the two images, such as calculating the angle of rotation or the scale difference.
247
+ if draw_aligned:
248
+ cv2.imshow(f"{form_name} - Aligned", imageTransformed_small)
249
+ cv2.waitKey(0)
250
+ cv2.destroyAllWindows()
251
+
252
+ return {
253
+ "matches": len(good_matches),
254
+ "homography": M,
255
+ "matched_image": matchedImage,
256
+ "aligned_image": imageTransformed,
257
+ }
258
+
259
+ def compute_tolerance_percentile(self, pixels):
260
+ """
261
+ Adaptively compute dynamic tolerance using local pixel distribution.
262
+
263
+ Args:
264
+ pixels: Array of pixel values in HSV color space for a specific region (e.g., highlighted area).
265
+
266
+ Returns:
267
+ Tuple of (h_tol, s_tol, v_tol) representing the computed tolerances for hue, saturation, and value channels based on the 10th and 90th percentiles of
268
+ """
269
+ h_vals = pixels[:, 0]
270
+ s_vals = pixels[:, 1]
271
+ v_vals = pixels[:, 2]
272
+
273
+ h_tol = int(np.percentile(h_vals, 90) - np.percentile(h_vals, 10))
274
+ s_tol = int(np.percentile(s_vals, 90) - np.percentile(s_vals, 10))
275
+ v_tol = int(np.percentile(v_vals, 90) - np.percentile(v_vals, 10))
276
+
277
+ return h_tol, s_tol, v_tol
278
+
279
+ def detect_highlighted_text(
280
+ self,
281
+ hsv_colors=None, # seed HSV values
282
+ h_tol=8,
283
+ s_tol=80,
284
+ v_tol=80,
285
+ show_mask=False,
286
+ show_combined_mask=False,
287
+ show_image_with_mask=False,
288
+ ):
289
+ """
290
+ Detect highlighted text by creating HSV masks around specified colors.
291
+ Returns combined mask and individual masks for each color.
292
+
293
+ HSV (Hue, Saturation, Value) image processing is a color representation model, often preferred over RGB in computer vision
294
+ for color-based segmentation and detection. It separates color information (hue) from lighting/brightness (value), allowing
295
+ robust object tracking under varying illumination. Common uses include object tracking, color-based filtering, and thresholding
296
+ in OpenCV.
297
+
298
+ Args:
299
+ hsv_colors: List of seed HSV tuples to detect (e.g., yellow, green)
300
+ h_tol, s_tol, v_tol: Tolerances for hue, saturation, and value to create color ranges
301
+ show: Whether to display intermediate masks and results using OpenCV windows
302
+ show_mask: Show individual color masks
303
+ show_combined_mask: Show combined mask of all detected colors
304
+ show_image_with_mask: Show the original image with detected areas masked
305
+
306
+ Returns:
307
+ image_with_mask: Original image with detected areas masked
308
+ combined_mask: Binary mask combining all detected colors
309
+ masks: List of individual masks for each specified color
310
+
311
+ Usage:
312
+ image = cv2.imread("doc.jpg")
313
+
314
+ # Common highlighter HSV seeds (you can refine using click sampling)
315
+ highlight_colors = [
316
+ (30, 200, 250), # yellow
317
+ (60, 200, 250), # green
318
+ (150, 200, 250), # pink
319
+ (15, 200, 250), # orange
320
+ ]
321
+
322
+ mask, masks = detect_highlighted_text(image, highlight_colors)
323
+ """
324
+ if hsv_colors is None:
325
+ hsv_colors = [(27, 167, 251)]
326
+ img_blur = cv2.GaussianBlur(self.image, (5, 5), 0)
327
+ hsv = cv2.cvtColor(img_blur, cv2.COLOR_BGR2HSV)
328
+ combined_mask = np.zeros(hsv.shape[:2], dtype=np.uint8)
329
+ masks = []
330
+
331
+ for i, (h, s, v) in enumerate(hsv_colors):
332
+ # h_tol2, s_tol2, v_tol2 = compute_tolerence = self.compute_dynamic_tolerance2(hsv, h, s, v)
333
+ # print(f"Computed tolerances → H: {compute_tolerence[0]}, S: {compute_tolerence[1]}, V: {compute_tolerence[2]}")
334
+ lower = np.array([max(0, h - h_tol), max(0, s - s_tol), max(0, v - v_tol)])
335
+
336
+ upper = np.array(
337
+ [min(179, h + h_tol), min(255, s + s_tol), min(255, v + v_tol)]
338
+ )
339
+
340
+ mask = cv2.inRange(hsv, lower, upper)
341
+ masks.append(mask)
342
+
343
+ # Combine all masks
344
+ combined_mask = cv2.bitwise_or(combined_mask, mask)
345
+
346
+ # Remove Noise
347
+ # kernel = np.ones((3,3), np.uint8)
348
+ # combined_mask = cv2.morphologyEx(combined_mask, cv2.MORPH_CLOSE, kernel)
349
+
350
+ if show_mask:
351
+ cv2.imshow(f"Mask {i}", mask)
352
+
353
+ img_with_mask = cv2.bitwise_and(self.image, self.image, mask=combined_mask)
354
+ if show_combined_mask:
355
+ cv2.imshow("Combined Mask", combined_mask)
356
+ if show_image_with_mask:
357
+ cv2.imshow("Image with Mask", img_with_mask)
358
+
359
+ if show_mask or show_combined_mask or show_image_with_mask:
360
+ cv2.waitKey(0)
361
+ cv2.destroyAllWindows()
362
+
363
+ return img_with_mask, combined_mask, masks
364
+
365
+ def get_dominant_hsv_colors(self, k=10):
366
+ img_blur = cv2.GaussianBlur(self.image, (5, 5), 0)
367
+ hsv = cv2.cvtColor(img_blur, cv2.COLOR_BGR2HSV)
368
+
369
+ # 🔥 REMOVE WHITE BEFORE CLUSTERING
370
+ s = hsv[:, :, 1]
371
+ v = hsv[:, :, 2]
372
+ mask = (s > 25) & (v > 60) # lower threshold → keeps light blue
373
+
374
+ pixels = hsv[mask]
375
+
376
+ if len(pixels) == 0:
377
+ return []
378
+
379
+ pixels = pixels.reshape(-1, 3).astype(np.float32)
380
+
381
+ k = min(k, len(pixels))
382
+
383
+ _, labels, centers = cv2.kmeans(
384
+ pixels,
385
+ k,
386
+ None,
387
+ (cv2.TERM_CRITERIA_EPS + cv2.TERM_CRITERIA_MAX_ITER, 100, 0.2),
388
+ 10,
389
+ cv2.KMEANS_RANDOM_CENTERS,
390
+ )
391
+
392
+ counts = np.bincount(labels.flatten())
393
+
394
+ highlight_colors = []
395
+
396
+ for i, (h, s, v) in enumerate(centers):
397
+ h, s, v = int(h), int(s), int(v)
398
+
399
+ # remove tiny clusters
400
+ if counts[i] < 30:
401
+ continue
402
+
403
+ # remove dark
404
+ if v < 60:
405
+ continue
406
+
407
+ # remove near-white
408
+ if v > 200 and s < 40:
409
+ continue
410
+
411
+ highlight_colors.append((h, s, v))
412
+
413
+ # 🔥 softer deduplication
414
+ filtered = []
415
+ for c in highlight_colors:
416
+ if not any(abs(c[0] - fc[0]) < 4 for fc in filtered):
417
+ filtered.append(c)
418
+
419
+ predefined = [
420
+ (95, 120, 255), # light blue
421
+ (30, 200, 250), # yellow
422
+ (60, 200, 250), # green
423
+ (100, 200, 250), # blue
424
+ (150, 200, 250), # pink
425
+ (20, 200, 250), # orange
426
+ ]
427
+ return filtered + predefined
428
+
429
+ # def get_dominant_hsv_colors(self, k=4):
430
+ # """
431
+ # auto-detect highlight colors in the image by clustering pixel colors in HSV space using K-means.
432
+ # Get dominant HSV colors from the image using K-means clustering.
433
+
434
+ # Args:
435
+ # k: Number of dominant colors to detect (default is 4)
436
+
437
+ # Returns:
438
+ # List of dominant HSV color tuples (h, s, v) detected in the image.
439
+ # Usage:
440
+
441
+ # """
442
+ # mg_blur = cv2.GaussianBlur(self.image, (5,5), 0)
443
+ # hsv = cv2.cvtColor(mg_blur, cv2.COLOR_BGR2HSV)
444
+ # pixels = hsv.reshape(-1, 3).astype(np.float32)
445
+
446
+ # _, labels, centers = cv2.kmeans(
447
+ # pixels, k, None,
448
+ # (cv2.TERM_CRITERIA_EPS + cv2.TERM_CRITERIA_MAX_ITER, 100, 0.2),
449
+ # 10,
450
+ # cv2.KMEANS_RANDOM_CENTERS
451
+ # )
452
+ # # Filter likely highlight colors
453
+ # highlight_colors = []
454
+ # for (h, s, v) in centers:
455
+ # if s > 80 and v > 150: # high saturation + brightness
456
+ # highlight_colors.append((int(h), int(s), int(v)))
457
+
458
+ # return highlight_colors
459
+
460
+ def detect_single_highlighted_text(self, image, hsv_colors=None):
461
+ """Detect highlighted text based on a single HSV color.
462
+ Args:
463
+ image: Input image in BGR format (as read by OpenCV)
464
+ hsv_colors: List of HSV values to detect (default is a single yellow color)
465
+ Returns:
466
+ image_with_mask: Image with detected highlighted areas masked
467
+ mask: Binary mask of detected highlighted areas
468
+ """
469
+ if hsv_colors is None:
470
+ hsv_colors = [27, 167, 251]
471
+ hsv = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
472
+ lower = np.array([hsv_colors[0], hsv_colors[1], hsv_colors[2]])
473
+ upper = np.array([140, 255, 255])
474
+
475
+ mask = cv2.inRange(hsv, lower, upper)
476
+ img_with_mask = cv2.bitwise_and(image, image, mask=mask)
477
+ return img_with_mask, mask
478
+
479
+ def refine_mask(self, mask, merge=True):
480
+ """
481
+ Refines a binary mask by applying morphological operations to remove noise and merge words in the same line.
482
+
483
+ Args:
484
+ mask: The input binary mask to be refined.
485
+ merge: If True, applies dilation to merge words in the same line (default is True).
486
+ Use case merge
487
+ Detect each highlight separately : False
488
+ Group text into lines/regions : True
489
+
490
+ Returns:
491
+ The refined binary mask.
492
+ """
493
+
494
+ kernel_small = np.ones((3, 3), np.uint8)
495
+ # mask = cv2.erode(mask, kernel_small, iterations=1)
496
+ # Remove noise
497
+ mask = cv2.morphologyEx(mask, cv2.MORPH_OPEN, kernel_small, iterations=1)
498
+
499
+ # Fill gaps
500
+ mask = cv2.morphologyEx(mask, cv2.MORPH_CLOSE, kernel_small, iterations=2)
501
+
502
+ if merge:
503
+ # ⚠️ This merges multiple highlights into ONE contour
504
+ kernel_line = np.ones((15, 5), np.uint8)
505
+ mask = cv2.dilate(mask, kernel_line, iterations=1)
506
+
507
+ return mask
508
+
509
+ def get_cannty_edges(self, low_threshold=50, high_threshold=150):
510
+ img = self.image.copy()
511
+ gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
512
+
513
+ # Edge detection
514
+ blur = cv2.GaussianBlur(gray, (5, 5), 1)
515
+ edges = cv2.Canny(blur, low_threshold, high_threshold)
516
+
517
+ # Morphological cleanup
518
+ kernel = np.ones((3, 3), np.uint8)
519
+ edges = cv2.dilate(edges, kernel, iterations=1)
520
+ edges = cv2.erode(edges, kernel, iterations=1)
521
+ return edges
522
+
523
+ def find_contours(
524
+ self,
525
+ mask,
526
+ min_area=200,
527
+ get_canny_edges=False,
528
+ canny_threshold: list[int] | None = None,
529
+ debug=False,
530
+ dilate_merge=True,
531
+ filter_shapes: (
532
+ list | None
533
+ ) = None, # to detect for triangle or rectangle/square shapes
534
+ sort_contours_smallest_to_largest=False,
535
+ sort_countours_largest_to_smallest=False,
536
+ sort_bbox_smaller_to_largest=False,
537
+ sort_bbox_top_to_bottom=False,
538
+ sort_bbox_left_to_right=False,
539
+ sort_bbox_area_largest_to_smallest=False,
540
+ sort_bbox_grid_wise=False,
541
+ bbox_grid_tolerence=10,
542
+ retrieval_type=cv2.RETR_EXTERNAL,
543
+ approximation_method=cv2.CHAIN_APPROX_SIMPLE,
544
+ draw_contours=False,
545
+ contour_box_color=(0, 255, 0),
546
+ contour_box_thickness=2,
547
+ contour_text_color=(255, 255, 0),
548
+ contour_text_thickness=2,
549
+ ):
550
+ """
551
+ Find contours in a binary mask and filter them based on area and other criteria.
552
+ Contours and bounding boxes are both used for object localization in computer vision, with contours providing precise,
553
+ detailed outlines of shapes, while bounding boxes offer simplified rectangular boxes (min/max coordinates) used primarily
554
+ for detection, tracking, and fast computation. Contours are better for shape analysis, whereas bounding boxes are ideal for
555
+ spatial localization
556
+
557
+ Args:
558
+ mask: Binary image (mask) where contours are to be found. This is the image on wich contour detection will be performed, typically a binary mask resulting from color segmentation or thresholding.
559
+ min_area: Minimum area threshold to filter contours (default is 200).
560
+ get_canny_edges: If True, applies Canny edge detection to the mask before finding contours (default is False).
561
+ canny_threshold: List of two integers representing the lower and upper thresholds for Canny edge detection (default is [100, 100]).
562
+ debug: If True, prints debug information about contours found and filtered.
563
+ dilate_merge: If True, applies dilation to merge nearby contours (default is True).
564
+ sort_contours_smallest_to_largest: If True, sorts the contours by area from smallest to largest (default is False).
565
+ filter_shapes: List of vertex counts to filter contours by shape (e.g., [3, 4] for triangles and rectangles). If empty or None, no shape filtering is applied (default is [3, 4]).
566
+ sort_bbox_smaller_to_largest: If True, sorts the bounding boxes of contours by area from smallest to largest (default is False).
567
+ sort_countours_largest_to_smallest: If True, sorts the contours by area from largest to smallest (default is False).
568
+ sort_bbox_top_to_bottom: If True, sorts the bounding boxes of contours from top to bottom (default is False).
569
+ sort_bbox_left_to_right: If True, sorts the bounding boxes of contours from left to right (default is False).
570
+ sort_bbox_area_largest_to_smallest: If True, sorts the bounding boxes of contours by area from largest to smallest (default is False).
571
+ sort_bbox_grid_wise: If True, sorts the bounding boxes in a grid-wise manner (first by rows, then by columns) with a specified tolerence (default is False).
572
+ bbox_grid_tolerence: Tolerence in pixels for grouping bounding boxes into the same row when sort_bbox_grid_wise is True (default is 10).
573
+ retrieval_type: Contour retrieval mode (default is cv2.RETR_EXTERNAL).
574
+ approximation_method: Contour approximation method (default is cv2.CHAIN_APPROX_SIMPLE).
575
+ draw_contours: If True, draws the filtered contours on a copy of the original image for visualization (default is False).
576
+
577
+ Returns:
578
+ List of dictionaries containing contour information (contour, area, bounding_box, approx_vertices, center) for each filtered contour.
579
+ """
580
+
581
+ if canny_threshold is None:
582
+ canny_threshold = [100, 100]
583
+ if filter_shapes is None:
584
+ filter_shapes = []
585
+ if get_canny_edges:
586
+ cleaned = self.get_cannty_edges(
587
+ low_threshold=canny_threshold[0], high_threshold=canny_threshold[1]
588
+ )
589
+ else:
590
+ # 1. Clean noise (very important)
591
+ cleaned = self.refine_mask(mask, merge=dilate_merge)
592
+
593
+ # kernel = np.ones((3, 3), np.uint8)
594
+
595
+ # # Remove small noise
596
+ # cleaned = cv2.morphologyEx(mask, cv2.MORPH_OPEN, kernel, iterations=1)
597
+
598
+ # # Fill gaps inside highlights
599
+ # cleaned = cv2.morphologyEx(cleaned, cv2.MORPH_CLOSE, kernel, iterations=2)
600
+
601
+ # 2. Find contours
602
+ contours, _ = cv2.findContours(
603
+ cleaned,
604
+ retrieval_type,
605
+ approximation_method, # only outer regions
606
+ )
607
+
608
+ if debug:
609
+ print(f"Found {len(contours)} contours before filtering")
610
+
611
+ # filtered_contours = []
612
+ # boxes = []
613
+ contours_results = []
614
+ countour_img = self.image.copy()
615
+ # 3. Filter contours
616
+ for cnt in contours:
617
+ area = cv2.contourArea(cnt)
618
+
619
+ if area < min_area:
620
+ continue
621
+
622
+ peri = cv2.arcLength(
623
+ cnt, True
624
+ ) # Computes the perimeter of the contour and True → contour is closed
625
+ approx = cv2.approxPolyDP(cnt, 0.02 * peri, True)
626
+ print("Approx vertices:", len(approx))
627
+
628
+ x, y, w, h = cv2.boundingRect(cnt)
629
+ print(f"Contour area: {area}, Bounding box: (x={x}, y={y}, w={w}, h={h})")
630
+ if w < 20 or h < 10:
631
+ continue
632
+
633
+ """
634
+ cv2.approxPolyDP(cnt, 0.02 * peri, True)
635
+
636
+ It applies the Douglas-Peucker algorithm to simplify the contour.
637
+ What it does:
638
+ 1. Reduces a complex contour (many points) → simpler polygon
639
+ 2. Keeps the general shape, removes noise/jagged edges
640
+ """
641
+ if draw_contours:
642
+ print(
643
+ f"Accepted contour with area {area} and bounding box (x={x}, y={y}, w={w}, h={h})"
644
+ )
645
+ cv2.putText(
646
+ countour_img,
647
+ str(len(approx)),
648
+ (x, y - 10),
649
+ cv2.FONT_HERSHEY_SIMPLEX,
650
+ 1,
651
+ contour_text_color,
652
+ contour_text_thickness,
653
+ )
654
+
655
+ center_x, center_y = x + (w // 2), y + (h // 2)
656
+ cv2.rectangle(
657
+ countour_img,
658
+ (x, y),
659
+ (x + w, y + h),
660
+ contour_box_color,
661
+ contour_box_thickness,
662
+ )
663
+ cv2.circle(
664
+ countour_img, (center_x, center_y), 5, contour_box_color, cv2.FILLED
665
+ )
666
+ # Check if the vertex count matches the filter_shapes criteria (if provided)
667
+ if len(filter_shapes) != 0 and len(approx) not in filter_shapes:
668
+ continue
669
+
670
+ contours_results.append(
671
+ {
672
+ "contour": cnt,
673
+ "area": area,
674
+ "bounding_box": (x, y, w, h),
675
+ "approx_vertices": approx,
676
+ "center": (center_x, center_y),
677
+ }
678
+ )
679
+
680
+ # filtered_contours.append(cnt)
681
+ # boxes.append((x, y, w, h))
682
+
683
+ if draw_contours:
684
+ cv2.drawContours(countour_img, [cnt], -1, (0, 255, 0), 2)
685
+
686
+ if debug:
687
+ print(f"Total contours (raw): {len(contours)}")
688
+ print(f"Filtered contours: {len(contours_results)}")
689
+ for i, c in enumerate(contours_results):
690
+ x, y, w, h = c["bounding_box"]
691
+ print(
692
+ f"[{i}] Area={c['area']:.2f}, Box=({x},{y},{w},{h}), Center={c['center']}"
693
+ )
694
+
695
+ if sort_bbox_area_largest_to_smallest:
696
+ contours_results = sorted(
697
+ contours_results, key=lambda c: c["area"], reverse=True
698
+ )
699
+
700
+ if sort_contours_smallest_to_largest:
701
+ contours_results = sorted(contours_results, key=lambda c: c["area"])
702
+
703
+ if sort_countours_largest_to_smallest:
704
+ contours_results = sorted(
705
+ contours_results, key=lambda c: c["area"], reverse=True
706
+ )
707
+ # filtered_contours = sorted(filtered_contours, key=cv2.contourArea, reverse=True)
708
+
709
+ if sort_bbox_smaller_to_largest:
710
+ contours_results = sorted(
711
+ contours_results,
712
+ key=lambda c: c["bounding_box"][2] * c["bounding_box"][3], # w * h
713
+ )
714
+
715
+ if sort_bbox_top_to_bottom:
716
+ # Sort top-to-bottom
717
+ contours_results = sorted(
718
+ contours_results,
719
+ key=lambda c: (c["bounding_box"][1], c["bounding_box"][0]), # y, then x
720
+ )
721
+ # boxes = sorted(boxes, key=lambda b: (b[1], b[0]))
722
+
723
+ if sort_bbox_left_to_right:
724
+ contours_results = sorted(
725
+ contours_results,
726
+ key=lambda c: c["bounding_box"][0], # x
727
+ )
728
+
729
+ if sort_bbox_grid_wise:
730
+ contours_results = sorted(
731
+ contours_results,
732
+ key=lambda c: (
733
+ c["bounding_box"][1] // bbox_grid_tolerence,
734
+ c["bounding_box"][0], # then sort in row
735
+ ),
736
+ )
737
+
738
+ # boxes = sorted(boxes, key=lambda b: (b[0], b[1]))
739
+ return contours_results, countour_img
740
+
741
+ def detect_contours(self, min_area=500):
742
+ edges = self.get_cannty_edges()
743
+ contours, _ = cv2.findContours(
744
+ edges, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE
745
+ )
746
+ contours = [c for c in contours if cv2.contourArea(c) > min_area]
747
+ contours = sorted(contours, key=cv2.contourArea, reverse=True)
748
+ return contours
749
+
750
+ def export_measurements(data, path="measurements.json"):
751
+ with open(path, "w") as f:
752
+ json.dump(data, f, indent=2)
753
+
754
+ def draw_grid(self, pixels_per_cm, color=(200, 200, 200)):
755
+ h, w = self.image.shape[:2]
756
+ step = int(pixels_per_cm)
757
+ for x in range(0, w, step):
758
+ cv2.line(self.image, (x, 0), (x, h), color, 1)
759
+
760
+ for y in range(0, h, step):
761
+ cv2.line(self.image, (0, y), (w, y), color, 1)
762
+
763
+ return self.image
764
+
765
+ def detect_reference(self, contours):
766
+ candidates = []
767
+
768
+ for cnt in contours:
769
+ area = cv2.contourArea(cnt)
770
+ if area < 3000:
771
+ continue
772
+
773
+ rect = cv2.minAreaRect(cnt)
774
+ w, h = rect[1]
775
+
776
+ if w == 0 or h == 0:
777
+ continue
778
+
779
+ aspect = max(w, h) / min(w, h)
780
+
781
+ if 1.3 < aspect < 1.5:
782
+ candidates.append(("A4", cnt, 21.0))
783
+
784
+ elif 1.5 < aspect < 1.7:
785
+ candidates.append(("CARD", cnt, 8.56))
786
+
787
+ if candidates:
788
+ # choose largest
789
+ candidates.sort(key=lambda x: cv2.contourArea(x[1]), reverse=True)
790
+ label, cnt, real_size = candidates[0]
791
+
792
+ return cnt, real_size, label
793
+
794
+ # fallback → AI detection (optional)
795
+ return None, None, None
796
+
797
+ # ─────────────────────────── NEW METHODS ───────────────────────────
798
+
799
+ def get_image_info(self) -> dict:
800
+ """Return basic metadata about the current image.
801
+
802
+ Returns:
803
+ dict: {'height', 'width', 'channels', 'dtype', 'size_bytes'}
804
+ """
805
+ h, w = self.image.shape[:2]
806
+ channels = self.image.shape[2] if self.image.ndim == 3 else 1
807
+ return {
808
+ "height": h,
809
+ "width": w,
810
+ "channels": channels,
811
+ "dtype": str(self.image.dtype),
812
+ "size_bytes": self.image.nbytes,
813
+ }
814
+
815
+ def apply_clahe(self, clip_limit=2.0, tile_grid_size=(8, 8)) -> np.ndarray:
816
+ """Apply Contrast-Limited Adaptive Histogram Equalization (CLAHE).
817
+ Better than global histogram equalization for documents with uneven lighting.
818
+
819
+ Args:
820
+ clip_limit: Threshold for contrast limiting.
821
+ tile_grid_size: Size of the grid for histogram equalization.
822
+ Returns:
823
+ Grayscale numpy array with enhanced contrast.
824
+ """
825
+ gray = cv2.cvtColor(self.image, cv2.COLOR_BGR2GRAY)
826
+ clahe = cv2.createCLAHE(clipLimit=clip_limit, tileGridSize=tile_grid_size)
827
+ return clahe.apply(gray)
828
+
829
+ def detect_blur(self, threshold=100.0) -> bool:
830
+ """Return True if the image is blurry (Laplacian variance below threshold).
831
+ Useful for quality-gating OCR or capture pipelines.
832
+
833
+ Args:
834
+ threshold: Variance below this value = blurry. Typical good-image range: 200+.
835
+ Returns:
836
+ bool: True = blurry.
837
+ """
838
+ gray = cv2.cvtColor(self.image, cv2.COLOR_BGR2GRAY)
839
+ variance = cv2.Laplacian(gray, cv2.CV_64F).var()
840
+ return variance < threshold
841
+
842
+ def get_blur_score(self) -> float:
843
+ """Return the Laplacian variance as a focus/sharpness score.
844
+ Higher = sharper. Useful for ranking multiple captures.
845
+
846
+ Returns:
847
+ float
848
+ """
849
+ gray = cv2.cvtColor(self.image, cv2.COLOR_BGR2GRAY)
850
+ return float(cv2.Laplacian(gray, cv2.CV_64F).var())
851
+
852
+ def get_brightness(self) -> float:
853
+ """Return mean pixel brightness of the image (0–255).
854
+ Useful for auto-exposure feedback or quality checks.
855
+
856
+ Returns:
857
+ float
858
+ """
859
+ gray = cv2.cvtColor(self.image, cv2.COLOR_BGR2GRAY)
860
+ return float(np.mean(gray))
861
+
862
+ def crop(self, x: int, y: int, w: int, h: int) -> np.ndarray:
863
+ """Crop a rectangular region from the image.
864
+
865
+ Args:
866
+ x, y: Top-left corner coordinates.
867
+ w, h: Width and height of the crop.
868
+ Returns:
869
+ BGR numpy array crop.
870
+ """
871
+ H, W = self.image.shape[:2]
872
+ x1, y1 = max(0, x), max(0, y)
873
+ x2, y2 = min(W, x + w), min(H, y + h)
874
+ return self.image[y1:y2, x1:x2].copy()
875
+
876
+ def flip(self, direction="horizontal") -> np.ndarray:
877
+ """Flip the image horizontally or vertically.
878
+
879
+ Args:
880
+ direction: 'horizontal' | 'vertical' | 'both'
881
+ Returns:
882
+ Flipped BGR numpy array.
883
+ """
884
+ flip_code = {"horizontal": 1, "vertical": 0, "both": -1}.get(direction, 1)
885
+ self.image = cv2.flip(self.image, flip_code)
886
+ return self.image
887
+
888
+ def adjust_brightness_contrast(self, alpha=1.0, beta=0) -> np.ndarray:
889
+ """Apply linear brightness/contrast adjustment: output = alpha * input + beta.
890
+
891
+ Args:
892
+ alpha: Contrast multiplier (1.0 = no change, > 1 = more contrast).
893
+ beta: Brightness offset added to all pixels (-255 to 255).
894
+ Returns:
895
+ Adjusted BGR numpy array.
896
+ """
897
+ self.image = cv2.convertScaleAbs(self.image, alpha=alpha, beta=beta)
898
+ return self.image
899
+
900
+ def denoise(self, strength=10) -> np.ndarray:
901
+ """Apply fast non-local means denoising — good for scanned document noise.
902
+
903
+ Args:
904
+ strength: Filter strength; higher = more noise removed but more blurry.
905
+ Returns:
906
+ Denoised BGR numpy array.
907
+ """
908
+ self.image = cv2.fastNlMeansDenoisingColored(
909
+ self.image, None, strength, strength, 7, 21
910
+ )
911
+ return self.image
912
+
913
+ def extract_text_regions(self, boxes):
914
+ """
915
+ Extract text regions from the input image based on the provided bounding boxes. This method takes an image and a
916
+ list of bounding boxes (each defined by its top-left corner coordinates and dimensions) and extracts the corresponding regions of interest (ROIs) from the image.
917
+ It then applies OCR to each extracted region to obtain the text contained within it. The method returns a list of dictionaries, where each dictionary contains the
918
+ bounding box coordinates and the extracted text for that region. This can be useful for analyzing specific areas of the image or for further processing of the detected text.
919
+
920
+ Args:
921
+ image (np.ndarray): The input image from which to extract text regions.
922
+ boxes (List[Tuple[int, int, int, int]]): A list of bounding boxes, where each box is defined by a tuple of (x, y, width, height)
923
+
924
+ Returns:
925
+ List[Dict[str, Any]]: A list of dictionaries, each containing the bounding box coordinates and the extracted text for that region. For example: [{"bbox": (x1, y1, x2, y2), "text": "Extracted text from the region"}, ...
926
+ """
927
+ results = []
928
+
929
+ # Crop the image based on the bounding boxes and apply OCR to each region
930
+ for x, y, w, h in boxes:
931
+ roi = self.image[y : y + h, x : x + w]
932
+
933
+ gray = cv2.cvtColor(roi, cv2.COLOR_BGR2GRAY)
934
+ _, thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
935
+
936
+ text = pytesseract.image_to_string(thresh, config="--psm 6")
937
+
938
+ results.append({"bbox": (x, y, x + w, y + h), "text": text.strip()})
939
+
940
+ return results
941
+
942
+ # ─────────────────────────── UTILITY METHODS ───────────────────────────
943
+
944
+ def resize_to_fit(self, max_width: int, max_height: int) -> np.ndarray:
945
+ """Resize the image to fit within max_width x max_height, preserving aspect ratio.
946
+
947
+ Args:
948
+ max_width: Maximum output width in pixels.
949
+ max_height: Maximum output height in pixels.
950
+ Returns:
951
+ Resized BGR numpy array.
952
+ """
953
+ h, w = self.image.shape[:2]
954
+ scale = min(max_width / w, max_height / h)
955
+ new_w, new_h = int(w * scale), int(h * scale)
956
+ return cv2.resize(self.image, (new_w, new_h), interpolation=cv2.INTER_AREA)
957
+
958
+ def pad_to_square(self, fill: int = 0) -> np.ndarray:
959
+ """Pad the image with a constant border to make it square.
960
+
961
+ The shorter dimension is padded symmetrically; the longer dimension is unchanged.
962
+
963
+ Args:
964
+ fill: Constant pixel value used for padding (0 = black).
965
+ Returns:
966
+ Square BGR numpy array.
967
+ """
968
+ h, w = self.image.shape[:2]
969
+ size = max(h, w)
970
+ pad_h = size - h
971
+ pad_w = size - w
972
+ top, left = pad_h // 2, pad_w // 2
973
+ return cv2.copyMakeBorder(
974
+ self.image,
975
+ top,
976
+ pad_h - top,
977
+ left,
978
+ pad_w - left,
979
+ cv2.BORDER_CONSTANT,
980
+ value=fill,
981
+ )
982
+
983
+ def normalize(self, mean=(0, 0, 0), std=(1, 1, 1)) -> np.ndarray:
984
+ """Normalize pixel values to float32 in [0, 1] and optionally subtract mean / divide by std.
985
+
986
+ Args:
987
+ mean: Per-channel mean to subtract after scaling to [0, 1].
988
+ std: Per-channel std to divide by after mean subtraction.
989
+ Returns:
990
+ float32 numpy array.
991
+ """
992
+ img = self.image.astype(np.float32) / 255.0
993
+ return ((img - np.array(mean)) / np.array(std)).astype(np.float32)
994
+
995
+ def create_thumbnail(self, size=(128, 128)) -> np.ndarray:
996
+ """Resize the image to a fixed thumbnail size (no aspect-ratio preservation).
997
+
998
+ Args:
999
+ size: (width, height) tuple for the output thumbnail.
1000
+ Returns:
1001
+ BGR numpy array of shape (height, width, 3).
1002
+ """
1003
+ return cv2.resize(self.image, size, interpolation=cv2.INTER_AREA)
1004
+
1005
+ def batch_crop(self, boxes) -> list:
1006
+ """Crop multiple regions from the image at once.
1007
+
1008
+ Args:
1009
+ boxes: Iterable of (x, y, w, h) tuples in pixel coordinates.
1010
+ Coordinates are clamped to image boundaries.
1011
+ Returns:
1012
+ List of BGR numpy array crops, one per box.
1013
+ """
1014
+ h, w = self.image.shape[:2]
1015
+ crops = []
1016
+ for x, y, bw, bh in boxes:
1017
+ x1, y1 = max(0, x), max(0, y)
1018
+ x2, y2 = min(w, x + bw), min(h, y + bh)
1019
+ crops.append(self.image[y1:y2, x1:x2].copy())
1020
+ return crops
1021
+
1022
+ def get_dominant_colors(self, k: int = 5) -> list:
1023
+ """Return k dominant BGR colors using K-means clustering on all pixels.
1024
+
1025
+ Args:
1026
+ k: Number of clusters / dominant colors to return.
1027
+ Returns:
1028
+ List of k (B, G, R) tuples as integers.
1029
+ """
1030
+ pixels = self.image.reshape(-1, 3).astype(np.float32)
1031
+ criteria = (cv2.TERM_CRITERIA_EPS + cv2.TERM_CRITERIA_MAX_ITER, 100, 0.2)
1032
+ _, _, centers = cv2.kmeans(
1033
+ pixels, k, None, criteria, 10, cv2.KMEANS_RANDOM_CENTERS
1034
+ )
1035
+ return [tuple(int(c) for c in color) for color in centers]
1036
+
1037
+ def overlay_image(
1038
+ self, overlay: np.ndarray, x: int, y: int, alpha: float = 1.0
1039
+ ) -> np.ndarray:
1040
+ """Blend an overlay image onto a copy of self.image at position (x, y).
1041
+
1042
+ Args:
1043
+ overlay: BGR numpy array to blend in.
1044
+ x: Left edge of the overlay region (pixels).
1045
+ y: Top edge of the overlay region (pixels).
1046
+ alpha: Opacity of the overlay (0.0 = invisible, 1.0 = fully opaque).
1047
+ Returns:
1048
+ New BGR numpy array with the overlay blended in.
1049
+ """
1050
+ out = self.image.copy()
1051
+ h, w = overlay.shape[:2]
1052
+ y2 = min(y + h, out.shape[0])
1053
+ x2 = min(x + w, out.shape[1])
1054
+ oh, ow = y2 - y, x2 - x
1055
+ if oh > 0 and ow > 0:
1056
+ roi = out[y:y2, x:x2]
1057
+ out[y:y2, x:x2] = cv2.addWeighted(
1058
+ roi, 1 - alpha, overlay[:oh, :ow], alpha, 0
1059
+ )
1060
+ return out
1061
+
1062
+ def compare_histograms(self, other_image: np.ndarray) -> float:
1063
+ """Compare self.image to another image using 3D BGR histogram correlation.
1064
+
1065
+ Both histograms are normalised to [0, 1] before comparison.
1066
+
1067
+ Args:
1068
+ other_image: BGR numpy array to compare against.
1069
+ Returns:
1070
+ Correlation score in [-1, 1]; 1.0 = identical histogram.
1071
+ """
1072
+
1073
+ def _hist(img):
1074
+ h = cv2.calcHist(
1075
+ [img], [0, 1, 2], None, [8, 8, 8], [0, 256, 0, 256, 0, 256]
1076
+ )
1077
+ cv2.normalize(h, h, alpha=0, beta=1, norm_type=cv2.NORM_MINMAX)
1078
+ return h
1079
+
1080
+ return float(
1081
+ cv2.compareHist(_hist(self.image), _hist(other_image), cv2.HISTCMP_CORREL)
1082
+ )
1083
+
1084
+ def to_base64(self) -> str:
1085
+ """Encode the image as a base64 PNG string (UTF-8).
1086
+
1087
+ Useful for embedding images in JSON payloads or HTML data URIs.
1088
+
1089
+ Returns:
1090
+ Base64-encoded string of the PNG-encoded image.
1091
+ """
1092
+ import base64
1093
+
1094
+ _, buf = cv2.imencode(".png", self.image)
1095
+ return base64.b64encode(buf).decode("utf-8")