openvisionkit 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1230 @@
1
+ import math
2
+ import time
3
+ from collections import deque
4
+ from itertools import combinations
5
+ from pathlib import Path
6
+
7
+ import cv2
8
+ import mediapipe as mp
9
+ import numpy as np
10
+ from mediapipe.tasks import python
11
+ from mediapipe.tasks.python import vision
12
+
13
+ cap = cv2.VideoCapture(0)
14
+
15
+ _MODEL_DIR = Path(__file__).parent / "models"
16
+ _DEFAULT_MODEL = str(_MODEL_DIR / "hand_landmarker.task")
17
+
18
+
19
+ # There are 21 hand landmarks in total, and the tips of the fingers are represented by the following landmark indices: 4 (thumb), 8 (index finger), 12 (middle finger), 16 (ring finger), and 20 (little finger). These indices correspond to the specific landmarks that represent the tips of each finger in the hand landmark detection model. By accessing these landmarks, you can determine the position and state of each finger for various applications such as gesture recognition or hand tracking.
20
+ class HandDetector:
21
+ def __init__(
22
+ self,
23
+ model_path=_DEFAULT_MODEL,
24
+ running_mode="IMAGE",
25
+ max_hands=2,
26
+ detection_confidence=0.5,
27
+ hand_presence_confidence=0.5,
28
+ tracking_confidence=0.5,
29
+ smoothing_window=8,
30
+ calibration_samples=None,
31
+ ):
32
+ """Initialize the HandDetector class with the specified parameters for hand detection and tracking. The constructor sets up the MediaPipe hand landmark detection model, drawing utilities, and configuration options for hand detection and tracking.
33
+
34
+ Args:
35
+ model_path (str): The path to the MediaPipe hand landmark detection model file.
36
+ running_mode (str): The mode in which the model should run. Can be "IMAGE" for image mode or "VIDEO" for video mode.
37
+ max_hands (int): The maximum number of hands to detect in the input images.
38
+ detection_confidence (float): The minimum confidence threshold for hand detection. Only detections with confidence above this threshold will be considered valid.
39
+ hand_presence_confidence (float): The minimum confidence threshold for determining the presence of a hand in the image. This is used to filter out false positives where the model may detect a hand that is not actually present.
40
+ tracking_confidence (float): The minimum confidence threshold for tracking the detected hand landmarks across frames in a video stream. This helps to maintain consistent tracking of hand landmarks over time, even if the hand moves or changes position in the video feed.
41
+ smoothing_window (int): The size of the window for smoothing distance measurements. This is used to average out distance measurements over a specified number of frames to reduce noise and provide more stable distance estimates.
42
+ calibration_samples (int): The number of samples to use for calibrating the distance estimation. If None, no calibration will be performed and default values will be used for distance estimation.
43
+
44
+ The constructor initializes the MediaPipe hand landmark detection model with the specified options and sets up the drawing utilities for visualizing the detected hand landmarks and connections. It also defines constants for margin and text color used in drawing the handedness information on the output images.
45
+ """
46
+ self.running_mode = getattr(vision.RunningMode, running_mode)
47
+ base_options = python.BaseOptions(model_asset_path=model_path)
48
+ options = vision.HandLandmarkerOptions(
49
+ base_options=base_options,
50
+ running_mode=self.running_mode, # IMAGE | VIDEO | LIVE_STREAM
51
+ num_hands=max_hands,
52
+ min_hand_detection_confidence=detection_confidence,
53
+ min_hand_presence_confidence=hand_presence_confidence,
54
+ min_tracking_confidence=tracking_confidence,
55
+ )
56
+ self.detector = vision.HandLandmarker.create_from_options(options)
57
+ self.mp_hands = mp.tasks.vision.HandLandmarksConnections
58
+ self.mp_drawing_styles = mp.tasks.vision.drawing_styles
59
+ self.mp_drawing_utils = mp.tasks.vision.drawing_utils
60
+ self.MARGIN = 5
61
+ self.HANDEDNESS_TEXT_COLOR = (0, 165, 255)
62
+ self.fingerTips = [4, 8, 12, 16, 20]
63
+ self.fingerPips = [6, 10, 14, 18]
64
+ self.fingerDips = [7, 11, 15, 19]
65
+ self.wrist = 0
66
+ self.finger_mcp = [2, 5, 9, 13, 17]
67
+ self.distance_history = deque(maxlen=smoothing_window)
68
+ self._lm_list = [] # cached landmarks_list from last get_landmarks call
69
+ if calibration_samples:
70
+ self.fit_polynomial(calibration_samples)
71
+
72
+ def _to_mp_image(self, image):
73
+ """
74
+ Convert a BGR image (as used by OpenCV) to an mp.Image format suitable for MediaPipe processing.
75
+ Args:
76
+ image: The input image in BGR format (as used by OpenCV).
77
+ Returns:
78
+ An mp.Image object in RGB format suitable for MediaPipe processing.
79
+ """
80
+ rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
81
+ return mp.Image(image_format=mp.ImageFormat.SRGB, data=rgb)
82
+
83
+ def set_landmarks_image(self, image):
84
+ self.mp_image = self._to_mp_image(image)
85
+ # Perform hand detection
86
+ detection_result = self.detector.detect(self.mp_image)
87
+ # Get the hand landmarks list => list of detected hands
88
+ # Each hand has 21 landmarks with x, y, z coordinates
89
+ self.hand_landmarks_list = detection_result.hand_landmarks
90
+
91
+ # Tells if the hand is left or right
92
+ self.handedness_list = detection_result.handedness
93
+
94
+ def draw_landmarks(
95
+ self,
96
+ img_bgr,
97
+ to_draw_landmark=True,
98
+ to_draw_center_point=True,
99
+ to_draw_bounding_box=True,
100
+ to_put_handle_label=True,
101
+ flip_hands=False,
102
+ ):
103
+ """
104
+ Detect hand landmarks in the input BGR image and draw them on a copy of the image. The function processes the image using MediaPipe's hand landmark detection, retrieves the detected landmarks and handedness information, and optionally draws the landmarks and connections on the image for visualization.
105
+
106
+ Args:
107
+ imgBGR (numpy array): The input image in BGR format on which to detect and draw hand landmarks.
108
+ to_draw_landmark (bool): Whether to draw the detected landmarks and connections on the image.
109
+ to_draw_center_point (bool): Whether to draw a circle at the center point of the detected hand landmarks on the image.
110
+ to_draw_bounding_box (bool): Whether to draw a bounding box around the detected hand landmarks on the image.
111
+ to_put_handle_label (bool): Whether to put the handedness label (e.g., "Left" or "Right") near the detected hand landmarks on the image.
112
+ flip_hands (bool): Whether to flip the left and right hand labels. This can be useful when displaying mirrored webcam feeds, where the left and right hands may appear reversed. If True, the function will swap the "Left" and "Right" labels for the detected hands in the output image.
113
+
114
+ Returns:
115
+ annotated_image (numpy array): The image with detected hand landmarks and connections drawn (if to
116
+ Draw is True). The function returns a copy of the input image with the detected hand landmarks and connections drawn for visualization purposes.
117
+ output_landmarks: A list of tuples containing the landmarks list, bounding box, landmark parameters, and hand type for each detected hand. Each tuple in the list corresponds to a detected hand and contains the following information:
118
+ - landmarks_list: A list of 21 landmarks for the detected hand, each represented as a tuple (x, y, z).
119
+ - bounding_box: A tuple (xmin, ymin, width, height) representing the bounding box around the detected hand.
120
+ - landmark_params: Additional parameters related to the detected landmarks.
121
+ - hand_type: A string indicating the handedness of the detected hand ("Left" or "Right").
122
+ """
123
+ # Convert to MediaPipe image
124
+ self.set_landmarks_image(img_bgr)
125
+ annotated_image = np.copy(self.mp_image.numpy_view())
126
+
127
+ if not self.hand_landmarks_list:
128
+ return img_bgr, []
129
+
130
+ height, width, _ = annotated_image.shape
131
+
132
+ # Get structured data
133
+ all_hands = self.get_landmarks(img_bgr.copy(), flip_hands=flip_hands)
134
+ output_landmarks = []
135
+ for idx, hand_data in enumerate(all_hands):
136
+ hand_landmarks = self.hand_landmarks_list[idx]
137
+ bbox = hand_data["bounding_box"]
138
+ center = hand_data["center_point"]
139
+ label = hand_data["hand_type"]
140
+
141
+ xmin, ymin, w, h = bbox
142
+ center_x, center_y = center
143
+
144
+ # -------- Draw landmarks --------
145
+ if to_draw_landmark:
146
+ self.mp_drawing_utils.draw_landmarks(
147
+ annotated_image,
148
+ hand_landmarks,
149
+ self.mp_hands.HAND_CONNECTIONS,
150
+ self.mp_drawing_styles.get_default_hand_landmarks_style(),
151
+ self.mp_drawing_styles.get_default_hand_connections_style(),
152
+ )
153
+
154
+ # -------- Draw bounding box --------
155
+ if to_draw_bounding_box:
156
+ cv2.rectangle(
157
+ annotated_image,
158
+ (xmin - 20, ymin - 20),
159
+ (xmin + w + 20, ymin + h + 20),
160
+ (0, 255, 0),
161
+ 2,
162
+ )
163
+
164
+ # -------- Draw center --------
165
+ if to_draw_center_point:
166
+ cv2.circle(
167
+ annotated_image, (center_x, center_y), 8, (0, 0, 255), cv2.FILLED
168
+ )
169
+
170
+ # -------- Label --------
171
+ if to_put_handle_label and label:
172
+ cv2.putText(
173
+ annotated_image,
174
+ label,
175
+ (xmin, ymin - 10),
176
+ cv2.FONT_HERSHEY_SIMPLEX,
177
+ 0.7,
178
+ self.HANDEDNESS_TEXT_COLOR,
179
+ 2,
180
+ cv2.LINE_AA,
181
+ )
182
+ landmarks_list = hand_data["landmarks_list"]
183
+ hand_bounding_box = bbox
184
+ landmark_params = {
185
+ "center_point": center,
186
+ "width": w,
187
+ "height": h,
188
+ "bbox": (xmin, ymin, w, h),
189
+ }
190
+ hand_type = label
191
+
192
+ output_landmarks.append(
193
+ (landmarks_list, hand_bounding_box, landmark_params, hand_type)
194
+ )
195
+
196
+ annotated_image = cv2.cvtColor(annotated_image, cv2.COLOR_RGB2BGR)
197
+
198
+ return annotated_image, output_landmarks
199
+
200
+ def get_landmarks(self, img, flip_hands=False):
201
+ """
202
+ Extracts landmarks, bounding boxes, and center points for each detected hand.
203
+
204
+ Args:
205
+ img (numpy.ndarray): Input image.
206
+ flip_hands (bool): Swap Left/Right hand labels. Useful when
207
+ displaying mirrored webcam feeds.
208
+
209
+ Returns:
210
+ list: Hand information dictionaries.
211
+
212
+ Example output:
213
+ [
214
+ {
215
+ "landmarks_list": [[0, x0, y0, z0], [1, x1, y1, z1], ..., [20, x20, y20, z20]],
216
+ "bounding_box": (xmin, ymin, width, height),
217
+ "center_point": (center_x, center_y),
218
+ "hand_type": "Left" or "Right"
219
+ },
220
+ ...
221
+ ]
222
+ The function processes the input image to detect hand landmarks and returns a list of dictionaries, where
223
+ """
224
+ height, width, _ = img.shape
225
+ all_hands = []
226
+
227
+ # Generate landmarks if not already available
228
+ if not hasattr(self, "hand_landmarks_list") or not self.hand_landmarks_list:
229
+ self.set_landmarks_image(img.copy())
230
+
231
+ for idx, hand_landmarks in enumerate(self.hand_landmarks_list):
232
+ landmarks_list = []
233
+ x_coords, y_coords = [], []
234
+
235
+ # Extract landmarks
236
+ for landmark_id, landmark in enumerate(hand_landmarks):
237
+ cx = int(landmark.x * width)
238
+ cy = int(landmark.y * height)
239
+ cz = int(landmark.z * width)
240
+
241
+ landmarks_list.append([landmark_id, cx, cy, cz])
242
+
243
+ x_coords.append(cx)
244
+ y_coords.append(cy)
245
+
246
+ # Bounding box
247
+ xmin, xmax = min(x_coords), max(x_coords)
248
+ ymin, ymax = min(y_coords), max(y_coords)
249
+
250
+ bbox_width = xmax - xmin
251
+ bbox_height = ymax - ymin
252
+
253
+ bbox = (xmin, ymin, bbox_width, bbox_height)
254
+
255
+ # Center point
256
+ center_x = xmin + bbox_width // 2
257
+ center_y = ymin + bbox_height // 2
258
+
259
+ # Hand label
260
+ hand_label = None
261
+ if idx < len(self.handedness_list):
262
+ hand_label = self.handedness_list[idx][0].category_name
263
+
264
+ if flip_hands:
265
+ hand_label = (
266
+ "Left"
267
+ if hand_label == "Right"
268
+ else "Right"
269
+ if hand_label == "Left"
270
+ else hand_label
271
+ )
272
+
273
+ if idx == 0:
274
+ self._lm_list = landmarks_list
275
+
276
+ all_hands.append(
277
+ {
278
+ "landmarks_list": landmarks_list,
279
+ "bounding_box": bbox,
280
+ "center_point": (center_x, center_y),
281
+ "hand_type": hand_label,
282
+ }
283
+ )
284
+
285
+ return all_hands
286
+
287
+ def is_finger_point_inside_rect(self, point, rect):
288
+ """
289
+ Robust rectangle hit test.
290
+ Supports only (x, y, w, h) - SAFE & CONSISTENT.
291
+ """
292
+
293
+ px, py = point
294
+ rx, ry, rw, rh = rect
295
+
296
+ # guard against invalid values
297
+ if rw < 0 or rh < 0:
298
+ return False
299
+
300
+ return (rx <= px <= rx + rw) and (ry <= py <= ry + rh)
301
+
302
+ def finger_joined(self, p1, p2, image, landmarks, threshold=0.25):
303
+ """
304
+ Check if two fingers are joined based on the normalized distance between their landmarks. The function calculates the normalized distance between the specified landmarks and compares it to a threshold to determine if the fingers are considered joined. It also provides an annotated image for visualization.
305
+
306
+ Args:
307
+ p1 (int): The index of the first finger landmark.
308
+ p2 (int): The index of the second finger landmark.
309
+ image (numpy.ndarray): The image on which to annotate the finger status.
310
+ landmarks (list): A list of hand landmarks. Each landmark is expected to be a tuple of (x, y) coordinates.
311
+ threshold (float): The normalized distance threshold below which the fingers are considered joined.
312
+ Returns:
313
+ bool: True if the fingers are joined, False otherwise.
314
+ numpy.ndarray: The annotated image.
315
+ """
316
+ annotated = image.copy()
317
+
318
+ if not landmarks:
319
+ return False, annotated
320
+
321
+ normalized = self._normalize(landmarks, p1, p2)
322
+ print(f"Normalized distance between landmarks {p1} and {p2}: {normalized:.4f}")
323
+ is_joined = normalized < threshold
324
+ return is_joined, annotated
325
+
326
+ def get_distance_between_landmarks(
327
+ self,
328
+ landmark_id_1: int,
329
+ landmark_id_2: int,
330
+ hand_landmarks,
331
+ frame_shape=None,
332
+ return_points: bool = True,
333
+ ):
334
+ """
335
+ Calculate distance between two MediaPipe hand landmarks.
336
+
337
+ Args:
338
+ hand_landmarks:
339
+ MediaPipe hand landmarks object.
340
+ landmark_id_1:
341
+ First landmark index.
342
+ landmark_id_2:
343
+ Second landmark index.
344
+ frame_shape:
345
+ frame.shape if we want pixel distance.
346
+ If None, returns normalized landmark distance.
347
+ return_points:
348
+ If True, also returns point coordinates.
349
+
350
+ Returns:
351
+ distance, point1, point2
352
+ """
353
+
354
+ lm1 = hand_landmarks.landmark[landmark_id_1]
355
+ lm2 = hand_landmarks.landmark[landmark_id_2]
356
+
357
+ if frame_shape is not None:
358
+ h, w = frame_shape[:2]
359
+
360
+ p1 = (int(lm1.x * w), int(lm1.y * h))
361
+ p2 = (int(lm2.x * w), int(lm2.y * h))
362
+ else:
363
+ p1 = (lm1.x, lm1.y)
364
+ p2 = (lm2.x, lm2.y)
365
+
366
+ distance = math.sqrt((p1[0] - p2[0]) ** 2 + (p1[1] - p2[1]) ** 2)
367
+
368
+ if return_points:
369
+ return distance, p1, p2
370
+
371
+ return distance
372
+
373
+ def fingers_up(self, hand_landmarks=None):
374
+ """
375
+ Determine which fingers are raised (up) based on the detected hand landmarks. The function analyzes the positions of the landmarks for each finger and compares them to determine if a finger is raised or not.
376
+
377
+ Args:
378
+ hand_landmarks (list, optional): A list of landmarks for a detected hand, where each landmark contains x, y, z coordinates normalized to the image dimensions. Defaults to the cached landmarks from the last get_landmarks() call.
379
+
380
+ Returns:
381
+ fingers (list): A list of integers representing the state of each finger, where 1 indicates that the finger is raised (up) and 0 indicates that it is not raised (down). The order of the fingers in the list corresponds to the thumb, index, middle, ring, and little fingers.
382
+ """
383
+ if hand_landmarks is None:
384
+ hand_landmarks = self._lm_list
385
+ fingers = []
386
+ # Thumb
387
+ # We need to calculate thumb separately because it moves in a different plane compared to the other fingers.
388
+ # The code checks the x-coordinate of the thumb tip (landmark 4) and compares it to the x-coordinate of landmark 3 (the joint before the thumb tip)
389
+ # to determine if the thumb is raised or not. For a right hand, if the thumb tip is to the right of landmark 3, it is considered raised (1), otherwise it is considered not raised (0).
390
+ # For a left hand, the logic would be reversed.
391
+ # When the thumb point 3 is on the left of the thumb tip point 4, it means the thumb is raised (1) for a right hand. If the thumb tip is on the right of point 3, it means the thumb is not raised (0). This logic is based on the typical orientation of the hand and how the thumb moves in relation to the other fingers.
392
+
393
+ # Check if the hand is right or left based on the x-coordinates of the thumb tip and the joint before the thumb tip. For a right hand, if the thumb tip (landmark 4) is to the right of landmark 3, it is considered raised (1), otherwise it is considered not raised (0). For a left hand, the logic would be reversed, where if the thumb tip is to the left of landmark 3, it would be considered raised (1), and if it is to the right, it would be considered not raised (0).
394
+ for _idx, handedness in enumerate(self.handedness_list):
395
+ hand_label = handedness[0].category_name # 'Left' or 'Right'
396
+ if hand_label == "Right":
397
+ if (
398
+ hand_landmarks[self.fingerTips[0]][1]
399
+ > hand_landmarks[self.fingerTips[0] - 1][1]
400
+ ): # For right hand
401
+ fingers.append(1)
402
+ else:
403
+ fingers.append(0)
404
+ else:
405
+ # For a left hand, the logic is reversed. If the thumb tip (landmark 4) is to the left of landmark 3, it is considered raised (1), and if it is to the right, it is considered not raised (0). This is because the orientation of the hand is different for left and right hands, and the thumb moves in opposite directions relative to the other fingers.
406
+ if (
407
+ hand_landmarks[self.fingerTips[0]][1]
408
+ < hand_landmarks[self.fingerTips[0] - 1][1]
409
+ ): # For left hand
410
+ fingers.append(1)
411
+ else:
412
+ fingers.append(0)
413
+
414
+ # Fingers (index, middle, ring, little)
415
+ for id in range(1, 5):
416
+ if (
417
+ hand_landmarks[self.fingerTips[id]][2]
418
+ < hand_landmarks[self.fingerTips[id] - 2][2]
419
+ ):
420
+ fingers.append(1)
421
+ else:
422
+ fingers.append(0)
423
+
424
+ return fingers
425
+
426
+ def get_distance(
427
+ self, p1, p2, img, to_draw_circle_key_point=True, to_draw_line=True
428
+ ):
429
+ # Extract coordinates from specific hand
430
+ x1, y1 = p1
431
+ x2, y2 = p2
432
+
433
+ cx, cy = (x1 + x2) // 2, (y1 + y2) // 2
434
+
435
+ # Distance
436
+ length = math.hypot(x2 - x1, y2 - y1)
437
+
438
+ # Draw line
439
+ if to_draw_line:
440
+ cv2.line(img, (x1, y1), (x2, y2), (255, 0, 255), 3)
441
+
442
+ # Draw points
443
+ if to_draw_circle_key_point:
444
+ cv2.circle(img, (x1, y1), 8, (255, 0, 255), cv2.FILLED)
445
+ cv2.circle(img, (x2, y2), 8, (255, 0, 255), cv2.FILLED)
446
+ cv2.circle(img, (cx, cy), 8, (0, 0, 255), cv2.FILLED)
447
+
448
+ return length, img, [x1, y1, x2, y2, cx, cy]
449
+
450
+ def euclidean_distance(self, p1, p2):
451
+ """
452
+ Euclidean distance between two points p1 and p2, where each point is represented as a tuple of (x, y) coordinates. The function calculates the distance using the formula: distance = sqrt((x2 - x1)^2 + (y2 - y1)^2), which gives the straight-line distance between the two points in a 2D space.
453
+ As the distance is calculated using the Euclidean distance formula, it provides a measure of how far apart the two points are in the 2D space of the image. This can be useful for various applications such as gesture recognition, where the distance between specific landmarks can indicate certain hand gestures or movements.
454
+
455
+ Args:
456
+ p1 (tuple): The first point represented as a tuple of (x, y) coordinates.
457
+ p2 (tuple): The second point represented as a tuple of (x, y) coordinates.
458
+ Returns:
459
+ distance (float): The Euclidean distance between the two points p1 and p2.
460
+ """
461
+ # return ((p1[0] - p2[0])**2 + (p1[1] - p2[1])**2) ** 0.5
462
+ return float(np.linalg.norm(np.array(p1) - np.array(p2)))
463
+
464
+ def compute_real_palm_width(pixel_width, distance_cm, focal_length_px):
465
+ """
466
+ Compute the real palm width in centimeters based on the detected palm width in pixels, the known distance from the camera to the hand, and the focal length of the camera in pixels.
467
+ The function uses the formula: real_palm_width_cm = (pixel_width * distance_cm) / focal_length_px, where pixel_width is the measured width of the palm in pixels, distance_cm is the known distance from the camera to the hand in centimeters, and focal_length_px is the focal length of the camera in pixels. This calculation allows for estimating the actual size of the palm in real-world units (centimeters) based on the detected size in pixels and the known distance from the camera. This can be useful for applications that require understanding the physical dimensions of the hand or for distance estimation based on the size of the detected palm in the image.
468
+
469
+ Args:
470
+ pixel_width (float): The measured width of the palm in pixels as detected by the hand landmark detection model. This value is used in the calculation of the real palm width in centimeters.
471
+ distance_cm (float): The known distance from the camera to the hand in centimeters. This value is used in the calculation of the real palm width in centimeters.
472
+ focal_length_px (float): The focal length of the camera in pixels. This value is used in the calculation of the real palm width in centimeters.
473
+
474
+ Returns:
475
+ float: The calculated real palm width in centimeters based on the detected pixel width, known distance, and focal length. If the pixel width is zero or negative, or if the focal length is zero or negative, the function returns None to indicate that the real palm width cannot be computed with the given input.
476
+ """
477
+ if pixel_width <= 0 or focal_length_px <= 0:
478
+ return None
479
+
480
+ return (pixel_width * distance_cm) / focal_length_px
481
+
482
+ def landmark_to_pixel(self, landmark, image_width, image_height):
483
+ """
484
+ Landmark to pixel coordinates conversion. The function takes a landmark with normalized coordinates (x, y) and converts it to pixel coordinates based on the width and height of the image. The x-coordinate is multiplied by the image width, and the y-coordinate is multiplied by the image height to obtain the corresponding pixel coordinates in the image. This conversion is essential for accurately mapping the detected landmarks to their positions in the original image for visualization or further processing.
485
+
486
+ Args:
487
+ landmark: A landmark with normalized coordinates (x, y) that represents a specific point on the hand detected by the MediaPipe model.
488
+ image_width (int): The width of the image
489
+ image_height (int): The height of the image
490
+
491
+ Returns:
492
+ tuple: A tuple containing the pixel coordinates (x, y) corresponding to the input landmark, calculated by multiplying the normalized coordinates of the landmark by the width and height of the image, respectively
493
+ """
494
+ return int(landmark.x * image_width), int(landmark.y * image_height)
495
+
496
+ def palm_width_px(self, img, hand_landmarks, drawLandmarks=False):
497
+ """
498
+ Uses distance between INDEX_MCP and PINKY_MCP as palm width.
499
+ This is more stable than fingertip distance.
500
+
501
+ Args:
502
+ hand_landmarks (list): A list of hand landmarks, where each landmark is expected to be a tuple of (x, y) coordinates. The function specifically uses the landmarks corresponding to the INDEX_MCP and PINKY_MCP to calculate the palm width in pixels.
503
+ These landmarks represent the base joints of the index and pinky fingers, respectively, and their distance provides a more stable measurement of the palm width compared to using fingertip landmarks, which can be more variable due to finger bending and movement.
504
+ drawLandmarks (bool): Whether to draw the landmarks and connections on the image for visualization purposes. If True, the function will draw a line between the INDEX_MCP and PINKY_MCP landmarks, as well as circles at these landmark points on the input image.
505
+
506
+ Returns:
507
+ float: The calculated palm width in pixels, which is the distance between the INDEX_MCP
508
+ """
509
+ index_mcp_landmarks = hand_landmarks[self.finger_mcp[1]]
510
+ pinky_mcp_landmarks = hand_landmarks[self.finger_mcp[4]]
511
+ index_mcp = (
512
+ index_mcp_landmarks[1],
513
+ index_mcp_landmarks[2],
514
+ ) # (x, y) for INDEX_MCP
515
+ pinky_mcp = (
516
+ pinky_mcp_landmarks[1],
517
+ pinky_mcp_landmarks[2],
518
+ ) # (x, y) for PINKY_MCP
519
+ if drawLandmarks:
520
+ cv2.line(img, index_mcp, pinky_mcp, (0, 255, 0), 2)
521
+ cv2.circle(img, index_mcp, 6, (255, 0, 0), -1)
522
+ cv2.circle(img, pinky_mcp, 6, (255, 0, 0), -1)
523
+ return self.euclidean_distance(index_mcp, pinky_mcp), index_mcp, pinky_mcp
524
+
525
+ def _palm_scale(self, hand):
526
+ """
527
+ Calculate the palm scale of a hand based on specific landmarks.
528
+
529
+ Args:
530
+ hand (list): A list of hand landmarks. Each landmark is expected to be a tuple of (x, y) coordinates.
531
+
532
+ Returns:
533
+ float: The calculated palm scale of the hand, which is an average of the distances between
534
+ """
535
+ # stable reference (best practice)
536
+ return (
537
+ self.euclidean_distance(
538
+ hand[self.wrist], hand[self.finger_mcp[2]]
539
+ ) # wrist → middle MCP
540
+ + self.euclidean_distance(
541
+ hand[self.finger_mcp[1]], hand[self.finger_mcp[4]]
542
+ ) # index MCP → pinky MCP
543
+ ) / 2
544
+
545
+ def calibrate_focal_length(self, palm_width_px):
546
+ """
547
+ Calibrate the focal length of the camera based on a known distance and the detected palm width in pixels. The function calculates the focal length using the formula: focal_length = (palm_width_px * known_distance_cm) / real_palm_width_cm, where palm_width_px is the measured width of the palm in pixels, known_distance_cm is the known distance from the camera to the hand in centimeters, and real_palm_width_cm is the actual width of the palm in centimeters. This calibration allows for accurate distance estimation based on the detected palm width in subsequent frames.
548
+
549
+ Args:
550
+ palm_width_px (float): The measured width of the palm in pixels as detected by the hand landmark detection model. This value is used in the calculation of the focal length for distance estimation.
551
+
552
+ Returns:
553
+ float: The calculated focal length of the camera based on the known distance and the detected palm width in pixels. This focal length can be used for accurate distance estimation in subsequent frames based on the
554
+ detected palm width in pixels. If the palm width in pixels is zero or negative, the function returns None to indicate that the focal length cannot be calibrated with the given input.
555
+ """
556
+ if palm_width_px <= 0:
557
+ return None
558
+
559
+ self.focal_length_px = (
560
+ palm_width_px * self.known_distance_cm
561
+ ) / self.real_palm_width_cm
562
+
563
+ return self.focal_length_px
564
+
565
+ def get_dynamic_palm_width(self, hand, image_shape, distance_cm, focal_length_px):
566
+ """
567
+ Get the dynamic palm width in centimeters based on the detected landmarks of the hand, the shape of the image, the known distance from the camera to the hand, and the focal length of the camera in pixels. The function calculates the pixel width of the palm using specific landmarks (e.g., INDEX_MCP and PINKY_MCP) and then uses this pixel width along with the known distance and focal length to compute the real palm width in centimeters using the formula: real_palm_width_cm = (pixel_width * distance_cm) / focal_length_px. This allows for dynamic estimation of the palm width in real-world units based on the detected landmarks and camera parameters.
568
+
569
+ Args:
570
+ hand (object): The detected hand landmarks object that contains the landmark information for the hand.
571
+ image_shape (tuple): The shape of the input image as a tuple (height, width, channels). This is used to convert the normalized landmark coordinates to pixel coordinates.
572
+ distance_cm (float): The known distance from the camera to the hand in centimeters. This value is used in the calculation of the real palm width in centimeters.
573
+ focal_length_px (float): The focal length of the camera in pixels. This value is used in the calculation of the real palm width in centimeters.
574
+
575
+ Returns:
576
+ float: The calculated real palm width in centimeters based on the detected landmarks, image shape, known distance, and focal length. This value provides an estimate of the actual size of the palm in real-world units (centimeters) based on the detected size in pixels and the known distance from the
577
+ camera. If the pixel width is zero or negative, or if the focal length is zero or negative, the function returns None to indicate that the real palm width cannot be computed with the given input.
578
+ """
579
+ h, w, _ = image_shape
580
+
581
+ lm = hand.landmark
582
+
583
+ # Index MCP (5) and Pinky MCP (17)
584
+ p1 = (int(lm[self.finger_mcp[1]].x * w), int(lm[self.finger_mcp[1]].y * h))
585
+ p2 = (int(lm[self.finger_mcp[4]].x * w), int(lm[self.finger_mcp[4]].y * h))
586
+
587
+ pixel_width = np.linalg.norm(np.array(p1) - np.array(p2))
588
+
589
+ real_width_cm = (pixel_width * distance_cm) / focal_length_px
590
+
591
+ return real_width_cm, pixel_width, p1, p2
592
+
593
+ def fit_polynomial(self, calibration_samples, polynomial_degree=2):
594
+ """
595
+ calibration_samples format:
596
+ [
597
+ (palm_width_px, distance_cm),
598
+ (palm_width_px, distance_cm),
599
+ ...
600
+ ]
601
+ """
602
+ x = np.array([sample[0] for sample in calibration_samples], dtype=np.float32)
603
+ y = np.array([sample[1] for sample in calibration_samples], dtype=np.float32)
604
+
605
+ coeffs = np.polyfit(x, y, polynomial_degree) # y = Ax^2 + Bx + C
606
+ self.model = np.poly1d(coeffs)
607
+
608
+ # print("Polynomial coefficients:", coeffs)
609
+ return coeffs
610
+
611
+ def adaptive_distance_cm(
612
+ self,
613
+ palm_width_px,
614
+ frame_width_px,
615
+ horizontal_fov_deg=60,
616
+ estimated_palm_width_cm=8.5,
617
+ ):
618
+ if palm_width_px <= 0:
619
+ return None
620
+
621
+ focal_length_px = frame_width_px / (
622
+ 2 * math.tan(math.radians(horizontal_fov_deg / 2))
623
+ )
624
+
625
+ distance_cm = (estimated_palm_width_cm * focal_length_px) / palm_width_px
626
+
627
+ return distance_cm
628
+
629
+ def estimate_distance_cm(
630
+ self,
631
+ palm_width_px,
632
+ ):
633
+ """
634
+ # Formula
635
+ # distance = (real_palm_width * focal_length) / palm_width_px
636
+
637
+ Estimate the distance from the camera to the hand in centimeters based on the detected palm width in pixels, the known real palm width in centimeters, and the focal length of the camera in pixels. The function uses the formula: distance_cm = (real_palm_width_cm * focal_length_px) / palm_width_px, where real_palm_width_cm is the actual width of the palm in centimeters, focal_length_px is the focal length of the camera in pixels, and palm_width_px is the measured width of the palm in pixels as detected by the hand landmark detection model. This estimation allows for determining how far the hand is from the camera based on the detected size of the palm in pixels and the known parameters of the camera and hand size.
638
+
639
+ Args:
640
+ palm_width_px (float): The measured width of the palm in pixels as detected by the
641
+
642
+ Returns:
643
+ float: The estimated distance from the camera to the hand in centimeters based on the detected palm width in pixels, known real palm width in centimeters, and focal length of the camera in pixels. This value provides an estimate of how far the hand is from the camera based on the detected size of the palm in pixels and the known parameters of the camera and hand size. If the palm width in pixels is zero or negative, or if the focal length is zero or negative, the function returns None to indicate that the distance cannot be estimated with the given input.
644
+ """
645
+ if self.model is None:
646
+ return None
647
+
648
+ distance = float(self.model(palm_width_px))
649
+ # reject impossible values
650
+ if distance <= 0 or distance > 300:
651
+ return None
652
+
653
+ self.distance_history.append(distance)
654
+ return float(np.mean(self.distance_history))
655
+
656
+ def _normalize(self, hand, p1, p2):
657
+ """
658
+ Normalize the distance between two landmarks (p1 and p2) by the palm scale of the hand. The function calculates the Euclidean distance between the specified landmarks and divides it by the palm scale to provide a normalized distance that accounts for variations in hand size. This normalization allows for more consistent comparisons of distances between landmarks across different hands and gestures, as it takes into account the overall size of the hand rather than just the raw distance between specific landmarks.
659
+
660
+ Args:
661
+ hand (list): A list of hand landmarks. Each landmark is expected to be a tuple of (x, y) coordinates.
662
+ p1 (int): The index of the first landmark for which to calculate the distance.
663
+ p2 (int): The index of the second landmark for which to calculate the distance.
664
+ Returns:
665
+ float: The normalized distance between the two landmarks p1 and p2, calculated as the Euclidean distance between the landmarks divided by the palm scale of the hand. This normalized distance provides a more consistent measure of the distance between the landmarks that accounts for variations in hand size, allowing for better
666
+ """
667
+ tip_dist = self.euclidean_distance(hand[p1], hand[p2])
668
+ scale = self._palm_scale(hand)
669
+ if scale == 0:
670
+ return 0
671
+ return tip_dist / scale
672
+
673
+ def is_fingers_joined(
674
+ self, p1, p2, image, landmarks, threshold=0.25, draw_intersection_point=True
675
+ ):
676
+ """
677
+ Check if two fingers are joined and provide the idexes which fingers are joined based on the normalized distance between their landmarks
678
+
679
+ Args:
680
+ p1 (int): The index of the first finger landmark.
681
+ p2 (int): The index of the second finger landmark.
682
+ image (numpy.ndarray): The image on which to annotate the finger status.
683
+ landmarks (list): A list of hand landmarks. Each landmark is expected to be a tuple of (x, y) coordinates.
684
+ threshold (float): The normalized distance threshold below which the fingers are considered joined.
685
+ Returns:
686
+ bool: True if the fingers are joined, False otherwise.
687
+ numpy.ndarray: The annotated image.
688
+ """
689
+ if not landmarks:
690
+ return False
691
+
692
+ normalized = self._normalize(landmarks, p1, p2)
693
+ print(f"Normalized distance between landmarks {p1} and {p2}: {normalized:.4f}")
694
+ is_joined = normalized < threshold
695
+ if is_joined and draw_intersection_point:
696
+ x1, y1 = landmarks[p1][1], landmarks[p1][2]
697
+ x2, y2 = landmarks[p2][1], landmarks[p2][2]
698
+ cx, cy = (x1 + x2) // 2, (y1 + y2) // 2
699
+ # to make circle color red if fingers are joined, otherwise green
700
+ cv2.circle(image, (cx, cy), 10, (0, 255, 0), cv2.FILLED)
701
+
702
+ return is_joined
703
+
704
+ def is_fingers_joined_2(
705
+ self,
706
+ p1,
707
+ p2,
708
+ image,
709
+ landmarks,
710
+ threshold=0.18, # slightly relaxed (important)
711
+ draw_intersection_point=True,
712
+ debug=False,
713
+ ):
714
+ """
715
+ Check if two fingers are joined using a more robust method that accounts for variations in hand size and camera perspective. This method calculates the normalized distance between the specified finger landmarks and compares it to an adaptive threshold to determine if the fingers are joined.
716
+
717
+ Args:
718
+ p1 (int): The index of the first finger landmark.
719
+ p2 (int): The index of the second finger landmark.
720
+ image (numpy.ndarray): The image on which to annotate the finger status.
721
+ landmarks (list): A list of hand landmarks. Each landmark is expected to be a tuple of (x, y) coordinates.
722
+ threshold (float): The normalized distance threshold below which the fingers are considered joined.
723
+ draw_intersection_point (bool): Whether to draw a circle at the intersection point of the fingers.
724
+ debug (bool): Whether to print debug information.
725
+ Returns:
726
+ bool: True if the fingers are joined, False otherwise.
727
+ """
728
+
729
+ if not landmarks or len(landmarks) <= max(p1, p2):
730
+ return False
731
+
732
+ x1, y1, _ = landmarks[p1][1:]
733
+ x2, y2, _ = landmarks[p2][1:]
734
+
735
+ # -----------------------------
736
+ # RAW DISTANCE
737
+ # -----------------------------
738
+ pixel_dist = self.euclidean_distance((x1, y1), (x2, y2))
739
+
740
+ # -----------------------------
741
+ # ROBUST PALM SCALE (FIXED)
742
+ # use wrist (0), index MCP (5), pinky MCP (17)
743
+ # works for LEFT + RIGHT hands
744
+ # -----------------------------
745
+ try:
746
+ wrist = landmarks[0][1:3]
747
+ index_mcp = landmarks[5][1:3]
748
+ pinky_mcp = landmarks[17][1:3]
749
+
750
+ palm_diag1 = self.euclidean_distance(wrist, index_mcp)
751
+ palm_diag2 = self.euclidean_distance(wrist, pinky_mcp)
752
+
753
+ palm_size = (palm_diag1 + palm_diag2) / 2
754
+ except Exception:
755
+ palm_size = self._palm_scale(landmarks)
756
+
757
+ palm_size = max(palm_size, 1e-6)
758
+
759
+ # -----------------------------
760
+ # NORMALIZED DISTANCE
761
+ # -----------------------------
762
+ normalized_dist = pixel_dist / palm_size
763
+
764
+ # -----------------------------
765
+ # ADAPTIVE THRESHOLD (IMPORTANT FIX)
766
+ # -----------------------------
767
+ # left hand tends to appear slightly scaled differently in camera
768
+ adaptive_threshold = threshold
769
+
770
+ # optional stability boost
771
+ is_joined = normalized_dist < adaptive_threshold
772
+
773
+ # -----------------------------
774
+ # DEBUG
775
+ # -----------------------------
776
+ if debug:
777
+ print(
778
+ f"[JOIN DEBUG] pixel={pixel_dist:.2f}, "
779
+ f"palm={palm_size:.2f}, "
780
+ f"norm={normalized_dist:.4f}, "
781
+ f"threshold={adaptive_threshold}"
782
+ )
783
+
784
+ # -----------------------------
785
+ # DRAW
786
+ # -----------------------------
787
+ if draw_intersection_point:
788
+ cx = int((x1 + x2) / 2)
789
+ cy = int((y1 + y2) / 2)
790
+
791
+ color = (0, 0, 255) if is_joined else (0, 255, 255)
792
+ thickness = cv2.FILLED if is_joined else 1
793
+
794
+ cv2.circle(image, (cx, cy), 8, color, thickness)
795
+
796
+ return is_joined
797
+
798
+ def joined_fingers(self, image, landmarks, threshold=0.25):
799
+ """
800
+ Check which fingers are joined based on the normalized distance between their landmarks and return a list indicating the joined state of each finger. The function iterates through predefined pairs of finger landmarks, calculates the normalized distance for each pair, and updates a list to indicate which fingers are joined based on the specified threshold. This allows for a comprehensive analysis of finger positions and can be used for gesture recognition or other applications that require understanding of finger interactions.
801
+
802
+ Args:
803
+ image (numpy.ndarray): The image on which to annotate the finger status.
804
+ landmarks (list): A list of hand landmarks. Each landmark is expected to be a tuple of (x, y) coordinates.
805
+ threshold (float): The normalized distance threshold below which the fingers are considered joined.
806
+ Returns:
807
+ list: A list of integers representing the joined state of each finger, where 1 indicates
808
+ """
809
+
810
+ annotated = image.copy()
811
+ joined_state = [0, 0, 0, 0, 0]
812
+ if not landmarks or len(landmarks) < 21:
813
+ return joined_state, annotated
814
+
815
+ pairs = list(combinations(self.fingerTips, 2))
816
+
817
+ for i, (p1, p2) in enumerate(pairs):
818
+ normalized = self._normalize(landmarks, p1, p2)
819
+ print(f"Finger {p1} vs {p2} -> {normalized:.4f}")
820
+
821
+ is_joined = normalized < threshold
822
+ if is_joined:
823
+ joined_state[i] = 1
824
+ joined_state[i + 1] = 1
825
+
826
+ return joined_state, annotated
827
+
828
+ def count_fingers(self):
829
+ """
830
+ Returns the total number of fingers currently up (0-5).
831
+ """
832
+ return sum(self.fingers_up())
833
+
834
+ def is_fist(self):
835
+ """
836
+ True if all fingers are down (classic fist).
837
+ """
838
+ fingers = self.fingers_up()
839
+ return len(fingers) == 5 and all(f == 0 for f in fingers)
840
+
841
+ def is_open_hand(self):
842
+ """
843
+ True if all fingers are up (open palm).
844
+ """
845
+ fingers = self.fingers_up()
846
+ return len(fingers) == 5 and all(f == 1 for f in fingers)
847
+
848
+ def is_thumbs_up(self):
849
+ """
850
+ Classic thumbs-up gesture.
851
+ """
852
+ fingers = self.fingers_up()
853
+ return (
854
+ len(fingers) == 5 and fingers[0] == 1 and all(f == 0 for f in fingers[1:])
855
+ )
856
+
857
+ def is_peace_sign(self):
858
+ """
859
+ Peace / V sign (index + middle up, others down).
860
+ """
861
+ fingers = self.fingers_up()
862
+ return (
863
+ len(fingers) == 5
864
+ and fingers[1] == 1
865
+ and fingers[2] == 1
866
+ and fingers[0] == 0
867
+ and fingers[3] == 0
868
+ and fingers[4] == 0
869
+ )
870
+
871
+ # ─────────────────────────── NEW METHODS ───────────────────────────
872
+
873
+ def get_gesture_name(self, hand_landmarks):
874
+ """Return a human-readable gesture label for the first detected hand.
875
+ Checks gestures in priority order and returns the first match.
876
+
877
+ Args:
878
+ hand_landmarks: landmarks_list from get_landmarks() — list of [id, x, y, z].
879
+ Returns:
880
+ str: 'Fist' | 'Open' | 'ThumbsUp' | 'Peace' | 'Unknown'
881
+ """
882
+ fingers = self.fingers_up(hand_landmarks)
883
+ if len(fingers) < 5:
884
+ return "Unknown"
885
+ if all(f == 0 for f in fingers):
886
+ return "Fist"
887
+ if all(f == 1 for f in fingers):
888
+ return "Open"
889
+ if fingers[0] == 1 and all(f == 0 for f in fingers[1:]):
890
+ return "ThumbsUp"
891
+ if (
892
+ fingers[1] == 1
893
+ and fingers[2] == 1
894
+ and fingers[0] == 0
895
+ and fingers[3] == 0
896
+ and fingers[4] == 0
897
+ ):
898
+ return "Peace"
899
+ return "Unknown"
900
+
901
+ def get_finger_count(self, hand_landmarks):
902
+ """Return number of fingers currently raised (0–5).
903
+
904
+ Args:
905
+ hand_landmarks: landmarks_list from get_landmarks().
906
+ Returns:
907
+ int
908
+ """
909
+ return sum(self.fingers_up(hand_landmarks))
910
+
911
+ def get_angle_between_landmarks(self, landmarks_list, a, b, c):
912
+ """Compute the joint angle at landmark b formed by landmarks a-b-c.
913
+ Uses 2D (x, y) pixel coordinates from landmarks_list.
914
+
915
+ Args:
916
+ landmarks_list: List of [id, x, y, z] from get_landmarks().
917
+ a, b, c: Landmark indices (e.g. 5, 6, 7 for index finger PIP joint).
918
+ Returns:
919
+ float: Angle in degrees (0–180).
920
+ """
921
+ pa = np.array([landmarks_list[a][1], landmarks_list[a][2]], dtype=float)
922
+ pb = np.array([landmarks_list[b][1], landmarks_list[b][2]], dtype=float)
923
+ pc = np.array([landmarks_list[c][1], landmarks_list[c][2]], dtype=float)
924
+ ba = pa - pb
925
+ bc = pc - pb
926
+ cos_val = np.dot(ba, bc) / (np.linalg.norm(ba) * np.linalg.norm(bc) + 1e-6)
927
+ return float(np.degrees(np.arccos(np.clip(cos_val, -1.0, 1.0))))
928
+
929
+ def get_hand_bbox(self, hand_data):
930
+ """Extract bounding box from a hand data dict returned by get_landmarks().
931
+
932
+ Args:
933
+ hand_data: Single dict from get_landmarks() list.
934
+ Returns:
935
+ tuple(int, int, int, int): (x, y, w, h)
936
+ """
937
+ return hand_data["bounding_box"]
938
+
939
+ def get_hand_center(self, hand_data):
940
+ """Extract center point from a hand data dict returned by get_landmarks().
941
+
942
+ Args:
943
+ hand_data: Single dict from get_landmarks() list.
944
+ Returns:
945
+ tuple(int, int): (cx, cy)
946
+ """
947
+ return hand_data["center_point"]
948
+
949
+ def is_pointing(self, hand_landmarks):
950
+ """Return True for a pointing gesture: only index finger raised.
951
+
952
+ Args:
953
+ hand_landmarks: landmarks_list from get_landmarks().
954
+ Returns:
955
+ bool
956
+ """
957
+ fingers = self.fingers_up(hand_landmarks)
958
+ return (
959
+ len(fingers) == 5
960
+ and fingers[1] == 1
961
+ and fingers[0] == 0
962
+ and fingers[2] == 0
963
+ and fingers[3] == 0
964
+ and fingers[4] == 0
965
+ )
966
+
967
+ def get_wrist_position(self, hand_landmarks):
968
+ """Return pixel coordinates of the wrist landmark (id 0).
969
+
970
+ Args:
971
+ hand_landmarks: landmarks_list from get_landmarks() — list of [id, x, y, z].
972
+ Returns:
973
+ tuple(int, int): (x, y) pixel position of the wrist.
974
+ """
975
+ return (hand_landmarks[0][1], hand_landmarks[0][2])
976
+
977
+ def get_fingertip_positions(self, hand_landmarks):
978
+ """Return pixel positions of all five fingertips.
979
+
980
+ Args:
981
+ hand_landmarks: landmarks_list from get_landmarks().
982
+ Returns:
983
+ dict: {'thumb': (x,y), 'index': (x,y), 'middle': (x,y), 'ring': (x,y), 'little': (x,y)}
984
+ """
985
+ names = ["thumb", "index", "middle", "ring", "little"]
986
+ return {
987
+ name: (hand_landmarks[tip][1], hand_landmarks[tip][2])
988
+ for name, tip in zip(names, self.fingerTips, strict=False)
989
+ }
990
+
991
+ def is_ok_sign(self, hand_landmarks):
992
+ """Return True if the hand is making an OK sign.
993
+
994
+ The OK sign requires the thumb tip (landmark 4) and index tip (landmark 8)
995
+ to be close together (normalized distance < 0.08) while the middle, ring,
996
+ and little fingers are raised.
997
+
998
+ Args:
999
+ hand_landmarks: landmarks_list from get_landmarks() — list of [id, x, y, z].
1000
+ Returns:
1001
+ bool
1002
+ """
1003
+ fingers = self.fingers_up(hand_landmarks)
1004
+ t = hand_landmarks[4][1:3]
1005
+ i = hand_landmarks[8][1:3]
1006
+ dist = self.euclidean_distance(t, i)
1007
+ return dist < 0.08 and fingers[2] == 1 and fingers[3] == 1 and fingers[4] == 1
1008
+
1009
+ def is_call_me(self, hand_landmarks):
1010
+ """Return True if the hand is making a 'call me' gesture.
1011
+
1012
+ The call-me gesture has the thumb and little finger extended while the
1013
+ index, middle, and ring fingers are folded down.
1014
+
1015
+ Args:
1016
+ hand_landmarks: landmarks_list from get_landmarks() — list of [id, x, y, z].
1017
+ Returns:
1018
+ bool
1019
+ """
1020
+ fingers = self.fingers_up(hand_landmarks)
1021
+ return (
1022
+ fingers[0] == 1
1023
+ and fingers[1] == 0
1024
+ and fingers[2] == 0
1025
+ and fingers[3] == 0
1026
+ and fingers[4] == 1
1027
+ )
1028
+
1029
+ def is_rock_sign(self, hand_landmarks):
1030
+ """Return True if the hand is making a rock/devil-horns sign.
1031
+
1032
+ The rock sign has the index and little fingers extended while the thumb,
1033
+ middle, and ring fingers are folded down.
1034
+
1035
+ Args:
1036
+ hand_landmarks: landmarks_list from get_landmarks() — list of [id, x, y, z].
1037
+ Returns:
1038
+ bool
1039
+ """
1040
+ fingers = self.fingers_up(hand_landmarks)
1041
+ return (
1042
+ fingers[0] == 0
1043
+ and fingers[1] == 1
1044
+ and fingers[2] == 0
1045
+ and fingers[3] == 0
1046
+ and fingers[4] == 1
1047
+ )
1048
+
1049
+ def recognize_number(self, hand_landmarks):
1050
+ """Return the number (0–5) represented by the hand gesture.
1051
+
1052
+ Delegates to get_finger_count to count raised fingers.
1053
+
1054
+ Args:
1055
+ hand_landmarks: landmarks_list from get_landmarks() — list of [id, x, y, z].
1056
+ Returns:
1057
+ int: Number of fingers raised (0–5).
1058
+ """
1059
+ return self.get_finger_count(hand_landmarks)
1060
+
1061
+ def get_hand_orientation(self, hand_landmarks):
1062
+ """Return the cardinal orientation of the hand based on wrist-to-middle-MCP vector.
1063
+
1064
+ Compares the wrist (landmark 0) to the middle finger MCP (landmark 9) to
1065
+ determine which direction the hand is pointing.
1066
+
1067
+ Args:
1068
+ hand_landmarks: landmarks_list from get_landmarks() — list of [id, x, y, z].
1069
+ Returns:
1070
+ str: One of 'palm_up', 'palm_down', 'palm_left', 'palm_right'.
1071
+ """
1072
+ wrist = hand_landmarks[0][1:3]
1073
+ middle_mcp = hand_landmarks[9][1:3]
1074
+ dx = middle_mcp[0] - wrist[0]
1075
+ dy = middle_mcp[1] - wrist[1]
1076
+ if abs(dx) >= abs(dy):
1077
+ return "palm_right" if dx > 0 else "palm_left"
1078
+ return "palm_up" if dy < 0 else "palm_down"
1079
+
1080
+ def get_swipe_direction(self, prev_wrist, curr_wrist, threshold=20):
1081
+ """Classify the swipe direction between two wrist positions.
1082
+
1083
+ Compares two (x, y) wrist positions and returns the dominant direction of
1084
+ movement. Returns 'none' if the displacement is below the threshold in both
1085
+ axes.
1086
+
1087
+ Args:
1088
+ prev_wrist (tuple): Previous wrist position as (x, y).
1089
+ curr_wrist (tuple): Current wrist position as (x, y).
1090
+ threshold (int): Minimum pixel displacement to register as a swipe.
1091
+ Returns:
1092
+ str: One of 'right', 'left', 'up', 'down', 'none'.
1093
+ """
1094
+ dx = curr_wrist[0] - prev_wrist[0]
1095
+ dy = curr_wrist[1] - prev_wrist[1]
1096
+ if max(abs(dx), abs(dy)) < threshold:
1097
+ return "none"
1098
+ if abs(dx) >= abs(dy):
1099
+ return "right" if dx > 0 else "left"
1100
+ return "down" if dy > 0 else "up"
1101
+
1102
+ def get_all_finger_angles(self, hand_landmarks):
1103
+ """Compute the joint angle at the middle joint of each finger.
1104
+
1105
+ Uses get_angle_between_landmarks for each finger's MCP–PIP–DIP triplet.
1106
+
1107
+ Args:
1108
+ hand_landmarks: landmarks_list from get_landmarks() — list of [id, x, y, z].
1109
+ Returns:
1110
+ dict: Keys are finger names ('thumb', 'index', 'middle', 'ring', 'little'),
1111
+ values are angles in degrees (0–180).
1112
+ """
1113
+ joints = {
1114
+ "thumb": (1, 2, 3),
1115
+ "index": (5, 6, 7),
1116
+ "middle": (9, 10, 11),
1117
+ "ring": (13, 14, 15),
1118
+ "little": (17, 18, 19),
1119
+ }
1120
+ return {
1121
+ name: self.get_angle_between_landmarks(hand_landmarks, a, b, c)
1122
+ for name, (a, b, c) in joints.items()
1123
+ }
1124
+
1125
+ def draw_gesture_label(self, image, hand_data, label):
1126
+ """Draw a gesture label above the hand bounding box on a copy of the image.
1127
+
1128
+ Args:
1129
+ image: BGR numpy array.
1130
+ hand_data (dict): Hand dict from get_landmarks() with key 'bounding_box' (x, y, w, h).
1131
+ label (str): Gesture label text to render.
1132
+ Returns:
1133
+ numpy.ndarray: Annotated copy of the input image (BGR).
1134
+ """
1135
+ out = image.copy()
1136
+ x, y, w, h = hand_data["bounding_box"]
1137
+ cv2.putText(
1138
+ out,
1139
+ label,
1140
+ (x, max(y - 10, 10)),
1141
+ cv2.FONT_HERSHEY_SIMPLEX,
1142
+ 0.8,
1143
+ (255, 0, 0),
1144
+ 2,
1145
+ )
1146
+ return out
1147
+
1148
+ def to_json(self, hand_data):
1149
+ """Serialize a hand data dict to a JSON-compatible structure.
1150
+
1151
+ Args:
1152
+ hand_data (dict): Hand dict from get_landmarks() containing 'hand_type',
1153
+ 'center_point', 'bounding_box', and 'landmarks_list'.
1154
+ Returns:
1155
+ dict: JSON-serializable dict with keys 'hand_type', 'center_point',
1156
+ 'bounding_box', and 'landmarks'.
1157
+ """
1158
+ return {
1159
+ "hand_type": hand_data.get("hand_type", "Unknown"),
1160
+ "center_point": list(hand_data.get("center_point", (0, 0))),
1161
+ "bounding_box": list(hand_data.get("bounding_box", (0, 0, 0, 0))),
1162
+ "landmarks": [list(lm) for lm in hand_data.get("landmarks_list", [])],
1163
+ }
1164
+
1165
+
1166
+ def main():
1167
+ currentTime = 0
1168
+ previousTime = 0
1169
+ handDetector = HandDetector()
1170
+ while True:
1171
+ ret, imgFrame = cap.read()
1172
+ if not ret:
1173
+ break
1174
+
1175
+ # Convert BGR → RGB
1176
+ imgRGB = cv2.cvtColor(imgFrame, cv2.COLOR_BGR2RGB)
1177
+
1178
+ # Detect the hand landmarks in the RGB image using the handDetector instance. The detected landmarks are stored in the hand_landmarks_list attribute of the handDetector object, which can be accessed for further processing or visualization.
1179
+ annotated_image = handDetector.draw_landmarks(imgRGB)
1180
+
1181
+ # Get the list of landmarks for the detected hands. The get_landmarks method processes the annotated image and returns a list of landmarks, which can be used for various applications such as gesture recognition or hand tracking.
1182
+ handDetector.get_landmarks(annotated_image)
1183
+
1184
+ # Convert RGB → BGR for OpenCV display
1185
+ cv2.cvtColor(annotated_image, cv2.COLOR_RGB2BGR)
1186
+
1187
+ taps = handDetector.detect_finger_tapping()
1188
+ handDetector.count_fingers()
1189
+
1190
+ # Example prints (you can replace with your own logic)
1191
+ if any(taps):
1192
+ print(f"🔥 TAP DETECTED! Fingers: {taps} | Finger indices [4,8,12,16,20]")
1193
+
1194
+ if handDetector.is_fist():
1195
+ print("👊 Fist detected")
1196
+ if handDetector.is_thumbs_up():
1197
+ print("👍 Thumbs up")
1198
+ if handDetector.is_peace_sign():
1199
+ print("✌️ Peace sign")
1200
+ if handDetector.is_open_hand():
1201
+ print("🖐️ Open hand")
1202
+
1203
+ # FPS
1204
+ currentTime = time.time()
1205
+ fps = (
1206
+ 1 / (currentTime - previousTime) if (currentTime - previousTime) > 0 else 0
1207
+ )
1208
+ previousTime = currentTime
1209
+ cv2.putText(
1210
+ annotated_image,
1211
+ f"FPS: {int(fps)}",
1212
+ (10, 30),
1213
+ cv2.FONT_HERSHEY_SIMPLEX,
1214
+ 1,
1215
+ (0, 255, 0),
1216
+ 2,
1217
+ )
1218
+ cv2.imshow(
1219
+ "Extended Hand Tracking (MediaPipe) - Tapping + Gestures", annotated_image
1220
+ )
1221
+ # Stop the loop and close the application when the 'Esc' key is pressed. The waitKey function waits for a key event for a specified amount of time (in this case, 1 millisecond) and checks if the 'Esc' key (ASCII code 27) is pressed to break the loop and release resources.
1222
+ if cv2.waitKey(1) & 0xFF == 27:
1223
+ break
1224
+
1225
+ cap.release()
1226
+ cv2.destroyAllWindows()
1227
+
1228
+
1229
+ if __name__ == "__main__":
1230
+ main()