fasthands 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,5 @@
1
+ __pycache__/
2
+ .venv*/
3
+ models/extracted/
4
+ output_*
5
+ dist/
@@ -0,0 +1,28 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Vimal Mollyn
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
22
+
23
+ ---
24
+
25
+ The bundled hand detection and hand landmark model weights are derived from
26
+ Google's MediaPipe hand_landmarker.task models, licensed under the Apache
27
+ License, Version 2.0 (https://www.apache.org/licenses/LICENSE-2.0).
28
+ Copyright Google LLC.
@@ -0,0 +1,78 @@
1
+ Metadata-Version: 2.4
2
+ Name: fasthands
3
+ Version: 0.1.0
4
+ Summary: Fastest MediaPipe-compatible hand tracker: hand landmarks on the Apple Neural Engine at 0.7 ms/frame
5
+ Project-URL: Repository, https://github.com/VimalMollyn/fasthands
6
+ Author-email: Vimal Mollyn <vmollyn@andrew.cmu.edu>
7
+ License-Expression: MIT
8
+ License-File: LICENSE
9
+ Keywords: coreml,hand-landmarks,hand-tracking,mediapipe,neural-engine
10
+ Classifier: Operating System :: MacOS
11
+ Classifier: Programming Language :: Python :: 3
12
+ Classifier: Topic :: Scientific/Engineering :: Image Recognition
13
+ Requires-Python: >=3.10
14
+ Requires-Dist: coremltools>=8.0
15
+ Requires-Dist: numpy>=1.24
16
+ Requires-Dist: opencv-python>=4.8
17
+ Description-Content-Type: text/markdown
18
+
19
+ # fasthands
20
+
21
+ **The fastest MediaPipe-compatible hand tracker.** MediaPipe Hands' official
22
+ models running on the Apple Neural Engine via CoreML — **0.7 ms per tracked
23
+ frame** on Apple Silicon, ~5× faster than MediaPipe itself, with a faithful
24
+ port of the full HandLandmarker pipeline (SSD anchors, weighted NMS, ROI
25
+ tracking, landmark projection, deduplication).
26
+
27
+ macOS / Apple Silicon only.
28
+
29
+ ## Install
30
+
31
+ ```sh
32
+ pip install fasthands
33
+ ```
34
+
35
+ ## Use
36
+
37
+ ```python
38
+ import cv2
39
+ import fasthands
40
+
41
+ tracker = fasthands.load(num_hands=1)
42
+
43
+ image = cv2.cvtColor(cv2.imread("hand.jpg"), cv2.COLOR_BGR2RGB)
44
+ hands = tracker(image) # single image
45
+ # hands = tracker.detect_video(frame) # video: tracks between frames, ~0.7 ms
46
+
47
+ for hand in hands:
48
+ print(hand["handedness"], hand["score"])
49
+ print(hand["landmarks"]) # 21 x (x, y, z), normalized image coords
50
+ print(hand["world_landmarks"]) # 21 x (x, y, z), meters, hand-centered
51
+ ```
52
+
53
+ Or from the command line:
54
+
55
+ ```sh
56
+ fasthands photo.jpg --out annotated.jpg
57
+ fasthands-webcam --mirror # live demo with FPS overlay
58
+ ```
59
+
60
+ ## Speed (Apple M4, 540×720 frame, one hand)
61
+
62
+ | | tracking | detect + track |
63
+ |---|---|---|
64
+ | **fasthands (ANE)** | **0.7 ms** | **1.9 ms** |
65
+ | mediapipe (XNNPACK CPU) | 3.3 ms | 8.7 ms |
66
+
67
+ Landmarks agree with MediaPipe to ~1e-3 (Neural Engine fp16); the pipeline
68
+ logic itself is verified to MediaPipe's own float32 reproducibility floor.
69
+
70
+ ## How
71
+
72
+ The `hand_landmarker.task` models are converted to CoreML, and every MediaPipe
73
+ calculator in the pipeline (anchors, decode, weighted NMS, rect transforms,
74
+ rotated crops, projections, VIDEO-mode ROI tracking, dedup) is reimplemented
75
+ in numpy with float32 op-order fidelity. Model weights © Google, Apache 2.0.
76
+
77
+ Source, the PyTorch reference implementation, and the full verification
78
+ harness: https://github.com/VimalMollyn/fasthands
@@ -0,0 +1,60 @@
1
+ # fasthands
2
+
3
+ **The fastest MediaPipe-compatible hand tracker.** MediaPipe Hands' official
4
+ models running on the Apple Neural Engine via CoreML — **0.7 ms per tracked
5
+ frame** on Apple Silicon, ~5× faster than MediaPipe itself, with a faithful
6
+ port of the full HandLandmarker pipeline (SSD anchors, weighted NMS, ROI
7
+ tracking, landmark projection, deduplication).
8
+
9
+ macOS / Apple Silicon only.
10
+
11
+ ## Install
12
+
13
+ ```sh
14
+ pip install fasthands
15
+ ```
16
+
17
+ ## Use
18
+
19
+ ```python
20
+ import cv2
21
+ import fasthands
22
+
23
+ tracker = fasthands.load(num_hands=1)
24
+
25
+ image = cv2.cvtColor(cv2.imread("hand.jpg"), cv2.COLOR_BGR2RGB)
26
+ hands = tracker(image) # single image
27
+ # hands = tracker.detect_video(frame) # video: tracks between frames, ~0.7 ms
28
+
29
+ for hand in hands:
30
+ print(hand["handedness"], hand["score"])
31
+ print(hand["landmarks"]) # 21 x (x, y, z), normalized image coords
32
+ print(hand["world_landmarks"]) # 21 x (x, y, z), meters, hand-centered
33
+ ```
34
+
35
+ Or from the command line:
36
+
37
+ ```sh
38
+ fasthands photo.jpg --out annotated.jpg
39
+ fasthands-webcam --mirror # live demo with FPS overlay
40
+ ```
41
+
42
+ ## Speed (Apple M4, 540×720 frame, one hand)
43
+
44
+ | | tracking | detect + track |
45
+ |---|---|---|
46
+ | **fasthands (ANE)** | **0.7 ms** | **1.9 ms** |
47
+ | mediapipe (XNNPACK CPU) | 3.3 ms | 8.7 ms |
48
+
49
+ Landmarks agree with MediaPipe to ~1e-3 (Neural Engine fp16); the pipeline
50
+ logic itself is verified to MediaPipe's own float32 reproducibility floor.
51
+
52
+ ## How
53
+
54
+ The `hand_landmarker.task` models are converted to CoreML, and every MediaPipe
55
+ calculator in the pipeline (anchors, decode, weighted NMS, rect transforms,
56
+ rotated crops, projections, VIDEO-mode ROI tracking, dedup) is reimplemented
57
+ in numpy with float32 op-order fidelity. Model weights © Google, Apache 2.0.
58
+
59
+ Source, the PyTorch reference implementation, and the full verification
60
+ harness: https://github.com/VimalMollyn/fasthands
@@ -0,0 +1,46 @@
1
+ [project]
2
+ name = "fasthands"
3
+ version = "0.1.0"
4
+ description = "Fastest MediaPipe-compatible hand tracker: hand landmarks on the Apple Neural Engine at 0.7 ms/frame"
5
+ readme = "PYPI_README.md"
6
+ requires-python = ">=3.10"
7
+ license = "MIT"
8
+ authors = [{ name = "Vimal Mollyn", email = "vmollyn@andrew.cmu.edu" }]
9
+ keywords = ["hand-tracking", "mediapipe", "coreml", "neural-engine", "hand-landmarks"]
10
+ classifiers = [
11
+ "Operating System :: MacOS",
12
+ "Programming Language :: Python :: 3",
13
+ "Topic :: Scientific/Engineering :: Image Recognition",
14
+ ]
15
+ dependencies = [
16
+ "coremltools>=8.0",
17
+ "numpy>=1.24",
18
+ "opencv-python>=4.8",
19
+ ]
20
+
21
+ [project.urls]
22
+ Repository = "https://github.com/VimalMollyn/fasthands"
23
+
24
+ [project.scripts]
25
+ fasthands = "fasthands.cli:main"
26
+ fasthands-webcam = "fasthands.cli:webcam"
27
+
28
+ [build-system]
29
+ requires = ["hatchling"]
30
+ build-backend = "hatchling.build"
31
+
32
+ [tool.hatch.build.targets.wheel]
33
+ packages = ["src/fasthands"]
34
+
35
+ [tool.hatch.build.targets.sdist]
36
+ include = ["src", "PYPI_README.md", "LICENSE"]
37
+
38
+ [dependency-groups]
39
+ # research / verification tooling for this repo, not shipped with the package
40
+ dev = [
41
+ "mediapipe>=0.10.35",
42
+ "torch>=2.12.0",
43
+ "ai-edge-litert>=2.1.5",
44
+ "tflite>=2.18.0",
45
+ "pytest>=8",
46
+ ]
@@ -0,0 +1,17 @@
1
+ """fasthands — the fastest MediaPipe-compatible hand tracker.
2
+
3
+ MediaPipe Hands' models running on the Apple Neural Engine via CoreML,
4
+ with a faithful port of the HandLandmarker pipeline. ~0.7 ms per tracked
5
+ frame on Apple Silicon (~5x faster than MediaPipe itself).
6
+
7
+ import fasthands
8
+ tracker = fasthands.load(num_hands=1)
9
+ hands = tracker.detect_video(rgb_frame) # tracking (video) mode
10
+ hands = tracker(rgb_image) # single-image mode
11
+ """
12
+
13
+ from .coreml import load
14
+ from .pipeline import HAND_CONNECTIONS, HandLandmarker, draw
15
+
16
+ __version__ = "0.1.0"
17
+ __all__ = ["load", "HandLandmarker", "HAND_CONNECTIONS", "draw", "__version__"]
@@ -0,0 +1,131 @@
1
+ """Command-line entry points: `fasthands <image>` and `fasthands-webcam`."""
2
+
3
+ import argparse
4
+ import json
5
+ import threading
6
+ import time
7
+
8
+ import cv2
9
+
10
+ from . import load
11
+ from .pipeline import draw
12
+
13
+
14
+ def main():
15
+ parser = argparse.ArgumentParser(
16
+ description="fasthands: hand landmarks on the Neural Engine")
17
+ parser.add_argument("image", help="path to input image")
18
+ parser.add_argument("--out", default=None, help="annotated output image path")
19
+ parser.add_argument("--json", dest="json_path", default=None,
20
+ help="write landmarks as JSON")
21
+ parser.add_argument("--num-hands", type=int, default=2)
22
+ parser.add_argument("--compute-units", default="ALL",
23
+ choices=["ALL", "CPU_AND_NE", "CPU_AND_GPU", "CPU_ONLY"])
24
+ args = parser.parse_args()
25
+
26
+ image_bgr = cv2.imread(args.image)
27
+ if image_bgr is None:
28
+ raise SystemExit(f"could not read image: {args.image}")
29
+ tracker = load(num_hands=args.num_hands, compute_units=args.compute_units)
30
+ hands = tracker(cv2.cvtColor(image_bgr, cv2.COLOR_BGR2RGB))
31
+
32
+ print(f"detected {len(hands)} hand(s)")
33
+ for i, hand in enumerate(hands):
34
+ print(f"\nhand {i}: {hand['handedness']} (score {hand['score']:.4f})")
35
+ for j, (x, y, z) in enumerate(hand["landmarks"]):
36
+ print(f" lm[{j:2d}] x={x:.6f} y={y:.6f} z={z:.6f}")
37
+
38
+ if args.json_path:
39
+ dump = [{
40
+ "handedness": h["handedness"], "score": h["score"],
41
+ "landmarks": [{"x": float(x), "y": float(y), "z": float(z)}
42
+ for x, y, z in h["landmarks"]],
43
+ "world_landmarks": [{"x": float(x), "y": float(y), "z": float(z)}
44
+ for x, y, z in h["world_landmarks"]],
45
+ } for h in hands]
46
+ with open(args.json_path, "w") as f:
47
+ json.dump(dump, f, indent=2)
48
+ print(f"\nlandmarks written to {args.json_path}")
49
+ if args.out:
50
+ cv2.imwrite(args.out, draw(image_bgr, hands))
51
+ print(f"annotated image written to {args.out}")
52
+
53
+
54
+ class _Camera:
55
+ """Threaded capture: always serves the latest frame."""
56
+
57
+ def __init__(self, index):
58
+ self.cap = cv2.VideoCapture(index)
59
+ if not self.cap.isOpened():
60
+ raise SystemExit(f"could not open camera {index}")
61
+ self.frame = None
62
+ self.ok = True
63
+ self.lock = threading.Lock()
64
+ threading.Thread(target=self._loop, daemon=True).start()
65
+ while self.ok and self.frame is None:
66
+ time.sleep(0.01)
67
+
68
+ def _loop(self):
69
+ while self.ok:
70
+ ok, frame = self.cap.read()
71
+ if not ok:
72
+ self.ok = False
73
+ break
74
+ with self.lock:
75
+ self.frame = frame
76
+
77
+ def read(self):
78
+ with self.lock:
79
+ return self.ok, None if self.frame is None else self.frame.copy()
80
+
81
+ def release(self):
82
+ self.ok = False
83
+ self.cap.release()
84
+
85
+
86
+ def webcam():
87
+ parser = argparse.ArgumentParser(description="fasthands live webcam demo")
88
+ parser.add_argument("--camera", type=int, default=0)
89
+ parser.add_argument("--mirror", action="store_true", help="selfie view")
90
+ parser.add_argument("--num-hands", type=int, default=1)
91
+ parser.add_argument("--compute-units", default="ALL",
92
+ choices=["ALL", "CPU_AND_NE", "CPU_AND_GPU", "CPU_ONLY"])
93
+ args = parser.parse_args()
94
+
95
+ tracker = load(num_hands=args.num_hands, compute_units=args.compute_units)
96
+ cap = _Camera(args.camera)
97
+
98
+ fps, infer_ms = 0.0, 0.0
99
+ prev = time.perf_counter()
100
+ while True:
101
+ ok, frame = cap.read()
102
+ if not ok:
103
+ break
104
+ if args.mirror:
105
+ frame = cv2.flip(frame, 1)
106
+
107
+ t0 = time.perf_counter()
108
+ hands = tracker.detect_video(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
109
+ infer_ms = 0.9 * infer_ms + 0.1 * (time.perf_counter() - t0) * 1000
110
+ frame = draw(frame, hands)
111
+ for hand in hands:
112
+ x, y = hand["landmarks"][0][:2]
113
+ cv2.putText(frame, f"{hand['handedness']} {hand['score']:.2f}",
114
+ (int(x * frame.shape[1]) - 30, int(y * frame.shape[0]) + 30),
115
+ cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 255, 0), 2)
116
+
117
+ now = time.perf_counter()
118
+ fps = 0.9 * fps + 0.1 / (now - prev) if fps else 1.0 / (now - prev)
119
+ prev = now
120
+ cv2.putText(frame, f"{fps:.1f} FPS {infer_ms:.1f} ms", (10, 30),
121
+ cv2.FONT_HERSHEY_SIMPLEX, 1.0, (0, 255, 255), 2)
122
+ cv2.imshow("fasthands", frame)
123
+ if cv2.waitKey(1) & 0xFF in (ord("q"), 27):
124
+ break
125
+
126
+ cap.release()
127
+ cv2.destroyAllWindows()
128
+
129
+
130
+ if __name__ == "__main__":
131
+ main()
@@ -0,0 +1,40 @@
1
+ """CoreML inference backend (Neural Engine capable) with bundled models."""
2
+
3
+ import sys
4
+ from pathlib import Path
5
+
6
+ import numpy as np
7
+
8
+ from .pipeline import HandLandmarker
9
+
10
+ MODELS_DIR = Path(__file__).parent / "models"
11
+
12
+
13
+ class CoreMLBackend:
14
+ def __init__(self, path, compute_units="ALL"):
15
+ if sys.platform != "darwin":
16
+ raise RuntimeError("fasthands requires macOS (CoreML)")
17
+ import coremltools as ct
18
+ self.model = ct.models.MLModel(
19
+ str(path), compute_units=ct.ComputeUnit[compute_units])
20
+ spec = self.model.get_spec()
21
+ self.output_names = [o.name for o in spec.description.output]
22
+
23
+ def __call__(self, x: np.ndarray):
24
+ out = self.model.predict({"image": x})
25
+ return [out[n] for n in self.output_names]
26
+
27
+
28
+ def load(num_hands: int = 2, compute_units: str = "ALL",
29
+ models_dir=MODELS_DIR) -> HandLandmarker:
30
+ """Create a HandLandmarker running on CoreML.
31
+
32
+ compute_units: ALL (Neural Engine + GPU + CPU), CPU_AND_NE, CPU_AND_GPU,
33
+ or CPU_ONLY.
34
+ """
35
+ models_dir = Path(models_dir)
36
+ return HandLandmarker(
37
+ CoreMLBackend(models_dir / "hand_detector.mlpackage", compute_units),
38
+ CoreMLBackend(models_dir / "hand_landmarks_detector.mlpackage", compute_units),
39
+ num_hands=num_hands,
40
+ )
@@ -0,0 +1,18 @@
1
+ {
2
+ "fileFormatVersion": "1.0.0",
3
+ "itemInfoEntries": {
4
+ "7CBC671B-4858-4DC4-80A9-AA56D588F465": {
5
+ "author": "com.apple.CoreML",
6
+ "description": "CoreML Model Specification",
7
+ "name": "model.mlmodel",
8
+ "path": "com.apple.CoreML/model.mlmodel"
9
+ },
10
+ "95186DCA-2FC9-41F8-B09F-1A3EF04C9006": {
11
+ "author": "com.apple.CoreML",
12
+ "description": "CoreML Model Weights",
13
+ "name": "weights",
14
+ "path": "com.apple.CoreML/weights"
15
+ }
16
+ },
17
+ "rootModelIdentifier": "7CBC671B-4858-4DC4-80A9-AA56D588F465"
18
+ }
@@ -0,0 +1,18 @@
1
+ {
2
+ "fileFormatVersion": "1.0.0",
3
+ "itemInfoEntries": {
4
+ "4B1E732F-140B-42D1-A2E5-4377545D8444": {
5
+ "author": "com.apple.CoreML",
6
+ "description": "CoreML Model Specification",
7
+ "name": "model.mlmodel",
8
+ "path": "com.apple.CoreML/model.mlmodel"
9
+ },
10
+ "DFF8F2DB-240E-491C-8B01-E606ED9B5740": {
11
+ "author": "com.apple.CoreML",
12
+ "description": "CoreML Model Weights",
13
+ "name": "weights",
14
+ "path": "com.apple.CoreML/weights"
15
+ }
16
+ },
17
+ "rootModelIdentifier": "4B1E732F-140B-42D1-A2E5-4377545D8444"
18
+ }
@@ -0,0 +1,457 @@
1
+ """MediaPipe HandLandmarker pipeline, faithfully ported (numpy + OpenCV only).
2
+
3
+ Mirrors MediaPipe's calculators in float32 operation order: SSD anchors,
4
+ detection decode, weighted NMS (in tensor space), detection projection,
5
+ rect transformation, rotated-rect cropping, landmark projection, VIDEO-mode
6
+ tracking (landmarks -> next-frame ROI) and hand deduplication.
7
+
8
+ Inference backends are injected: any callable taking a float32 NHWC array
9
+ [1, H, W, 3] in [0, 1] and returning the model's output arrays in order.
10
+ """
11
+
12
+ import math
13
+
14
+ import cv2
15
+ import numpy as np
16
+
17
+ F = np.float32
18
+
19
+ # ----------------------------------------------------------------------------
20
+ # Constants from mediapipe/tasks/cc/vision/hand_detector/hand_detector_graph.cc
21
+ # and hand_landmarker/hand_landmarks_detector_graph.cc
22
+ # ----------------------------------------------------------------------------
23
+ DETECT_SIZE = 192
24
+ LANDMARK_SIZE = 224
25
+ NUM_KEYPOINTS = 7
26
+ MIN_DETECTION_CONFIDENCE = F(0.5)
27
+ MIN_HAND_PRESENCE_CONFIDENCE = F(0.5)
28
+ NMS_THRESHOLD = F(0.3)
29
+ SCORE_CLIPPING_THRESH = F(100.0)
30
+ RECT_SCALE = F(2.6) # RectTransformationCalculator scale_x/scale_y
31
+ RECT_SHIFT_Y = F(-0.5) # RectTransformationCalculator shift_y
32
+ LANDMARKS_NORMALIZE_Z = 0.4 # TensorsToLandmarksCalculator normalize_z
33
+ # NOTE: the tasks HandDetectorGraph sets rotation_vector_target_angle(90) on
34
+ # DetectionsToRectsCalculator -- that proto field is in RADIANS (the _degrees
35
+ # variant is a separate field), so the effective target angle really is
36
+ # 90 rad (= 2.0354 rad mod 2pi), not pi/2. We reproduce that behavior.
37
+ ROTATION_TARGET_ANGLE = 90.0
38
+
39
+ HAND_CONNECTIONS = [
40
+ (0, 1), (1, 2), (2, 3), (3, 4),
41
+ (0, 5), (5, 6), (6, 7), (7, 8),
42
+ (5, 9), (9, 10), (10, 11), (11, 12),
43
+ (9, 13), (13, 14), (14, 15), (15, 16),
44
+ (13, 17), (17, 18), (18, 19), (19, 20),
45
+ (0, 17),
46
+ ]
47
+
48
+
49
+ # ----------------------------------------------------------------------------
50
+ # SSD anchors — mediapipe/calculators/tflite/ssd_anchors_calculator.cc with the
51
+ # hand-detector config (num_layers=4, strides 8,16,16,16, scales .1484375-.75)
52
+ # ----------------------------------------------------------------------------
53
+ def generate_anchors() -> np.ndarray:
54
+ num_layers = 4
55
+ strides = [8, 16, 16, 16]
56
+ anchors = []
57
+ layer_id = 0
58
+ while layer_id < num_layers:
59
+ scales = []
60
+ last = layer_id
61
+ while last < num_layers and strides[last] == strides[layer_id]:
62
+ scales.append(0) # aspect_ratio 1.0 anchor
63
+ scales.append(0) # interpolated anchor (same center)
64
+ last += 1
65
+ fm = math.ceil(DETECT_SIZE / strides[layer_id])
66
+ for y in range(fm):
67
+ for x in range(fm):
68
+ for _ in scales: # fixed_anchor_size: w = h = 1.0
69
+ anchors.append((F(x + 0.5) / F(fm), F(y + 0.5) / F(fm), 1.0, 1.0))
70
+ layer_id = last
71
+ return np.array(anchors, dtype=np.float32) # [2016, 4] cx, cy, w, h
72
+
73
+
74
+ # ----------------------------------------------------------------------------
75
+ # ImageToTensorCalculator (OpenCV converter): rotated sub-rect -> square crop.
76
+ # Identical cv2 calls (boxPoints + getPerspectiveTransform + warpPerspective
77
+ # on uint8, then *1/255f like cv::Mat::convertTo with a float scale).
78
+ # ----------------------------------------------------------------------------
79
+ def crop_rotated_rect(image_rgb, cx, cy, w, h, rotation_rad, dst_size, border):
80
+ angle_deg = F(np.float64(F(rotation_rad) * F(180.0)) / math.pi)
81
+ src = cv2.boxPoints(((float(cx), float(cy)), (float(w), float(h)), float(angle_deg)))
82
+ dst = np.array(
83
+ [[0, dst_size], [0, 0], [dst_size, 0], [dst_size, dst_size]], dtype=np.float32
84
+ )
85
+ m = cv2.getPerspectiveTransform(src.astype(np.float32), dst)
86
+ crop = cv2.warpPerspective(
87
+ image_rgb, m, (dst_size, dst_size), flags=cv2.INTER_LINEAR, borderMode=border
88
+ )
89
+ return crop.astype(np.float32) * F(1.0 / 255.0)
90
+
91
+
92
+ def normalize_radians(angle: float) -> float:
93
+ return angle - 2 * math.pi * math.floor((angle + math.pi) / (2 * math.pi))
94
+
95
+
96
+ def compute_rotation(x0, y0, x1, y1) -> F:
97
+ """DetectionsToRectsCalculator::ComputeRotation in float32, exactly as the
98
+ C++ does it: rot = NormalizeRadians(90.f - atan2f(-(y1-y0), x1-x0))."""
99
+ a = F(ROTATION_TARGET_ANGLE) - F(math.atan2(-(F(y1) - F(y0)), F(x1) - F(x0)))
100
+ return F(normalize_radians(float(a)))
101
+
102
+
103
+ # ----------------------------------------------------------------------------
104
+ # TensorsToDetectionsCalculator (decode + sigmoid scores), in tensor space
105
+ # ----------------------------------------------------------------------------
106
+ def decode_detections(raw_boxes, raw_scores, anchors):
107
+ """raw_boxes [2016,18], raw_scores [2016,1] float32 -> list of dicts
108
+ {xmin, ymin, w, h, kp[7][2], score} in the 192x192 tensor space."""
109
+ logits = np.clip(raw_scores.reshape(-1), -SCORE_CLIPPING_THRESH, SCORE_CLIPPING_THRESH)
110
+ # sigmoid in float64, rounded once to float32 (= correctly-rounded expf path)
111
+ scores = (1.0 / (1.0 + np.exp(-logits.astype(np.float64)))).astype(np.float32)
112
+ scale = F(DETECT_SIZE)
113
+ rb = raw_boxes
114
+ xc = rb[:, 0] / scale * anchors[:, 2] + anchors[:, 0]
115
+ yc = rb[:, 1] / scale * anchors[:, 3] + anchors[:, 1]
116
+ w = rb[:, 2] / scale * anchors[:, 2]
117
+ h = rb[:, 3] / scale * anchors[:, 3]
118
+ xmin = xc - w / F(2.0)
119
+ ymin = yc - h / F(2.0)
120
+ # detection proto stores xmin/ymin/width/height
121
+ width = (xc + w / F(2.0)) - xmin
122
+ height = (yc + h / F(2.0)) - ymin
123
+ kx = rb[:, 4:4 + NUM_KEYPOINTS * 2:2] / scale * anchors[:, 2:3] + anchors[:, 0:1]
124
+ ky = rb[:, 5:5 + NUM_KEYPOINTS * 2:2] / scale * anchors[:, 3:4] + anchors[:, 1:2]
125
+ dets = []
126
+ for i in np.nonzero(scores >= MIN_DETECTION_CONFIDENCE)[0]:
127
+ dets.append({
128
+ "xmin": xmin[i], "ymin": ymin[i], "w": width[i], "h": height[i],
129
+ "kp": [(kx[i, k], ky[i, k]) for k in range(NUM_KEYPOINTS)],
130
+ "score": scores[i],
131
+ })
132
+ return dets
133
+
134
+
135
+ # ----------------------------------------------------------------------------
136
+ # NonMaxSuppressionCalculator, algorithm=WEIGHTED, overlap=IoU, thresh=0.3 —
137
+ # float32 accumulation in the same order as the C++ implementation.
138
+ # ----------------------------------------------------------------------------
139
+ def iou(a, b):
140
+ xa, ya = max(a["xmin"], b["xmin"]), max(a["ymin"], b["ymin"])
141
+ xb = min(a["xmin"] + a["w"], b["xmin"] + b["w"])
142
+ yb = min(a["ymin"] + a["h"], b["ymin"] + b["h"])
143
+ if xb <= xa or yb <= ya:
144
+ return F(0.0)
145
+ inter = (xb - xa) * (yb - ya)
146
+ union = a["w"] * a["h"] + b["w"] * b["h"] - inter
147
+ return inter / union
148
+
149
+
150
+ def weighted_nms(dets):
151
+ remained = sorted(dets, key=lambda d: -d["score"])
152
+ out = []
153
+ while remained:
154
+ top = remained[0]
155
+ sims = [iou(d, top) for d in remained]
156
+ candidates = [d for d, s in zip(remained, sims) if s > NMS_THRESHOLD]
157
+ remained = [d for d, s in zip(remained, sims) if not s > NMS_THRESHOLD]
158
+ merged = dict(top)
159
+ if candidates:
160
+ w_xmin = w_ymin = w_xmax = w_ymax = F(0.0)
161
+ kp_acc = [[F(0.0), F(0.0)] for _ in range(NUM_KEYPOINTS)]
162
+ total = F(0.0)
163
+ for c in candidates:
164
+ total = total + c["score"]
165
+ w_xmin = w_xmin + c["xmin"] * c["score"]
166
+ w_ymin = w_ymin + c["ymin"] * c["score"]
167
+ w_xmax = w_xmax + (c["xmin"] + c["w"]) * c["score"]
168
+ w_ymax = w_ymax + (c["ymin"] + c["h"]) * c["score"]
169
+ for k in range(NUM_KEYPOINTS):
170
+ kp_acc[k][0] = kp_acc[k][0] + c["kp"][k][0] * c["score"]
171
+ kp_acc[k][1] = kp_acc[k][1] + c["kp"][k][1] * c["score"]
172
+ merged["xmin"] = w_xmin / total
173
+ merged["ymin"] = w_ymin / total
174
+ merged["w"] = (w_xmax / total) - merged["xmin"]
175
+ merged["h"] = (w_ymax / total) - merged["ymin"]
176
+ merged["kp"] = [(kp_acc[k][0] / total, kp_acc[k][1] / total)
177
+ for k in range(NUM_KEYPOINTS)]
178
+ out.append(merged)
179
+ return out
180
+
181
+
182
+ # ----------------------------------------------------------------------------
183
+ # DetectionProjectionCalculator: tensor space -> image space through the
184
+ # float32 matrix produced by GetRotatedSubRectToRectTransformMatrix.
185
+ # ----------------------------------------------------------------------------
186
+ def letterbox_projection(iw, ih):
187
+ """Matrix for the full-image keep-aspect-ratio ROI (rotation 0)."""
188
+ side = F(max(iw, ih))
189
+ e, f = F(0.5) * F(iw), F(0.5) * F(ih) # GetRoi: norm 0.5 * size
190
+ g, h = F(1.0) / F(iw), F(1.0) / F(ih)
191
+ m0 = side * F(1.0) * g # a*c*g, c=1, d=0
192
+ m3 = (F(-0.5) * side * F(1.0) + e) * g
193
+ m5 = side * F(1.0) * h
194
+ m7 = (F(-0.5) * side * F(1.0) + f) * h
195
+
196
+ def project(x, y):
197
+ return F(F(x * m0) + m3), F(F(y * m5) + m7)
198
+ return project
199
+
200
+
201
+ def project_detection(det, project):
202
+ corners = [(det["xmin"], det["ymin"]),
203
+ (det["xmin"] + det["w"], det["ymin"]),
204
+ (det["xmin"] + det["w"], det["ymin"] + det["h"]),
205
+ (det["xmin"], det["ymin"] + det["h"])]
206
+ pts = [project(x, y) for x, y in corners]
207
+ xmin = min(p[0] for p in pts)
208
+ ymin = min(p[1] for p in pts)
209
+ return {
210
+ "xmin": xmin, "ymin": ymin,
211
+ "w": max(p[0] for p in pts) - xmin,
212
+ "h": max(p[1] for p in pts) - ymin,
213
+ "kp": [project(x, y) for x, y in det["kp"]],
214
+ "score": det["score"],
215
+ }
216
+
217
+
218
+ # ----------------------------------------------------------------------------
219
+ # HandLandmarksToRectCalculator + RectTransformation(2.0, shift_y -0.1,
220
+ # square_long): next-frame ROI from the current landmarks (VIDEO mode).
221
+ # ----------------------------------------------------------------------------
222
+ PARTIAL_LANDMARK_IDS = [0, 1, 2, 3, 5, 6, 9, 10, 13, 14, 17, 18]
223
+ TRACK_RECT_SCALE = F(2.0)
224
+ TRACK_RECT_SHIFT_Y = F(-0.1)
225
+
226
+
227
+ def rect_from_landmarks(landmarks, iw, ih):
228
+ lm = landmarks[PARTIAL_LANDMARK_IDS][:, :2]
229
+
230
+ # rotation: wrist -> mean of index/middle/ring MCPs, target pi/2
231
+ x0, y0 = lm[0, 0] * F(iw), lm[0, 1] * F(ih)
232
+ x1 = (lm[4, 0] + lm[8, 0]) / F(2.0) # index, ring
233
+ y1 = (lm[4, 1] + lm[8, 1]) / F(2.0)
234
+ x1 = (x1 + lm[6, 0]) / F(2.0) * F(iw) # middle
235
+ y1 = (y1 + lm[6, 1]) / F(2.0) * F(ih)
236
+ rotation = F(normalize_radians(
237
+ float(F(math.pi * 0.5) - F(math.atan2(-(y1 - y0), x1 - x0)))))
238
+ rev = -rotation
239
+
240
+ # bbox center, then bbox in the de-rotated frame
241
+ cax = (lm[:, 0].max() + lm[:, 0].min()) / F(2.0)
242
+ cay = (lm[:, 1].max() + lm[:, 1].min()) / F(2.0)
243
+ ox = (lm[:, 0] - cax) * F(iw)
244
+ oy = (lm[:, 1] - cay) * F(ih)
245
+ px = ox * F(math.cos(rev)) - oy * F(math.sin(rev))
246
+ py = ox * F(math.sin(rev)) + oy * F(math.cos(rev))
247
+ pcx = (px.max() + px.min()) / F(2.0)
248
+ pcy = (py.max() + py.min()) / F(2.0)
249
+ cx = (pcx * F(math.cos(rotation)) - pcy * F(math.sin(rotation)) + F(iw) * cax) / F(iw)
250
+ cy = (pcx * F(math.sin(rotation)) + pcy * F(math.cos(rotation)) + F(ih) * cay) / F(ih)
251
+ w = (px.max() - px.min()) / F(iw)
252
+ h = (py.max() - py.min()) / F(ih)
253
+
254
+ # RectTransformationCalculator: shift, square_long, scale 2.0
255
+ sin_a, cos_a = F(math.sin(rotation)), F(math.cos(rotation))
256
+ x_shift = (-F(ih) * h * TRACK_RECT_SHIFT_Y * sin_a) / F(iw)
257
+ y_shift = (F(ih) * h * TRACK_RECT_SHIFT_Y * cos_a) / F(ih)
258
+ cx, cy = cx + x_shift, cy + y_shift
259
+ long_side = max(w * F(iw), h * F(ih))
260
+ return (cx, cy, long_side / F(iw) * TRACK_RECT_SCALE,
261
+ long_side / F(ih) * TRACK_RECT_SCALE, rotation)
262
+
263
+
264
+ def deduplicate_hands(hands, iw, ih):
265
+ """HandLandmarksDeduplicationCalculator: suppress a hand if >=10 of its 21
266
+ landmarks lie within 0.2 x baseline-palm-size of an already-retained hand
267
+ and their landmark bounding boxes overlap with IoU > 0.2."""
268
+ def baseline(lm):
269
+ px = lm[:, :2] * (iw, ih)
270
+ return max(np.linalg.norm(px[0] - px[5]), np.linalg.norm(px[5] - px[17]),
271
+ np.linalg.norm(px[17] - px[0]))
272
+
273
+ def bbox_iou(a, b):
274
+ ax0, ay0 = a[:, 0].min(), a[:, 1].min(); ax1, ay1 = a[:, 0].max(), a[:, 1].max()
275
+ bx0, by0 = b[:, 0].min(), b[:, 1].min(); bx1, by1 = b[:, 0].max(), b[:, 1].max()
276
+ xa, ya, xb, yb = max(ax0, bx0), max(ay0, by0), min(ax1, bx1), min(ay1, by1)
277
+ if xb <= xa or yb <= ya:
278
+ return 0.0
279
+ inter = (xb - xa) * (yb - ya)
280
+ return inter / ((ax1-ax0)*(ay1-ay0) + (bx1-bx0)*(by1-by0) - inter)
281
+
282
+ kept = []
283
+ for h in hands:
284
+ lm = h["landmarks"]
285
+ dup = False
286
+ for k in kept:
287
+ klm = k["landmarks"]
288
+ thresh = max(baseline(lm), baseline(klm)) * 0.2
289
+ dists = np.linalg.norm((lm[:, :2] - klm[:, :2]) * (iw, ih), axis=1)
290
+ if (dists < thresh).sum() >= 10 and bbox_iou(lm, klm) > 0.2:
291
+ dup = True
292
+ break
293
+ if not dup:
294
+ kept.append(h)
295
+ return kept
296
+
297
+
298
+ def _rect_iou(a, b):
299
+ """Axis-aligned IoU of two (cx, cy, w, h, rot) rects, for association."""
300
+ ax0, ay0 = a[0] - a[2] / 2, a[1] - a[3] / 2
301
+ bx0, by0 = b[0] - b[2] / 2, b[1] - b[3] / 2
302
+ xa, ya = max(ax0, bx0), max(ay0, by0)
303
+ xb, yb = min(ax0 + a[2], bx0 + b[2]), min(ay0 + a[3], by0 + b[3])
304
+ if xb <= xa or yb <= ya:
305
+ return 0.0
306
+ inter = (xb - xa) * (yb - ya)
307
+ return float(inter / (a[2] * a[3] + b[2] * b[3] - inter))
308
+
309
+
310
+ # ----------------------------------------------------------------------------
311
+ # Pipeline
312
+ # ----------------------------------------------------------------------------
313
+ class HandLandmarker:
314
+ """The HandLandmarker task graph with injectable inference backends.
315
+
316
+ detector / landmarker: callables mapping a float32 NHWC array [1,H,W,3]
317
+ in [0,1] to the model's raw output arrays (in model output order).
318
+ """
319
+
320
+ def __init__(self, detector, landmarker, num_hands=2):
321
+ self.detector = detector
322
+ self.landmarker = landmarker
323
+ self.anchors = generate_anchors()
324
+ self.num_hands = num_hands
325
+ self._tracked_rects = [] # VIDEO mode: ROIs carried to the next frame
326
+
327
+ def __call__(self, image_rgb: np.ndarray):
328
+ """IMAGE mode: palm detection + landmarks every call."""
329
+ ih, iw = image_rgb.shape[:2]
330
+ hands = []
331
+ for rect in self._detect_rects(image_rgb)[: self.num_hands]:
332
+ hand = self._landmarks(image_rgb, *rect)
333
+ if hand is not None:
334
+ hands.append(hand)
335
+ return deduplicate_hands(hands, iw, ih)
336
+
337
+ def detect_video(self, image_rgb: np.ndarray):
338
+ """VIDEO mode, like MediaPipe's: reuse the previous frame's
339
+ landmark-derived ROIs and only run palm detection when fewer than
340
+ num_hands hands are being tracked (HandAssociationCalculator logic:
341
+ tracked rects take precedence, new detections overlapping IoU>0.5
342
+ are dropped)."""
343
+ rects = list(self._tracked_rects)
344
+ if len(rects) < self.num_hands:
345
+ for r in self._detect_rects(image_rgb):
346
+ if all(_rect_iou(r, t) <= 0.5 for t in rects):
347
+ rects.append(r)
348
+ rects = rects[: self.num_hands]
349
+
350
+ ih, iw = image_rgb.shape[:2]
351
+ hands = []
352
+ for rect in rects:
353
+ hand = self._landmarks(image_rgb, *rect)
354
+ if hand is not None:
355
+ hands.append(hand)
356
+ hands = deduplicate_hands(hands, iw, ih)
357
+ self._tracked_rects = [rect_from_landmarks(h["landmarks"], iw, ih)
358
+ for h in hands]
359
+ return hands
360
+
361
+ def reset(self):
362
+ self._tracked_rects = []
363
+
364
+ def _detect_rects(self, image_rgb):
365
+ """Palm detection -> transformed hand ROI rects (cx, cy, w, h, rot)."""
366
+ ih, iw = image_rgb.shape[:2]
367
+
368
+ # --- palm detection on the letterboxed square ROI of the full image ---
369
+ side = max(iw, ih)
370
+ crop = crop_rotated_rect(image_rgb, F(0.5) * F(iw), F(0.5) * F(ih),
371
+ side, side, 0.0, DETECT_SIZE, cv2.BORDER_CONSTANT)
372
+ raw_boxes, raw_scores = self.detector(crop[None])
373
+ dets = decode_detections(raw_boxes[0], raw_scores[0], self.anchors)
374
+ dets = weighted_nms(dets)
375
+
376
+ # --- project detections to image space, convert to rects, transform ---
377
+ project = letterbox_projection(iw, ih)
378
+ rects = []
379
+ for det in [project_detection(d, project) for d in dets]:
380
+ # DetectionsToRectsCalculator
381
+ cx = det["xmin"] + det["w"] / F(2.0)
382
+ cy = det["ymin"] + det["h"] / F(2.0)
383
+ w, h = det["w"], det["h"]
384
+ rotation = compute_rotation(
385
+ det["kp"][0][0] * F(iw), det["kp"][0][1] * F(ih), # wrist center
386
+ det["kp"][2][0] * F(iw), det["kp"][2][1] * F(ih), # middle MCP
387
+ )
388
+ # RectTransformationCalculator: scale 2.6, shift_y -0.5, square_long
389
+ sin_a, cos_a = F(math.sin(rotation)), F(math.cos(rotation))
390
+ if float(rotation) == 0.0:
391
+ cx, cy = cx + w * F(0.0), cy + h * RECT_SHIFT_Y
392
+ else:
393
+ x_shift = (F(iw) * w * F(0.0) * cos_a - F(ih) * h * RECT_SHIFT_Y * sin_a) / F(iw)
394
+ y_shift = (F(iw) * w * F(0.0) * sin_a + F(ih) * h * RECT_SHIFT_Y * cos_a) / F(ih)
395
+ cx, cy = cx + x_shift, cy + y_shift
396
+ long_side = max(w * F(iw), h * F(ih))
397
+ rect_w = long_side / F(iw) * RECT_SCALE
398
+ rect_h = long_side / F(ih) * RECT_SCALE
399
+ rects.append((cx, cy, rect_w, rect_h, rotation))
400
+ return rects
401
+
402
+ def _landmarks(self, image_rgb, cx, cy, rect_w, rect_h, rotation):
403
+ ih, iw = image_rgb.shape[:2]
404
+ crop = crop_rotated_rect(
405
+ image_rgb, F(cx) * F(iw), F(cy) * F(ih), F(rect_w) * F(iw),
406
+ F(rect_h) * F(ih), rotation, LANDMARK_SIZE, cv2.BORDER_REPLICATE,
407
+ )
408
+ lm_raw, presence, handedness_raw, world_raw = self.landmarker(crop[None])
409
+
410
+ # ThresholdingCalculator: hand is present only if score > threshold
411
+ if not F(presence.reshape(-1)[0]) > MIN_HAND_PRESENCE_CONFIDENCE:
412
+ return None
413
+
414
+ # TensorsToClassificationCalculator binary_classification:
415
+ # label_items[0] = Right (score s), label_items[1] = Left (score 1-s)
416
+ s = F(handedness_raw.reshape(-1)[0])
417
+ label, score = ("Right", s) if s >= F(0.5) else ("Left", F(1.0) - s)
418
+
419
+ # TensorsToLandmarksCalculator: x,y /= 224; z /= 224 then /= 0.4
420
+ lm = np.asarray(lm_raw).reshape(21, 3)
421
+ size = F(LANDMARK_SIZE)
422
+ nz = F(LANDMARKS_NORMALIZE_Z)
423
+
424
+ # LandmarkProjectionCalculator (square-ROI NORM_RECT path), float32
425
+ sin_a, cos_a = F(math.sin(rotation)), F(math.cos(rotation))
426
+ landmarks = np.zeros((21, 3), dtype=np.float32)
427
+ world = np.zeros((21, 3), dtype=np.float32)
428
+ wl = np.asarray(world_raw).reshape(21, 3)
429
+ for i in range(21):
430
+ x = lm[i, 0] / size - F(0.5)
431
+ y = lm[i, 1] / size - F(0.5)
432
+ z = lm[i, 2] / size / nz
433
+ nx = cos_a * x - sin_a * y
434
+ ny = sin_a * x + cos_a * y
435
+ landmarks[i, 0] = nx * F(rect_w) + F(cx)
436
+ landmarks[i, 1] = ny * F(rect_h) + F(cy)
437
+ landmarks[i, 2] = z * F(rect_w)
438
+ # WorldLandmarkProjectionCalculator: rotate xy by rect angle
439
+ world[i, 0] = cos_a * wl[i, 0] - sin_a * wl[i, 1]
440
+ world[i, 1] = sin_a * wl[i, 0] + cos_a * wl[i, 1]
441
+ world[i, 2] = wl[i, 2]
442
+
443
+ return {"handedness": label, "score": float(score),
444
+ "landmarks": landmarks, "world_landmarks": world}
445
+
446
+
447
+ def draw(image_bgr, hands):
448
+ """Draw hand skeletons (in-place safe copy) on a BGR image."""
449
+ annotated = image_bgr.copy()
450
+ h, w = annotated.shape[:2]
451
+ for hand in hands:
452
+ pts = [(int(x * w), int(y * h)) for x, y, _ in hand["landmarks"]]
453
+ for a, b in HAND_CONNECTIONS:
454
+ cv2.line(annotated, pts[a], pts[b], (0, 255, 0), 2)
455
+ for x, y in pts:
456
+ cv2.circle(annotated, (x, y), 4, (0, 0, 255), -1)
457
+ return annotated