fasthands 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fasthands-0.1.0/.gitignore +5 -0
- fasthands-0.1.0/LICENSE +28 -0
- fasthands-0.1.0/PKG-INFO +78 -0
- fasthands-0.1.0/PYPI_README.md +60 -0
- fasthands-0.1.0/pyproject.toml +46 -0
- fasthands-0.1.0/src/fasthands/__init__.py +17 -0
- fasthands-0.1.0/src/fasthands/cli.py +131 -0
- fasthands-0.1.0/src/fasthands/coreml.py +40 -0
- fasthands-0.1.0/src/fasthands/models/hand_detector.mlpackage/Data/com.apple.CoreML/model.mlmodel +0 -0
- fasthands-0.1.0/src/fasthands/models/hand_detector.mlpackage/Data/com.apple.CoreML/weights/weight.bin +0 -0
- fasthands-0.1.0/src/fasthands/models/hand_detector.mlpackage/Manifest.json +18 -0
- fasthands-0.1.0/src/fasthands/models/hand_landmarks_detector.mlpackage/Data/com.apple.CoreML/model.mlmodel +0 -0
- fasthands-0.1.0/src/fasthands/models/hand_landmarks_detector.mlpackage/Data/com.apple.CoreML/weights/weight.bin +0 -0
- fasthands-0.1.0/src/fasthands/models/hand_landmarks_detector.mlpackage/Manifest.json +18 -0
- fasthands-0.1.0/src/fasthands/pipeline.py +457 -0
fasthands-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Vimal Mollyn
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
22
|
+
|
|
23
|
+
---
|
|
24
|
+
|
|
25
|
+
The bundled hand detection and hand landmark model weights are derived from
|
|
26
|
+
Google's MediaPipe hand_landmarker.task models, licensed under the Apache
|
|
27
|
+
License, Version 2.0 (https://www.apache.org/licenses/LICENSE-2.0).
|
|
28
|
+
Copyright Google LLC.
|
fasthands-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: fasthands
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Fastest MediaPipe-compatible hand tracker: hand landmarks on the Apple Neural Engine at 0.7 ms/frame
|
|
5
|
+
Project-URL: Repository, https://github.com/VimalMollyn/fasthands
|
|
6
|
+
Author-email: Vimal Mollyn <vmollyn@andrew.cmu.edu>
|
|
7
|
+
License-Expression: MIT
|
|
8
|
+
License-File: LICENSE
|
|
9
|
+
Keywords: coreml,hand-landmarks,hand-tracking,mediapipe,neural-engine
|
|
10
|
+
Classifier: Operating System :: MacOS
|
|
11
|
+
Classifier: Programming Language :: Python :: 3
|
|
12
|
+
Classifier: Topic :: Scientific/Engineering :: Image Recognition
|
|
13
|
+
Requires-Python: >=3.10
|
|
14
|
+
Requires-Dist: coremltools>=8.0
|
|
15
|
+
Requires-Dist: numpy>=1.24
|
|
16
|
+
Requires-Dist: opencv-python>=4.8
|
|
17
|
+
Description-Content-Type: text/markdown
|
|
18
|
+
|
|
19
|
+
# fasthands
|
|
20
|
+
|
|
21
|
+
**The fastest MediaPipe-compatible hand tracker.** MediaPipe Hands' official
|
|
22
|
+
models running on the Apple Neural Engine via CoreML — **0.7 ms per tracked
|
|
23
|
+
frame** on Apple Silicon, ~5× faster than MediaPipe itself, with a faithful
|
|
24
|
+
port of the full HandLandmarker pipeline (SSD anchors, weighted NMS, ROI
|
|
25
|
+
tracking, landmark projection, deduplication).
|
|
26
|
+
|
|
27
|
+
macOS / Apple Silicon only.
|
|
28
|
+
|
|
29
|
+
## Install
|
|
30
|
+
|
|
31
|
+
```sh
|
|
32
|
+
pip install fasthands
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
## Use
|
|
36
|
+
|
|
37
|
+
```python
|
|
38
|
+
import cv2
|
|
39
|
+
import fasthands
|
|
40
|
+
|
|
41
|
+
tracker = fasthands.load(num_hands=1)
|
|
42
|
+
|
|
43
|
+
image = cv2.cvtColor(cv2.imread("hand.jpg"), cv2.COLOR_BGR2RGB)
|
|
44
|
+
hands = tracker(image) # single image
|
|
45
|
+
# hands = tracker.detect_video(frame) # video: tracks between frames, ~0.7 ms
|
|
46
|
+
|
|
47
|
+
for hand in hands:
|
|
48
|
+
print(hand["handedness"], hand["score"])
|
|
49
|
+
print(hand["landmarks"]) # 21 x (x, y, z), normalized image coords
|
|
50
|
+
print(hand["world_landmarks"]) # 21 x (x, y, z), meters, hand-centered
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
Or from the command line:
|
|
54
|
+
|
|
55
|
+
```sh
|
|
56
|
+
fasthands photo.jpg --out annotated.jpg
|
|
57
|
+
fasthands-webcam --mirror # live demo with FPS overlay
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
## Speed (Apple M4, 540×720 frame, one hand)
|
|
61
|
+
|
|
62
|
+
| | tracking | detect + track |
|
|
63
|
+
|---|---|---|
|
|
64
|
+
| **fasthands (ANE)** | **0.7 ms** | **1.9 ms** |
|
|
65
|
+
| mediapipe (XNNPACK CPU) | 3.3 ms | 8.7 ms |
|
|
66
|
+
|
|
67
|
+
Landmarks agree with MediaPipe to ~1e-3 (Neural Engine fp16); the pipeline
|
|
68
|
+
logic itself is verified to MediaPipe's own float32 reproducibility floor.
|
|
69
|
+
|
|
70
|
+
## How
|
|
71
|
+
|
|
72
|
+
The `hand_landmarker.task` models are converted to CoreML, and every MediaPipe
|
|
73
|
+
calculator in the pipeline (anchors, decode, weighted NMS, rect transforms,
|
|
74
|
+
rotated crops, projections, VIDEO-mode ROI tracking, dedup) is reimplemented
|
|
75
|
+
in numpy with float32 op-order fidelity. Model weights © Google, Apache 2.0.
|
|
76
|
+
|
|
77
|
+
Source, the PyTorch reference implementation, and the full verification
|
|
78
|
+
harness: https://github.com/VimalMollyn/fasthands
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
# fasthands
|
|
2
|
+
|
|
3
|
+
**The fastest MediaPipe-compatible hand tracker.** MediaPipe Hands' official
|
|
4
|
+
models running on the Apple Neural Engine via CoreML — **0.7 ms per tracked
|
|
5
|
+
frame** on Apple Silicon, ~5× faster than MediaPipe itself, with a faithful
|
|
6
|
+
port of the full HandLandmarker pipeline (SSD anchors, weighted NMS, ROI
|
|
7
|
+
tracking, landmark projection, deduplication).
|
|
8
|
+
|
|
9
|
+
macOS / Apple Silicon only.
|
|
10
|
+
|
|
11
|
+
## Install
|
|
12
|
+
|
|
13
|
+
```sh
|
|
14
|
+
pip install fasthands
|
|
15
|
+
```
|
|
16
|
+
|
|
17
|
+
## Use
|
|
18
|
+
|
|
19
|
+
```python
|
|
20
|
+
import cv2
|
|
21
|
+
import fasthands
|
|
22
|
+
|
|
23
|
+
tracker = fasthands.load(num_hands=1)
|
|
24
|
+
|
|
25
|
+
image = cv2.cvtColor(cv2.imread("hand.jpg"), cv2.COLOR_BGR2RGB)
|
|
26
|
+
hands = tracker(image) # single image
|
|
27
|
+
# hands = tracker.detect_video(frame) # video: tracks between frames, ~0.7 ms
|
|
28
|
+
|
|
29
|
+
for hand in hands:
|
|
30
|
+
print(hand["handedness"], hand["score"])
|
|
31
|
+
print(hand["landmarks"]) # 21 x (x, y, z), normalized image coords
|
|
32
|
+
print(hand["world_landmarks"]) # 21 x (x, y, z), meters, hand-centered
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
Or from the command line:
|
|
36
|
+
|
|
37
|
+
```sh
|
|
38
|
+
fasthands photo.jpg --out annotated.jpg
|
|
39
|
+
fasthands-webcam --mirror # live demo with FPS overlay
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
## Speed (Apple M4, 540×720 frame, one hand)
|
|
43
|
+
|
|
44
|
+
| | tracking | detect + track |
|
|
45
|
+
|---|---|---|
|
|
46
|
+
| **fasthands (ANE)** | **0.7 ms** | **1.9 ms** |
|
|
47
|
+
| mediapipe (XNNPACK CPU) | 3.3 ms | 8.7 ms |
|
|
48
|
+
|
|
49
|
+
Landmarks agree with MediaPipe to ~1e-3 (Neural Engine fp16); the pipeline
|
|
50
|
+
logic itself is verified to MediaPipe's own float32 reproducibility floor.
|
|
51
|
+
|
|
52
|
+
## How
|
|
53
|
+
|
|
54
|
+
The `hand_landmarker.task` models are converted to CoreML, and every MediaPipe
|
|
55
|
+
calculator in the pipeline (anchors, decode, weighted NMS, rect transforms,
|
|
56
|
+
rotated crops, projections, VIDEO-mode ROI tracking, dedup) is reimplemented
|
|
57
|
+
in numpy with float32 op-order fidelity. Model weights © Google, Apache 2.0.
|
|
58
|
+
|
|
59
|
+
Source, the PyTorch reference implementation, and the full verification
|
|
60
|
+
harness: https://github.com/VimalMollyn/fasthands
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "fasthands"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = "Fastest MediaPipe-compatible hand tracker: hand landmarks on the Apple Neural Engine at 0.7 ms/frame"
|
|
5
|
+
readme = "PYPI_README.md"
|
|
6
|
+
requires-python = ">=3.10"
|
|
7
|
+
license = "MIT"
|
|
8
|
+
authors = [{ name = "Vimal Mollyn", email = "vmollyn@andrew.cmu.edu" }]
|
|
9
|
+
keywords = ["hand-tracking", "mediapipe", "coreml", "neural-engine", "hand-landmarks"]
|
|
10
|
+
classifiers = [
|
|
11
|
+
"Operating System :: MacOS",
|
|
12
|
+
"Programming Language :: Python :: 3",
|
|
13
|
+
"Topic :: Scientific/Engineering :: Image Recognition",
|
|
14
|
+
]
|
|
15
|
+
dependencies = [
|
|
16
|
+
"coremltools>=8.0",
|
|
17
|
+
"numpy>=1.24",
|
|
18
|
+
"opencv-python>=4.8",
|
|
19
|
+
]
|
|
20
|
+
|
|
21
|
+
[project.urls]
|
|
22
|
+
Repository = "https://github.com/VimalMollyn/fasthands"
|
|
23
|
+
|
|
24
|
+
[project.scripts]
|
|
25
|
+
fasthands = "fasthands.cli:main"
|
|
26
|
+
fasthands-webcam = "fasthands.cli:webcam"
|
|
27
|
+
|
|
28
|
+
[build-system]
|
|
29
|
+
requires = ["hatchling"]
|
|
30
|
+
build-backend = "hatchling.build"
|
|
31
|
+
|
|
32
|
+
[tool.hatch.build.targets.wheel]
|
|
33
|
+
packages = ["src/fasthands"]
|
|
34
|
+
|
|
35
|
+
[tool.hatch.build.targets.sdist]
|
|
36
|
+
include = ["src", "PYPI_README.md", "LICENSE"]
|
|
37
|
+
|
|
38
|
+
[dependency-groups]
|
|
39
|
+
# research / verification tooling for this repo, not shipped with the package
|
|
40
|
+
dev = [
|
|
41
|
+
"mediapipe>=0.10.35",
|
|
42
|
+
"torch>=2.12.0",
|
|
43
|
+
"ai-edge-litert>=2.1.5",
|
|
44
|
+
"tflite>=2.18.0",
|
|
45
|
+
"pytest>=8",
|
|
46
|
+
]
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
"""fasthands — the fastest MediaPipe-compatible hand tracker.
|
|
2
|
+
|
|
3
|
+
MediaPipe Hands' models running on the Apple Neural Engine via CoreML,
|
|
4
|
+
with a faithful port of the HandLandmarker pipeline. ~0.7 ms per tracked
|
|
5
|
+
frame on Apple Silicon (~5x faster than MediaPipe itself).
|
|
6
|
+
|
|
7
|
+
import fasthands
|
|
8
|
+
tracker = fasthands.load(num_hands=1)
|
|
9
|
+
hands = tracker.detect_video(rgb_frame) # tracking (video) mode
|
|
10
|
+
hands = tracker(rgb_image) # single-image mode
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from .coreml import load
|
|
14
|
+
from .pipeline import HAND_CONNECTIONS, HandLandmarker, draw
|
|
15
|
+
|
|
16
|
+
__version__ = "0.1.0"
|
|
17
|
+
__all__ = ["load", "HandLandmarker", "HAND_CONNECTIONS", "draw", "__version__"]
|
|
@@ -0,0 +1,131 @@
|
|
|
1
|
+
"""Command-line entry points: `fasthands <image>` and `fasthands-webcam`."""
|
|
2
|
+
|
|
3
|
+
import argparse
|
|
4
|
+
import json
|
|
5
|
+
import threading
|
|
6
|
+
import time
|
|
7
|
+
|
|
8
|
+
import cv2
|
|
9
|
+
|
|
10
|
+
from . import load
|
|
11
|
+
from .pipeline import draw
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def main():
|
|
15
|
+
parser = argparse.ArgumentParser(
|
|
16
|
+
description="fasthands: hand landmarks on the Neural Engine")
|
|
17
|
+
parser.add_argument("image", help="path to input image")
|
|
18
|
+
parser.add_argument("--out", default=None, help="annotated output image path")
|
|
19
|
+
parser.add_argument("--json", dest="json_path", default=None,
|
|
20
|
+
help="write landmarks as JSON")
|
|
21
|
+
parser.add_argument("--num-hands", type=int, default=2)
|
|
22
|
+
parser.add_argument("--compute-units", default="ALL",
|
|
23
|
+
choices=["ALL", "CPU_AND_NE", "CPU_AND_GPU", "CPU_ONLY"])
|
|
24
|
+
args = parser.parse_args()
|
|
25
|
+
|
|
26
|
+
image_bgr = cv2.imread(args.image)
|
|
27
|
+
if image_bgr is None:
|
|
28
|
+
raise SystemExit(f"could not read image: {args.image}")
|
|
29
|
+
tracker = load(num_hands=args.num_hands, compute_units=args.compute_units)
|
|
30
|
+
hands = tracker(cv2.cvtColor(image_bgr, cv2.COLOR_BGR2RGB))
|
|
31
|
+
|
|
32
|
+
print(f"detected {len(hands)} hand(s)")
|
|
33
|
+
for i, hand in enumerate(hands):
|
|
34
|
+
print(f"\nhand {i}: {hand['handedness']} (score {hand['score']:.4f})")
|
|
35
|
+
for j, (x, y, z) in enumerate(hand["landmarks"]):
|
|
36
|
+
print(f" lm[{j:2d}] x={x:.6f} y={y:.6f} z={z:.6f}")
|
|
37
|
+
|
|
38
|
+
if args.json_path:
|
|
39
|
+
dump = [{
|
|
40
|
+
"handedness": h["handedness"], "score": h["score"],
|
|
41
|
+
"landmarks": [{"x": float(x), "y": float(y), "z": float(z)}
|
|
42
|
+
for x, y, z in h["landmarks"]],
|
|
43
|
+
"world_landmarks": [{"x": float(x), "y": float(y), "z": float(z)}
|
|
44
|
+
for x, y, z in h["world_landmarks"]],
|
|
45
|
+
} for h in hands]
|
|
46
|
+
with open(args.json_path, "w") as f:
|
|
47
|
+
json.dump(dump, f, indent=2)
|
|
48
|
+
print(f"\nlandmarks written to {args.json_path}")
|
|
49
|
+
if args.out:
|
|
50
|
+
cv2.imwrite(args.out, draw(image_bgr, hands))
|
|
51
|
+
print(f"annotated image written to {args.out}")
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
class _Camera:
|
|
55
|
+
"""Threaded capture: always serves the latest frame."""
|
|
56
|
+
|
|
57
|
+
def __init__(self, index):
|
|
58
|
+
self.cap = cv2.VideoCapture(index)
|
|
59
|
+
if not self.cap.isOpened():
|
|
60
|
+
raise SystemExit(f"could not open camera {index}")
|
|
61
|
+
self.frame = None
|
|
62
|
+
self.ok = True
|
|
63
|
+
self.lock = threading.Lock()
|
|
64
|
+
threading.Thread(target=self._loop, daemon=True).start()
|
|
65
|
+
while self.ok and self.frame is None:
|
|
66
|
+
time.sleep(0.01)
|
|
67
|
+
|
|
68
|
+
def _loop(self):
|
|
69
|
+
while self.ok:
|
|
70
|
+
ok, frame = self.cap.read()
|
|
71
|
+
if not ok:
|
|
72
|
+
self.ok = False
|
|
73
|
+
break
|
|
74
|
+
with self.lock:
|
|
75
|
+
self.frame = frame
|
|
76
|
+
|
|
77
|
+
def read(self):
|
|
78
|
+
with self.lock:
|
|
79
|
+
return self.ok, None if self.frame is None else self.frame.copy()
|
|
80
|
+
|
|
81
|
+
def release(self):
|
|
82
|
+
self.ok = False
|
|
83
|
+
self.cap.release()
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def webcam():
|
|
87
|
+
parser = argparse.ArgumentParser(description="fasthands live webcam demo")
|
|
88
|
+
parser.add_argument("--camera", type=int, default=0)
|
|
89
|
+
parser.add_argument("--mirror", action="store_true", help="selfie view")
|
|
90
|
+
parser.add_argument("--num-hands", type=int, default=1)
|
|
91
|
+
parser.add_argument("--compute-units", default="ALL",
|
|
92
|
+
choices=["ALL", "CPU_AND_NE", "CPU_AND_GPU", "CPU_ONLY"])
|
|
93
|
+
args = parser.parse_args()
|
|
94
|
+
|
|
95
|
+
tracker = load(num_hands=args.num_hands, compute_units=args.compute_units)
|
|
96
|
+
cap = _Camera(args.camera)
|
|
97
|
+
|
|
98
|
+
fps, infer_ms = 0.0, 0.0
|
|
99
|
+
prev = time.perf_counter()
|
|
100
|
+
while True:
|
|
101
|
+
ok, frame = cap.read()
|
|
102
|
+
if not ok:
|
|
103
|
+
break
|
|
104
|
+
if args.mirror:
|
|
105
|
+
frame = cv2.flip(frame, 1)
|
|
106
|
+
|
|
107
|
+
t0 = time.perf_counter()
|
|
108
|
+
hands = tracker.detect_video(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
|
|
109
|
+
infer_ms = 0.9 * infer_ms + 0.1 * (time.perf_counter() - t0) * 1000
|
|
110
|
+
frame = draw(frame, hands)
|
|
111
|
+
for hand in hands:
|
|
112
|
+
x, y = hand["landmarks"][0][:2]
|
|
113
|
+
cv2.putText(frame, f"{hand['handedness']} {hand['score']:.2f}",
|
|
114
|
+
(int(x * frame.shape[1]) - 30, int(y * frame.shape[0]) + 30),
|
|
115
|
+
cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 255, 0), 2)
|
|
116
|
+
|
|
117
|
+
now = time.perf_counter()
|
|
118
|
+
fps = 0.9 * fps + 0.1 / (now - prev) if fps else 1.0 / (now - prev)
|
|
119
|
+
prev = now
|
|
120
|
+
cv2.putText(frame, f"{fps:.1f} FPS {infer_ms:.1f} ms", (10, 30),
|
|
121
|
+
cv2.FONT_HERSHEY_SIMPLEX, 1.0, (0, 255, 255), 2)
|
|
122
|
+
cv2.imshow("fasthands", frame)
|
|
123
|
+
if cv2.waitKey(1) & 0xFF in (ord("q"), 27):
|
|
124
|
+
break
|
|
125
|
+
|
|
126
|
+
cap.release()
|
|
127
|
+
cv2.destroyAllWindows()
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
if __name__ == "__main__":
|
|
131
|
+
main()
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
"""CoreML inference backend (Neural Engine capable) with bundled models."""
|
|
2
|
+
|
|
3
|
+
import sys
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
import numpy as np
|
|
7
|
+
|
|
8
|
+
from .pipeline import HandLandmarker
|
|
9
|
+
|
|
10
|
+
MODELS_DIR = Path(__file__).parent / "models"
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class CoreMLBackend:
|
|
14
|
+
def __init__(self, path, compute_units="ALL"):
|
|
15
|
+
if sys.platform != "darwin":
|
|
16
|
+
raise RuntimeError("fasthands requires macOS (CoreML)")
|
|
17
|
+
import coremltools as ct
|
|
18
|
+
self.model = ct.models.MLModel(
|
|
19
|
+
str(path), compute_units=ct.ComputeUnit[compute_units])
|
|
20
|
+
spec = self.model.get_spec()
|
|
21
|
+
self.output_names = [o.name for o in spec.description.output]
|
|
22
|
+
|
|
23
|
+
def __call__(self, x: np.ndarray):
|
|
24
|
+
out = self.model.predict({"image": x})
|
|
25
|
+
return [out[n] for n in self.output_names]
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def load(num_hands: int = 2, compute_units: str = "ALL",
|
|
29
|
+
models_dir=MODELS_DIR) -> HandLandmarker:
|
|
30
|
+
"""Create a HandLandmarker running on CoreML.
|
|
31
|
+
|
|
32
|
+
compute_units: ALL (Neural Engine + GPU + CPU), CPU_AND_NE, CPU_AND_GPU,
|
|
33
|
+
or CPU_ONLY.
|
|
34
|
+
"""
|
|
35
|
+
models_dir = Path(models_dir)
|
|
36
|
+
return HandLandmarker(
|
|
37
|
+
CoreMLBackend(models_dir / "hand_detector.mlpackage", compute_units),
|
|
38
|
+
CoreMLBackend(models_dir / "hand_landmarks_detector.mlpackage", compute_units),
|
|
39
|
+
num_hands=num_hands,
|
|
40
|
+
)
|
fasthands-0.1.0/src/fasthands/models/hand_detector.mlpackage/Data/com.apple.CoreML/model.mlmodel
ADDED
|
Binary file
|
|
Binary file
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
{
|
|
2
|
+
"fileFormatVersion": "1.0.0",
|
|
3
|
+
"itemInfoEntries": {
|
|
4
|
+
"7CBC671B-4858-4DC4-80A9-AA56D588F465": {
|
|
5
|
+
"author": "com.apple.CoreML",
|
|
6
|
+
"description": "CoreML Model Specification",
|
|
7
|
+
"name": "model.mlmodel",
|
|
8
|
+
"path": "com.apple.CoreML/model.mlmodel"
|
|
9
|
+
},
|
|
10
|
+
"95186DCA-2FC9-41F8-B09F-1A3EF04C9006": {
|
|
11
|
+
"author": "com.apple.CoreML",
|
|
12
|
+
"description": "CoreML Model Weights",
|
|
13
|
+
"name": "weights",
|
|
14
|
+
"path": "com.apple.CoreML/weights"
|
|
15
|
+
}
|
|
16
|
+
},
|
|
17
|
+
"rootModelIdentifier": "7CBC671B-4858-4DC4-80A9-AA56D588F465"
|
|
18
|
+
}
|
|
Binary file
|
|
Binary file
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
{
|
|
2
|
+
"fileFormatVersion": "1.0.0",
|
|
3
|
+
"itemInfoEntries": {
|
|
4
|
+
"4B1E732F-140B-42D1-A2E5-4377545D8444": {
|
|
5
|
+
"author": "com.apple.CoreML",
|
|
6
|
+
"description": "CoreML Model Specification",
|
|
7
|
+
"name": "model.mlmodel",
|
|
8
|
+
"path": "com.apple.CoreML/model.mlmodel"
|
|
9
|
+
},
|
|
10
|
+
"DFF8F2DB-240E-491C-8B01-E606ED9B5740": {
|
|
11
|
+
"author": "com.apple.CoreML",
|
|
12
|
+
"description": "CoreML Model Weights",
|
|
13
|
+
"name": "weights",
|
|
14
|
+
"path": "com.apple.CoreML/weights"
|
|
15
|
+
}
|
|
16
|
+
},
|
|
17
|
+
"rootModelIdentifier": "4B1E732F-140B-42D1-A2E5-4377545D8444"
|
|
18
|
+
}
|
|
@@ -0,0 +1,457 @@
|
|
|
1
|
+
"""MediaPipe HandLandmarker pipeline, faithfully ported (numpy + OpenCV only).
|
|
2
|
+
|
|
3
|
+
Mirrors MediaPipe's calculators in float32 operation order: SSD anchors,
|
|
4
|
+
detection decode, weighted NMS (in tensor space), detection projection,
|
|
5
|
+
rect transformation, rotated-rect cropping, landmark projection, VIDEO-mode
|
|
6
|
+
tracking (landmarks -> next-frame ROI) and hand deduplication.
|
|
7
|
+
|
|
8
|
+
Inference backends are injected: any callable taking a float32 NHWC array
|
|
9
|
+
[1, H, W, 3] in [0, 1] and returning the model's output arrays in order.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
import math
|
|
13
|
+
|
|
14
|
+
import cv2
|
|
15
|
+
import numpy as np
|
|
16
|
+
|
|
17
|
+
F = np.float32
|
|
18
|
+
|
|
19
|
+
# ----------------------------------------------------------------------------
|
|
20
|
+
# Constants from mediapipe/tasks/cc/vision/hand_detector/hand_detector_graph.cc
|
|
21
|
+
# and hand_landmarker/hand_landmarks_detector_graph.cc
|
|
22
|
+
# ----------------------------------------------------------------------------
|
|
23
|
+
DETECT_SIZE = 192
|
|
24
|
+
LANDMARK_SIZE = 224
|
|
25
|
+
NUM_KEYPOINTS = 7
|
|
26
|
+
MIN_DETECTION_CONFIDENCE = F(0.5)
|
|
27
|
+
MIN_HAND_PRESENCE_CONFIDENCE = F(0.5)
|
|
28
|
+
NMS_THRESHOLD = F(0.3)
|
|
29
|
+
SCORE_CLIPPING_THRESH = F(100.0)
|
|
30
|
+
RECT_SCALE = F(2.6) # RectTransformationCalculator scale_x/scale_y
|
|
31
|
+
RECT_SHIFT_Y = F(-0.5) # RectTransformationCalculator shift_y
|
|
32
|
+
LANDMARKS_NORMALIZE_Z = 0.4 # TensorsToLandmarksCalculator normalize_z
|
|
33
|
+
# NOTE: the tasks HandDetectorGraph sets rotation_vector_target_angle(90) on
|
|
34
|
+
# DetectionsToRectsCalculator -- that proto field is in RADIANS (the _degrees
|
|
35
|
+
# variant is a separate field), so the effective target angle really is
|
|
36
|
+
# 90 rad (= 2.0354 rad mod 2pi), not pi/2. We reproduce that behavior.
|
|
37
|
+
ROTATION_TARGET_ANGLE = 90.0
|
|
38
|
+
|
|
39
|
+
HAND_CONNECTIONS = [
|
|
40
|
+
(0, 1), (1, 2), (2, 3), (3, 4),
|
|
41
|
+
(0, 5), (5, 6), (6, 7), (7, 8),
|
|
42
|
+
(5, 9), (9, 10), (10, 11), (11, 12),
|
|
43
|
+
(9, 13), (13, 14), (14, 15), (15, 16),
|
|
44
|
+
(13, 17), (17, 18), (18, 19), (19, 20),
|
|
45
|
+
(0, 17),
|
|
46
|
+
]
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
# ----------------------------------------------------------------------------
|
|
50
|
+
# SSD anchors — mediapipe/calculators/tflite/ssd_anchors_calculator.cc with the
|
|
51
|
+
# hand-detector config (num_layers=4, strides 8,16,16,16, scales .1484375-.75)
|
|
52
|
+
# ----------------------------------------------------------------------------
|
|
53
|
+
def generate_anchors() -> np.ndarray:
|
|
54
|
+
num_layers = 4
|
|
55
|
+
strides = [8, 16, 16, 16]
|
|
56
|
+
anchors = []
|
|
57
|
+
layer_id = 0
|
|
58
|
+
while layer_id < num_layers:
|
|
59
|
+
scales = []
|
|
60
|
+
last = layer_id
|
|
61
|
+
while last < num_layers and strides[last] == strides[layer_id]:
|
|
62
|
+
scales.append(0) # aspect_ratio 1.0 anchor
|
|
63
|
+
scales.append(0) # interpolated anchor (same center)
|
|
64
|
+
last += 1
|
|
65
|
+
fm = math.ceil(DETECT_SIZE / strides[layer_id])
|
|
66
|
+
for y in range(fm):
|
|
67
|
+
for x in range(fm):
|
|
68
|
+
for _ in scales: # fixed_anchor_size: w = h = 1.0
|
|
69
|
+
anchors.append((F(x + 0.5) / F(fm), F(y + 0.5) / F(fm), 1.0, 1.0))
|
|
70
|
+
layer_id = last
|
|
71
|
+
return np.array(anchors, dtype=np.float32) # [2016, 4] cx, cy, w, h
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
# ----------------------------------------------------------------------------
|
|
75
|
+
# ImageToTensorCalculator (OpenCV converter): rotated sub-rect -> square crop.
|
|
76
|
+
# Identical cv2 calls (boxPoints + getPerspectiveTransform + warpPerspective
|
|
77
|
+
# on uint8, then *1/255f like cv::Mat::convertTo with a float scale).
|
|
78
|
+
# ----------------------------------------------------------------------------
|
|
79
|
+
def crop_rotated_rect(image_rgb, cx, cy, w, h, rotation_rad, dst_size, border):
|
|
80
|
+
angle_deg = F(np.float64(F(rotation_rad) * F(180.0)) / math.pi)
|
|
81
|
+
src = cv2.boxPoints(((float(cx), float(cy)), (float(w), float(h)), float(angle_deg)))
|
|
82
|
+
dst = np.array(
|
|
83
|
+
[[0, dst_size], [0, 0], [dst_size, 0], [dst_size, dst_size]], dtype=np.float32
|
|
84
|
+
)
|
|
85
|
+
m = cv2.getPerspectiveTransform(src.astype(np.float32), dst)
|
|
86
|
+
crop = cv2.warpPerspective(
|
|
87
|
+
image_rgb, m, (dst_size, dst_size), flags=cv2.INTER_LINEAR, borderMode=border
|
|
88
|
+
)
|
|
89
|
+
return crop.astype(np.float32) * F(1.0 / 255.0)
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def normalize_radians(angle: float) -> float:
|
|
93
|
+
return angle - 2 * math.pi * math.floor((angle + math.pi) / (2 * math.pi))
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def compute_rotation(x0, y0, x1, y1) -> F:
|
|
97
|
+
"""DetectionsToRectsCalculator::ComputeRotation in float32, exactly as the
|
|
98
|
+
C++ does it: rot = NormalizeRadians(90.f - atan2f(-(y1-y0), x1-x0))."""
|
|
99
|
+
a = F(ROTATION_TARGET_ANGLE) - F(math.atan2(-(F(y1) - F(y0)), F(x1) - F(x0)))
|
|
100
|
+
return F(normalize_radians(float(a)))
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
# ----------------------------------------------------------------------------
|
|
104
|
+
# TensorsToDetectionsCalculator (decode + sigmoid scores), in tensor space
|
|
105
|
+
# ----------------------------------------------------------------------------
|
|
106
|
+
def decode_detections(raw_boxes, raw_scores, anchors):
|
|
107
|
+
"""raw_boxes [2016,18], raw_scores [2016,1] float32 -> list of dicts
|
|
108
|
+
{xmin, ymin, w, h, kp[7][2], score} in the 192x192 tensor space."""
|
|
109
|
+
logits = np.clip(raw_scores.reshape(-1), -SCORE_CLIPPING_THRESH, SCORE_CLIPPING_THRESH)
|
|
110
|
+
# sigmoid in float64, rounded once to float32 (= correctly-rounded expf path)
|
|
111
|
+
scores = (1.0 / (1.0 + np.exp(-logits.astype(np.float64)))).astype(np.float32)
|
|
112
|
+
scale = F(DETECT_SIZE)
|
|
113
|
+
rb = raw_boxes
|
|
114
|
+
xc = rb[:, 0] / scale * anchors[:, 2] + anchors[:, 0]
|
|
115
|
+
yc = rb[:, 1] / scale * anchors[:, 3] + anchors[:, 1]
|
|
116
|
+
w = rb[:, 2] / scale * anchors[:, 2]
|
|
117
|
+
h = rb[:, 3] / scale * anchors[:, 3]
|
|
118
|
+
xmin = xc - w / F(2.0)
|
|
119
|
+
ymin = yc - h / F(2.0)
|
|
120
|
+
# detection proto stores xmin/ymin/width/height
|
|
121
|
+
width = (xc + w / F(2.0)) - xmin
|
|
122
|
+
height = (yc + h / F(2.0)) - ymin
|
|
123
|
+
kx = rb[:, 4:4 + NUM_KEYPOINTS * 2:2] / scale * anchors[:, 2:3] + anchors[:, 0:1]
|
|
124
|
+
ky = rb[:, 5:5 + NUM_KEYPOINTS * 2:2] / scale * anchors[:, 3:4] + anchors[:, 1:2]
|
|
125
|
+
dets = []
|
|
126
|
+
for i in np.nonzero(scores >= MIN_DETECTION_CONFIDENCE)[0]:
|
|
127
|
+
dets.append({
|
|
128
|
+
"xmin": xmin[i], "ymin": ymin[i], "w": width[i], "h": height[i],
|
|
129
|
+
"kp": [(kx[i, k], ky[i, k]) for k in range(NUM_KEYPOINTS)],
|
|
130
|
+
"score": scores[i],
|
|
131
|
+
})
|
|
132
|
+
return dets
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
# ----------------------------------------------------------------------------
|
|
136
|
+
# NonMaxSuppressionCalculator, algorithm=WEIGHTED, overlap=IoU, thresh=0.3 —
|
|
137
|
+
# float32 accumulation in the same order as the C++ implementation.
|
|
138
|
+
# ----------------------------------------------------------------------------
|
|
139
|
+
def iou(a, b):
|
|
140
|
+
xa, ya = max(a["xmin"], b["xmin"]), max(a["ymin"], b["ymin"])
|
|
141
|
+
xb = min(a["xmin"] + a["w"], b["xmin"] + b["w"])
|
|
142
|
+
yb = min(a["ymin"] + a["h"], b["ymin"] + b["h"])
|
|
143
|
+
if xb <= xa or yb <= ya:
|
|
144
|
+
return F(0.0)
|
|
145
|
+
inter = (xb - xa) * (yb - ya)
|
|
146
|
+
union = a["w"] * a["h"] + b["w"] * b["h"] - inter
|
|
147
|
+
return inter / union
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
def weighted_nms(dets):
|
|
151
|
+
remained = sorted(dets, key=lambda d: -d["score"])
|
|
152
|
+
out = []
|
|
153
|
+
while remained:
|
|
154
|
+
top = remained[0]
|
|
155
|
+
sims = [iou(d, top) for d in remained]
|
|
156
|
+
candidates = [d for d, s in zip(remained, sims) if s > NMS_THRESHOLD]
|
|
157
|
+
remained = [d for d, s in zip(remained, sims) if not s > NMS_THRESHOLD]
|
|
158
|
+
merged = dict(top)
|
|
159
|
+
if candidates:
|
|
160
|
+
w_xmin = w_ymin = w_xmax = w_ymax = F(0.0)
|
|
161
|
+
kp_acc = [[F(0.0), F(0.0)] for _ in range(NUM_KEYPOINTS)]
|
|
162
|
+
total = F(0.0)
|
|
163
|
+
for c in candidates:
|
|
164
|
+
total = total + c["score"]
|
|
165
|
+
w_xmin = w_xmin + c["xmin"] * c["score"]
|
|
166
|
+
w_ymin = w_ymin + c["ymin"] * c["score"]
|
|
167
|
+
w_xmax = w_xmax + (c["xmin"] + c["w"]) * c["score"]
|
|
168
|
+
w_ymax = w_ymax + (c["ymin"] + c["h"]) * c["score"]
|
|
169
|
+
for k in range(NUM_KEYPOINTS):
|
|
170
|
+
kp_acc[k][0] = kp_acc[k][0] + c["kp"][k][0] * c["score"]
|
|
171
|
+
kp_acc[k][1] = kp_acc[k][1] + c["kp"][k][1] * c["score"]
|
|
172
|
+
merged["xmin"] = w_xmin / total
|
|
173
|
+
merged["ymin"] = w_ymin / total
|
|
174
|
+
merged["w"] = (w_xmax / total) - merged["xmin"]
|
|
175
|
+
merged["h"] = (w_ymax / total) - merged["ymin"]
|
|
176
|
+
merged["kp"] = [(kp_acc[k][0] / total, kp_acc[k][1] / total)
|
|
177
|
+
for k in range(NUM_KEYPOINTS)]
|
|
178
|
+
out.append(merged)
|
|
179
|
+
return out
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
# ----------------------------------------------------------------------------
|
|
183
|
+
# DetectionProjectionCalculator: tensor space -> image space through the
|
|
184
|
+
# float32 matrix produced by GetRotatedSubRectToRectTransformMatrix.
|
|
185
|
+
# ----------------------------------------------------------------------------
|
|
186
|
+
def letterbox_projection(iw, ih):
|
|
187
|
+
"""Matrix for the full-image keep-aspect-ratio ROI (rotation 0)."""
|
|
188
|
+
side = F(max(iw, ih))
|
|
189
|
+
e, f = F(0.5) * F(iw), F(0.5) * F(ih) # GetRoi: norm 0.5 * size
|
|
190
|
+
g, h = F(1.0) / F(iw), F(1.0) / F(ih)
|
|
191
|
+
m0 = side * F(1.0) * g # a*c*g, c=1, d=0
|
|
192
|
+
m3 = (F(-0.5) * side * F(1.0) + e) * g
|
|
193
|
+
m5 = side * F(1.0) * h
|
|
194
|
+
m7 = (F(-0.5) * side * F(1.0) + f) * h
|
|
195
|
+
|
|
196
|
+
def project(x, y):
|
|
197
|
+
return F(F(x * m0) + m3), F(F(y * m5) + m7)
|
|
198
|
+
return project
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
def project_detection(det, project):
|
|
202
|
+
corners = [(det["xmin"], det["ymin"]),
|
|
203
|
+
(det["xmin"] + det["w"], det["ymin"]),
|
|
204
|
+
(det["xmin"] + det["w"], det["ymin"] + det["h"]),
|
|
205
|
+
(det["xmin"], det["ymin"] + det["h"])]
|
|
206
|
+
pts = [project(x, y) for x, y in corners]
|
|
207
|
+
xmin = min(p[0] for p in pts)
|
|
208
|
+
ymin = min(p[1] for p in pts)
|
|
209
|
+
return {
|
|
210
|
+
"xmin": xmin, "ymin": ymin,
|
|
211
|
+
"w": max(p[0] for p in pts) - xmin,
|
|
212
|
+
"h": max(p[1] for p in pts) - ymin,
|
|
213
|
+
"kp": [project(x, y) for x, y in det["kp"]],
|
|
214
|
+
"score": det["score"],
|
|
215
|
+
}
|
|
216
|
+
|
|
217
|
+
|
|
218
|
+
# ----------------------------------------------------------------------------
|
|
219
|
+
# HandLandmarksToRectCalculator + RectTransformation(2.0, shift_y -0.1,
|
|
220
|
+
# square_long): next-frame ROI from the current landmarks (VIDEO mode).
|
|
221
|
+
# ----------------------------------------------------------------------------
|
|
222
|
+
PARTIAL_LANDMARK_IDS = [0, 1, 2, 3, 5, 6, 9, 10, 13, 14, 17, 18]
|
|
223
|
+
TRACK_RECT_SCALE = F(2.0)
|
|
224
|
+
TRACK_RECT_SHIFT_Y = F(-0.1)
|
|
225
|
+
|
|
226
|
+
|
|
227
|
+
def rect_from_landmarks(landmarks, iw, ih):
|
|
228
|
+
lm = landmarks[PARTIAL_LANDMARK_IDS][:, :2]
|
|
229
|
+
|
|
230
|
+
# rotation: wrist -> mean of index/middle/ring MCPs, target pi/2
|
|
231
|
+
x0, y0 = lm[0, 0] * F(iw), lm[0, 1] * F(ih)
|
|
232
|
+
x1 = (lm[4, 0] + lm[8, 0]) / F(2.0) # index, ring
|
|
233
|
+
y1 = (lm[4, 1] + lm[8, 1]) / F(2.0)
|
|
234
|
+
x1 = (x1 + lm[6, 0]) / F(2.0) * F(iw) # middle
|
|
235
|
+
y1 = (y1 + lm[6, 1]) / F(2.0) * F(ih)
|
|
236
|
+
rotation = F(normalize_radians(
|
|
237
|
+
float(F(math.pi * 0.5) - F(math.atan2(-(y1 - y0), x1 - x0)))))
|
|
238
|
+
rev = -rotation
|
|
239
|
+
|
|
240
|
+
# bbox center, then bbox in the de-rotated frame
|
|
241
|
+
cax = (lm[:, 0].max() + lm[:, 0].min()) / F(2.0)
|
|
242
|
+
cay = (lm[:, 1].max() + lm[:, 1].min()) / F(2.0)
|
|
243
|
+
ox = (lm[:, 0] - cax) * F(iw)
|
|
244
|
+
oy = (lm[:, 1] - cay) * F(ih)
|
|
245
|
+
px = ox * F(math.cos(rev)) - oy * F(math.sin(rev))
|
|
246
|
+
py = ox * F(math.sin(rev)) + oy * F(math.cos(rev))
|
|
247
|
+
pcx = (px.max() + px.min()) / F(2.0)
|
|
248
|
+
pcy = (py.max() + py.min()) / F(2.0)
|
|
249
|
+
cx = (pcx * F(math.cos(rotation)) - pcy * F(math.sin(rotation)) + F(iw) * cax) / F(iw)
|
|
250
|
+
cy = (pcx * F(math.sin(rotation)) + pcy * F(math.cos(rotation)) + F(ih) * cay) / F(ih)
|
|
251
|
+
w = (px.max() - px.min()) / F(iw)
|
|
252
|
+
h = (py.max() - py.min()) / F(ih)
|
|
253
|
+
|
|
254
|
+
# RectTransformationCalculator: shift, square_long, scale 2.0
|
|
255
|
+
sin_a, cos_a = F(math.sin(rotation)), F(math.cos(rotation))
|
|
256
|
+
x_shift = (-F(ih) * h * TRACK_RECT_SHIFT_Y * sin_a) / F(iw)
|
|
257
|
+
y_shift = (F(ih) * h * TRACK_RECT_SHIFT_Y * cos_a) / F(ih)
|
|
258
|
+
cx, cy = cx + x_shift, cy + y_shift
|
|
259
|
+
long_side = max(w * F(iw), h * F(ih))
|
|
260
|
+
return (cx, cy, long_side / F(iw) * TRACK_RECT_SCALE,
|
|
261
|
+
long_side / F(ih) * TRACK_RECT_SCALE, rotation)
|
|
262
|
+
|
|
263
|
+
|
|
264
|
+
def deduplicate_hands(hands, iw, ih):
|
|
265
|
+
"""HandLandmarksDeduplicationCalculator: suppress a hand if >=10 of its 21
|
|
266
|
+
landmarks lie within 0.2 x baseline-palm-size of an already-retained hand
|
|
267
|
+
and their landmark bounding boxes overlap with IoU > 0.2."""
|
|
268
|
+
def baseline(lm):
|
|
269
|
+
px = lm[:, :2] * (iw, ih)
|
|
270
|
+
return max(np.linalg.norm(px[0] - px[5]), np.linalg.norm(px[5] - px[17]),
|
|
271
|
+
np.linalg.norm(px[17] - px[0]))
|
|
272
|
+
|
|
273
|
+
def bbox_iou(a, b):
|
|
274
|
+
ax0, ay0 = a[:, 0].min(), a[:, 1].min(); ax1, ay1 = a[:, 0].max(), a[:, 1].max()
|
|
275
|
+
bx0, by0 = b[:, 0].min(), b[:, 1].min(); bx1, by1 = b[:, 0].max(), b[:, 1].max()
|
|
276
|
+
xa, ya, xb, yb = max(ax0, bx0), max(ay0, by0), min(ax1, bx1), min(ay1, by1)
|
|
277
|
+
if xb <= xa or yb <= ya:
|
|
278
|
+
return 0.0
|
|
279
|
+
inter = (xb - xa) * (yb - ya)
|
|
280
|
+
return inter / ((ax1-ax0)*(ay1-ay0) + (bx1-bx0)*(by1-by0) - inter)
|
|
281
|
+
|
|
282
|
+
kept = []
|
|
283
|
+
for h in hands:
|
|
284
|
+
lm = h["landmarks"]
|
|
285
|
+
dup = False
|
|
286
|
+
for k in kept:
|
|
287
|
+
klm = k["landmarks"]
|
|
288
|
+
thresh = max(baseline(lm), baseline(klm)) * 0.2
|
|
289
|
+
dists = np.linalg.norm((lm[:, :2] - klm[:, :2]) * (iw, ih), axis=1)
|
|
290
|
+
if (dists < thresh).sum() >= 10 and bbox_iou(lm, klm) > 0.2:
|
|
291
|
+
dup = True
|
|
292
|
+
break
|
|
293
|
+
if not dup:
|
|
294
|
+
kept.append(h)
|
|
295
|
+
return kept
|
|
296
|
+
|
|
297
|
+
|
|
298
|
+
def _rect_iou(a, b):
|
|
299
|
+
"""Axis-aligned IoU of two (cx, cy, w, h, rot) rects, for association."""
|
|
300
|
+
ax0, ay0 = a[0] - a[2] / 2, a[1] - a[3] / 2
|
|
301
|
+
bx0, by0 = b[0] - b[2] / 2, b[1] - b[3] / 2
|
|
302
|
+
xa, ya = max(ax0, bx0), max(ay0, by0)
|
|
303
|
+
xb, yb = min(ax0 + a[2], bx0 + b[2]), min(ay0 + a[3], by0 + b[3])
|
|
304
|
+
if xb <= xa or yb <= ya:
|
|
305
|
+
return 0.0
|
|
306
|
+
inter = (xb - xa) * (yb - ya)
|
|
307
|
+
return float(inter / (a[2] * a[3] + b[2] * b[3] - inter))
|
|
308
|
+
|
|
309
|
+
|
|
310
|
+
# ----------------------------------------------------------------------------
|
|
311
|
+
# Pipeline
|
|
312
|
+
# ----------------------------------------------------------------------------
|
|
313
|
+
class HandLandmarker:
|
|
314
|
+
"""The HandLandmarker task graph with injectable inference backends.
|
|
315
|
+
|
|
316
|
+
detector / landmarker: callables mapping a float32 NHWC array [1,H,W,3]
|
|
317
|
+
in [0,1] to the model's raw output arrays (in model output order).
|
|
318
|
+
"""
|
|
319
|
+
|
|
320
|
+
def __init__(self, detector, landmarker, num_hands=2):
|
|
321
|
+
self.detector = detector
|
|
322
|
+
self.landmarker = landmarker
|
|
323
|
+
self.anchors = generate_anchors()
|
|
324
|
+
self.num_hands = num_hands
|
|
325
|
+
self._tracked_rects = [] # VIDEO mode: ROIs carried to the next frame
|
|
326
|
+
|
|
327
|
+
def __call__(self, image_rgb: np.ndarray):
|
|
328
|
+
"""IMAGE mode: palm detection + landmarks every call."""
|
|
329
|
+
ih, iw = image_rgb.shape[:2]
|
|
330
|
+
hands = []
|
|
331
|
+
for rect in self._detect_rects(image_rgb)[: self.num_hands]:
|
|
332
|
+
hand = self._landmarks(image_rgb, *rect)
|
|
333
|
+
if hand is not None:
|
|
334
|
+
hands.append(hand)
|
|
335
|
+
return deduplicate_hands(hands, iw, ih)
|
|
336
|
+
|
|
337
|
+
def detect_video(self, image_rgb: np.ndarray):
|
|
338
|
+
"""VIDEO mode, like MediaPipe's: reuse the previous frame's
|
|
339
|
+
landmark-derived ROIs and only run palm detection when fewer than
|
|
340
|
+
num_hands hands are being tracked (HandAssociationCalculator logic:
|
|
341
|
+
tracked rects take precedence, new detections overlapping IoU>0.5
|
|
342
|
+
are dropped)."""
|
|
343
|
+
rects = list(self._tracked_rects)
|
|
344
|
+
if len(rects) < self.num_hands:
|
|
345
|
+
for r in self._detect_rects(image_rgb):
|
|
346
|
+
if all(_rect_iou(r, t) <= 0.5 for t in rects):
|
|
347
|
+
rects.append(r)
|
|
348
|
+
rects = rects[: self.num_hands]
|
|
349
|
+
|
|
350
|
+
ih, iw = image_rgb.shape[:2]
|
|
351
|
+
hands = []
|
|
352
|
+
for rect in rects:
|
|
353
|
+
hand = self._landmarks(image_rgb, *rect)
|
|
354
|
+
if hand is not None:
|
|
355
|
+
hands.append(hand)
|
|
356
|
+
hands = deduplicate_hands(hands, iw, ih)
|
|
357
|
+
self._tracked_rects = [rect_from_landmarks(h["landmarks"], iw, ih)
|
|
358
|
+
for h in hands]
|
|
359
|
+
return hands
|
|
360
|
+
|
|
361
|
+
def reset(self):
|
|
362
|
+
self._tracked_rects = []
|
|
363
|
+
|
|
364
|
+
def _detect_rects(self, image_rgb):
|
|
365
|
+
"""Palm detection -> transformed hand ROI rects (cx, cy, w, h, rot)."""
|
|
366
|
+
ih, iw = image_rgb.shape[:2]
|
|
367
|
+
|
|
368
|
+
# --- palm detection on the letterboxed square ROI of the full image ---
|
|
369
|
+
side = max(iw, ih)
|
|
370
|
+
crop = crop_rotated_rect(image_rgb, F(0.5) * F(iw), F(0.5) * F(ih),
|
|
371
|
+
side, side, 0.0, DETECT_SIZE, cv2.BORDER_CONSTANT)
|
|
372
|
+
raw_boxes, raw_scores = self.detector(crop[None])
|
|
373
|
+
dets = decode_detections(raw_boxes[0], raw_scores[0], self.anchors)
|
|
374
|
+
dets = weighted_nms(dets)
|
|
375
|
+
|
|
376
|
+
# --- project detections to image space, convert to rects, transform ---
|
|
377
|
+
project = letterbox_projection(iw, ih)
|
|
378
|
+
rects = []
|
|
379
|
+
for det in [project_detection(d, project) for d in dets]:
|
|
380
|
+
# DetectionsToRectsCalculator
|
|
381
|
+
cx = det["xmin"] + det["w"] / F(2.0)
|
|
382
|
+
cy = det["ymin"] + det["h"] / F(2.0)
|
|
383
|
+
w, h = det["w"], det["h"]
|
|
384
|
+
rotation = compute_rotation(
|
|
385
|
+
det["kp"][0][0] * F(iw), det["kp"][0][1] * F(ih), # wrist center
|
|
386
|
+
det["kp"][2][0] * F(iw), det["kp"][2][1] * F(ih), # middle MCP
|
|
387
|
+
)
|
|
388
|
+
# RectTransformationCalculator: scale 2.6, shift_y -0.5, square_long
|
|
389
|
+
sin_a, cos_a = F(math.sin(rotation)), F(math.cos(rotation))
|
|
390
|
+
if float(rotation) == 0.0:
|
|
391
|
+
cx, cy = cx + w * F(0.0), cy + h * RECT_SHIFT_Y
|
|
392
|
+
else:
|
|
393
|
+
x_shift = (F(iw) * w * F(0.0) * cos_a - F(ih) * h * RECT_SHIFT_Y * sin_a) / F(iw)
|
|
394
|
+
y_shift = (F(iw) * w * F(0.0) * sin_a + F(ih) * h * RECT_SHIFT_Y * cos_a) / F(ih)
|
|
395
|
+
cx, cy = cx + x_shift, cy + y_shift
|
|
396
|
+
long_side = max(w * F(iw), h * F(ih))
|
|
397
|
+
rect_w = long_side / F(iw) * RECT_SCALE
|
|
398
|
+
rect_h = long_side / F(ih) * RECT_SCALE
|
|
399
|
+
rects.append((cx, cy, rect_w, rect_h, rotation))
|
|
400
|
+
return rects
|
|
401
|
+
|
|
402
|
+
def _landmarks(self, image_rgb, cx, cy, rect_w, rect_h, rotation):
|
|
403
|
+
ih, iw = image_rgb.shape[:2]
|
|
404
|
+
crop = crop_rotated_rect(
|
|
405
|
+
image_rgb, F(cx) * F(iw), F(cy) * F(ih), F(rect_w) * F(iw),
|
|
406
|
+
F(rect_h) * F(ih), rotation, LANDMARK_SIZE, cv2.BORDER_REPLICATE,
|
|
407
|
+
)
|
|
408
|
+
lm_raw, presence, handedness_raw, world_raw = self.landmarker(crop[None])
|
|
409
|
+
|
|
410
|
+
# ThresholdingCalculator: hand is present only if score > threshold
|
|
411
|
+
if not F(presence.reshape(-1)[0]) > MIN_HAND_PRESENCE_CONFIDENCE:
|
|
412
|
+
return None
|
|
413
|
+
|
|
414
|
+
# TensorsToClassificationCalculator binary_classification:
|
|
415
|
+
# label_items[0] = Right (score s), label_items[1] = Left (score 1-s)
|
|
416
|
+
s = F(handedness_raw.reshape(-1)[0])
|
|
417
|
+
label, score = ("Right", s) if s >= F(0.5) else ("Left", F(1.0) - s)
|
|
418
|
+
|
|
419
|
+
# TensorsToLandmarksCalculator: x,y /= 224; z /= 224 then /= 0.4
|
|
420
|
+
lm = np.asarray(lm_raw).reshape(21, 3)
|
|
421
|
+
size = F(LANDMARK_SIZE)
|
|
422
|
+
nz = F(LANDMARKS_NORMALIZE_Z)
|
|
423
|
+
|
|
424
|
+
# LandmarkProjectionCalculator (square-ROI NORM_RECT path), float32
|
|
425
|
+
sin_a, cos_a = F(math.sin(rotation)), F(math.cos(rotation))
|
|
426
|
+
landmarks = np.zeros((21, 3), dtype=np.float32)
|
|
427
|
+
world = np.zeros((21, 3), dtype=np.float32)
|
|
428
|
+
wl = np.asarray(world_raw).reshape(21, 3)
|
|
429
|
+
for i in range(21):
|
|
430
|
+
x = lm[i, 0] / size - F(0.5)
|
|
431
|
+
y = lm[i, 1] / size - F(0.5)
|
|
432
|
+
z = lm[i, 2] / size / nz
|
|
433
|
+
nx = cos_a * x - sin_a * y
|
|
434
|
+
ny = sin_a * x + cos_a * y
|
|
435
|
+
landmarks[i, 0] = nx * F(rect_w) + F(cx)
|
|
436
|
+
landmarks[i, 1] = ny * F(rect_h) + F(cy)
|
|
437
|
+
landmarks[i, 2] = z * F(rect_w)
|
|
438
|
+
# WorldLandmarkProjectionCalculator: rotate xy by rect angle
|
|
439
|
+
world[i, 0] = cos_a * wl[i, 0] - sin_a * wl[i, 1]
|
|
440
|
+
world[i, 1] = sin_a * wl[i, 0] + cos_a * wl[i, 1]
|
|
441
|
+
world[i, 2] = wl[i, 2]
|
|
442
|
+
|
|
443
|
+
return {"handedness": label, "score": float(score),
|
|
444
|
+
"landmarks": landmarks, "world_landmarks": world}
|
|
445
|
+
|
|
446
|
+
|
|
447
|
+
def draw(image_bgr, hands):
|
|
448
|
+
"""Draw hand skeletons (in-place safe copy) on a BGR image."""
|
|
449
|
+
annotated = image_bgr.copy()
|
|
450
|
+
h, w = annotated.shape[:2]
|
|
451
|
+
for hand in hands:
|
|
452
|
+
pts = [(int(x * w), int(y * h)) for x, y, _ in hand["landmarks"]]
|
|
453
|
+
for a, b in HAND_CONNECTIONS:
|
|
454
|
+
cv2.line(annotated, pts[a], pts[b], (0, 255, 0), 2)
|
|
455
|
+
for x, y in pts:
|
|
456
|
+
cv2.circle(annotated, (x, y), 4, (0, 0, 255), -1)
|
|
457
|
+
return annotated
|