wilor-mlx 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- wilor_mlx-0.1.0/LICENSE +21 -0
- wilor_mlx-0.1.0/PKG-INFO +162 -0
- wilor_mlx-0.1.0/README.md +128 -0
- wilor_mlx-0.1.0/pyproject.toml +49 -0
- wilor_mlx-0.1.0/setup.cfg +4 -0
- wilor_mlx-0.1.0/src/wilor_mlx/__init__.py +5 -0
- wilor_mlx-0.1.0/src/wilor_mlx/convert.py +482 -0
- wilor_mlx-0.1.0/src/wilor_mlx/kernels/__init__.py +0 -0
- wilor_mlx-0.1.0/src/wilor_mlx/mano.py +246 -0
- wilor_mlx-0.1.0/src/wilor_mlx/model.py +257 -0
- wilor_mlx-0.1.0/src/wilor_mlx/refinenet.py +255 -0
- wilor_mlx-0.1.0/src/wilor_mlx/vit.py +232 -0
- wilor_mlx-0.1.0/src/wilor_mlx.egg-info/PKG-INFO +162 -0
- wilor_mlx-0.1.0/src/wilor_mlx.egg-info/SOURCES.txt +15 -0
- wilor_mlx-0.1.0/src/wilor_mlx.egg-info/dependency_links.txt +1 -0
- wilor_mlx-0.1.0/src/wilor_mlx.egg-info/requires.txt +8 -0
- wilor_mlx-0.1.0/src/wilor_mlx.egg-info/top_level.txt +1 -0
wilor_mlx-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Noah Lyons
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
wilor_mlx-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,162 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: wilor-mlx
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: WiLoR hand pose estimation for Apple Silicon, rebuilt end-to-end in MLX
|
|
5
|
+
Author: Noah Lyons
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/lyonsno/wilor-mlx
|
|
8
|
+
Project-URL: Repository, https://github.com/lyonsno/wilor-mlx
|
|
9
|
+
Project-URL: Model Card, https://huggingface.co/BasinShapers/wilor-mlx
|
|
10
|
+
Keywords: mlx,apple-silicon,hand-pose,wilor,mano,3d,real-time
|
|
11
|
+
Classifier: Development Status :: 4 - Beta
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: Intended Audience :: Science/Research
|
|
14
|
+
Classifier: Operating System :: MacOS
|
|
15
|
+
Classifier: Programming Language :: Python :: 3
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.14
|
|
21
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
22
|
+
Classifier: Topic :: Scientific/Engineering :: Image Recognition
|
|
23
|
+
Requires-Python: >=3.10
|
|
24
|
+
Description-Content-Type: text/markdown
|
|
25
|
+
License-File: LICENSE
|
|
26
|
+
Requires-Dist: mlx>=0.22.0
|
|
27
|
+
Requires-Dist: numpy
|
|
28
|
+
Requires-Dist: scipy
|
|
29
|
+
Requires-Dist: huggingface_hub
|
|
30
|
+
Provides-Extra: dev
|
|
31
|
+
Requires-Dist: pytest>=7.0; extra == "dev"
|
|
32
|
+
Requires-Dist: torch; extra == "dev"
|
|
33
|
+
Dynamic: license-file
|
|
34
|
+
|
|
35
|
+
# wilor-mlx
|
|
36
|
+
|
|
37
|
+
WiLoR hand pose estimation for Apple Silicon, rebuilt end-to-end in [MLX](https://github.com/ml-explore/mlx).
|
|
38
|
+
|
|
39
|
+
A from-scratch MLX port of [WiLoR-mini](https://github.com/warmshao/WiLoR-mini) (Zhan et al., "WiLoR: End-to-end 3D hand localization and reconstruction in-the-wild") — the pose/reconstruction model including ViT backbone, MANO hand model, and RefineNet refinement stage. It expects a cropped hand image from a separate detector. First run requires `torch` for a one-time MANO conversion; after that, inference runs purely on MLX.
|
|
40
|
+
|
|
41
|
+
## Performance
|
|
42
|
+
|
|
43
|
+
Tested on Apple M4 Max, single-image inference, float32.
|
|
44
|
+
|
|
45
|
+
### Live sidecar behavior (gesture UI prototype)
|
|
46
|
+
|
|
47
|
+
The strongest launch evidence is the route we actually use for interaction: camera frame → hand crop → WiLoR-mini pose/reconstruction sidecar → hand-pose event.
|
|
48
|
+
|
|
49
|
+
On a clean post-reboot M4 Max same-harness smoke over recent 160x120 saved frames from a gesture UI prototype, MLX runs the pose/reconstruction model stage at about 37ms median versus 49ms for PyTorch MPS, and the full saved-frame route at about 49ms versus 60ms. That is roughly a 1.3x model-stage advantage and a 1.2x full-route advantage on the fair comparison denominator we trust most right now.
|
|
50
|
+
|
|
51
|
+
Larger derived-frame stress tests widen both backends; MLX remained faster in those runs, but we treat those numbers as route/runtime stress evidence rather than the headline model benchmark.
|
|
52
|
+
|
|
53
|
+
Older app-level telemetry is what pushed us toward MLX in the first place, but clean reruns narrowed the comparison denominator enough that we are not using the old tail story as a fresh universal PyTorch-vs-MLX headline. The current public claim is narrower and stronger: WiLoR-mini now has a native MLX runtime on Apple Silicon, with live sidecar latency low enough to build interaction on.
|
|
54
|
+
|
|
55
|
+
### Why so consistent?
|
|
56
|
+
|
|
57
|
+
MLX's lazy evaluation builds a graph that can be evaluated in fewer, fused submissions. That reduces dispatch and synchronization surface area in this short-context ViT workload, which is where our traces suggest the PyTorch MPS tail was coming from. The result is tight, flat latency that makes 3D hand pose viable as a real-time control primitive.
|
|
58
|
+
|
|
59
|
+
### Local benchmark harness
|
|
60
|
+
|
|
61
|
+
The repository includes a local benchmark harness for route checks and local reproduction:
|
|
62
|
+
|
|
63
|
+
```bash
|
|
64
|
+
python benchmarks/bench_wilor.py --backend mlx --weights weights/wilor-mlx.safetensors --mano-npz weights/mano.npz
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
We are not using the old app-tail telemetry as the headline claim right now. The strongest current evidence is the same-harness saved-frame route above: it measures the path that actually matters for using hand pose as a real-time input primitive while keeping the PyTorch MPS comparison on the same denominator. Lower-bandwidth M2 Pro/Tahoe validation also shows MLX ahead on archived hand-positive frames, but recent macOS/Metal changes moved both backends enough that we are treating exact M2 Pro numbers as rebaseline work rather than launch headline copy.
|
|
68
|
+
|
|
69
|
+
## Install
|
|
70
|
+
|
|
71
|
+
```bash
|
|
72
|
+
git clone https://github.com/lyonsno/wilor-mlx
|
|
73
|
+
cd wilor-mlx
|
|
74
|
+
pip install -e .
|
|
75
|
+
pip install torch # needed once for first-run MANO conversion, not used after
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
Requires macOS with Apple Silicon, Python 3.10+. MLX and other dependencies install automatically.
|
|
79
|
+
|
|
80
|
+
## How it works
|
|
81
|
+
|
|
82
|
+
On the first call to `WiLoR.from_pretrained()`, wilor-mlx automatically:
|
|
83
|
+
|
|
84
|
+
1. Downloads model weights from [HuggingFace](https://huggingface.co/BasinShapers/wilor-mlx) (2.4 GB, cached locally)
|
|
85
|
+
2. Downloads MANO hand model data from the [WiLoR-mini](https://huggingface.co/warmshao/WiLoR-mini) checkpoint (requires `torch` for one-time conversion)
|
|
86
|
+
3. Caches converted MANO data at `~/.cache/wilor-mlx/mano.npz`
|
|
87
|
+
|
|
88
|
+
After the first run, everything loads from cache and **torch is never used again.**
|
|
89
|
+
|
|
90
|
+
The MANO hand model is licensed separately by the Max Planck Institute. We do not redistribute MANO data — it is downloaded from the original WiLoR-mini source and converted locally on your machine. See [mano.is.tue.mpg.de](https://mano.is.tue.mpg.de/) for MANO license terms.
|
|
91
|
+
|
|
92
|
+
Float32 and int4 weight variants are available on the [model card](https://huggingface.co/BasinShapers/wilor-mlx). Both run at the same speed on Apple Silicon — at these sequence lengths (210 tokens) the model is compute-bound, not memory-bandwidth-bound, so smaller weights don't accelerate inference. Int4 is purely a deployment convenience (2.4 GB → 490 MB).
|
|
93
|
+
|
|
94
|
+
If you prefer to supply your own MANO data (e.g. obtained directly from [MPI](https://mano.is.tue.mpg.de/)), pass `mano_path=...` to `from_pretrained()`.
|
|
95
|
+
|
|
96
|
+
## Quick start
|
|
97
|
+
|
|
98
|
+
```python
|
|
99
|
+
from wilor_mlx import WiLoR
|
|
100
|
+
import mlx.core as mx
|
|
101
|
+
import numpy as np
|
|
102
|
+
|
|
103
|
+
# Load model — everything downloads and caches automatically
|
|
104
|
+
model = WiLoR.from_pretrained()
|
|
105
|
+
|
|
106
|
+
# Prepare input: a 256x256 RGB hand crop as uint8
|
|
107
|
+
# WiLoR expects a tightly cropped hand image, typically from a hand detector
|
|
108
|
+
image = np.random.randint(0, 256, (1, 256, 256, 3), dtype=np.uint8) # replace with real image
|
|
109
|
+
image_mlx = mx.array(image)
|
|
110
|
+
|
|
111
|
+
# Run inference
|
|
112
|
+
result = model(image_mlx)
|
|
113
|
+
mx.eval(result)
|
|
114
|
+
|
|
115
|
+
# Outputs
|
|
116
|
+
keypoints_3d = np.array(result['pred_keypoints_3d']) # (1, 21, 3) — 21 hand keypoints in 3D
|
|
117
|
+
vertices = np.array(result['pred_vertices']) # (1, 778, 3) — MANO mesh vertices
|
|
118
|
+
camera = np.array(result['pred_cam']) # (1, 3) — weak-perspective camera [s, tx, ty]
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
### Input format
|
|
122
|
+
|
|
123
|
+
The model expects a **256×256 RGB crop of a hand**, as a `(B, 256, 256, 3)` uint8 MLX array in NHWC layout. The model handles normalization internally (ImageNet mean/std). In a typical pipeline, a hand detector (like YOLO) first finds the hand bounding box in a full frame, then the crop is resized to 256×256 and passed to WiLoR for 3D pose estimation.
|
|
124
|
+
|
|
125
|
+
### Output format
|
|
126
|
+
|
|
127
|
+
| Key | Shape | Description |
|
|
128
|
+
|---|---|---|
|
|
129
|
+
| `pred_keypoints_3d` | (B, 21, 3) | 3D hand joint locations (OpenPose ordering) |
|
|
130
|
+
| `pred_vertices` | (B, 778, 3) | MANO mesh vertex positions |
|
|
131
|
+
| `pred_cam` | (B, 3) | Weak-perspective camera `[scale, tx, ty]` |
|
|
132
|
+
| `global_orient` | (B, 1, 3) | Global wrist rotation (axis-angle) |
|
|
133
|
+
| `hand_pose` | (B, 15, 3) | Per-finger joint rotations (axis-angle) |
|
|
134
|
+
| `betas` | (B, 10) | MANO shape parameters |
|
|
135
|
+
|
|
136
|
+
## Numerical accuracy
|
|
137
|
+
|
|
138
|
+
Compared against PyTorch WiLoR-mini on identical inputs (float32):
|
|
139
|
+
|
|
140
|
+
| Output | Max abs diff | Notes |
|
|
141
|
+
|---|---|---|
|
|
142
|
+
| pred_vertices (778×3) | 0.006 | Sub-millimeter |
|
|
143
|
+
| pred_keypoints_3d (21×3) | 0.006 | Sub-millimeter |
|
|
144
|
+
| hand_pose (15×3) | 0.06 | Axis-angle is sensitive near gimbal lock |
|
|
145
|
+
| betas (10) | 0.10 | Accumulates through 32 transformer layers |
|
|
146
|
+
|
|
147
|
+
The geometric outputs that matter for hand tracking (vertices, keypoints) match within sub-millimeter precision.
|
|
148
|
+
|
|
149
|
+
## Architecture
|
|
150
|
+
|
|
151
|
+
The port includes:
|
|
152
|
+
|
|
153
|
+
- **ViT-H/16 backbone** — 1280 embed dim, 32 transformer layers, 16 heads. Processes 192 image patches + 18 learnable tokens (pose/shape/camera).
|
|
154
|
+
- **MANO hand model** — differentiable hand mesh with Linear Blend Skinning, Rodrigues rotations, and kinematic chain. 778 vertices, 16 joints.
|
|
155
|
+
- **RefineNet** — multi-scale deconvolution pyramid that samples ViT features at projected vertex locations via bilinear grid sampling, then refines the initial MANO parameter estimates.
|
|
156
|
+
- **Weight converter** — loads PyTorch `.ckpt` files, handles Conv2d NCHW→NHWC transposition, ConvTranspose2d weight layout, and BatchNorm parameter mapping.
|
|
157
|
+
|
|
158
|
+
## License
|
|
159
|
+
|
|
160
|
+
The wilor-mlx code and distributed weight files are MIT licensed. Our weights (on [HuggingFace](https://huggingface.co/BasinShapers/wilor-mlx)) contain only ViT backbone, RefineNet, and learned embedding parameters — no MANO data is bundled or rehosted.
|
|
161
|
+
|
|
162
|
+
The [MANO hand model](https://mano.is.tue.mpg.de/) is separately licensed by the Max Planck Institute. `WiLoR.from_pretrained()` fetches upstream [WiLoR-mini](https://huggingface.co/warmshao/WiLoR-mini) assets and converts MANO data locally on your machine. If you prefer to obtain MANO directly from MPI, pass `mano_path=...` to use your own copy.
|
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
# wilor-mlx
|
|
2
|
+
|
|
3
|
+
WiLoR hand pose estimation for Apple Silicon, rebuilt end-to-end in [MLX](https://github.com/ml-explore/mlx).
|
|
4
|
+
|
|
5
|
+
A from-scratch MLX port of [WiLoR-mini](https://github.com/warmshao/WiLoR-mini) (Zhan et al., "WiLoR: End-to-end 3D hand localization and reconstruction in-the-wild") — the pose/reconstruction model including ViT backbone, MANO hand model, and RefineNet refinement stage. It expects a cropped hand image from a separate detector. First run requires `torch` for a one-time MANO conversion; after that, inference runs purely on MLX.
|
|
6
|
+
|
|
7
|
+
## Performance
|
|
8
|
+
|
|
9
|
+
Tested on Apple M4 Max, single-image inference, float32.
|
|
10
|
+
|
|
11
|
+
### Live sidecar behavior (gesture UI prototype)
|
|
12
|
+
|
|
13
|
+
The strongest launch evidence is the route we actually use for interaction: camera frame → hand crop → WiLoR-mini pose/reconstruction sidecar → hand-pose event.
|
|
14
|
+
|
|
15
|
+
On a clean post-reboot M4 Max same-harness smoke over recent 160x120 saved frames from a gesture UI prototype, MLX runs the pose/reconstruction model stage at about 37ms median versus 49ms for PyTorch MPS, and the full saved-frame route at about 49ms versus 60ms. That is roughly a 1.3x model-stage advantage and a 1.2x full-route advantage on the fair comparison denominator we trust most right now.
|
|
16
|
+
|
|
17
|
+
Larger derived-frame stress tests widen both backends; MLX remained faster in those runs, but we treat those numbers as route/runtime stress evidence rather than the headline model benchmark.
|
|
18
|
+
|
|
19
|
+
Older app-level telemetry is what pushed us toward MLX in the first place, but clean reruns narrowed the comparison denominator enough that we are not using the old tail story as a fresh universal PyTorch-vs-MLX headline. The current public claim is narrower and stronger: WiLoR-mini now has a native MLX runtime on Apple Silicon, with live sidecar latency low enough to build interaction on.
|
|
20
|
+
|
|
21
|
+
### Why so consistent?
|
|
22
|
+
|
|
23
|
+
MLX's lazy evaluation builds a graph that can be evaluated in fewer, fused submissions. That reduces dispatch and synchronization surface area in this short-context ViT workload, which is where our traces suggest the PyTorch MPS tail was coming from. The result is tight, flat latency that makes 3D hand pose viable as a real-time control primitive.
|
|
24
|
+
|
|
25
|
+
### Local benchmark harness
|
|
26
|
+
|
|
27
|
+
The repository includes a local benchmark harness for route checks and local reproduction:
|
|
28
|
+
|
|
29
|
+
```bash
|
|
30
|
+
python benchmarks/bench_wilor.py --backend mlx --weights weights/wilor-mlx.safetensors --mano-npz weights/mano.npz
|
|
31
|
+
```
|
|
32
|
+
|
|
33
|
+
We are not using the old app-tail telemetry as the headline claim right now. The strongest current evidence is the same-harness saved-frame route above: it measures the path that actually matters for using hand pose as a real-time input primitive while keeping the PyTorch MPS comparison on the same denominator. Lower-bandwidth M2 Pro/Tahoe validation also shows MLX ahead on archived hand-positive frames, but recent macOS/Metal changes moved both backends enough that we are treating exact M2 Pro numbers as rebaseline work rather than launch headline copy.
|
|
34
|
+
|
|
35
|
+
## Install
|
|
36
|
+
|
|
37
|
+
```bash
|
|
38
|
+
git clone https://github.com/lyonsno/wilor-mlx
|
|
39
|
+
cd wilor-mlx
|
|
40
|
+
pip install -e .
|
|
41
|
+
pip install torch # needed once for first-run MANO conversion, not used after
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
Requires macOS with Apple Silicon, Python 3.10+. MLX and other dependencies install automatically.
|
|
45
|
+
|
|
46
|
+
## How it works
|
|
47
|
+
|
|
48
|
+
On the first call to `WiLoR.from_pretrained()`, wilor-mlx automatically:
|
|
49
|
+
|
|
50
|
+
1. Downloads model weights from [HuggingFace](https://huggingface.co/BasinShapers/wilor-mlx) (2.4 GB, cached locally)
|
|
51
|
+
2. Downloads MANO hand model data from the [WiLoR-mini](https://huggingface.co/warmshao/WiLoR-mini) checkpoint (requires `torch` for one-time conversion)
|
|
52
|
+
3. Caches converted MANO data at `~/.cache/wilor-mlx/mano.npz`
|
|
53
|
+
|
|
54
|
+
After the first run, everything loads from cache and **torch is never used again.**
|
|
55
|
+
|
|
56
|
+
The MANO hand model is licensed separately by the Max Planck Institute. We do not redistribute MANO data — it is downloaded from the original WiLoR-mini source and converted locally on your machine. See [mano.is.tue.mpg.de](https://mano.is.tue.mpg.de/) for MANO license terms.
|
|
57
|
+
|
|
58
|
+
Float32 and int4 weight variants are available on the [model card](https://huggingface.co/BasinShapers/wilor-mlx). Both run at the same speed on Apple Silicon — at these sequence lengths (210 tokens) the model is compute-bound, not memory-bandwidth-bound, so smaller weights don't accelerate inference. Int4 is purely a deployment convenience (2.4 GB → 490 MB).
|
|
59
|
+
|
|
60
|
+
If you prefer to supply your own MANO data (e.g. obtained directly from [MPI](https://mano.is.tue.mpg.de/)), pass `mano_path=...` to `from_pretrained()`.
|
|
61
|
+
|
|
62
|
+
## Quick start
|
|
63
|
+
|
|
64
|
+
```python
|
|
65
|
+
from wilor_mlx import WiLoR
|
|
66
|
+
import mlx.core as mx
|
|
67
|
+
import numpy as np
|
|
68
|
+
|
|
69
|
+
# Load model — everything downloads and caches automatically
|
|
70
|
+
model = WiLoR.from_pretrained()
|
|
71
|
+
|
|
72
|
+
# Prepare input: a 256x256 RGB hand crop as uint8
|
|
73
|
+
# WiLoR expects a tightly cropped hand image, typically from a hand detector
|
|
74
|
+
image = np.random.randint(0, 256, (1, 256, 256, 3), dtype=np.uint8) # replace with real image
|
|
75
|
+
image_mlx = mx.array(image)
|
|
76
|
+
|
|
77
|
+
# Run inference
|
|
78
|
+
result = model(image_mlx)
|
|
79
|
+
mx.eval(result)
|
|
80
|
+
|
|
81
|
+
# Outputs
|
|
82
|
+
keypoints_3d = np.array(result['pred_keypoints_3d']) # (1, 21, 3) — 21 hand keypoints in 3D
|
|
83
|
+
vertices = np.array(result['pred_vertices']) # (1, 778, 3) — MANO mesh vertices
|
|
84
|
+
camera = np.array(result['pred_cam']) # (1, 3) — weak-perspective camera [s, tx, ty]
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
### Input format
|
|
88
|
+
|
|
89
|
+
The model expects a **256×256 RGB crop of a hand**, as a `(B, 256, 256, 3)` uint8 MLX array in NHWC layout. The model handles normalization internally (ImageNet mean/std). In a typical pipeline, a hand detector (like YOLO) first finds the hand bounding box in a full frame, then the crop is resized to 256×256 and passed to WiLoR for 3D pose estimation.
|
|
90
|
+
|
|
91
|
+
### Output format
|
|
92
|
+
|
|
93
|
+
| Key | Shape | Description |
|
|
94
|
+
|---|---|---|
|
|
95
|
+
| `pred_keypoints_3d` | (B, 21, 3) | 3D hand joint locations (OpenPose ordering) |
|
|
96
|
+
| `pred_vertices` | (B, 778, 3) | MANO mesh vertex positions |
|
|
97
|
+
| `pred_cam` | (B, 3) | Weak-perspective camera `[scale, tx, ty]` |
|
|
98
|
+
| `global_orient` | (B, 1, 3) | Global wrist rotation (axis-angle) |
|
|
99
|
+
| `hand_pose` | (B, 15, 3) | Per-finger joint rotations (axis-angle) |
|
|
100
|
+
| `betas` | (B, 10) | MANO shape parameters |
|
|
101
|
+
|
|
102
|
+
## Numerical accuracy
|
|
103
|
+
|
|
104
|
+
Compared against PyTorch WiLoR-mini on identical inputs (float32):
|
|
105
|
+
|
|
106
|
+
| Output | Max abs diff | Notes |
|
|
107
|
+
|---|---|---|
|
|
108
|
+
| pred_vertices (778×3) | 0.006 | Sub-millimeter |
|
|
109
|
+
| pred_keypoints_3d (21×3) | 0.006 | Sub-millimeter |
|
|
110
|
+
| hand_pose (15×3) | 0.06 | Axis-angle is sensitive near gimbal lock |
|
|
111
|
+
| betas (10) | 0.10 | Accumulates through 32 transformer layers |
|
|
112
|
+
|
|
113
|
+
The geometric outputs that matter for hand tracking (vertices, keypoints) match within sub-millimeter precision.
|
|
114
|
+
|
|
115
|
+
## Architecture
|
|
116
|
+
|
|
117
|
+
The port includes:
|
|
118
|
+
|
|
119
|
+
- **ViT-H/16 backbone** — 1280 embed dim, 32 transformer layers, 16 heads. Processes 192 image patches + 18 learnable tokens (pose/shape/camera).
|
|
120
|
+
- **MANO hand model** — differentiable hand mesh with Linear Blend Skinning, Rodrigues rotations, and kinematic chain. 778 vertices, 16 joints.
|
|
121
|
+
- **RefineNet** — multi-scale deconvolution pyramid that samples ViT features at projected vertex locations via bilinear grid sampling, then refines the initial MANO parameter estimates.
|
|
122
|
+
- **Weight converter** — loads PyTorch `.ckpt` files, handles Conv2d NCHW→NHWC transposition, ConvTranspose2d weight layout, and BatchNorm parameter mapping.
|
|
123
|
+
|
|
124
|
+
## License
|
|
125
|
+
|
|
126
|
+
The wilor-mlx code and distributed weight files are MIT licensed. Our weights (on [HuggingFace](https://huggingface.co/BasinShapers/wilor-mlx)) contain only ViT backbone, RefineNet, and learned embedding parameters — no MANO data is bundled or rehosted.
|
|
127
|
+
|
|
128
|
+
The [MANO hand model](https://mano.is.tue.mpg.de/) is separately licensed by the Max Planck Institute. `WiLoR.from_pretrained()` fetches upstream [WiLoR-mini](https://huggingface.co/warmshao/WiLoR-mini) assets and converts MANO data locally on your machine. If you prefer to obtain MANO directly from MPI, pass `mano_path=...` to use your own copy.
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=68.0"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "wilor-mlx"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "WiLoR hand pose estimation for Apple Silicon, rebuilt end-to-end in MLX"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
license = "MIT"
|
|
11
|
+
requires-python = ">=3.10"
|
|
12
|
+
authors = [
|
|
13
|
+
{ name = "Noah Lyons" },
|
|
14
|
+
]
|
|
15
|
+
keywords = ["mlx", "apple-silicon", "hand-pose", "wilor", "mano", "3d", "real-time"]
|
|
16
|
+
classifiers = [
|
|
17
|
+
"Development Status :: 4 - Beta",
|
|
18
|
+
"Intended Audience :: Developers",
|
|
19
|
+
"Intended Audience :: Science/Research",
|
|
20
|
+
"Operating System :: MacOS",
|
|
21
|
+
"Programming Language :: Python :: 3",
|
|
22
|
+
"Programming Language :: Python :: 3.10",
|
|
23
|
+
"Programming Language :: Python :: 3.11",
|
|
24
|
+
"Programming Language :: Python :: 3.12",
|
|
25
|
+
"Programming Language :: Python :: 3.13",
|
|
26
|
+
"Programming Language :: Python :: 3.14",
|
|
27
|
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
28
|
+
"Topic :: Scientific/Engineering :: Image Recognition",
|
|
29
|
+
]
|
|
30
|
+
dependencies = [
|
|
31
|
+
"mlx>=0.22.0",
|
|
32
|
+
"numpy",
|
|
33
|
+
"scipy",
|
|
34
|
+
"huggingface_hub",
|
|
35
|
+
]
|
|
36
|
+
|
|
37
|
+
[project.urls]
|
|
38
|
+
Homepage = "https://github.com/lyonsno/wilor-mlx"
|
|
39
|
+
Repository = "https://github.com/lyonsno/wilor-mlx"
|
|
40
|
+
"Model Card" = "https://huggingface.co/BasinShapers/wilor-mlx"
|
|
41
|
+
|
|
42
|
+
[project.optional-dependencies]
|
|
43
|
+
dev = [
|
|
44
|
+
"pytest>=7.0",
|
|
45
|
+
"torch",
|
|
46
|
+
]
|
|
47
|
+
|
|
48
|
+
[tool.setuptools.packages.find]
|
|
49
|
+
where = ["src"]
|