@goodganglabs/lipsync-wasm-v2 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +565 -0
- package/lipsync-wasm-wrapper.d.ts +54 -0
- package/lipsync-wasm-wrapper.js +370 -0
- package/lipsync_wasm_v2.d.ts +242 -0
- package/lipsync_wasm_v2.js +918 -0
- package/lipsync_wasm_v2_bg.wasm +0 -0
- package/package.json +27 -0
package/README.md
ADDED
|
@@ -0,0 +1,565 @@
|
|
|
1
|
+
# @goodganglabs/lipsync-wasm-v2
|
|
2
|
+
|
|
3
|
+
WebAssembly-based real-time audio-to-blendshape lip sync engine.
|
|
4
|
+
Converts 16kHz PCM audio into 52-dimensional ARKit-compatible blendshape frames at 30fps using a student distillation model.
|
|
5
|
+
|
|
6
|
+
## Which Version?
|
|
7
|
+
|
|
8
|
+
| | V2 (this package) | V1 |
|
|
9
|
+
|---|---|---|
|
|
10
|
+
| **Dimensions** | 52-dim ARKit | 111-dim ARKit |
|
|
11
|
+
| **Model** | Student distillation | Phoneme classification |
|
|
12
|
+
| **Post-processing** | Built-in (crisp mouth, fade, blinks) | Manual |
|
|
13
|
+
| **Idle expression** | Not included | Built-in `IdleExpressionGenerator` |
|
|
14
|
+
| **VAD** | Not included | Built-in `VoiceActivityDetector` |
|
|
15
|
+
| **ONNX fallback** | None (ONNX required) | Heuristic fallback |
|
|
16
|
+
| **Recommendation** | Most use cases | Full expression control needed |
|
|
17
|
+
|
|
18
|
+
## Features
|
|
19
|
+
|
|
20
|
+
- 52-dim ARKit blendshape output (direct prediction, no intermediate phoneme step)
|
|
21
|
+
- Batch and real-time streaming processing
|
|
22
|
+
- Built-in post-processing: mouth articulation enhancement, fade in/out, automatic blink injection
|
|
23
|
+
- Built-in expression preset blending
|
|
24
|
+
- Embedded VRMA bone animation data
|
|
25
|
+
- Runs entirely in the browser via WebAssembly
|
|
26
|
+
|
|
27
|
+
## Requirements
|
|
28
|
+
|
|
29
|
+
- **onnxruntime-web** `>=1.17.0` (peer dependency, **required** — V2 has no heuristic fallback)
|
|
30
|
+
|
|
31
|
+
```html
|
|
32
|
+
<script src="https://cdn.jsdelivr.net/npm/onnxruntime-web@1.17.0/dist/ort.min.js"></script>
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
## Installation
|
|
36
|
+
|
|
37
|
+
```bash
|
|
38
|
+
npm install @goodganglabs/lipsync-wasm-v2
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
## Quick Start
|
|
42
|
+
|
|
43
|
+
### Batch Processing
|
|
44
|
+
|
|
45
|
+
```js
|
|
46
|
+
import { LipSyncWasmWrapper } from '@goodganglabs/lipsync-wasm-v2';
|
|
47
|
+
|
|
48
|
+
const lipsync = new LipSyncWasmWrapper();
|
|
49
|
+
await lipsync.init();
|
|
50
|
+
|
|
51
|
+
const result = await lipsync.processFile(audioFile);
|
|
52
|
+
for (let i = 0; i < result.frame_count; i++) {
|
|
53
|
+
const frame = lipsync.getFrame(result, i); // number[52]
|
|
54
|
+
applyToAvatar(frame);
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
lipsync.dispose();
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
### Three.js VRM Complete Example
|
|
61
|
+
|
|
62
|
+
Full integration with a VRM avatar: init, load VRM, apply blendshapes, render loop.
|
|
63
|
+
|
|
64
|
+
```html
|
|
65
|
+
<script type="importmap">
|
|
66
|
+
{ "imports": {
|
|
67
|
+
"three": "https://cdn.jsdelivr.net/npm/three@0.179.1/build/three.module.js",
|
|
68
|
+
"three/addons/": "https://cdn.jsdelivr.net/npm/three@0.179.1/examples/jsm/",
|
|
69
|
+
"@pixiv/three-vrm": "https://cdn.jsdelivr.net/npm/@pixiv/three-vrm@3.4.5/lib/three-vrm.module.min.js",
|
|
70
|
+
"@pixiv/three-vrm-animation": "https://cdn.jsdelivr.net/npm/@pixiv/three-vrm-animation@3.4.5/lib/three-vrm-animation.module.min.js"
|
|
71
|
+
}}
|
|
72
|
+
</script>
|
|
73
|
+
<script src="https://cdn.jsdelivr.net/npm/onnxruntime-web@1.17.0/dist/ort.min.js"></script>
|
|
74
|
+
|
|
75
|
+
<canvas id="avatar-canvas" style="width:100%; height:500px;"></canvas>
|
|
76
|
+
|
|
77
|
+
<script type="module">
|
|
78
|
+
import * as THREE from 'three';
|
|
79
|
+
import { GLTFLoader } from 'three/addons/loaders/GLTFLoader.js';
|
|
80
|
+
import { OrbitControls } from 'three/addons/controls/OrbitControls.js';
|
|
81
|
+
import { VRMLoaderPlugin, VRMUtils } from '@pixiv/three-vrm';
|
|
82
|
+
import { LipSyncWasmWrapper } from '@goodganglabs/lipsync-wasm-v2';
|
|
83
|
+
|
|
84
|
+
// --- Index-to-name mapping (52-dim ARKit) ---
|
|
85
|
+
const SYSTEM_INDEX_TO_BLENDSHAPE = {
|
|
86
|
+
0: ['browDownLeft'], 1: ['browDownRight'], 2: ['browInnerUp'],
|
|
87
|
+
3: ['browOuterUpLeft'], 4: ['browOuterUpRight'],
|
|
88
|
+
5: ['cheekPuff'], 6: ['cheekSquintLeft'], 7: ['cheekSquintRight'],
|
|
89
|
+
8: ['eyeBlinkLeft'], 9: ['eyeBlinkRight'],
|
|
90
|
+
10: ['eyeLookDownLeft'], 11: ['eyeLookDownRight'],
|
|
91
|
+
12: ['eyeLookInLeft'], 13: ['eyeLookInRight'],
|
|
92
|
+
14: ['eyeLookOutLeft'], 15: ['eyeLookOutRight'],
|
|
93
|
+
16: ['eyeLookUpLeft'], 17: ['eyeLookUpRight'],
|
|
94
|
+
18: ['eyeSquintLeft'], 19: ['eyeSquintRight'],
|
|
95
|
+
20: ['eyeWideLeft'], 21: ['eyeWideRight'],
|
|
96
|
+
22: ['jawForward'], 23: ['jawLeft'], 24: ['jawOpen'], 25: ['jawRight'],
|
|
97
|
+
26: ['mouthClose'], 27: ['mouthDimpleLeft'], 28: ['mouthDimpleRight'],
|
|
98
|
+
29: ['mouthFrownLeft'], 30: ['mouthFrownRight'], 31: ['mouthFunnel'],
|
|
99
|
+
32: ['mouthLeft'], 33: ['mouthLowerDownLeft'], 34: ['mouthLowerDownRight'],
|
|
100
|
+
35: ['mouthPressLeft'], 36: ['mouthPressRight'], 37: ['mouthPucker'],
|
|
101
|
+
38: ['mouthRight'], 39: ['mouthRollLower'], 40: ['mouthRollUpper'],
|
|
102
|
+
41: ['mouthShrugLower'], 42: ['mouthShrugUpper'],
|
|
103
|
+
43: ['mouthSmileLeft'], 44: ['mouthSmileRight'],
|
|
104
|
+
45: ['mouthStretchLeft'], 46: ['mouthStretchRight'],
|
|
105
|
+
47: ['mouthUpperUpLeft'], 48: ['mouthUpperUpRight'],
|
|
106
|
+
49: ['noseSneerLeft'], 50: ['noseSneerRight'],
|
|
107
|
+
51: ['tongueOut']
|
|
108
|
+
};
|
|
109
|
+
|
|
110
|
+
// --- Apply blendshape frame to VRM ---
|
|
111
|
+
function applyBlendshapes(vrm, frame) {
|
|
112
|
+
if (!vrm) return;
|
|
113
|
+
|
|
114
|
+
// VRM 1.0 (expressionManager)
|
|
115
|
+
if (vrm.expressionManager) {
|
|
116
|
+
for (const [idx, names] of Object.entries(SYSTEM_INDEX_TO_BLENDSHAPE)) {
|
|
117
|
+
const value = frame[idx] || 0;
|
|
118
|
+
for (const name of names) {
|
|
119
|
+
vrm.expressionManager.setValue(name, value);
|
|
120
|
+
}
|
|
121
|
+
}
|
|
122
|
+
return;
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
// VRM 0.x (blendShapeProxy)
|
|
126
|
+
if (vrm.blendShapeProxy) {
|
|
127
|
+
for (const [idx, names] of Object.entries(SYSTEM_INDEX_TO_BLENDSHAPE)) {
|
|
128
|
+
const value = frame[idx] || 0;
|
|
129
|
+
for (const name of names) {
|
|
130
|
+
vrm.blendShapeProxy.setValue(name, value);
|
|
131
|
+
}
|
|
132
|
+
}
|
|
133
|
+
vrm.blendShapeProxy.update();
|
|
134
|
+
return;
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
// Fallback: direct morph target manipulation
|
|
138
|
+
vrm.scene.traverse((child) => {
|
|
139
|
+
if (!child.isMesh || !child.morphTargetDictionary || !child.morphTargetInfluences) return;
|
|
140
|
+
for (const [idx, names] of Object.entries(SYSTEM_INDEX_TO_BLENDSHAPE)) {
|
|
141
|
+
const value = frame[idx] || 0;
|
|
142
|
+
for (const name of names) {
|
|
143
|
+
const morphIdx = child.morphTargetDictionary[name];
|
|
144
|
+
if (morphIdx !== undefined) {
|
|
145
|
+
child.morphTargetInfluences[morphIdx] = value;
|
|
146
|
+
}
|
|
147
|
+
}
|
|
148
|
+
}
|
|
149
|
+
});
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
// --- Setup ---
|
|
153
|
+
const canvas = document.getElementById('avatar-canvas');
|
|
154
|
+
const scene = new THREE.Scene();
|
|
155
|
+
scene.background = new THREE.Color(0x1a1a2e);
|
|
156
|
+
|
|
157
|
+
const camera = new THREE.PerspectiveCamera(30, canvas.clientWidth / canvas.clientHeight, 0.1, 100);
|
|
158
|
+
camera.position.set(0, 1.25, 0.5);
|
|
159
|
+
|
|
160
|
+
const renderer = new THREE.WebGLRenderer({ canvas, antialias: true });
|
|
161
|
+
renderer.setSize(canvas.clientWidth, canvas.clientHeight);
|
|
162
|
+
renderer.setPixelRatio(Math.min(window.devicePixelRatio, 2));
|
|
163
|
+
|
|
164
|
+
const controls = new OrbitControls(camera, canvas);
|
|
165
|
+
controls.target.set(0, 1.25, 0);
|
|
166
|
+
controls.enableDamping = true;
|
|
167
|
+
|
|
168
|
+
scene.add(new THREE.AmbientLight(0xffffff, 2.0));
|
|
169
|
+
const dirLight = new THREE.DirectionalLight(0xffffff, 1.1);
|
|
170
|
+
dirLight.position.set(1, 3, 2);
|
|
171
|
+
scene.add(dirLight);
|
|
172
|
+
|
|
173
|
+
// --- Load VRM ---
|
|
174
|
+
const loader = new GLTFLoader();
|
|
175
|
+
loader.register((parser) => new VRMLoaderPlugin(parser));
|
|
176
|
+
|
|
177
|
+
const gltf = await new Promise((resolve, reject) =>
|
|
178
|
+
loader.load('your-avatar.vrm', resolve, undefined, reject)
|
|
179
|
+
);
|
|
180
|
+
const vrm = gltf.userData.vrm;
|
|
181
|
+
VRMUtils.removeUnnecessaryVertices(gltf.scene);
|
|
182
|
+
VRMUtils.removeUnnecessaryJoints(gltf.scene);
|
|
183
|
+
scene.add(vrm.scene);
|
|
184
|
+
|
|
185
|
+
// --- Init LipSync ---
|
|
186
|
+
const lipsync = new LipSyncWasmWrapper();
|
|
187
|
+
await lipsync.init();
|
|
188
|
+
|
|
189
|
+
// --- Process audio & animate ---
|
|
190
|
+
const result = await lipsync.processFile(audioFile);
|
|
191
|
+
let frameIndex = 0;
|
|
192
|
+
const clock = new THREE.Clock();
|
|
193
|
+
|
|
194
|
+
function animate() {
|
|
195
|
+
requestAnimationFrame(animate);
|
|
196
|
+
const delta = clock.getDelta();
|
|
197
|
+
controls.update();
|
|
198
|
+
|
|
199
|
+
if (frameIndex < result.frame_count) {
|
|
200
|
+
const frame = lipsync.getFrame(result, frameIndex);
|
|
201
|
+
applyBlendshapes(vrm, frame);
|
|
202
|
+
frameIndex++;
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
vrm.update(delta);
|
|
206
|
+
renderer.render(scene, camera);
|
|
207
|
+
}
|
|
208
|
+
animate();
|
|
209
|
+
</script>
|
|
210
|
+
```
|
|
211
|
+
|
|
212
|
+
## ARKit Blendshape Index
|
|
213
|
+
|
|
214
|
+
Full 52-element index mapping:
|
|
215
|
+
|
|
216
|
+
| Index | Name | Index | Name |
|
|
217
|
+
|-------|------|-------|------|
|
|
218
|
+
| 0 | `browDownLeft` | 26 | `mouthClose` |
|
|
219
|
+
| 1 | `browDownRight` | 27 | `mouthDimpleLeft` |
|
|
220
|
+
| 2 | `browInnerUp` | 28 | `mouthDimpleRight` |
|
|
221
|
+
| 3 | `browOuterUpLeft` | 29 | `mouthFrownLeft` |
|
|
222
|
+
| 4 | `browOuterUpRight` | 30 | `mouthFrownRight` |
|
|
223
|
+
| 5 | `cheekPuff` | 31 | `mouthFunnel` |
|
|
224
|
+
| 6 | `cheekSquintLeft` | 32 | `mouthLeft` |
|
|
225
|
+
| 7 | `cheekSquintRight` | 33 | `mouthLowerDownLeft` |
|
|
226
|
+
| 8 | `eyeBlinkLeft` | 34 | `mouthLowerDownRight` |
|
|
227
|
+
| 9 | `eyeBlinkRight` | 35 | `mouthPressLeft` |
|
|
228
|
+
| 10 | `eyeLookDownLeft` | 36 | `mouthPressRight` |
|
|
229
|
+
| 11 | `eyeLookDownRight` | 37 | `mouthPucker` |
|
|
230
|
+
| 12 | `eyeLookInLeft` | 38 | `mouthRight` |
|
|
231
|
+
| 13 | `eyeLookInRight` | 39 | `mouthRollLower` |
|
|
232
|
+
| 14 | `eyeLookOutLeft` | 40 | `mouthRollUpper` |
|
|
233
|
+
| 15 | `eyeLookOutRight` | 41 | `mouthShrugLower` |
|
|
234
|
+
| 16 | `eyeLookUpLeft` | 42 | `mouthShrugUpper` |
|
|
235
|
+
| 17 | `eyeLookUpRight` | 43 | `mouthSmileLeft` |
|
|
236
|
+
| 18 | `eyeSquintLeft` | 44 | `mouthSmileRight` |
|
|
237
|
+
| 19 | `eyeSquintRight` | 45 | `mouthStretchLeft` |
|
|
238
|
+
| 20 | `eyeWideLeft` | 46 | `mouthStretchRight` |
|
|
239
|
+
| 21 | `eyeWideRight` | 47 | `mouthUpperUpLeft` |
|
|
240
|
+
| 22 | `jawForward` | 48 | `mouthUpperUpRight` |
|
|
241
|
+
| 23 | `jawLeft` | 49 | `noseSneerLeft` |
|
|
242
|
+
| 24 | `jawOpen` | 50 | `noseSneerRight` |
|
|
243
|
+
| 25 | `jawRight` | 51 | `tongueOut` |
|
|
244
|
+
|
|
245
|
+
## VRMA Bone Animation
|
|
246
|
+
|
|
247
|
+
The package includes embedded VRMA bone animation data for idle and speaking poses. Use these with Three.js `AnimationMixer` for natural body motion during lip sync.
|
|
248
|
+
|
|
249
|
+
```js
|
|
250
|
+
import { GLTFLoader } from 'three/addons/loaders/GLTFLoader.js';
|
|
251
|
+
import { VRMAnimationLoaderPlugin, createVRMAnimationClip } from '@pixiv/three-vrm-animation';
|
|
252
|
+
|
|
253
|
+
// 1. Get embedded VRMA bytes from the wrapper
|
|
254
|
+
const vrmaData = lipsync.getVrmaBytes();
|
|
255
|
+
|
|
256
|
+
// 2. Load VRMA from bytes
|
|
257
|
+
async function loadVRMAFromBytes(bytes) {
|
|
258
|
+
const blob = new Blob([bytes], { type: 'application/octet-stream' });
|
|
259
|
+
const url = URL.createObjectURL(blob);
|
|
260
|
+
const loader = new GLTFLoader();
|
|
261
|
+
loader.register((parser) => new VRMAnimationLoaderPlugin(parser));
|
|
262
|
+
const gltf = await new Promise((resolve, reject) =>
|
|
263
|
+
loader.load(url, resolve, undefined, reject)
|
|
264
|
+
);
|
|
265
|
+
URL.revokeObjectURL(url);
|
|
266
|
+
return gltf.userData.vrmAnimations[0];
|
|
267
|
+
}
|
|
268
|
+
|
|
269
|
+
const idleAnim = await loadVRMAFromBytes(vrmaData.idle);
|
|
270
|
+
const speakingAnim = await loadVRMAFromBytes(vrmaData.speaking);
|
|
271
|
+
|
|
272
|
+
// 3. Setup AnimationMixer with crossfade
|
|
273
|
+
const mixer = new THREE.AnimationMixer(vrm.scene);
|
|
274
|
+
|
|
275
|
+
const idleClip = createVRMAnimationClip(idleAnim, vrm);
|
|
276
|
+
const speakingClip = createVRMAnimationClip(speakingAnim, vrm);
|
|
277
|
+
|
|
278
|
+
const idleAction = mixer.clipAction(idleClip);
|
|
279
|
+
const speakingAction = mixer.clipAction(speakingClip);
|
|
280
|
+
|
|
281
|
+
idleAction.setLoop(THREE.LoopRepeat);
|
|
282
|
+
speakingAction.setLoop(THREE.LoopRepeat);
|
|
283
|
+
|
|
284
|
+
idleAction.setEffectiveWeight(1);
|
|
285
|
+
idleAction.play();
|
|
286
|
+
speakingAction.setEffectiveWeight(0);
|
|
287
|
+
speakingAction.play();
|
|
288
|
+
|
|
289
|
+
// 4. Smoothstep crossfade between idle and speaking
|
|
290
|
+
let crossFadeProgress = 0;
|
|
291
|
+
let isSpeaking = false;
|
|
292
|
+
|
|
293
|
+
function updateBoneWeights(delta) {
|
|
294
|
+
const target = isSpeaking ? 1 : 0;
|
|
295
|
+
const speed = 1.0 / 0.4; // 0.4s transition duration
|
|
296
|
+
if (target > crossFadeProgress) {
|
|
297
|
+
crossFadeProgress = Math.min(crossFadeProgress + delta * speed, 1);
|
|
298
|
+
} else {
|
|
299
|
+
crossFadeProgress = Math.max(crossFadeProgress - delta * speed, 0);
|
|
300
|
+
}
|
|
301
|
+
// Smoothstep interpolation
|
|
302
|
+
const t = crossFadeProgress;
|
|
303
|
+
const w = t * t * (3 - 2 * t);
|
|
304
|
+
speakingAction.setEffectiveWeight(w);
|
|
305
|
+
idleAction.setEffectiveWeight(1 - w);
|
|
306
|
+
}
|
|
307
|
+
|
|
308
|
+
// In your render loop:
|
|
309
|
+
// updateBoneWeights(delta);
|
|
310
|
+
// mixer.update(delta);
|
|
311
|
+
```
|
|
312
|
+
|
|
313
|
+
## Real-time Streaming
|
|
314
|
+
|
|
315
|
+
### Microphone Input with AudioWorklet
|
|
316
|
+
|
|
317
|
+
```js
|
|
318
|
+
// 1. Get microphone stream
|
|
319
|
+
const stream = await navigator.mediaDevices.getUserMedia({
|
|
320
|
+
audio: { sampleRate: 16000, channelCount: 1, echoCancellation: true }
|
|
321
|
+
});
|
|
322
|
+
const audioCtx = new AudioContext({ sampleRate: 16000 });
|
|
323
|
+
const source = audioCtx.createMediaStreamSource(stream);
|
|
324
|
+
|
|
325
|
+
// 2. AudioWorklet: batch 1600 samples (100ms @ 16kHz)
|
|
326
|
+
const workletCode = `
|
|
327
|
+
class MicProcessor extends AudioWorkletProcessor {
|
|
328
|
+
constructor() {
|
|
329
|
+
super();
|
|
330
|
+
this.buffer = [];
|
|
331
|
+
this.bufferLen = 0;
|
|
332
|
+
this.TARGET = 1600; // 100ms @ 16kHz
|
|
333
|
+
}
|
|
334
|
+
process(inputs) {
|
|
335
|
+
const input = inputs[0];
|
|
336
|
+
if (input.length > 0 && input[0].length > 0) {
|
|
337
|
+
this.buffer.push(new Float32Array(input[0]));
|
|
338
|
+
this.bufferLen += input[0].length;
|
|
339
|
+
if (this.bufferLen >= this.TARGET) {
|
|
340
|
+
const merged = new Float32Array(this.bufferLen);
|
|
341
|
+
let off = 0;
|
|
342
|
+
for (const buf of this.buffer) { merged.set(buf, off); off += buf.length; }
|
|
343
|
+
this.port.postMessage(merged);
|
|
344
|
+
this.buffer = [];
|
|
345
|
+
this.bufferLen = 0;
|
|
346
|
+
}
|
|
347
|
+
}
|
|
348
|
+
return true;
|
|
349
|
+
}
|
|
350
|
+
}
|
|
351
|
+
registerProcessor('mic-processor', MicProcessor);
|
|
352
|
+
`;
|
|
353
|
+
const blob = new Blob([workletCode], { type: 'application/javascript' });
|
|
354
|
+
const url = URL.createObjectURL(blob);
|
|
355
|
+
await audioCtx.audioWorklet.addModule(url);
|
|
356
|
+
URL.revokeObjectURL(url);
|
|
357
|
+
|
|
358
|
+
const workletNode = new AudioWorkletNode(audioCtx, 'mic-processor');
|
|
359
|
+
source.connect(workletNode);
|
|
360
|
+
workletNode.connect(audioCtx.destination);
|
|
361
|
+
|
|
362
|
+
// 3. Frame queue + processing
|
|
363
|
+
const streamQueue = [];
|
|
364
|
+
let micProcessing = false;
|
|
365
|
+
const micBuffer = [];
|
|
366
|
+
|
|
367
|
+
workletNode.port.onmessage = (e) => {
|
|
368
|
+
micBuffer.push(e.data);
|
|
369
|
+
if (!micProcessing) processMicBuffer();
|
|
370
|
+
};
|
|
371
|
+
|
|
372
|
+
async function processMicBuffer() {
|
|
373
|
+
if (micBuffer.length === 0) return;
|
|
374
|
+
micProcessing = true;
|
|
375
|
+
try {
|
|
376
|
+
const chunks = micBuffer.splice(0);
|
|
377
|
+
let totalLen = 0;
|
|
378
|
+
for (const c of chunks) totalLen += c.length;
|
|
379
|
+
const audio = new Float32Array(totalLen);
|
|
380
|
+
let offset = 0;
|
|
381
|
+
for (const c of chunks) { audio.set(c, offset); offset += c.length; }
|
|
382
|
+
|
|
383
|
+
const result = await lipsync.processAudioChunk(audio);
|
|
384
|
+
if (result && result.frame_count > 0) {
|
|
385
|
+
for (let i = 0; i < result.frame_count; i++) {
|
|
386
|
+
streamQueue.push(lipsync.getFrame(result, i));
|
|
387
|
+
}
|
|
388
|
+
}
|
|
389
|
+
} finally {
|
|
390
|
+
micProcessing = false;
|
|
391
|
+
if (micBuffer.length > 0) processMicBuffer();
|
|
392
|
+
}
|
|
393
|
+
}
|
|
394
|
+
|
|
395
|
+
// 4. Consume at 30fps in render loop
|
|
396
|
+
let streamTimeAccum = 0;
|
|
397
|
+
const frameInterval = 1.0 / 30.0;
|
|
398
|
+
|
|
399
|
+
function renderLoop() {
|
|
400
|
+
requestAnimationFrame(renderLoop);
|
|
401
|
+
const delta = clock.getDelta();
|
|
402
|
+
|
|
403
|
+
streamTimeAccum += delta;
|
|
404
|
+
while (streamTimeAccum >= frameInterval) {
|
|
405
|
+
streamTimeAccum -= frameInterval;
|
|
406
|
+
if (streamQueue.length > 0) {
|
|
407
|
+
const frame = streamQueue.shift();
|
|
408
|
+
applyBlendshapes(vrm, frame);
|
|
409
|
+
}
|
|
410
|
+
}
|
|
411
|
+
|
|
412
|
+
vrm.update(delta);
|
|
413
|
+
renderer.render(scene, camera);
|
|
414
|
+
}
|
|
415
|
+
```
|
|
416
|
+
|
|
417
|
+
### TTS Streaming Integration
|
|
418
|
+
|
|
419
|
+
When processing TTS audio chunks, yield to the main thread periodically to prevent render freezes:
|
|
420
|
+
|
|
421
|
+
```js
|
|
422
|
+
async function processTTSChunks(chunks) {
|
|
423
|
+
for (let i = 0; i < chunks.length; i++) {
|
|
424
|
+
const result = await lipsync.processAudioChunk(
|
|
425
|
+
chunks[i],
|
|
426
|
+
i === chunks.length - 1 // isLast on final chunk
|
|
427
|
+
);
|
|
428
|
+
if (result && result.frame_count > 0) {
|
|
429
|
+
for (let j = 0; j < result.frame_count; j++) {
|
|
430
|
+
streamQueue.push(lipsync.getFrame(result, j));
|
|
431
|
+
}
|
|
432
|
+
}
|
|
433
|
+
// Yield to main thread every 3 chunks (~300ms) to keep rAF rendering smooth
|
|
434
|
+
if ((i + 1) % 3 === 0) {
|
|
435
|
+
await new Promise((resolve) => setTimeout(resolve, 0));
|
|
436
|
+
}
|
|
437
|
+
}
|
|
438
|
+
}
|
|
439
|
+
```
|
|
440
|
+
|
|
441
|
+
## API Reference
|
|
442
|
+
|
|
443
|
+
### Constructor
|
|
444
|
+
|
|
445
|
+
```ts
|
|
446
|
+
new LipSyncWasmWrapper(options?: { wasmPath?: string })
|
|
447
|
+
```
|
|
448
|
+
|
|
449
|
+
| Parameter | Type | Default | Description |
|
|
450
|
+
|-----------|------|---------|-------------|
|
|
451
|
+
| `wasmPath` | `string` | `'./lipsync_wasm_v2.js'` | Path to the WASM glue module |
|
|
452
|
+
|
|
453
|
+
### Properties
|
|
454
|
+
|
|
455
|
+
| Property | Type | Description |
|
|
456
|
+
|----------|------|-------------|
|
|
457
|
+
| `ready` | `boolean` | `true` after `init()` completes |
|
|
458
|
+
| `modelVersion` | `string` | `'v2'` |
|
|
459
|
+
| `blendshapeDim` | `number` | `52` |
|
|
460
|
+
|
|
461
|
+
### `init(options?): Promise<{ mode: string }>`
|
|
462
|
+
|
|
463
|
+
Initializes the WASM runtime, loads the ONNX model, and applies the expression preset.
|
|
464
|
+
|
|
465
|
+
| Option | Type | Default | Description |
|
|
466
|
+
|--------|------|---------|-------------|
|
|
467
|
+
| `onProgress` | `(stage: string, percent: number) => void` | — | Progress callback. Stages: `'wasm'`, `'decrypt'`, `'onnx'` |
|
|
468
|
+
| `preset` | `boolean \| string` | `true` | `true` loads the built-in preset. Pass a URL string to load a custom preset JSON. `false` disables preset loading. |
|
|
469
|
+
|
|
470
|
+
Returns `{ mode: 'v2-onnx' }`. Throws if ONNX Runtime is not available.
|
|
471
|
+
|
|
472
|
+
### `processAudio(audio: Float32Array): Promise<ProcessResult>`
|
|
473
|
+
|
|
474
|
+
Processes a complete 16kHz mono PCM audio buffer.
|
|
475
|
+
|
|
476
|
+
### `processAudioBuffer(audioBuffer: AudioBuffer): Promise<ProcessResult>`
|
|
477
|
+
|
|
478
|
+
Processes a Web Audio API `AudioBuffer` (automatically resampled to 16kHz).
|
|
479
|
+
|
|
480
|
+
### `processFile(file: File): Promise<ProcessResult>`
|
|
481
|
+
|
|
482
|
+
Decodes and processes an audio `File` object.
|
|
483
|
+
|
|
484
|
+
### `processAudioChunk(chunk: Float32Array, isLast?: boolean): Promise<ProcessResult | null>`
|
|
485
|
+
|
|
486
|
+
Feeds an audio chunk for real-time streaming. Streaming sessions are managed internally — the first call starts a session, passing `isLast = true` ends it. Returns `null` if the internal buffer has not accumulated enough data.
|
|
487
|
+
|
|
488
|
+
### `getFrame(result: ProcessResult, frameIndex: number): number[]`
|
|
489
|
+
|
|
490
|
+
Extracts a single blendshape frame from a `ProcessResult`. Returns `number[52]`.
|
|
491
|
+
|
|
492
|
+
### `getVrmaBytes(): { idle: Uint8Array, speaking: Uint8Array }`
|
|
493
|
+
|
|
494
|
+
Returns embedded VRMA bone animation data for idle and speaking states.
|
|
495
|
+
|
|
496
|
+
### `reset(): void`
|
|
497
|
+
|
|
498
|
+
Resets internal state and ends any active streaming session.
|
|
499
|
+
|
|
500
|
+
### `dispose(): void`
|
|
501
|
+
|
|
502
|
+
Releases all WASM and ONNX resources.
|
|
503
|
+
|
|
504
|
+
### ProcessResult
|
|
505
|
+
|
|
506
|
+
```ts
|
|
507
|
+
{
|
|
508
|
+
blendshapes: number[]; // Flat array: frame_count * 52 values
|
|
509
|
+
frame_count: number; // Number of output frames (30fps)
|
|
510
|
+
mode: string; // 'v2-onnx' | 'v2-streaming-onnx'
|
|
511
|
+
}
|
|
512
|
+
```
|
|
513
|
+
|
|
514
|
+
## Bundler Setup
|
|
515
|
+
|
|
516
|
+
### Vite
|
|
517
|
+
|
|
518
|
+
Works out of the box. No additional configuration needed.
|
|
519
|
+
|
|
520
|
+
### Webpack
|
|
521
|
+
|
|
522
|
+
Enable async WebAssembly support:
|
|
523
|
+
|
|
524
|
+
```js
|
|
525
|
+
// webpack.config.js
|
|
526
|
+
module.exports = {
|
|
527
|
+
experiments: {
|
|
528
|
+
asyncWebAssembly: true,
|
|
529
|
+
},
|
|
530
|
+
};
|
|
531
|
+
```
|
|
532
|
+
|
|
533
|
+
### CDN (no bundler)
|
|
534
|
+
|
|
535
|
+
Use `<script type="module">` with an import map:
|
|
536
|
+
|
|
537
|
+
```html
|
|
538
|
+
<script type="importmap">
|
|
539
|
+
{ "imports": {
|
|
540
|
+
"@goodganglabs/lipsync-wasm-v2": "https://your-cdn.com/lipsync-wasm-v2/lipsync-wasm-wrapper.js"
|
|
541
|
+
}}
|
|
542
|
+
</script>
|
|
543
|
+
<script src="https://cdn.jsdelivr.net/npm/onnxruntime-web@1.17.0/dist/ort.min.js"></script>
|
|
544
|
+
<script type="module">
|
|
545
|
+
import { LipSyncWasmWrapper } from '@goodganglabs/lipsync-wasm-v2';
|
|
546
|
+
// ... your code
|
|
547
|
+
</script>
|
|
548
|
+
```
|
|
549
|
+
|
|
550
|
+
When hosting WASM files on a different path than the wrapper JS, use the `wasmPath` option:
|
|
551
|
+
|
|
552
|
+
```js
|
|
553
|
+
const lipsync = new LipSyncWasmWrapper({
|
|
554
|
+
wasmPath: '/static/wasm/lipsync_wasm_v2.js'
|
|
555
|
+
});
|
|
556
|
+
```
|
|
557
|
+
|
|
558
|
+
## Deployment
|
|
559
|
+
|
|
560
|
+
`.wasm` files must be served with the `application/wasm` MIME type.
|
|
561
|
+
CORS headers are required for cross-origin usage.
|
|
562
|
+
|
|
563
|
+
## License
|
|
564
|
+
|
|
565
|
+
Proprietary — GoodGang Labs
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @goodganglabs/lipsync-wasm-v2
|
|
3
|
+
* Audio-to-blendshape lip sync engine (52-dim ARKit, student model)
|
|
4
|
+
*/
|
|
5
|
+
|
|
6
|
+
export interface ProcessResult {
|
|
7
|
+
blendshapes: number[];
|
|
8
|
+
frame_count: number;
|
|
9
|
+
fps: number;
|
|
10
|
+
mode?: string;
|
|
11
|
+
}
|
|
12
|
+
|
|
13
|
+
export interface InitResult {
|
|
14
|
+
mode: 'v2-onnx';
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
export interface VrmaBytes {
|
|
18
|
+
idle: Uint8Array;
|
|
19
|
+
speaking: Uint8Array;
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
export interface InitOptions {
|
|
23
|
+
/** GoodGangLabs license key (e.g. "ggl_xxx"). Omit for 30-day free trial. */
|
|
24
|
+
licenseKey?: string;
|
|
25
|
+
onProgress?: (stage: string, percent: number) => void;
|
|
26
|
+
preset?: boolean | string;
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
export interface ConstructorOptions {
|
|
30
|
+
wasmPath?: string;
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
export class LipSyncWasmWrapper {
|
|
34
|
+
constructor(options?: ConstructorOptions);
|
|
35
|
+
|
|
36
|
+
readonly ready: boolean;
|
|
37
|
+
readonly modelVersion: 'v2';
|
|
38
|
+
readonly blendshapeDim: 52;
|
|
39
|
+
readonly wasmModule: any;
|
|
40
|
+
|
|
41
|
+
init(options?: InitOptions): Promise<InitResult>;
|
|
42
|
+
|
|
43
|
+
processAudio(audio: Float32Array): Promise<ProcessResult>;
|
|
44
|
+
processAudioBuffer(audioBuffer: AudioBuffer): Promise<ProcessResult>;
|
|
45
|
+
processFile(file: File): Promise<ProcessResult>;
|
|
46
|
+
|
|
47
|
+
processAudioChunk(audioChunk: Float32Array, isLast?: boolean): Promise<ProcessResult | null>;
|
|
48
|
+
|
|
49
|
+
getFrame(result: ProcessResult, frameIndex: number): number[];
|
|
50
|
+
|
|
51
|
+
getVrmaBytes(): VrmaBytes;
|
|
52
|
+
reset(): void;
|
|
53
|
+
dispose(): void;
|
|
54
|
+
}
|