@goodganglabs/lipsync-wasm-v1 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +606 -0
- package/lipsync-wasm-wrapper.d.ts +55 -0
- package/lipsync-wasm-wrapper.js +377 -0
- package/lipsync_wasm_v1.d.ts +302 -0
- package/lipsync_wasm_v1.js +1083 -0
- package/lipsync_wasm_v1_bg.wasm +0 -0
- package/package.json +27 -0
package/README.md
ADDED
|
@@ -0,0 +1,606 @@
|
|
|
1
|
+
# @goodganglabs/lipsync-wasm-v1
|
|
2
|
+
|
|
3
|
+
WebAssembly-based real-time audio-to-blendshape lip sync engine.
|
|
4
|
+
Converts 16kHz PCM audio into 111-dimensional ARKit-compatible blendshape frames at 30fps using a phoneme classification model.
|
|
5
|
+
|
|
6
|
+
## Which Version?
|
|
7
|
+
|
|
8
|
+
| | V1 (this package) | V2 |
|
|
9
|
+
|---|---|---|
|
|
10
|
+
| **Dimensions** | 111-dim ARKit | 52-dim ARKit |
|
|
11
|
+
| **Model** | Phoneme classification | Student distillation |
|
|
12
|
+
| **Idle expression** | Built-in `IdleExpressionGenerator` | Not included |
|
|
13
|
+
| **VAD** | Built-in `VoiceActivityDetector` | Not included |
|
|
14
|
+
| **ONNX fallback** | Heuristic fallback | None (ONNX required) |
|
|
15
|
+
| **Post-processing** | Manual | Built-in (crisp mouth, fade, blinks) |
|
|
16
|
+
| **Recommendation** | Full expression control needed | Most use cases |
|
|
17
|
+
|
|
18
|
+
## Features
|
|
19
|
+
|
|
20
|
+
- 111-dim ARKit blendshape output (phoneme-based model)
|
|
21
|
+
- Batch and real-time streaming processing
|
|
22
|
+
- Built-in expression preset blending
|
|
23
|
+
- Embedded VRMA bone animation data
|
|
24
|
+
- Built-in idle expression generator (eye blinks + micro expressions)
|
|
25
|
+
- Built-in voice activity detection (VAD) with auto-calibration
|
|
26
|
+
- ONNX Runtime inference with automatic heuristic fallback
|
|
27
|
+
- Runs entirely in the browser via WebAssembly
|
|
28
|
+
|
|
29
|
+
## Requirements
|
|
30
|
+
|
|
31
|
+
- **onnxruntime-web** `>=1.17.0` (peer dependency)
|
|
32
|
+
|
|
33
|
+
```html
|
|
34
|
+
<script src="https://cdn.jsdelivr.net/npm/onnxruntime-web@1.17.0/dist/ort.min.js"></script>
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
## Installation
|
|
38
|
+
|
|
39
|
+
```bash
|
|
40
|
+
npm install @goodganglabs/lipsync-wasm-v1
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
## Quick Start
|
|
44
|
+
|
|
45
|
+
### Batch Processing
|
|
46
|
+
|
|
47
|
+
```js
|
|
48
|
+
import { LipSyncWasmWrapper } from '@goodganglabs/lipsync-wasm-v1';
|
|
49
|
+
|
|
50
|
+
const lipsync = new LipSyncWasmWrapper();
|
|
51
|
+
await lipsync.init();
|
|
52
|
+
|
|
53
|
+
const result = await lipsync.processFile(audioFile);
|
|
54
|
+
for (let i = 0; i < result.frame_count; i++) {
|
|
55
|
+
const frame = lipsync.getFrame(result, i); // number[111]
|
|
56
|
+
applyToAvatar(frame);
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
lipsync.dispose();
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
### Three.js VRM Complete Example
|
|
63
|
+
|
|
64
|
+
Full integration with a VRM avatar: init, load VRM, apply blendshapes, render loop.
|
|
65
|
+
|
|
66
|
+
```html
|
|
67
|
+
<script type="importmap">
|
|
68
|
+
{ "imports": {
|
|
69
|
+
"three": "https://cdn.jsdelivr.net/npm/three@0.179.1/build/three.module.js",
|
|
70
|
+
"three/addons/": "https://cdn.jsdelivr.net/npm/three@0.179.1/examples/jsm/",
|
|
71
|
+
"@pixiv/three-vrm": "https://cdn.jsdelivr.net/npm/@pixiv/three-vrm@3.4.5/lib/three-vrm.module.min.js",
|
|
72
|
+
"@pixiv/three-vrm-animation": "https://cdn.jsdelivr.net/npm/@pixiv/three-vrm-animation@3.4.5/lib/three-vrm-animation.module.min.js"
|
|
73
|
+
}}
|
|
74
|
+
</script>
|
|
75
|
+
<script src="https://cdn.jsdelivr.net/npm/onnxruntime-web@1.17.0/dist/ort.min.js"></script>
|
|
76
|
+
|
|
77
|
+
<canvas id="avatar-canvas" style="width:100%; height:500px;"></canvas>
|
|
78
|
+
|
|
79
|
+
<script type="module">
|
|
80
|
+
import * as THREE from 'three';
|
|
81
|
+
import { GLTFLoader } from 'three/addons/loaders/GLTFLoader.js';
|
|
82
|
+
import { OrbitControls } from 'three/addons/controls/OrbitControls.js';
|
|
83
|
+
import { VRMLoaderPlugin, VRMUtils } from '@pixiv/three-vrm';
|
|
84
|
+
import { LipSyncWasmWrapper } from '@goodganglabs/lipsync-wasm-v1';
|
|
85
|
+
|
|
86
|
+
// --- Index-to-name mapping (first 52 of 111-dim ARKit) ---
|
|
87
|
+
// V1 outputs 111 dimensions. The first 52 match the standard ARKit set.
|
|
88
|
+
const SYSTEM_INDEX_TO_BLENDSHAPE = {
|
|
89
|
+
0: ['browDownLeft'], 1: ['browDownRight'], 2: ['browInnerUp'],
|
|
90
|
+
3: ['browOuterUpLeft'], 4: ['browOuterUpRight'],
|
|
91
|
+
5: ['cheekPuff'], 6: ['cheekSquintLeft'], 7: ['cheekSquintRight'],
|
|
92
|
+
8: ['eyeBlinkLeft'], 9: ['eyeBlinkRight'],
|
|
93
|
+
10: ['eyeLookDownLeft'], 11: ['eyeLookDownRight'],
|
|
94
|
+
12: ['eyeLookInLeft'], 13: ['eyeLookInRight'],
|
|
95
|
+
14: ['eyeLookOutLeft'], 15: ['eyeLookOutRight'],
|
|
96
|
+
16: ['eyeLookUpLeft'], 17: ['eyeLookUpRight'],
|
|
97
|
+
18: ['eyeSquintLeft'], 19: ['eyeSquintRight'],
|
|
98
|
+
20: ['eyeWideLeft'], 21: ['eyeWideRight'],
|
|
99
|
+
22: ['jawForward'], 23: ['jawLeft'], 24: ['jawOpen'], 25: ['jawRight'],
|
|
100
|
+
26: ['mouthClose'], 27: ['mouthDimpleLeft'], 28: ['mouthDimpleRight'],
|
|
101
|
+
29: ['mouthFrownLeft'], 30: ['mouthFrownRight'], 31: ['mouthFunnel'],
|
|
102
|
+
32: ['mouthLeft'], 33: ['mouthLowerDownLeft'], 34: ['mouthLowerDownRight'],
|
|
103
|
+
35: ['mouthPressLeft'], 36: ['mouthPressRight'], 37: ['mouthPucker'],
|
|
104
|
+
38: ['mouthRight'], 39: ['mouthRollLower'], 40: ['mouthRollUpper'],
|
|
105
|
+
41: ['mouthShrugLower'], 42: ['mouthShrugUpper'],
|
|
106
|
+
43: ['mouthSmileLeft'], 44: ['mouthSmileRight'],
|
|
107
|
+
45: ['mouthStretchLeft'], 46: ['mouthStretchRight'],
|
|
108
|
+
47: ['mouthUpperUpLeft'], 48: ['mouthUpperUpRight'],
|
|
109
|
+
49: ['noseSneerLeft'], 50: ['noseSneerRight'],
|
|
110
|
+
51: ['tongueOut']
|
|
111
|
+
};
|
|
112
|
+
|
|
113
|
+
// --- Apply blendshape frame to VRM ---
|
|
114
|
+
function applyBlendshapes(vrm, frame) {
|
|
115
|
+
if (!vrm) return;
|
|
116
|
+
|
|
117
|
+
// VRM 1.0 (expressionManager)
|
|
118
|
+
if (vrm.expressionManager) {
|
|
119
|
+
for (const [idx, names] of Object.entries(SYSTEM_INDEX_TO_BLENDSHAPE)) {
|
|
120
|
+
const value = frame[idx] || 0;
|
|
121
|
+
for (const name of names) {
|
|
122
|
+
vrm.expressionManager.setValue(name, value);
|
|
123
|
+
}
|
|
124
|
+
}
|
|
125
|
+
return;
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
// VRM 0.x (blendShapeProxy)
|
|
129
|
+
if (vrm.blendShapeProxy) {
|
|
130
|
+
for (const [idx, names] of Object.entries(SYSTEM_INDEX_TO_BLENDSHAPE)) {
|
|
131
|
+
const value = frame[idx] || 0;
|
|
132
|
+
for (const name of names) {
|
|
133
|
+
vrm.blendShapeProxy.setValue(name, value);
|
|
134
|
+
}
|
|
135
|
+
}
|
|
136
|
+
vrm.blendShapeProxy.update();
|
|
137
|
+
return;
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
// Fallback: direct morph target manipulation
|
|
141
|
+
vrm.scene.traverse((child) => {
|
|
142
|
+
if (!child.isMesh || !child.morphTargetDictionary || !child.morphTargetInfluences) return;
|
|
143
|
+
for (const [idx, names] of Object.entries(SYSTEM_INDEX_TO_BLENDSHAPE)) {
|
|
144
|
+
const value = frame[idx] || 0;
|
|
145
|
+
for (const name of names) {
|
|
146
|
+
const morphIdx = child.morphTargetDictionary[name];
|
|
147
|
+
if (morphIdx !== undefined) {
|
|
148
|
+
child.morphTargetInfluences[morphIdx] = value;
|
|
149
|
+
}
|
|
150
|
+
}
|
|
151
|
+
}
|
|
152
|
+
});
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
// --- Setup ---
|
|
156
|
+
const canvas = document.getElementById('avatar-canvas');
|
|
157
|
+
const scene = new THREE.Scene();
|
|
158
|
+
scene.background = new THREE.Color(0x1a1a2e);
|
|
159
|
+
|
|
160
|
+
const camera = new THREE.PerspectiveCamera(30, canvas.clientWidth / canvas.clientHeight, 0.1, 100);
|
|
161
|
+
camera.position.set(0, 1.25, 0.5);
|
|
162
|
+
|
|
163
|
+
const renderer = new THREE.WebGLRenderer({ canvas, antialias: true });
|
|
164
|
+
renderer.setSize(canvas.clientWidth, canvas.clientHeight);
|
|
165
|
+
renderer.setPixelRatio(Math.min(window.devicePixelRatio, 2));
|
|
166
|
+
|
|
167
|
+
const controls = new OrbitControls(camera, canvas);
|
|
168
|
+
controls.target.set(0, 1.25, 0);
|
|
169
|
+
controls.enableDamping = true;
|
|
170
|
+
|
|
171
|
+
scene.add(new THREE.AmbientLight(0xffffff, 2.0));
|
|
172
|
+
const dirLight = new THREE.DirectionalLight(0xffffff, 1.1);
|
|
173
|
+
dirLight.position.set(1, 3, 2);
|
|
174
|
+
scene.add(dirLight);
|
|
175
|
+
|
|
176
|
+
// --- Load VRM ---
|
|
177
|
+
const loader = new GLTFLoader();
|
|
178
|
+
loader.register((parser) => new VRMLoaderPlugin(parser));
|
|
179
|
+
|
|
180
|
+
const gltf = await new Promise((resolve, reject) =>
|
|
181
|
+
loader.load('your-avatar.vrm', resolve, undefined, reject)
|
|
182
|
+
);
|
|
183
|
+
const vrm = gltf.userData.vrm;
|
|
184
|
+
VRMUtils.removeUnnecessaryVertices(gltf.scene);
|
|
185
|
+
VRMUtils.removeUnnecessaryJoints(gltf.scene);
|
|
186
|
+
scene.add(vrm.scene);
|
|
187
|
+
|
|
188
|
+
// --- Init LipSync ---
|
|
189
|
+
const lipsync = new LipSyncWasmWrapper();
|
|
190
|
+
await lipsync.init();
|
|
191
|
+
|
|
192
|
+
// --- Process audio & animate ---
|
|
193
|
+
const result = await lipsync.processFile(audioFile);
|
|
194
|
+
let frameIndex = 0;
|
|
195
|
+
const clock = new THREE.Clock();
|
|
196
|
+
|
|
197
|
+
function animate() {
|
|
198
|
+
requestAnimationFrame(animate);
|
|
199
|
+
const delta = clock.getDelta();
|
|
200
|
+
controls.update();
|
|
201
|
+
|
|
202
|
+
if (frameIndex < result.frame_count) {
|
|
203
|
+
const frame = lipsync.getFrame(result, frameIndex);
|
|
204
|
+
applyBlendshapes(vrm, frame);
|
|
205
|
+
frameIndex++;
|
|
206
|
+
}
|
|
207
|
+
|
|
208
|
+
vrm.update(delta);
|
|
209
|
+
renderer.render(scene, camera);
|
|
210
|
+
}
|
|
211
|
+
animate();
|
|
212
|
+
</script>
|
|
213
|
+
```
|
|
214
|
+
|
|
215
|
+
## IdleExpressionGenerator
|
|
216
|
+
|
|
217
|
+
V1 includes a procedural idle expression generator that produces natural eye blinks (random interval 2.5-4.5s, 15% double-blink chance) and micro facial expressions (sinusoidal).
|
|
218
|
+
|
|
219
|
+
```js
|
|
220
|
+
const lipsync = new LipSyncWasmWrapper();
|
|
221
|
+
await lipsync.init();
|
|
222
|
+
|
|
223
|
+
// Access via the underlying WASM module
|
|
224
|
+
const idle = new lipsync.wasmModule.IdleExpressionGenerator();
|
|
225
|
+
let elapsedSeconds = 0;
|
|
226
|
+
|
|
227
|
+
function renderLoop() {
|
|
228
|
+
requestAnimationFrame(renderLoop);
|
|
229
|
+
const delta = clock.getDelta();
|
|
230
|
+
elapsedSeconds += delta;
|
|
231
|
+
|
|
232
|
+
// Generate procedural idle frame (number[111])
|
|
233
|
+
const frame = idle.get_frame(elapsedSeconds);
|
|
234
|
+
applyBlendshapes(vrm, frame);
|
|
235
|
+
|
|
236
|
+
vrm.update(delta);
|
|
237
|
+
renderer.render(scene, camera);
|
|
238
|
+
}
|
|
239
|
+
```
|
|
240
|
+
|
|
241
|
+
Use idle expressions when no audio is playing. Transition smoothly from lip sync to idle:
|
|
242
|
+
|
|
243
|
+
```js
|
|
244
|
+
// In render loop: lerp from last lip sync frame to idle
|
|
245
|
+
if (!isPlaying && prevFrame) {
|
|
246
|
+
const idleFrame = idle.get_frame(elapsedSeconds);
|
|
247
|
+
const blended = prevFrame.map((v, i) =>
|
|
248
|
+
v + 0.15 * ((idleFrame[i] || 0) - v) // alpha=0.15 for smooth transition
|
|
249
|
+
);
|
|
250
|
+
applyBlendshapes(vrm, blended);
|
|
251
|
+
prevFrame = blended;
|
|
252
|
+
}
|
|
253
|
+
```
|
|
254
|
+
|
|
255
|
+
## Voice Activity Detection (VAD)
|
|
256
|
+
|
|
257
|
+
V1 includes a built-in VAD that auto-calibrates from ambient noise. Use it to transition between idle and speaking bone animations.
|
|
258
|
+
|
|
259
|
+
```js
|
|
260
|
+
const lipsync = new LipSyncWasmWrapper();
|
|
261
|
+
await lipsync.init();
|
|
262
|
+
|
|
263
|
+
// --- Step 1: Calibrate from 1 second of ambient noise ---
|
|
264
|
+
const calibrationSamples = []; // collect RMS values
|
|
265
|
+
const calibrationStart = performance.now();
|
|
266
|
+
|
|
267
|
+
function collectCalibration(audioChunk) {
|
|
268
|
+
let sumSq = 0;
|
|
269
|
+
for (let i = 0; i < audioChunk.length; i++) {
|
|
270
|
+
sumSq += audioChunk[i] * audioChunk[i];
|
|
271
|
+
}
|
|
272
|
+
const rms = Math.sqrt(sumSq / audioChunk.length);
|
|
273
|
+
calibrationSamples.push(rms);
|
|
274
|
+
}
|
|
275
|
+
|
|
276
|
+
// After 1 second of collecting samples:
|
|
277
|
+
function finalizeCalibration() {
|
|
278
|
+
const mean = calibrationSamples.reduce((a, b) => a + b, 0) / calibrationSamples.length;
|
|
279
|
+
const variance = calibrationSamples.reduce((a, b) => a + (b - mean) ** 2, 0) / calibrationSamples.length;
|
|
280
|
+
const stdDev = Math.sqrt(variance);
|
|
281
|
+
|
|
282
|
+
const threshold = Math.max(mean + 2 * stdDev, 0.005);
|
|
283
|
+
const holdTime = 0.5; // seconds to hold "speaking" state after voice drops
|
|
284
|
+
|
|
285
|
+
// Create VAD via WASM module
|
|
286
|
+
const vad = new lipsync.wasmModule.VoiceActivityDetector(threshold, holdTime);
|
|
287
|
+
return vad;
|
|
288
|
+
}
|
|
289
|
+
|
|
290
|
+
// --- Step 2: Use VAD in streaming loop ---
|
|
291
|
+
let vad = null; // set after calibration
|
|
292
|
+
|
|
293
|
+
function processMicChunk(audio) {
|
|
294
|
+
if (vad) {
|
|
295
|
+
const isSpeaking = vad.feed_audio(audio);
|
|
296
|
+
// Use isSpeaking to transition bone animations:
|
|
297
|
+
// isSpeaking=true → crossfade to speaking pose
|
|
298
|
+
// isSpeaking=false → crossfade to idle pose
|
|
299
|
+
}
|
|
300
|
+
}
|
|
301
|
+
```
|
|
302
|
+
|
|
303
|
+
## VRMA Bone Animation
|
|
304
|
+
|
|
305
|
+
The package includes embedded VRMA bone animation data for idle and speaking poses. Use these with Three.js `AnimationMixer` for natural body motion during lip sync.
|
|
306
|
+
|
|
307
|
+
```js
|
|
308
|
+
import { GLTFLoader } from 'three/addons/loaders/GLTFLoader.js';
|
|
309
|
+
import { VRMAnimationLoaderPlugin, createVRMAnimationClip } from '@pixiv/three-vrm-animation';
|
|
310
|
+
|
|
311
|
+
// 1. Get embedded VRMA bytes from the wrapper
|
|
312
|
+
const vrmaData = lipsync.getVrmaBytes();
|
|
313
|
+
|
|
314
|
+
// 2. Load VRMA from bytes
|
|
315
|
+
async function loadVRMAFromBytes(bytes) {
|
|
316
|
+
const blob = new Blob([bytes], { type: 'application/octet-stream' });
|
|
317
|
+
const url = URL.createObjectURL(blob);
|
|
318
|
+
const loader = new GLTFLoader();
|
|
319
|
+
loader.register((parser) => new VRMAnimationLoaderPlugin(parser));
|
|
320
|
+
const gltf = await new Promise((resolve, reject) =>
|
|
321
|
+
loader.load(url, resolve, undefined, reject)
|
|
322
|
+
);
|
|
323
|
+
URL.revokeObjectURL(url);
|
|
324
|
+
return gltf.userData.vrmAnimations[0];
|
|
325
|
+
}
|
|
326
|
+
|
|
327
|
+
const idleAnim = await loadVRMAFromBytes(vrmaData.idle);
|
|
328
|
+
const speakingAnim = await loadVRMAFromBytes(vrmaData.speaking);
|
|
329
|
+
|
|
330
|
+
// 3. Setup AnimationMixer with crossfade
|
|
331
|
+
const mixer = new THREE.AnimationMixer(vrm.scene);
|
|
332
|
+
|
|
333
|
+
const idleClip = createVRMAnimationClip(idleAnim, vrm);
|
|
334
|
+
const speakingClip = createVRMAnimationClip(speakingAnim, vrm);
|
|
335
|
+
|
|
336
|
+
const idleAction = mixer.clipAction(idleClip);
|
|
337
|
+
const speakingAction = mixer.clipAction(speakingClip);
|
|
338
|
+
|
|
339
|
+
idleAction.setLoop(THREE.LoopRepeat);
|
|
340
|
+
speakingAction.setLoop(THREE.LoopRepeat);
|
|
341
|
+
|
|
342
|
+
idleAction.setEffectiveWeight(1);
|
|
343
|
+
idleAction.play();
|
|
344
|
+
speakingAction.setEffectiveWeight(0);
|
|
345
|
+
speakingAction.play();
|
|
346
|
+
|
|
347
|
+
// 4. Smoothstep crossfade between idle and speaking
|
|
348
|
+
let crossFadeProgress = 0;
|
|
349
|
+
let isSpeaking = false;
|
|
350
|
+
|
|
351
|
+
function updateBoneWeights(delta) {
|
|
352
|
+
const target = isSpeaking ? 1 : 0;
|
|
353
|
+
const speed = 1.0 / 0.4; // 0.4s transition duration
|
|
354
|
+
if (target > crossFadeProgress) {
|
|
355
|
+
crossFadeProgress = Math.min(crossFadeProgress + delta * speed, 1);
|
|
356
|
+
} else {
|
|
357
|
+
crossFadeProgress = Math.max(crossFadeProgress - delta * speed, 0);
|
|
358
|
+
}
|
|
359
|
+
// Smoothstep interpolation
|
|
360
|
+
const t = crossFadeProgress;
|
|
361
|
+
const w = t * t * (3 - 2 * t);
|
|
362
|
+
speakingAction.setEffectiveWeight(w);
|
|
363
|
+
idleAction.setEffectiveWeight(1 - w);
|
|
364
|
+
}
|
|
365
|
+
|
|
366
|
+
// In your render loop:
|
|
367
|
+
// updateBoneWeights(delta);
|
|
368
|
+
// mixer.update(delta);
|
|
369
|
+
```
|
|
370
|
+
|
|
371
|
+
## Real-time Streaming
|
|
372
|
+
|
|
373
|
+
### Microphone Input with AudioWorklet
|
|
374
|
+
|
|
375
|
+
```js
|
|
376
|
+
// 1. Get microphone stream
|
|
377
|
+
const stream = await navigator.mediaDevices.getUserMedia({
|
|
378
|
+
audio: { sampleRate: 16000, channelCount: 1, echoCancellation: true }
|
|
379
|
+
});
|
|
380
|
+
const audioCtx = new AudioContext({ sampleRate: 16000 });
|
|
381
|
+
const source = audioCtx.createMediaStreamSource(stream);
|
|
382
|
+
|
|
383
|
+
// 2. AudioWorklet: batch 1600 samples (100ms @ 16kHz)
|
|
384
|
+
const workletCode = `
|
|
385
|
+
class MicProcessor extends AudioWorkletProcessor {
|
|
386
|
+
constructor() {
|
|
387
|
+
super();
|
|
388
|
+
this.buffer = [];
|
|
389
|
+
this.bufferLen = 0;
|
|
390
|
+
this.TARGET = 1600; // 100ms @ 16kHz
|
|
391
|
+
}
|
|
392
|
+
process(inputs) {
|
|
393
|
+
const input = inputs[0];
|
|
394
|
+
if (input.length > 0 && input[0].length > 0) {
|
|
395
|
+
this.buffer.push(new Float32Array(input[0]));
|
|
396
|
+
this.bufferLen += input[0].length;
|
|
397
|
+
if (this.bufferLen >= this.TARGET) {
|
|
398
|
+
const merged = new Float32Array(this.bufferLen);
|
|
399
|
+
let off = 0;
|
|
400
|
+
for (const buf of this.buffer) { merged.set(buf, off); off += buf.length; }
|
|
401
|
+
this.port.postMessage(merged);
|
|
402
|
+
this.buffer = [];
|
|
403
|
+
this.bufferLen = 0;
|
|
404
|
+
}
|
|
405
|
+
}
|
|
406
|
+
return true;
|
|
407
|
+
}
|
|
408
|
+
}
|
|
409
|
+
registerProcessor('mic-processor', MicProcessor);
|
|
410
|
+
`;
|
|
411
|
+
const blob = new Blob([workletCode], { type: 'application/javascript' });
|
|
412
|
+
const url = URL.createObjectURL(blob);
|
|
413
|
+
await audioCtx.audioWorklet.addModule(url);
|
|
414
|
+
URL.revokeObjectURL(url);
|
|
415
|
+
|
|
416
|
+
const workletNode = new AudioWorkletNode(audioCtx, 'mic-processor');
|
|
417
|
+
source.connect(workletNode);
|
|
418
|
+
workletNode.connect(audioCtx.destination);
|
|
419
|
+
|
|
420
|
+
// 3. Frame queue + processing with VAD
|
|
421
|
+
const streamQueue = [];
|
|
422
|
+
let micProcessing = false;
|
|
423
|
+
const micBuffer = [];
|
|
424
|
+
|
|
425
|
+
workletNode.port.onmessage = (e) => {
|
|
426
|
+
micBuffer.push(e.data);
|
|
427
|
+
if (!micProcessing) processMicBuffer();
|
|
428
|
+
};
|
|
429
|
+
|
|
430
|
+
async function processMicBuffer() {
|
|
431
|
+
if (micBuffer.length === 0) return;
|
|
432
|
+
micProcessing = true;
|
|
433
|
+
try {
|
|
434
|
+
const chunks = micBuffer.splice(0);
|
|
435
|
+
let totalLen = 0;
|
|
436
|
+
for (const c of chunks) totalLen += c.length;
|
|
437
|
+
const audio = new Float32Array(totalLen);
|
|
438
|
+
let offset = 0;
|
|
439
|
+
for (const c of chunks) { audio.set(c, offset); offset += c.length; }
|
|
440
|
+
|
|
441
|
+
// VAD check (if calibrated)
|
|
442
|
+
if (vad) {
|
|
443
|
+
const speaking = vad.feed_audio(audio);
|
|
444
|
+
// Toggle bone animation transitions based on speaking state
|
|
445
|
+
}
|
|
446
|
+
|
|
447
|
+
const result = await lipsync.processAudioChunk(audio);
|
|
448
|
+
if (result && result.frame_count > 0) {
|
|
449
|
+
for (let i = 0; i < result.frame_count; i++) {
|
|
450
|
+
streamQueue.push(lipsync.getFrame(result, i));
|
|
451
|
+
}
|
|
452
|
+
}
|
|
453
|
+
} finally {
|
|
454
|
+
micProcessing = false;
|
|
455
|
+
if (micBuffer.length > 0) processMicBuffer();
|
|
456
|
+
}
|
|
457
|
+
}
|
|
458
|
+
|
|
459
|
+
// 4. Consume at 30fps in render loop
|
|
460
|
+
let streamTimeAccum = 0;
|
|
461
|
+
const frameInterval = 1.0 / 30.0;
|
|
462
|
+
|
|
463
|
+
function renderLoop() {
|
|
464
|
+
requestAnimationFrame(renderLoop);
|
|
465
|
+
const delta = clock.getDelta();
|
|
466
|
+
|
|
467
|
+
streamTimeAccum += delta;
|
|
468
|
+
while (streamTimeAccum >= frameInterval) {
|
|
469
|
+
streamTimeAccum -= frameInterval;
|
|
470
|
+
if (streamQueue.length > 0) {
|
|
471
|
+
const frame = streamQueue.shift();
|
|
472
|
+
applyBlendshapes(vrm, frame);
|
|
473
|
+
}
|
|
474
|
+
}
|
|
475
|
+
|
|
476
|
+
vrm.update(delta);
|
|
477
|
+
renderer.render(scene, camera);
|
|
478
|
+
}
|
|
479
|
+
```
|
|
480
|
+
|
|
481
|
+
## API Reference
|
|
482
|
+
|
|
483
|
+
### Constructor
|
|
484
|
+
|
|
485
|
+
```ts
|
|
486
|
+
new LipSyncWasmWrapper(options?: { wasmPath?: string })
|
|
487
|
+
```
|
|
488
|
+
|
|
489
|
+
| Parameter | Type | Default | Description |
|
|
490
|
+
|-----------|------|---------|-------------|
|
|
491
|
+
| `wasmPath` | `string` | `'./lipsync_wasm_v1.js'` | Path to the WASM glue module |
|
|
492
|
+
|
|
493
|
+
### Properties
|
|
494
|
+
|
|
495
|
+
| Property | Type | Description |
|
|
496
|
+
|----------|------|-------------|
|
|
497
|
+
| `ready` | `boolean` | `true` after `init()` completes |
|
|
498
|
+
| `modelVersion` | `string` | `'v1'` |
|
|
499
|
+
| `blendshapeDim` | `number` | `111` |
|
|
500
|
+
| `wasmModule` | `object` | Direct access to WASM exports (for `IdleExpressionGenerator`, `VoiceActivityDetector`) |
|
|
501
|
+
|
|
502
|
+
### `init(options?): Promise<{ mode: string }>`
|
|
503
|
+
|
|
504
|
+
Initializes the WASM runtime, loads the ONNX model, and applies the expression preset.
|
|
505
|
+
|
|
506
|
+
| Option | Type | Default | Description |
|
|
507
|
+
|--------|------|---------|-------------|
|
|
508
|
+
| `onProgress` | `(stage: string, percent: number) => void` | — | Progress callback. Stages: `'wasm'`, `'decrypt'`, `'onnx'`, `'onnx-fallback'` |
|
|
509
|
+
| `preset` | `boolean \| string` | `true` | `true` loads the built-in preset. Pass a URL string to load a custom preset JSON. `false` disables preset loading. |
|
|
510
|
+
|
|
511
|
+
Returns `{ mode: 'onnx' }` or `{ mode: 'heuristic' }` if ONNX is unavailable.
|
|
512
|
+
|
|
513
|
+
### `processAudio(audio: Float32Array): Promise<ProcessResult>`
|
|
514
|
+
|
|
515
|
+
Processes a complete 16kHz mono PCM audio buffer.
|
|
516
|
+
|
|
517
|
+
### `processAudioBuffer(audioBuffer: AudioBuffer): Promise<ProcessResult>`
|
|
518
|
+
|
|
519
|
+
Processes a Web Audio API `AudioBuffer` (automatically resampled to 16kHz).
|
|
520
|
+
|
|
521
|
+
### `processFile(file: File): Promise<ProcessResult>`
|
|
522
|
+
|
|
523
|
+
Decodes and processes an audio `File` object.
|
|
524
|
+
|
|
525
|
+
### `processAudioChunk(chunk: Float32Array, isLast?: boolean): Promise<ProcessResult | null>`
|
|
526
|
+
|
|
527
|
+
Feeds an audio chunk for real-time streaming. Streaming sessions are managed internally — the first call starts a session, passing `isLast = true` ends it. Returns `null` if the internal buffer has not accumulated enough data.
|
|
528
|
+
|
|
529
|
+
### `getFrame(result: ProcessResult, frameIndex: number): number[]`
|
|
530
|
+
|
|
531
|
+
Extracts a single blendshape frame from a `ProcessResult`. Returns `number[111]`.
|
|
532
|
+
|
|
533
|
+
### `getVrmaBytes(): { idle: Uint8Array, speaking: Uint8Array }`
|
|
534
|
+
|
|
535
|
+
Returns embedded VRMA bone animation data for idle and speaking states.
|
|
536
|
+
|
|
537
|
+
### `reset(): void`
|
|
538
|
+
|
|
539
|
+
Resets internal state and ends any active streaming session.
|
|
540
|
+
|
|
541
|
+
### `dispose(): void`
|
|
542
|
+
|
|
543
|
+
Releases all WASM and ONNX resources.
|
|
544
|
+
|
|
545
|
+
### ProcessResult
|
|
546
|
+
|
|
547
|
+
```ts
|
|
548
|
+
{
|
|
549
|
+
blendshapes: number[]; // Flat array: frame_count * 111 values
|
|
550
|
+
frame_count: number; // Number of output frames (30fps)
|
|
551
|
+
mode: string; // 'onnx' | 'heuristic' | 'streaming-onnx'
|
|
552
|
+
}
|
|
553
|
+
```
|
|
554
|
+
|
|
555
|
+
## Bundler Setup
|
|
556
|
+
|
|
557
|
+
### Vite
|
|
558
|
+
|
|
559
|
+
Works out of the box. No additional configuration needed.
|
|
560
|
+
|
|
561
|
+
### Webpack
|
|
562
|
+
|
|
563
|
+
Enable async WebAssembly support:
|
|
564
|
+
|
|
565
|
+
```js
|
|
566
|
+
// webpack.config.js
|
|
567
|
+
module.exports = {
|
|
568
|
+
experiments: {
|
|
569
|
+
asyncWebAssembly: true,
|
|
570
|
+
},
|
|
571
|
+
};
|
|
572
|
+
```
|
|
573
|
+
|
|
574
|
+
### CDN (no bundler)
|
|
575
|
+
|
|
576
|
+
Use `<script type="module">` with an import map:
|
|
577
|
+
|
|
578
|
+
```html
|
|
579
|
+
<script type="importmap">
|
|
580
|
+
{ "imports": {
|
|
581
|
+
"@goodganglabs/lipsync-wasm-v1": "https://your-cdn.com/lipsync-wasm-v1/lipsync-wasm-wrapper.js"
|
|
582
|
+
}}
|
|
583
|
+
</script>
|
|
584
|
+
<script src="https://cdn.jsdelivr.net/npm/onnxruntime-web@1.17.0/dist/ort.min.js"></script>
|
|
585
|
+
<script type="module">
|
|
586
|
+
import { LipSyncWasmWrapper } from '@goodganglabs/lipsync-wasm-v1';
|
|
587
|
+
// ... your code
|
|
588
|
+
</script>
|
|
589
|
+
```
|
|
590
|
+
|
|
591
|
+
When hosting WASM files on a different path than the wrapper JS, use the `wasmPath` option:
|
|
592
|
+
|
|
593
|
+
```js
|
|
594
|
+
const lipsync = new LipSyncWasmWrapper({
|
|
595
|
+
wasmPath: '/static/wasm/lipsync_wasm_v1.js'
|
|
596
|
+
});
|
|
597
|
+
```
|
|
598
|
+
|
|
599
|
+
## Deployment
|
|
600
|
+
|
|
601
|
+
`.wasm` files must be served with the `application/wasm` MIME type.
|
|
602
|
+
CORS headers are required for cross-origin usage.
|
|
603
|
+
|
|
604
|
+
## License
|
|
605
|
+
|
|
606
|
+
Proprietary — GoodGang Labs
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @goodganglabs/lipsync-wasm-v1
|
|
3
|
+
* Audio-to-blendshape lip sync engine (111-dim ARKit, phoneme model)
|
|
4
|
+
*/
|
|
5
|
+
|
|
6
|
+
export interface ProcessResult {
|
|
7
|
+
blendshapes: number[];
|
|
8
|
+
frame_count: number;
|
|
9
|
+
fps: number;
|
|
10
|
+
mode?: string;
|
|
11
|
+
}
|
|
12
|
+
|
|
13
|
+
export interface InitResult {
|
|
14
|
+
mode: 'onnx' | 'heuristic';
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
export interface VrmaBytes {
|
|
18
|
+
idle: Uint8Array;
|
|
19
|
+
speaking: Uint8Array;
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
export interface InitOptions {
|
|
23
|
+
/** GoodGangLabs license key (e.g. "ggl_xxx"). Omit for 30-day free trial. */
|
|
24
|
+
licenseKey?: string;
|
|
25
|
+
onProgress?: (stage: string, percent: number) => void;
|
|
26
|
+
preset?: boolean | string;
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
export interface ConstructorOptions {
|
|
30
|
+
wasmPath?: string;
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
export class LipSyncWasmWrapper {
|
|
34
|
+
constructor(options?: ConstructorOptions);
|
|
35
|
+
|
|
36
|
+
readonly ready: boolean;
|
|
37
|
+
readonly modelVersion: 'v1';
|
|
38
|
+
readonly blendshapeDim: 111;
|
|
39
|
+
readonly mode: 'onnx' | 'heuristic' | null;
|
|
40
|
+
readonly wasmModule: any;
|
|
41
|
+
|
|
42
|
+
init(options?: InitOptions): Promise<InitResult>;
|
|
43
|
+
|
|
44
|
+
processAudio(audio: Float32Array): Promise<ProcessResult>;
|
|
45
|
+
processAudioBuffer(audioBuffer: AudioBuffer): Promise<ProcessResult>;
|
|
46
|
+
processFile(file: File): Promise<ProcessResult>;
|
|
47
|
+
|
|
48
|
+
processAudioChunk(audioChunk: Float32Array, isLast?: boolean): Promise<ProcessResult | null>;
|
|
49
|
+
|
|
50
|
+
getFrame(result: ProcessResult, frameIndex: number): number[];
|
|
51
|
+
|
|
52
|
+
getVrmaBytes(): VrmaBytes;
|
|
53
|
+
reset(): void;
|
|
54
|
+
dispose(): void;
|
|
55
|
+
}
|