@omote/core 0.1.3 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.mts +462 -207
- package/dist/index.d.ts +462 -207
- package/dist/index.js +542 -186
- package/dist/index.js.map +1 -1
- package/dist/index.mjs +534 -178
- package/dist/index.mjs.map +1 -1
- package/package.json +1 -1
package/dist/index.mjs
CHANGED
|
@@ -220,6 +220,19 @@ var AudioScheduler = class {
|
|
|
220
220
|
async initialize() {
|
|
221
221
|
console.log("[AudioScheduler] Ready for lazy initialization");
|
|
222
222
|
}
|
|
223
|
+
/**
|
|
224
|
+
* Eagerly create and warm up the AudioContext
|
|
225
|
+
*
|
|
226
|
+
* Call this when a playback session starts (e.g., when AI response begins).
|
|
227
|
+
* The AudioContext needs time to initialize the audio hardware — on Windows
|
|
228
|
+
* this can take 50-100ms. By warming up early (before audio data arrives),
|
|
229
|
+
* the context is fully ready when schedule() is first called.
|
|
230
|
+
*
|
|
231
|
+
* Must be called after a user gesture (click/tap) for autoplay policy.
|
|
232
|
+
*/
|
|
233
|
+
async warmup() {
|
|
234
|
+
await this.ensureContext();
|
|
235
|
+
}
|
|
223
236
|
/**
|
|
224
237
|
* Ensure AudioContext is created and ready
|
|
225
238
|
* Called lazily on first schedule() - requires user gesture
|
|
@@ -250,7 +263,7 @@ var AudioScheduler = class {
|
|
|
250
263
|
const ctx = await this.ensureContext();
|
|
251
264
|
const channels = this.options.channels ?? 1;
|
|
252
265
|
if (!this.isPlaying) {
|
|
253
|
-
this.nextPlayTime = ctx.currentTime;
|
|
266
|
+
this.nextPlayTime = ctx.currentTime + 0.05;
|
|
254
267
|
this.isPlaying = true;
|
|
255
268
|
}
|
|
256
269
|
const audioBuffer = ctx.createBuffer(channels, audioData.length, ctx.sampleRate);
|
|
@@ -324,8 +337,19 @@ var AudioScheduler = class {
|
|
|
324
337
|
}
|
|
325
338
|
/**
|
|
326
339
|
* Reset scheduler state for new playback session
|
|
340
|
+
* Stops any orphaned sources that weren't cleaned up by cancelAll()
|
|
327
341
|
*/
|
|
328
342
|
reset() {
|
|
343
|
+
if (this.context) {
|
|
344
|
+
const now = this.context.currentTime;
|
|
345
|
+
for (const { source, gainNode } of this.scheduledSources) {
|
|
346
|
+
try {
|
|
347
|
+
gainNode.gain.setValueAtTime(0, now);
|
|
348
|
+
source.stop(now);
|
|
349
|
+
} catch {
|
|
350
|
+
}
|
|
351
|
+
}
|
|
352
|
+
}
|
|
329
353
|
this.nextPlayTime = 0;
|
|
330
354
|
this.isPlaying = false;
|
|
331
355
|
this.scheduledSources = [];
|
|
@@ -453,7 +477,7 @@ var LAMPipeline = class {
|
|
|
453
477
|
newBuffer.set(this.buffer, 0);
|
|
454
478
|
newBuffer.set(samples, this.buffer.length);
|
|
455
479
|
this.buffer = newBuffer;
|
|
456
|
-
|
|
480
|
+
while (this.buffer.length >= this.REQUIRED_SAMPLES) {
|
|
457
481
|
await this.processBuffer(lam);
|
|
458
482
|
}
|
|
459
483
|
}
|
|
@@ -606,12 +630,20 @@ var LAMPipeline = class {
|
|
|
606
630
|
};
|
|
607
631
|
|
|
608
632
|
// src/audio/SyncedAudioPipeline.ts
|
|
633
|
+
function pcm16ToFloat32(buffer) {
|
|
634
|
+
const byteLen = buffer.byteLength & ~1;
|
|
635
|
+
const int16 = byteLen === buffer.byteLength ? new Int16Array(buffer) : new Int16Array(buffer, 0, byteLen / 2);
|
|
636
|
+
const float32 = new Float32Array(int16.length);
|
|
637
|
+
for (let i = 0; i < int16.length; i++) {
|
|
638
|
+
float32[i] = int16[i] / 32768;
|
|
639
|
+
}
|
|
640
|
+
return float32;
|
|
641
|
+
}
|
|
609
642
|
var SyncedAudioPipeline = class extends EventEmitter {
|
|
610
643
|
constructor(options) {
|
|
611
644
|
super();
|
|
612
645
|
this.options = options;
|
|
613
|
-
this.
|
|
614
|
-
this.bufferedChunks = [];
|
|
646
|
+
this.playbackStarted = false;
|
|
615
647
|
this.monitorInterval = null;
|
|
616
648
|
this.frameAnimationId = null;
|
|
617
649
|
const sampleRate = options.sampleRate ?? 16e3;
|
|
@@ -622,11 +654,6 @@ var SyncedAudioPipeline = class extends EventEmitter {
|
|
|
622
654
|
});
|
|
623
655
|
this.lamPipeline = new LAMPipeline({
|
|
624
656
|
sampleRate,
|
|
625
|
-
onInference: (frameCount) => {
|
|
626
|
-
if (this.waitingForFirstLAM) {
|
|
627
|
-
this.onFirstLAMComplete();
|
|
628
|
-
}
|
|
629
|
-
},
|
|
630
657
|
onError: (error) => {
|
|
631
658
|
this.emit("error", error);
|
|
632
659
|
}
|
|
@@ -642,25 +669,24 @@ var SyncedAudioPipeline = class extends EventEmitter {
|
|
|
642
669
|
* Start a new playback session
|
|
643
670
|
*
|
|
644
671
|
* Resets all state and prepares for incoming audio chunks.
|
|
645
|
-
*
|
|
672
|
+
* Audio will be scheduled immediately as chunks arrive (no buffering).
|
|
646
673
|
*/
|
|
647
674
|
start() {
|
|
675
|
+
this.stopMonitoring();
|
|
648
676
|
this.scheduler.reset();
|
|
649
677
|
this.coalescer.reset();
|
|
650
678
|
this.lamPipeline.reset();
|
|
651
|
-
this.
|
|
652
|
-
this.
|
|
679
|
+
this.playbackStarted = false;
|
|
680
|
+
this.scheduler.warmup();
|
|
653
681
|
this.startFrameLoop();
|
|
654
682
|
this.startMonitoring();
|
|
655
683
|
}
|
|
656
684
|
/**
|
|
657
685
|
* Receive audio chunk from network
|
|
658
686
|
*
|
|
659
|
-
*
|
|
660
|
-
*
|
|
661
|
-
*
|
|
662
|
-
* - Audio scheduling waits until first LAM completes
|
|
663
|
-
* - Then all buffered audio is scheduled together with LAM frames
|
|
687
|
+
* Audio-first design: schedules audio immediately, LAM runs in background.
|
|
688
|
+
* This prevents LAM inference (50-300ms) from blocking audio scheduling,
|
|
689
|
+
* which caused audible stuttering with continuous audio streams.
|
|
664
690
|
*
|
|
665
691
|
* @param chunk - Uint8Array containing Int16 PCM audio
|
|
666
692
|
*/
|
|
@@ -669,51 +695,15 @@ var SyncedAudioPipeline = class extends EventEmitter {
|
|
|
669
695
|
if (!combined) {
|
|
670
696
|
return;
|
|
671
697
|
}
|
|
672
|
-
const
|
|
673
|
-
const
|
|
674
|
-
|
|
675
|
-
|
|
676
|
-
|
|
677
|
-
if (this.waitingForFirstLAM) {
|
|
678
|
-
this.bufferedChunks.push(combined);
|
|
679
|
-
const estimatedTime = this.scheduler.getCurrentTime();
|
|
680
|
-
await this.lamPipeline.push(float32, estimatedTime, this.options.lam);
|
|
681
|
-
} else {
|
|
682
|
-
const scheduleTime = await this.scheduler.schedule(float32);
|
|
683
|
-
await this.lamPipeline.push(float32, scheduleTime, this.options.lam);
|
|
698
|
+
const float32 = pcm16ToFloat32(combined);
|
|
699
|
+
const scheduleTime = await this.scheduler.schedule(float32);
|
|
700
|
+
if (!this.playbackStarted) {
|
|
701
|
+
this.playbackStarted = true;
|
|
702
|
+
this.emit("playback_start", scheduleTime);
|
|
684
703
|
}
|
|
685
|
-
|
|
686
|
-
|
|
687
|
-
|
|
688
|
-
*
|
|
689
|
-
* This is the critical synchronization point:
|
|
690
|
-
* - LAM frames are now ready in the queue
|
|
691
|
-
* - Schedule all buffered audio chunks
|
|
692
|
-
* - Adjust LAM frame timestamps to match actual schedule time
|
|
693
|
-
* - Audio and LAM start playing together, perfectly synchronized
|
|
694
|
-
*/
|
|
695
|
-
async onFirstLAMComplete() {
|
|
696
|
-
this.waitingForFirstLAM = false;
|
|
697
|
-
const beforeSchedule = this.scheduler.getCurrentTime();
|
|
698
|
-
let actualStartTime = beforeSchedule;
|
|
699
|
-
for (let i = 0; i < this.bufferedChunks.length; i++) {
|
|
700
|
-
const buffer = this.bufferedChunks[i];
|
|
701
|
-
const int16 = new Int16Array(buffer);
|
|
702
|
-
const float32 = new Float32Array(int16.length);
|
|
703
|
-
for (let j = 0; j < int16.length; j++) {
|
|
704
|
-
float32[j] = int16[j] / 32768;
|
|
705
|
-
}
|
|
706
|
-
const scheduleTime = await this.scheduler.schedule(float32);
|
|
707
|
-
if (i === 0) {
|
|
708
|
-
actualStartTime = scheduleTime;
|
|
709
|
-
}
|
|
710
|
-
}
|
|
711
|
-
const timeOffset = actualStartTime - beforeSchedule;
|
|
712
|
-
if (timeOffset !== 0) {
|
|
713
|
-
this.lamPipeline.adjustTimestamps(timeOffset);
|
|
714
|
-
}
|
|
715
|
-
this.bufferedChunks = [];
|
|
716
|
-
this.emit("playback_start", actualStartTime);
|
|
704
|
+
this.lamPipeline.push(float32, scheduleTime, this.options.lam).catch((err) => {
|
|
705
|
+
this.emit("error", err);
|
|
706
|
+
});
|
|
717
707
|
}
|
|
718
708
|
/**
|
|
719
709
|
* End of audio stream
|
|
@@ -745,10 +735,9 @@ var SyncedAudioPipeline = class extends EventEmitter {
|
|
|
745
735
|
async stop(fadeOutMs = 50) {
|
|
746
736
|
this.stopMonitoring();
|
|
747
737
|
await this.scheduler.cancelAll(fadeOutMs);
|
|
748
|
-
this.bufferedChunks = [];
|
|
749
738
|
this.coalescer.reset();
|
|
750
739
|
this.lamPipeline.reset();
|
|
751
|
-
this.
|
|
740
|
+
this.playbackStarted = false;
|
|
752
741
|
this.emit("playback_complete", void 0);
|
|
753
742
|
}
|
|
754
743
|
/**
|
|
@@ -805,8 +794,7 @@ var SyncedAudioPipeline = class extends EventEmitter {
|
|
|
805
794
|
*/
|
|
806
795
|
getState() {
|
|
807
796
|
return {
|
|
808
|
-
|
|
809
|
-
bufferedChunks: this.bufferedChunks.length,
|
|
797
|
+
playbackStarted: this.playbackStarted,
|
|
810
798
|
coalescerFill: this.coalescer.fillLevel,
|
|
811
799
|
lamFill: this.lamPipeline.fillLevel,
|
|
812
800
|
queuedFrames: this.lamPipeline.queuedFrameCount,
|
|
@@ -822,7 +810,6 @@ var SyncedAudioPipeline = class extends EventEmitter {
|
|
|
822
810
|
this.scheduler.dispose();
|
|
823
811
|
this.coalescer.reset();
|
|
824
812
|
this.lamPipeline.reset();
|
|
825
|
-
this.bufferedChunks = [];
|
|
826
813
|
}
|
|
827
814
|
};
|
|
828
815
|
|
|
@@ -2049,7 +2036,7 @@ function hasWebGPUApi() {
|
|
|
2049
2036
|
return "gpu" in navigator && navigator.gpu !== void 0;
|
|
2050
2037
|
}
|
|
2051
2038
|
function getRecommendedBackend() {
|
|
2052
|
-
if (isIOS()) {
|
|
2039
|
+
if (isSafari() || isIOS()) {
|
|
2053
2040
|
return "wasm";
|
|
2054
2041
|
}
|
|
2055
2042
|
return "webgpu";
|
|
@@ -2093,6 +2080,14 @@ function shouldEnableWasmProxy() {
|
|
|
2093
2080
|
}
|
|
2094
2081
|
return true;
|
|
2095
2082
|
}
|
|
2083
|
+
function isSafari() {
|
|
2084
|
+
if (typeof navigator === "undefined") return false;
|
|
2085
|
+
const ua = navigator.userAgent.toLowerCase();
|
|
2086
|
+
return /safari/.test(ua) && !/chrome|crios|fxios|chromium|edg/.test(ua);
|
|
2087
|
+
}
|
|
2088
|
+
function shouldUseCpuLipSync() {
|
|
2089
|
+
return isSafari();
|
|
2090
|
+
}
|
|
2096
2091
|
function isSpeechRecognitionAvailable() {
|
|
2097
2092
|
if (typeof window === "undefined") return false;
|
|
2098
2093
|
return "SpeechRecognition" in window || "webkitSpeechRecognition" in window;
|
|
@@ -2239,8 +2234,7 @@ function isOnnxRuntimeLoaded() {
|
|
|
2239
2234
|
return ortInstance !== null;
|
|
2240
2235
|
}
|
|
2241
2236
|
|
|
2242
|
-
// src/inference/
|
|
2243
|
-
var logger2 = createLogger("Wav2Vec2");
|
|
2237
|
+
// src/inference/blendshapeUtils.ts
|
|
2244
2238
|
var LAM_BLENDSHAPES = [
|
|
2245
2239
|
"browDownLeft",
|
|
2246
2240
|
"browDownRight",
|
|
@@ -2295,40 +2289,7 @@ var LAM_BLENDSHAPES = [
|
|
|
2295
2289
|
"noseSneerRight",
|
|
2296
2290
|
"tongueOut"
|
|
2297
2291
|
];
|
|
2298
|
-
var
|
|
2299
|
-
"<pad>",
|
|
2300
|
-
"<s>",
|
|
2301
|
-
"</s>",
|
|
2302
|
-
"<unk>",
|
|
2303
|
-
"|",
|
|
2304
|
-
"E",
|
|
2305
|
-
"T",
|
|
2306
|
-
"A",
|
|
2307
|
-
"O",
|
|
2308
|
-
"N",
|
|
2309
|
-
"I",
|
|
2310
|
-
"H",
|
|
2311
|
-
"S",
|
|
2312
|
-
"R",
|
|
2313
|
-
"D",
|
|
2314
|
-
"L",
|
|
2315
|
-
"U",
|
|
2316
|
-
"M",
|
|
2317
|
-
"W",
|
|
2318
|
-
"C",
|
|
2319
|
-
"F",
|
|
2320
|
-
"G",
|
|
2321
|
-
"Y",
|
|
2322
|
-
"P",
|
|
2323
|
-
"B",
|
|
2324
|
-
"V",
|
|
2325
|
-
"K",
|
|
2326
|
-
"'",
|
|
2327
|
-
"X",
|
|
2328
|
-
"J",
|
|
2329
|
-
"Q",
|
|
2330
|
-
"Z"
|
|
2331
|
-
];
|
|
2292
|
+
var ARKIT_BLENDSHAPES = LAM_BLENDSHAPES;
|
|
2332
2293
|
var ARKIT_SYMMETRIC_PAIRS = [
|
|
2333
2294
|
["jawLeft", "jawRight"],
|
|
2334
2295
|
["mouthLeft", "mouthRight"],
|
|
@@ -2364,6 +2325,107 @@ function symmetrizeBlendshapes(frame) {
|
|
|
2364
2325
|
}
|
|
2365
2326
|
return result;
|
|
2366
2327
|
}
|
|
2328
|
+
var WAV2ARKIT_BLENDSHAPES = [
|
|
2329
|
+
"browDownLeft",
|
|
2330
|
+
"browDownRight",
|
|
2331
|
+
"browInnerUp",
|
|
2332
|
+
"browOuterUpLeft",
|
|
2333
|
+
"browOuterUpRight",
|
|
2334
|
+
"cheekPuff",
|
|
2335
|
+
"cheekSquintLeft",
|
|
2336
|
+
"cheekSquintRight",
|
|
2337
|
+
"eyeBlinkLeft",
|
|
2338
|
+
"eyeBlinkRight",
|
|
2339
|
+
"eyeLookDownLeft",
|
|
2340
|
+
"eyeLookDownRight",
|
|
2341
|
+
"eyeLookInLeft",
|
|
2342
|
+
"eyeLookInRight",
|
|
2343
|
+
"eyeLookOutLeft",
|
|
2344
|
+
"eyeLookOutRight",
|
|
2345
|
+
"eyeLookUpLeft",
|
|
2346
|
+
"eyeLookUpRight",
|
|
2347
|
+
"eyeSquintLeft",
|
|
2348
|
+
"eyeSquintRight",
|
|
2349
|
+
"eyeWideLeft",
|
|
2350
|
+
"eyeWideRight",
|
|
2351
|
+
"jawForward",
|
|
2352
|
+
"jawLeft",
|
|
2353
|
+
"jawOpen",
|
|
2354
|
+
"mouthFrownLeft",
|
|
2355
|
+
"mouthFrownRight",
|
|
2356
|
+
"mouthFunnel",
|
|
2357
|
+
"mouthLeft",
|
|
2358
|
+
"mouthLowerDownLeft",
|
|
2359
|
+
"mouthLowerDownRight",
|
|
2360
|
+
"mouthPressLeft",
|
|
2361
|
+
"mouthPressRight",
|
|
2362
|
+
"mouthPucker",
|
|
2363
|
+
"mouthRight",
|
|
2364
|
+
"mouthRollLower",
|
|
2365
|
+
"mouthRollUpper",
|
|
2366
|
+
"mouthShrugLower",
|
|
2367
|
+
"mouthShrugUpper",
|
|
2368
|
+
"mouthSmileLeft",
|
|
2369
|
+
"mouthSmileRight",
|
|
2370
|
+
"mouthStretchLeft",
|
|
2371
|
+
"mouthStretchRight",
|
|
2372
|
+
"mouthUpperUpLeft",
|
|
2373
|
+
"mouthUpperUpRight",
|
|
2374
|
+
"noseSneerLeft",
|
|
2375
|
+
"noseSneerRight",
|
|
2376
|
+
"tongueOut",
|
|
2377
|
+
"mouthClose",
|
|
2378
|
+
"mouthDimpleLeft",
|
|
2379
|
+
"mouthDimpleRight",
|
|
2380
|
+
"jawRight"
|
|
2381
|
+
];
|
|
2382
|
+
var REMAP_WAV2ARKIT_TO_LAM = WAV2ARKIT_BLENDSHAPES.map(
|
|
2383
|
+
(name) => LAM_BLENDSHAPES.indexOf(name)
|
|
2384
|
+
);
|
|
2385
|
+
function remapWav2ArkitToLam(frame) {
|
|
2386
|
+
const result = new Float32Array(52);
|
|
2387
|
+
for (let i = 0; i < 52; i++) {
|
|
2388
|
+
result[REMAP_WAV2ARKIT_TO_LAM[i]] = frame[i];
|
|
2389
|
+
}
|
|
2390
|
+
return result;
|
|
2391
|
+
}
|
|
2392
|
+
|
|
2393
|
+
// src/inference/Wav2Vec2Inference.ts
|
|
2394
|
+
var logger2 = createLogger("Wav2Vec2");
|
|
2395
|
+
var CTC_VOCAB = [
|
|
2396
|
+
"<pad>",
|
|
2397
|
+
"<s>",
|
|
2398
|
+
"</s>",
|
|
2399
|
+
"<unk>",
|
|
2400
|
+
"|",
|
|
2401
|
+
"E",
|
|
2402
|
+
"T",
|
|
2403
|
+
"A",
|
|
2404
|
+
"O",
|
|
2405
|
+
"N",
|
|
2406
|
+
"I",
|
|
2407
|
+
"H",
|
|
2408
|
+
"S",
|
|
2409
|
+
"R",
|
|
2410
|
+
"D",
|
|
2411
|
+
"L",
|
|
2412
|
+
"U",
|
|
2413
|
+
"M",
|
|
2414
|
+
"W",
|
|
2415
|
+
"C",
|
|
2416
|
+
"F",
|
|
2417
|
+
"G",
|
|
2418
|
+
"Y",
|
|
2419
|
+
"P",
|
|
2420
|
+
"B",
|
|
2421
|
+
"V",
|
|
2422
|
+
"K",
|
|
2423
|
+
"'",
|
|
2424
|
+
"X",
|
|
2425
|
+
"J",
|
|
2426
|
+
"Q",
|
|
2427
|
+
"Z"
|
|
2428
|
+
];
|
|
2367
2429
|
var Wav2Vec2Inference = class {
|
|
2368
2430
|
constructor(config) {
|
|
2369
2431
|
this.session = null;
|
|
@@ -2602,6 +2664,7 @@ var Wav2Vec2Inference = class {
|
|
|
2602
2664
|
blendshapes,
|
|
2603
2665
|
asrLogits,
|
|
2604
2666
|
text,
|
|
2667
|
+
numFrames: numA2EFrames,
|
|
2605
2668
|
numA2EFrames,
|
|
2606
2669
|
numASRFrames,
|
|
2607
2670
|
inferenceTimeMs
|
|
@@ -2968,8 +3031,293 @@ var WhisperInference = class _WhisperInference {
|
|
|
2968
3031
|
}
|
|
2969
3032
|
};
|
|
2970
3033
|
|
|
3034
|
+
// src/inference/Wav2ArkitCpuInference.ts
|
|
3035
|
+
var logger5 = createLogger("Wav2ArkitCpu");
|
|
3036
|
+
var Wav2ArkitCpuInference = class {
|
|
3037
|
+
constructor(config) {
|
|
3038
|
+
this.session = null;
|
|
3039
|
+
this.ort = null;
|
|
3040
|
+
this._backend = "wasm";
|
|
3041
|
+
this.isLoading = false;
|
|
3042
|
+
// Inference queue for handling concurrent calls
|
|
3043
|
+
this.inferenceQueue = Promise.resolve();
|
|
3044
|
+
this.config = config;
|
|
3045
|
+
}
|
|
3046
|
+
get backend() {
|
|
3047
|
+
return this.session ? this._backend : null;
|
|
3048
|
+
}
|
|
3049
|
+
get isLoaded() {
|
|
3050
|
+
return this.session !== null;
|
|
3051
|
+
}
|
|
3052
|
+
/**
|
|
3053
|
+
* Load the ONNX model
|
|
3054
|
+
*/
|
|
3055
|
+
async load() {
|
|
3056
|
+
if (this.isLoading) {
|
|
3057
|
+
throw new Error("Model is already loading");
|
|
3058
|
+
}
|
|
3059
|
+
if (this.session) {
|
|
3060
|
+
throw new Error("Model already loaded. Call dispose() first.");
|
|
3061
|
+
}
|
|
3062
|
+
this.isLoading = true;
|
|
3063
|
+
const startTime = performance.now();
|
|
3064
|
+
const telemetry = getTelemetry();
|
|
3065
|
+
const span = telemetry?.startSpan("Wav2ArkitCpu.load", {
|
|
3066
|
+
"model.url": this.config.modelUrl,
|
|
3067
|
+
"model.backend_requested": this.config.backend || "wasm"
|
|
3068
|
+
});
|
|
3069
|
+
try {
|
|
3070
|
+
const preference = this.config.backend || "wasm";
|
|
3071
|
+
logger5.info("Loading ONNX Runtime...", { preference });
|
|
3072
|
+
const { ort, backend } = await getOnnxRuntimeForPreference(preference);
|
|
3073
|
+
this.ort = ort;
|
|
3074
|
+
this._backend = backend;
|
|
3075
|
+
logger5.info("ONNX Runtime loaded", { backend: this._backend });
|
|
3076
|
+
const cache = getModelCache();
|
|
3077
|
+
const modelUrl = this.config.modelUrl;
|
|
3078
|
+
const isCached = await cache.has(modelUrl);
|
|
3079
|
+
let modelBuffer;
|
|
3080
|
+
if (isCached) {
|
|
3081
|
+
logger5.debug("Loading model from cache", { modelUrl });
|
|
3082
|
+
modelBuffer = await cache.get(modelUrl);
|
|
3083
|
+
if (!modelBuffer) {
|
|
3084
|
+
logger5.warn("Cache corruption detected, clearing and retrying", { modelUrl });
|
|
3085
|
+
await cache.delete(modelUrl);
|
|
3086
|
+
modelBuffer = await fetchWithCache(modelUrl);
|
|
3087
|
+
}
|
|
3088
|
+
} else {
|
|
3089
|
+
logger5.debug("Fetching and caching model", { modelUrl });
|
|
3090
|
+
modelBuffer = await fetchWithCache(modelUrl);
|
|
3091
|
+
}
|
|
3092
|
+
if (!modelBuffer) {
|
|
3093
|
+
throw new Error(`Failed to load model: ${modelUrl}`);
|
|
3094
|
+
}
|
|
3095
|
+
logger5.debug("Creating ONNX session", {
|
|
3096
|
+
size: formatBytes(modelBuffer.byteLength),
|
|
3097
|
+
backend: this._backend
|
|
3098
|
+
});
|
|
3099
|
+
const sessionOptions = getSessionOptions(this._backend);
|
|
3100
|
+
const modelData = new Uint8Array(modelBuffer);
|
|
3101
|
+
this.session = await this.ort.InferenceSession.create(modelData, sessionOptions);
|
|
3102
|
+
const loadTimeMs = performance.now() - startTime;
|
|
3103
|
+
logger5.info("Model loaded successfully", {
|
|
3104
|
+
backend: this._backend,
|
|
3105
|
+
loadTimeMs: Math.round(loadTimeMs),
|
|
3106
|
+
inputs: this.session.inputNames,
|
|
3107
|
+
outputs: this.session.outputNames
|
|
3108
|
+
});
|
|
3109
|
+
span?.setAttributes({
|
|
3110
|
+
"model.backend": this._backend,
|
|
3111
|
+
"model.load_time_ms": loadTimeMs,
|
|
3112
|
+
"model.cached": isCached
|
|
3113
|
+
});
|
|
3114
|
+
span?.end();
|
|
3115
|
+
telemetry?.recordHistogram("omote.model.load_time", loadTimeMs, {
|
|
3116
|
+
model: "wav2arkit_cpu",
|
|
3117
|
+
backend: this._backend
|
|
3118
|
+
});
|
|
3119
|
+
logger5.debug("Running warmup inference");
|
|
3120
|
+
const warmupStart = performance.now();
|
|
3121
|
+
const silentAudio = new Float32Array(16e3);
|
|
3122
|
+
await this.infer(silentAudio);
|
|
3123
|
+
const warmupTimeMs = performance.now() - warmupStart;
|
|
3124
|
+
logger5.info("Warmup inference complete", {
|
|
3125
|
+
warmupTimeMs: Math.round(warmupTimeMs),
|
|
3126
|
+
backend: this._backend
|
|
3127
|
+
});
|
|
3128
|
+
telemetry?.recordHistogram("omote.model.warmup_time", warmupTimeMs, {
|
|
3129
|
+
model: "wav2arkit_cpu",
|
|
3130
|
+
backend: this._backend
|
|
3131
|
+
});
|
|
3132
|
+
return {
|
|
3133
|
+
backend: this._backend,
|
|
3134
|
+
loadTimeMs,
|
|
3135
|
+
inputNames: [...this.session.inputNames],
|
|
3136
|
+
outputNames: [...this.session.outputNames]
|
|
3137
|
+
};
|
|
3138
|
+
} catch (error) {
|
|
3139
|
+
span?.endWithError(error instanceof Error ? error : new Error(String(error)));
|
|
3140
|
+
telemetry?.incrementCounter("omote.errors.total", 1, {
|
|
3141
|
+
model: "wav2arkit_cpu",
|
|
3142
|
+
error_type: "load_failed"
|
|
3143
|
+
});
|
|
3144
|
+
throw error;
|
|
3145
|
+
} finally {
|
|
3146
|
+
this.isLoading = false;
|
|
3147
|
+
}
|
|
3148
|
+
}
|
|
3149
|
+
/**
|
|
3150
|
+
* Run inference on raw audio
|
|
3151
|
+
*
|
|
3152
|
+
* Accepts variable-length audio (not fixed to 16000 samples).
|
|
3153
|
+
* Output frames = ceil(30 * numSamples / 16000).
|
|
3154
|
+
*
|
|
3155
|
+
* @param audioSamples - Float32Array of raw audio at 16kHz
|
|
3156
|
+
* @param _identityIndex - Ignored (identity 11 is baked into the model)
|
|
3157
|
+
*/
|
|
3158
|
+
async infer(audioSamples, _identityIndex) {
|
|
3159
|
+
if (!this.session) {
|
|
3160
|
+
throw new Error("Model not loaded. Call load() first.");
|
|
3161
|
+
}
|
|
3162
|
+
const audioCopy = new Float32Array(audioSamples);
|
|
3163
|
+
const feeds = {
|
|
3164
|
+
"audio_waveform": new this.ort.Tensor("float32", audioCopy, [1, audioCopy.length])
|
|
3165
|
+
};
|
|
3166
|
+
return this.queueInference(feeds, audioCopy.length);
|
|
3167
|
+
}
|
|
3168
|
+
/**
|
|
3169
|
+
* Queue inference to serialize ONNX session calls
|
|
3170
|
+
*/
|
|
3171
|
+
queueInference(feeds, inputSamples) {
|
|
3172
|
+
return new Promise((resolve, reject) => {
|
|
3173
|
+
this.inferenceQueue = this.inferenceQueue.then(async () => {
|
|
3174
|
+
const telemetry = getTelemetry();
|
|
3175
|
+
const span = telemetry?.startSpan("Wav2ArkitCpu.infer", {
|
|
3176
|
+
"inference.backend": this._backend,
|
|
3177
|
+
"inference.input_samples": inputSamples
|
|
3178
|
+
});
|
|
3179
|
+
try {
|
|
3180
|
+
const startTime = performance.now();
|
|
3181
|
+
const results = await this.session.run(feeds);
|
|
3182
|
+
const inferenceTimeMs = performance.now() - startTime;
|
|
3183
|
+
const blendshapeOutput = results["blendshapes"];
|
|
3184
|
+
if (!blendshapeOutput) {
|
|
3185
|
+
throw new Error("Missing blendshapes output from model");
|
|
3186
|
+
}
|
|
3187
|
+
const blendshapeData = blendshapeOutput.data;
|
|
3188
|
+
const numFrames = blendshapeOutput.dims[1];
|
|
3189
|
+
const numBlendshapes = blendshapeOutput.dims[2];
|
|
3190
|
+
const blendshapes = [];
|
|
3191
|
+
for (let f = 0; f < numFrames; f++) {
|
|
3192
|
+
const rawFrame = blendshapeData.slice(f * numBlendshapes, (f + 1) * numBlendshapes);
|
|
3193
|
+
const remapped = remapWav2ArkitToLam(rawFrame);
|
|
3194
|
+
blendshapes.push(symmetrizeBlendshapes(remapped));
|
|
3195
|
+
}
|
|
3196
|
+
logger5.trace("Inference completed", {
|
|
3197
|
+
inferenceTimeMs: Math.round(inferenceTimeMs * 100) / 100,
|
|
3198
|
+
numFrames,
|
|
3199
|
+
inputSamples
|
|
3200
|
+
});
|
|
3201
|
+
span?.setAttributes({
|
|
3202
|
+
"inference.duration_ms": inferenceTimeMs,
|
|
3203
|
+
"inference.frames": numFrames
|
|
3204
|
+
});
|
|
3205
|
+
span?.end();
|
|
3206
|
+
telemetry?.recordHistogram("omote.inference.latency", inferenceTimeMs, {
|
|
3207
|
+
model: "wav2arkit_cpu",
|
|
3208
|
+
backend: this._backend
|
|
3209
|
+
});
|
|
3210
|
+
telemetry?.incrementCounter("omote.inference.total", 1, {
|
|
3211
|
+
model: "wav2arkit_cpu",
|
|
3212
|
+
backend: this._backend,
|
|
3213
|
+
status: "success"
|
|
3214
|
+
});
|
|
3215
|
+
resolve({
|
|
3216
|
+
blendshapes,
|
|
3217
|
+
numFrames,
|
|
3218
|
+
inferenceTimeMs
|
|
3219
|
+
});
|
|
3220
|
+
} catch (err) {
|
|
3221
|
+
span?.endWithError(err instanceof Error ? err : new Error(String(err)));
|
|
3222
|
+
telemetry?.incrementCounter("omote.inference.total", 1, {
|
|
3223
|
+
model: "wav2arkit_cpu",
|
|
3224
|
+
backend: this._backend,
|
|
3225
|
+
status: "error"
|
|
3226
|
+
});
|
|
3227
|
+
reject(err);
|
|
3228
|
+
}
|
|
3229
|
+
});
|
|
3230
|
+
});
|
|
3231
|
+
}
|
|
3232
|
+
/**
|
|
3233
|
+
* Dispose of the model and free resources
|
|
3234
|
+
*/
|
|
3235
|
+
async dispose() {
|
|
3236
|
+
if (this.session) {
|
|
3237
|
+
await this.session.release();
|
|
3238
|
+
this.session = null;
|
|
3239
|
+
}
|
|
3240
|
+
}
|
|
3241
|
+
};
|
|
3242
|
+
|
|
3243
|
+
// src/inference/createLipSync.ts
|
|
3244
|
+
var logger6 = createLogger("createLipSync");
|
|
3245
|
+
function createLipSync(config) {
|
|
3246
|
+
const mode = config.mode ?? "auto";
|
|
3247
|
+
const fallbackOnError = config.fallbackOnError ?? true;
|
|
3248
|
+
let useCpu;
|
|
3249
|
+
if (mode === "cpu") {
|
|
3250
|
+
useCpu = true;
|
|
3251
|
+
logger6.info("Forcing CPU lip sync model (wav2arkit_cpu)");
|
|
3252
|
+
} else if (mode === "gpu") {
|
|
3253
|
+
useCpu = false;
|
|
3254
|
+
logger6.info("Forcing GPU lip sync model (Wav2Vec2)");
|
|
3255
|
+
} else {
|
|
3256
|
+
useCpu = isSafari();
|
|
3257
|
+
logger6.info("Auto-detected lip sync model", {
|
|
3258
|
+
useCpu,
|
|
3259
|
+
isSafari: isSafari()
|
|
3260
|
+
});
|
|
3261
|
+
}
|
|
3262
|
+
if (useCpu) {
|
|
3263
|
+
logger6.info("Creating Wav2ArkitCpuInference (1.8MB, WASM)");
|
|
3264
|
+
return new Wav2ArkitCpuInference({
|
|
3265
|
+
modelUrl: config.cpuModelUrl
|
|
3266
|
+
});
|
|
3267
|
+
}
|
|
3268
|
+
const gpuInstance = new Wav2Vec2Inference({
|
|
3269
|
+
modelUrl: config.gpuModelUrl,
|
|
3270
|
+
backend: config.gpuBackend ?? "auto",
|
|
3271
|
+
numIdentityClasses: config.numIdentityClasses
|
|
3272
|
+
});
|
|
3273
|
+
if (fallbackOnError) {
|
|
3274
|
+
logger6.info("Creating Wav2Vec2Inference with CPU fallback");
|
|
3275
|
+
return new LipSyncWithFallback(gpuInstance, config);
|
|
3276
|
+
}
|
|
3277
|
+
logger6.info("Creating Wav2Vec2Inference (no fallback)");
|
|
3278
|
+
return gpuInstance;
|
|
3279
|
+
}
|
|
3280
|
+
var LipSyncWithFallback = class {
|
|
3281
|
+
constructor(gpuInstance, config) {
|
|
3282
|
+
this.hasFallenBack = false;
|
|
3283
|
+
this.implementation = gpuInstance;
|
|
3284
|
+
this.config = config;
|
|
3285
|
+
}
|
|
3286
|
+
get backend() {
|
|
3287
|
+
return this.implementation.backend;
|
|
3288
|
+
}
|
|
3289
|
+
get isLoaded() {
|
|
3290
|
+
return this.implementation.isLoaded;
|
|
3291
|
+
}
|
|
3292
|
+
async load() {
|
|
3293
|
+
try {
|
|
3294
|
+
return await this.implementation.load();
|
|
3295
|
+
} catch (error) {
|
|
3296
|
+
logger6.warn("GPU model load failed, falling back to CPU model", {
|
|
3297
|
+
error: error instanceof Error ? error.message : String(error)
|
|
3298
|
+
});
|
|
3299
|
+
try {
|
|
3300
|
+
await this.implementation.dispose();
|
|
3301
|
+
} catch {
|
|
3302
|
+
}
|
|
3303
|
+
this.implementation = new Wav2ArkitCpuInference({
|
|
3304
|
+
modelUrl: this.config.cpuModelUrl
|
|
3305
|
+
});
|
|
3306
|
+
this.hasFallenBack = true;
|
|
3307
|
+
logger6.info("Fallback to Wav2ArkitCpuInference successful");
|
|
3308
|
+
return await this.implementation.load();
|
|
3309
|
+
}
|
|
3310
|
+
}
|
|
3311
|
+
async infer(audioSamples, identityIndex) {
|
|
3312
|
+
return this.implementation.infer(audioSamples, identityIndex);
|
|
3313
|
+
}
|
|
3314
|
+
async dispose() {
|
|
3315
|
+
return this.implementation.dispose();
|
|
3316
|
+
}
|
|
3317
|
+
};
|
|
3318
|
+
|
|
2971
3319
|
// src/inference/SileroVADInference.ts
|
|
2972
|
-
var
|
|
3320
|
+
var logger7 = createLogger("SileroVAD");
|
|
2973
3321
|
var SileroVADInference = class {
|
|
2974
3322
|
constructor(config) {
|
|
2975
3323
|
this.session = null;
|
|
@@ -3041,23 +3389,23 @@ var SileroVADInference = class {
|
|
|
3041
3389
|
"model.sample_rate": this.config.sampleRate
|
|
3042
3390
|
});
|
|
3043
3391
|
try {
|
|
3044
|
-
|
|
3392
|
+
logger7.info("Loading ONNX Runtime...", { preference: this.config.backend });
|
|
3045
3393
|
const { ort, backend } = await getOnnxRuntimeForPreference(this.config.backend);
|
|
3046
3394
|
this.ort = ort;
|
|
3047
3395
|
this._backend = backend;
|
|
3048
|
-
|
|
3396
|
+
logger7.info("ONNX Runtime loaded", { backend: this._backend });
|
|
3049
3397
|
const cache = getModelCache();
|
|
3050
3398
|
const modelUrl = this.config.modelUrl;
|
|
3051
3399
|
const isCached = await cache.has(modelUrl);
|
|
3052
3400
|
let modelBuffer;
|
|
3053
3401
|
if (isCached) {
|
|
3054
|
-
|
|
3402
|
+
logger7.debug("Loading model from cache", { modelUrl });
|
|
3055
3403
|
modelBuffer = await cache.get(modelUrl);
|
|
3056
3404
|
} else {
|
|
3057
|
-
|
|
3405
|
+
logger7.debug("Fetching and caching model", { modelUrl });
|
|
3058
3406
|
modelBuffer = await fetchWithCache(modelUrl);
|
|
3059
3407
|
}
|
|
3060
|
-
|
|
3408
|
+
logger7.debug("Creating ONNX session", {
|
|
3061
3409
|
size: formatBytes(modelBuffer.byteLength),
|
|
3062
3410
|
backend: this._backend
|
|
3063
3411
|
});
|
|
@@ -3066,7 +3414,7 @@ var SileroVADInference = class {
|
|
|
3066
3414
|
this.session = await ort.InferenceSession.create(modelData, sessionOptions);
|
|
3067
3415
|
this.reset();
|
|
3068
3416
|
const loadTimeMs = performance.now() - startTime;
|
|
3069
|
-
|
|
3417
|
+
logger7.info("Model loaded successfully", {
|
|
3070
3418
|
backend: this._backend,
|
|
3071
3419
|
loadTimeMs: Math.round(loadTimeMs),
|
|
3072
3420
|
sampleRate: this.config.sampleRate,
|
|
@@ -3219,7 +3567,7 @@ var SileroVADInference = class {
|
|
|
3219
3567
|
this.preSpeechBuffer.shift();
|
|
3220
3568
|
}
|
|
3221
3569
|
}
|
|
3222
|
-
|
|
3570
|
+
logger7.trace("Skipping VAD inference - audio too quiet", {
|
|
3223
3571
|
rms: Math.round(rms * 1e4) / 1e4,
|
|
3224
3572
|
threshold: MIN_ENERGY_THRESHOLD
|
|
3225
3573
|
});
|
|
@@ -3273,7 +3621,7 @@ var SileroVADInference = class {
|
|
|
3273
3621
|
if (isSpeech && !this.wasSpeaking) {
|
|
3274
3622
|
preSpeechChunks = [...this.preSpeechBuffer];
|
|
3275
3623
|
this.preSpeechBuffer = [];
|
|
3276
|
-
|
|
3624
|
+
logger7.debug("Speech started with pre-speech buffer", {
|
|
3277
3625
|
preSpeechChunks: preSpeechChunks.length,
|
|
3278
3626
|
durationMs: Math.round(preSpeechChunks.length * this.getChunkDurationMs())
|
|
3279
3627
|
});
|
|
@@ -3286,7 +3634,7 @@ var SileroVADInference = class {
|
|
|
3286
3634
|
this.preSpeechBuffer = [];
|
|
3287
3635
|
}
|
|
3288
3636
|
this.wasSpeaking = isSpeech;
|
|
3289
|
-
|
|
3637
|
+
logger7.trace("VAD inference completed", {
|
|
3290
3638
|
probability: Math.round(probability * 1e3) / 1e3,
|
|
3291
3639
|
isSpeech,
|
|
3292
3640
|
inferenceTimeMs: Math.round(inferenceTimeMs * 100) / 100
|
|
@@ -3342,7 +3690,7 @@ var SileroVADInference = class {
|
|
|
3342
3690
|
SileroVADInference.isWebGPUAvailable = isWebGPUAvailable;
|
|
3343
3691
|
|
|
3344
3692
|
// src/inference/SileroVADWorker.ts
|
|
3345
|
-
var
|
|
3693
|
+
var logger8 = createLogger("SileroVADWorker");
|
|
3346
3694
|
var WASM_CDN_PATH2 = "https://cdn.jsdelivr.net/npm/onnxruntime-web@1.23.2/dist/";
|
|
3347
3695
|
var LOAD_TIMEOUT_MS = 1e4;
|
|
3348
3696
|
var INFERENCE_TIMEOUT_MS = 1e3;
|
|
@@ -3605,7 +3953,7 @@ var SileroVADWorker = class {
|
|
|
3605
3953
|
this.handleWorkerMessage(event.data);
|
|
3606
3954
|
};
|
|
3607
3955
|
worker.onerror = (error) => {
|
|
3608
|
-
|
|
3956
|
+
logger8.error("Worker error", { error: error.message });
|
|
3609
3957
|
for (const [, resolver] of this.pendingResolvers) {
|
|
3610
3958
|
resolver.reject(new Error(`Worker error: ${error.message}`));
|
|
3611
3959
|
}
|
|
@@ -3681,9 +4029,9 @@ var SileroVADWorker = class {
|
|
|
3681
4029
|
"model.sample_rate": this.config.sampleRate
|
|
3682
4030
|
});
|
|
3683
4031
|
try {
|
|
3684
|
-
|
|
4032
|
+
logger8.info("Creating VAD worker...");
|
|
3685
4033
|
this.worker = this.createWorker();
|
|
3686
|
-
|
|
4034
|
+
logger8.info("Loading model in worker...", {
|
|
3687
4035
|
modelUrl: this.config.modelUrl,
|
|
3688
4036
|
sampleRate: this.config.sampleRate
|
|
3689
4037
|
});
|
|
@@ -3699,7 +4047,7 @@ var SileroVADWorker = class {
|
|
|
3699
4047
|
);
|
|
3700
4048
|
this._isLoaded = true;
|
|
3701
4049
|
const loadTimeMs = performance.now() - startTime;
|
|
3702
|
-
|
|
4050
|
+
logger8.info("VAD worker loaded successfully", {
|
|
3703
4051
|
backend: "wasm",
|
|
3704
4052
|
loadTimeMs: Math.round(loadTimeMs),
|
|
3705
4053
|
workerLoadTimeMs: Math.round(result.loadTimeMs),
|
|
@@ -3806,7 +4154,7 @@ var SileroVADWorker = class {
|
|
|
3806
4154
|
if (isSpeech && !this.wasSpeaking) {
|
|
3807
4155
|
preSpeechChunks = [...this.preSpeechBuffer];
|
|
3808
4156
|
this.preSpeechBuffer = [];
|
|
3809
|
-
|
|
4157
|
+
logger8.debug("Speech started with pre-speech buffer", {
|
|
3810
4158
|
preSpeechChunks: preSpeechChunks.length,
|
|
3811
4159
|
durationMs: Math.round(preSpeechChunks.length * this.getChunkDurationMs())
|
|
3812
4160
|
});
|
|
@@ -3819,7 +4167,7 @@ var SileroVADWorker = class {
|
|
|
3819
4167
|
this.preSpeechBuffer = [];
|
|
3820
4168
|
}
|
|
3821
4169
|
this.wasSpeaking = isSpeech;
|
|
3822
|
-
|
|
4170
|
+
logger8.trace("VAD worker inference completed", {
|
|
3823
4171
|
probability: Math.round(result.probability * 1e3) / 1e3,
|
|
3824
4172
|
isSpeech,
|
|
3825
4173
|
inferenceTimeMs: Math.round(inferenceTimeMs * 100) / 100,
|
|
@@ -3887,18 +4235,18 @@ var SileroVADWorker = class {
|
|
|
3887
4235
|
};
|
|
3888
4236
|
|
|
3889
4237
|
// src/inference/createSileroVAD.ts
|
|
3890
|
-
var
|
|
4238
|
+
var logger9 = createLogger("createSileroVAD");
|
|
3891
4239
|
function supportsVADWorker() {
|
|
3892
4240
|
if (typeof Worker === "undefined") {
|
|
3893
|
-
|
|
4241
|
+
logger9.debug("Worker not supported: Worker constructor undefined");
|
|
3894
4242
|
return false;
|
|
3895
4243
|
}
|
|
3896
4244
|
if (typeof URL === "undefined" || typeof URL.createObjectURL === "undefined") {
|
|
3897
|
-
|
|
4245
|
+
logger9.debug("Worker not supported: URL.createObjectURL unavailable");
|
|
3898
4246
|
return false;
|
|
3899
4247
|
}
|
|
3900
4248
|
if (typeof Blob === "undefined") {
|
|
3901
|
-
|
|
4249
|
+
logger9.debug("Worker not supported: Blob constructor unavailable");
|
|
3902
4250
|
return false;
|
|
3903
4251
|
}
|
|
3904
4252
|
return true;
|
|
@@ -3908,19 +4256,19 @@ function createSileroVAD(config) {
|
|
|
3908
4256
|
let useWorker;
|
|
3909
4257
|
if (config.useWorker !== void 0) {
|
|
3910
4258
|
useWorker = config.useWorker;
|
|
3911
|
-
|
|
4259
|
+
logger9.debug("Worker preference explicitly set", { useWorker });
|
|
3912
4260
|
} else {
|
|
3913
4261
|
const workerSupported = supportsVADWorker();
|
|
3914
4262
|
const onMobile = isMobile();
|
|
3915
4263
|
useWorker = workerSupported && !onMobile;
|
|
3916
|
-
|
|
4264
|
+
logger9.debug("Auto-detected Worker preference", {
|
|
3917
4265
|
useWorker,
|
|
3918
4266
|
workerSupported,
|
|
3919
4267
|
onMobile
|
|
3920
4268
|
});
|
|
3921
4269
|
}
|
|
3922
4270
|
if (useWorker) {
|
|
3923
|
-
|
|
4271
|
+
logger9.info("Creating SileroVADWorker (off-main-thread)");
|
|
3924
4272
|
const worker = new SileroVADWorker({
|
|
3925
4273
|
modelUrl: config.modelUrl,
|
|
3926
4274
|
sampleRate: config.sampleRate,
|
|
@@ -3932,7 +4280,7 @@ function createSileroVAD(config) {
|
|
|
3932
4280
|
}
|
|
3933
4281
|
return worker;
|
|
3934
4282
|
}
|
|
3935
|
-
|
|
4283
|
+
logger9.info("Creating SileroVADInference (main thread)");
|
|
3936
4284
|
return new SileroVADInference(config);
|
|
3937
4285
|
}
|
|
3938
4286
|
var VADWorkerWithFallback = class {
|
|
@@ -3958,7 +4306,7 @@ var VADWorkerWithFallback = class {
|
|
|
3958
4306
|
try {
|
|
3959
4307
|
return await this.implementation.load();
|
|
3960
4308
|
} catch (error) {
|
|
3961
|
-
|
|
4309
|
+
logger9.warn("Worker load failed, falling back to main thread", {
|
|
3962
4310
|
error: error instanceof Error ? error.message : String(error)
|
|
3963
4311
|
});
|
|
3964
4312
|
try {
|
|
@@ -3967,7 +4315,7 @@ var VADWorkerWithFallback = class {
|
|
|
3967
4315
|
}
|
|
3968
4316
|
this.implementation = new SileroVADInference(this.config);
|
|
3969
4317
|
this.hasFallenBack = true;
|
|
3970
|
-
|
|
4318
|
+
logger9.info("Fallback to SileroVADInference successful");
|
|
3971
4319
|
return await this.implementation.load();
|
|
3972
4320
|
}
|
|
3973
4321
|
}
|
|
@@ -3989,7 +4337,7 @@ var VADWorkerWithFallback = class {
|
|
|
3989
4337
|
};
|
|
3990
4338
|
|
|
3991
4339
|
// src/inference/Emotion2VecInference.ts
|
|
3992
|
-
var
|
|
4340
|
+
var logger10 = createLogger("Emotion2Vec");
|
|
3993
4341
|
var EMOTION2VEC_LABELS = ["neutral", "happy", "angry", "sad"];
|
|
3994
4342
|
var Emotion2VecInference = class {
|
|
3995
4343
|
constructor(config) {
|
|
@@ -4031,28 +4379,28 @@ var Emotion2VecInference = class {
|
|
|
4031
4379
|
"model.backend_requested": this.config.backend
|
|
4032
4380
|
});
|
|
4033
4381
|
try {
|
|
4034
|
-
|
|
4382
|
+
logger10.info("Loading ONNX Runtime...", { preference: this.config.backend });
|
|
4035
4383
|
const { ort, backend } = await getOnnxRuntimeForPreference(this.config.backend);
|
|
4036
4384
|
this.ort = ort;
|
|
4037
4385
|
this._backend = backend;
|
|
4038
|
-
|
|
4039
|
-
|
|
4386
|
+
logger10.info("ONNX Runtime loaded", { backend: this._backend });
|
|
4387
|
+
logger10.info("Checking model cache...");
|
|
4040
4388
|
const cache = getModelCache();
|
|
4041
4389
|
const modelUrl = this.config.modelUrl;
|
|
4042
4390
|
const isCached = await cache.has(modelUrl);
|
|
4043
|
-
|
|
4391
|
+
logger10.info("Cache check complete", { modelUrl, isCached });
|
|
4044
4392
|
let modelBuffer;
|
|
4045
4393
|
if (isCached) {
|
|
4046
|
-
|
|
4394
|
+
logger10.info("Loading model from cache...", { modelUrl });
|
|
4047
4395
|
modelBuffer = await cache.get(modelUrl);
|
|
4048
|
-
|
|
4396
|
+
logger10.info("Model loaded from cache", { size: formatBytes(modelBuffer.byteLength) });
|
|
4049
4397
|
} else {
|
|
4050
|
-
|
|
4398
|
+
logger10.info("Fetching model (not cached)...", { modelUrl });
|
|
4051
4399
|
modelBuffer = await fetchWithCache(modelUrl);
|
|
4052
|
-
|
|
4400
|
+
logger10.info("Model fetched and cached", { size: formatBytes(modelBuffer.byteLength) });
|
|
4053
4401
|
}
|
|
4054
|
-
|
|
4055
|
-
|
|
4402
|
+
logger10.info("Creating ONNX session (this may take a while for large models)...");
|
|
4403
|
+
logger10.debug("Creating ONNX session", {
|
|
4056
4404
|
size: formatBytes(modelBuffer.byteLength),
|
|
4057
4405
|
backend: this._backend
|
|
4058
4406
|
});
|
|
@@ -4060,7 +4408,7 @@ var Emotion2VecInference = class {
|
|
|
4060
4408
|
const modelData = new Uint8Array(modelBuffer);
|
|
4061
4409
|
this.session = await ort.InferenceSession.create(modelData, sessionOptions);
|
|
4062
4410
|
const loadTimeMs = performance.now() - startTime;
|
|
4063
|
-
|
|
4411
|
+
logger10.info("Model loaded successfully", {
|
|
4064
4412
|
backend: this._backend,
|
|
4065
4413
|
loadTimeMs: Math.round(loadTimeMs),
|
|
4066
4414
|
sampleRate: this.config.sampleRate,
|
|
@@ -4172,7 +4520,7 @@ var Emotion2VecInference = class {
|
|
|
4172
4520
|
});
|
|
4173
4521
|
}
|
|
4174
4522
|
const inferenceTimeMs = performance.now() - startTime;
|
|
4175
|
-
|
|
4523
|
+
logger10.debug("Emotion inference completed", {
|
|
4176
4524
|
numFrames,
|
|
4177
4525
|
dominant: dominant.emotion,
|
|
4178
4526
|
confidence: Math.round(dominant.confidence * 100),
|
|
@@ -4249,7 +4597,7 @@ var Emotion2VecInference = class {
|
|
|
4249
4597
|
Emotion2VecInference.isWebGPUAvailable = isWebGPUAvailable;
|
|
4250
4598
|
|
|
4251
4599
|
// src/inference/SafariSpeechRecognition.ts
|
|
4252
|
-
var
|
|
4600
|
+
var logger11 = createLogger("SafariSpeech");
|
|
4253
4601
|
var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
4254
4602
|
constructor(config = {}) {
|
|
4255
4603
|
this.recognition = null;
|
|
@@ -4268,7 +4616,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
4268
4616
|
interimResults: config.interimResults ?? true,
|
|
4269
4617
|
maxAlternatives: config.maxAlternatives ?? 1
|
|
4270
4618
|
};
|
|
4271
|
-
|
|
4619
|
+
logger11.debug("SafariSpeechRecognition created", {
|
|
4272
4620
|
language: this.config.language,
|
|
4273
4621
|
continuous: this.config.continuous
|
|
4274
4622
|
});
|
|
@@ -4329,7 +4677,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
4329
4677
|
*/
|
|
4330
4678
|
async start() {
|
|
4331
4679
|
if (this.isListening) {
|
|
4332
|
-
|
|
4680
|
+
logger11.warn("Already listening");
|
|
4333
4681
|
return;
|
|
4334
4682
|
}
|
|
4335
4683
|
if (!_SafariSpeechRecognition.isAvailable()) {
|
|
@@ -4359,7 +4707,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
4359
4707
|
this.isListening = true;
|
|
4360
4708
|
this.startTime = performance.now();
|
|
4361
4709
|
this.accumulatedText = "";
|
|
4362
|
-
|
|
4710
|
+
logger11.info("Speech recognition started", {
|
|
4363
4711
|
language: this.config.language
|
|
4364
4712
|
});
|
|
4365
4713
|
span?.end();
|
|
@@ -4374,7 +4722,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
4374
4722
|
*/
|
|
4375
4723
|
async stop() {
|
|
4376
4724
|
if (!this.isListening || !this.recognition) {
|
|
4377
|
-
|
|
4725
|
+
logger11.warn("Not currently listening");
|
|
4378
4726
|
return {
|
|
4379
4727
|
text: this.accumulatedText,
|
|
4380
4728
|
language: this.config.language,
|
|
@@ -4403,7 +4751,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
4403
4751
|
if (this.recognition && this.isListening) {
|
|
4404
4752
|
this.recognition.abort();
|
|
4405
4753
|
this.isListening = false;
|
|
4406
|
-
|
|
4754
|
+
logger11.info("Speech recognition aborted");
|
|
4407
4755
|
}
|
|
4408
4756
|
}
|
|
4409
4757
|
/**
|
|
@@ -4434,7 +4782,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
4434
4782
|
this.isListening = false;
|
|
4435
4783
|
this.resultCallbacks = [];
|
|
4436
4784
|
this.errorCallbacks = [];
|
|
4437
|
-
|
|
4785
|
+
logger11.debug("SafariSpeechRecognition disposed");
|
|
4438
4786
|
}
|
|
4439
4787
|
/**
|
|
4440
4788
|
* Set up event handlers for the recognition instance
|
|
@@ -4462,7 +4810,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
4462
4810
|
confidence: alternative.confidence
|
|
4463
4811
|
};
|
|
4464
4812
|
this.emitResult(speechResult);
|
|
4465
|
-
|
|
4813
|
+
logger11.trace("Speech result", {
|
|
4466
4814
|
text: text.substring(0, 50),
|
|
4467
4815
|
isFinal,
|
|
4468
4816
|
confidence: alternative.confidence
|
|
@@ -4472,12 +4820,12 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
4472
4820
|
span?.end();
|
|
4473
4821
|
} catch (error) {
|
|
4474
4822
|
span?.endWithError(error instanceof Error ? error : new Error(String(error)));
|
|
4475
|
-
|
|
4823
|
+
logger11.error("Error processing speech result", { error });
|
|
4476
4824
|
}
|
|
4477
4825
|
};
|
|
4478
4826
|
this.recognition.onerror = (event) => {
|
|
4479
4827
|
const error = new Error(`Speech recognition error: ${event.error} - ${event.message}`);
|
|
4480
|
-
|
|
4828
|
+
logger11.error("Speech recognition error", { error: event.error, message: event.message });
|
|
4481
4829
|
this.emitError(error);
|
|
4482
4830
|
if (this.stopRejecter) {
|
|
4483
4831
|
this.stopRejecter(error);
|
|
@@ -4487,7 +4835,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
4487
4835
|
};
|
|
4488
4836
|
this.recognition.onend = () => {
|
|
4489
4837
|
this.isListening = false;
|
|
4490
|
-
|
|
4838
|
+
logger11.info("Speech recognition ended", {
|
|
4491
4839
|
totalText: this.accumulatedText.length,
|
|
4492
4840
|
durationMs: performance.now() - this.startTime
|
|
4493
4841
|
});
|
|
@@ -4504,13 +4852,13 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
4504
4852
|
}
|
|
4505
4853
|
};
|
|
4506
4854
|
this.recognition.onstart = () => {
|
|
4507
|
-
|
|
4855
|
+
logger11.debug("Speech recognition started by browser");
|
|
4508
4856
|
};
|
|
4509
4857
|
this.recognition.onspeechstart = () => {
|
|
4510
|
-
|
|
4858
|
+
logger11.debug("Speech detected");
|
|
4511
4859
|
};
|
|
4512
4860
|
this.recognition.onspeechend = () => {
|
|
4513
|
-
|
|
4861
|
+
logger11.debug("Speech ended");
|
|
4514
4862
|
};
|
|
4515
4863
|
}
|
|
4516
4864
|
/**
|
|
@@ -4521,7 +4869,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
4521
4869
|
try {
|
|
4522
4870
|
callback(result);
|
|
4523
4871
|
} catch (error) {
|
|
4524
|
-
|
|
4872
|
+
logger11.error("Error in result callback", { error });
|
|
4525
4873
|
}
|
|
4526
4874
|
}
|
|
4527
4875
|
}
|
|
@@ -4533,7 +4881,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
4533
4881
|
try {
|
|
4534
4882
|
callback(error);
|
|
4535
4883
|
} catch (callbackError) {
|
|
4536
|
-
|
|
4884
|
+
logger11.error("Error in error callback", { error: callbackError });
|
|
4537
4885
|
}
|
|
4538
4886
|
}
|
|
4539
4887
|
}
|
|
@@ -5956,12 +6304,12 @@ async function isHuggingFaceCDNReachable(testUrl = HF_CDN_TEST_URL) {
|
|
|
5956
6304
|
}
|
|
5957
6305
|
|
|
5958
6306
|
// src/utils/transformersCacheClear.ts
|
|
5959
|
-
var
|
|
6307
|
+
var logger12 = createLogger("TransformersCache");
|
|
5960
6308
|
async function clearTransformersCache(options) {
|
|
5961
6309
|
const verbose = options?.verbose ?? true;
|
|
5962
6310
|
const additionalPatterns = options?.additionalPatterns ?? [];
|
|
5963
6311
|
if (!("caches" in window)) {
|
|
5964
|
-
|
|
6312
|
+
logger12.warn("Cache API not available in this environment");
|
|
5965
6313
|
return [];
|
|
5966
6314
|
}
|
|
5967
6315
|
try {
|
|
@@ -5979,18 +6327,18 @@ async function clearTransformersCache(options) {
|
|
|
5979
6327
|
);
|
|
5980
6328
|
if (shouldDelete) {
|
|
5981
6329
|
if (verbose) {
|
|
5982
|
-
|
|
6330
|
+
logger12.info("Deleting cache", { cacheName });
|
|
5983
6331
|
}
|
|
5984
6332
|
const deleted = await caches.delete(cacheName);
|
|
5985
6333
|
if (deleted) {
|
|
5986
6334
|
deletedCaches.push(cacheName);
|
|
5987
6335
|
} else if (verbose) {
|
|
5988
|
-
|
|
6336
|
+
logger12.warn("Failed to delete cache", { cacheName });
|
|
5989
6337
|
}
|
|
5990
6338
|
}
|
|
5991
6339
|
}
|
|
5992
6340
|
if (verbose) {
|
|
5993
|
-
|
|
6341
|
+
logger12.info("Cache clearing complete", {
|
|
5994
6342
|
totalCaches: cacheNames.length,
|
|
5995
6343
|
deletedCount: deletedCaches.length,
|
|
5996
6344
|
deletedCaches
|
|
@@ -5998,35 +6346,35 @@ async function clearTransformersCache(options) {
|
|
|
5998
6346
|
}
|
|
5999
6347
|
return deletedCaches;
|
|
6000
6348
|
} catch (error) {
|
|
6001
|
-
|
|
6349
|
+
logger12.error("Error clearing caches", { error });
|
|
6002
6350
|
throw error;
|
|
6003
6351
|
}
|
|
6004
6352
|
}
|
|
6005
6353
|
async function clearSpecificCache(cacheName) {
|
|
6006
6354
|
if (!("caches" in window)) {
|
|
6007
|
-
|
|
6355
|
+
logger12.warn("Cache API not available in this environment");
|
|
6008
6356
|
return false;
|
|
6009
6357
|
}
|
|
6010
6358
|
try {
|
|
6011
6359
|
const deleted = await caches.delete(cacheName);
|
|
6012
|
-
|
|
6360
|
+
logger12.info("Cache deletion attempt", { cacheName, deleted });
|
|
6013
6361
|
return deleted;
|
|
6014
6362
|
} catch (error) {
|
|
6015
|
-
|
|
6363
|
+
logger12.error("Error deleting cache", { cacheName, error });
|
|
6016
6364
|
return false;
|
|
6017
6365
|
}
|
|
6018
6366
|
}
|
|
6019
6367
|
async function listCaches() {
|
|
6020
6368
|
if (!("caches" in window)) {
|
|
6021
|
-
|
|
6369
|
+
logger12.warn("Cache API not available in this environment");
|
|
6022
6370
|
return [];
|
|
6023
6371
|
}
|
|
6024
6372
|
try {
|
|
6025
6373
|
const cacheNames = await caches.keys();
|
|
6026
|
-
|
|
6374
|
+
logger12.debug("Available caches", { cacheNames });
|
|
6027
6375
|
return cacheNames;
|
|
6028
6376
|
} catch (error) {
|
|
6029
|
-
|
|
6377
|
+
logger12.error("Error listing caches", { error });
|
|
6030
6378
|
return [];
|
|
6031
6379
|
}
|
|
6032
6380
|
}
|
|
@@ -6068,7 +6416,7 @@ async function validateCachedResponse(cacheName, requestUrl) {
|
|
|
6068
6416
|
reason: valid ? "Valid response" : `Invalid: status=${response.status}, contentType=${contentType}, isHtml=${isHtml || looksLikeHtml}`
|
|
6069
6417
|
};
|
|
6070
6418
|
} catch (error) {
|
|
6071
|
-
|
|
6419
|
+
logger12.error("Error validating cached response", { cacheName, requestUrl, error });
|
|
6072
6420
|
return {
|
|
6073
6421
|
exists: false,
|
|
6074
6422
|
valid: false,
|
|
@@ -6105,7 +6453,7 @@ async function scanForInvalidCaches() {
|
|
|
6105
6453
|
}
|
|
6106
6454
|
}
|
|
6107
6455
|
}
|
|
6108
|
-
|
|
6456
|
+
logger12.info("Cache scan complete", {
|
|
6109
6457
|
totalCaches: cacheNames.length,
|
|
6110
6458
|
scannedEntries,
|
|
6111
6459
|
invalidCount: invalidEntries.length
|
|
@@ -6116,13 +6464,13 @@ async function scanForInvalidCaches() {
|
|
|
6116
6464
|
invalidEntries
|
|
6117
6465
|
};
|
|
6118
6466
|
} catch (error) {
|
|
6119
|
-
|
|
6467
|
+
logger12.error("Error scanning caches", { error });
|
|
6120
6468
|
throw error;
|
|
6121
6469
|
}
|
|
6122
6470
|
}
|
|
6123
6471
|
async function nukeBrowserCaches(preventRecreation = false) {
|
|
6124
6472
|
if (!("caches" in window)) {
|
|
6125
|
-
|
|
6473
|
+
logger12.warn("Cache API not available in this environment");
|
|
6126
6474
|
return 0;
|
|
6127
6475
|
}
|
|
6128
6476
|
try {
|
|
@@ -6134,17 +6482,17 @@ async function nukeBrowserCaches(preventRecreation = false) {
|
|
|
6134
6482
|
deletedCount++;
|
|
6135
6483
|
}
|
|
6136
6484
|
}
|
|
6137
|
-
|
|
6485
|
+
logger12.info("All browser caches cleared", {
|
|
6138
6486
|
totalDeleted: deletedCount
|
|
6139
6487
|
});
|
|
6140
6488
|
if (preventRecreation) {
|
|
6141
6489
|
const { env: env2 } = await import("./transformers.web-ALDLCPHT.mjs");
|
|
6142
6490
|
env2.useBrowserCache = false;
|
|
6143
|
-
|
|
6491
|
+
logger12.warn("Browser cache creation disabled (env.useBrowserCache = false)");
|
|
6144
6492
|
}
|
|
6145
6493
|
return deletedCount;
|
|
6146
6494
|
} catch (error) {
|
|
6147
|
-
|
|
6495
|
+
logger12.error("Error nuking caches", { error });
|
|
6148
6496
|
throw error;
|
|
6149
6497
|
}
|
|
6150
6498
|
}
|
|
@@ -6670,6 +7018,7 @@ var EmphasisDetector = class {
|
|
|
6670
7018
|
}
|
|
6671
7019
|
};
|
|
6672
7020
|
export {
|
|
7021
|
+
ARKIT_BLENDSHAPES,
|
|
6673
7022
|
AgentCoreAdapter,
|
|
6674
7023
|
AnimationGraph,
|
|
6675
7024
|
AudioChunkCoalescer,
|
|
@@ -6705,6 +7054,8 @@ export {
|
|
|
6705
7054
|
SileroVADWorker,
|
|
6706
7055
|
SyncedAudioPipeline,
|
|
6707
7056
|
TenantManager,
|
|
7057
|
+
WAV2ARKIT_BLENDSHAPES,
|
|
7058
|
+
Wav2ArkitCpuInference,
|
|
6708
7059
|
Wav2Vec2Inference,
|
|
6709
7060
|
WhisperInference,
|
|
6710
7061
|
blendEmotions,
|
|
@@ -6716,6 +7067,7 @@ export {
|
|
|
6716
7067
|
configureLogging,
|
|
6717
7068
|
configureTelemetry,
|
|
6718
7069
|
createEmotionVector,
|
|
7070
|
+
createLipSync,
|
|
6719
7071
|
createLogger,
|
|
6720
7072
|
createSessionWithFallback,
|
|
6721
7073
|
createSileroVAD,
|
|
@@ -6740,6 +7092,7 @@ export {
|
|
|
6740
7092
|
isIOSSafari,
|
|
6741
7093
|
isMobile,
|
|
6742
7094
|
isOnnxRuntimeLoaded,
|
|
7095
|
+
isSafari,
|
|
6743
7096
|
isSpeechRecognitionAvailable,
|
|
6744
7097
|
isWebGPUAvailable,
|
|
6745
7098
|
lerpEmotion,
|
|
@@ -6748,15 +7101,18 @@ export {
|
|
|
6748
7101
|
nukeBrowserCaches,
|
|
6749
7102
|
parseHuggingFaceUrl,
|
|
6750
7103
|
preloadModels,
|
|
7104
|
+
remapWav2ArkitToLam,
|
|
6751
7105
|
resetLoggingConfig,
|
|
6752
7106
|
resolveBackend,
|
|
6753
7107
|
scanForInvalidCaches,
|
|
6754
7108
|
setLogLevel,
|
|
6755
7109
|
setLoggingEnabled,
|
|
6756
7110
|
shouldEnableWasmProxy,
|
|
7111
|
+
shouldUseCpuLipSync,
|
|
6757
7112
|
shouldUseNativeASR,
|
|
6758
7113
|
shouldUseServerLipSync,
|
|
6759
7114
|
supportsVADWorker,
|
|
7115
|
+
symmetrizeBlendshapes,
|
|
6760
7116
|
validateCachedResponse
|
|
6761
7117
|
};
|
|
6762
7118
|
//# sourceMappingURL=index.mjs.map
|