@omote/core 0.5.7 → 0.6.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +4 -13
- package/dist/index.d.mts +813 -86
- package/dist/index.d.ts +813 -86
- package/dist/index.js +1653 -563
- package/dist/index.js.map +1 -1
- package/dist/index.mjs +1648 -558
- package/dist/index.mjs.map +1 -1
- package/package.json +3 -2
package/dist/index.mjs
CHANGED
|
@@ -762,6 +762,24 @@ var A2EProcessor = class {
|
|
|
762
762
|
}
|
|
763
763
|
};
|
|
764
764
|
|
|
765
|
+
// src/audio/audioUtils.ts
|
|
766
|
+
function pcm16ToFloat32(buffer) {
|
|
767
|
+
const byteLen = buffer.byteLength & ~1;
|
|
768
|
+
const int16 = byteLen === buffer.byteLength ? new Int16Array(buffer) : new Int16Array(buffer, 0, byteLen / 2);
|
|
769
|
+
const float32 = new Float32Array(int16.length);
|
|
770
|
+
for (let i = 0; i < int16.length; i++) {
|
|
771
|
+
float32[i] = int16[i] / 32768;
|
|
772
|
+
}
|
|
773
|
+
return float32;
|
|
774
|
+
}
|
|
775
|
+
function int16ToFloat32(int16) {
|
|
776
|
+
const float32 = new Float32Array(int16.length);
|
|
777
|
+
for (let i = 0; i < int16.length; i++) {
|
|
778
|
+
float32[i] = int16[i] / 32768;
|
|
779
|
+
}
|
|
780
|
+
return float32;
|
|
781
|
+
}
|
|
782
|
+
|
|
765
783
|
// src/telemetry/exporters/console.ts
|
|
766
784
|
var ConsoleExporter = class {
|
|
767
785
|
constructor(options = {}) {
|
|
@@ -2534,7 +2552,7 @@ var _Wav2Vec2Inference = class _Wav2Vec2Inference {
|
|
|
2534
2552
|
} else {
|
|
2535
2553
|
logger3.info("Fetching external model data", {
|
|
2536
2554
|
dataUrl,
|
|
2537
|
-
note: "This may be a large download
|
|
2555
|
+
note: "This may be a large download"
|
|
2538
2556
|
});
|
|
2539
2557
|
externalDataBuffer = await fetchWithCache(dataUrl);
|
|
2540
2558
|
}
|
|
@@ -2542,6 +2560,9 @@ var _Wav2Vec2Inference = class _Wav2Vec2Inference {
|
|
|
2542
2560
|
size: formatBytes(externalDataBuffer.byteLength)
|
|
2543
2561
|
});
|
|
2544
2562
|
} catch (err) {
|
|
2563
|
+
if (typeof this.config.externalDataUrl === "string") {
|
|
2564
|
+
throw new Error(`Failed to fetch external data: ${dataUrl} \u2014 ${err.message}`);
|
|
2565
|
+
}
|
|
2545
2566
|
logger3.debug("No external data file found (single-file model)", {
|
|
2546
2567
|
dataUrl,
|
|
2547
2568
|
error: err.message
|
|
@@ -2665,28 +2686,6 @@ var _Wav2Vec2Inference = class _Wav2Vec2Inference {
|
|
|
2665
2686
|
};
|
|
2666
2687
|
return this.queueInference(feeds);
|
|
2667
2688
|
}
|
|
2668
|
-
/**
|
|
2669
|
-
* Decode CTC logits to text using greedy decoding
|
|
2670
|
-
*/
|
|
2671
|
-
decodeCTC(logits) {
|
|
2672
|
-
const tokens = [];
|
|
2673
|
-
let prevToken = -1;
|
|
2674
|
-
for (const frame of logits) {
|
|
2675
|
-
let maxIdx = 0;
|
|
2676
|
-
let maxVal = frame[0];
|
|
2677
|
-
for (let i = 1; i < frame.length; i++) {
|
|
2678
|
-
if (frame[i] > maxVal) {
|
|
2679
|
-
maxVal = frame[i];
|
|
2680
|
-
maxIdx = i;
|
|
2681
|
-
}
|
|
2682
|
-
}
|
|
2683
|
-
if (maxIdx !== prevToken && maxIdx !== 0) {
|
|
2684
|
-
tokens.push(maxIdx);
|
|
2685
|
-
}
|
|
2686
|
-
prevToken = maxIdx;
|
|
2687
|
-
}
|
|
2688
|
-
return tokens.map((t) => CTC_VOCAB[t] === "|" ? " " : CTC_VOCAB[t]).join("");
|
|
2689
|
-
}
|
|
2690
2689
|
/**
|
|
2691
2690
|
* Queue inference to serialize ONNX session calls
|
|
2692
2691
|
*/
|
|
@@ -2714,37 +2713,25 @@ var _Wav2Vec2Inference = class _Wav2Vec2Inference {
|
|
|
2714
2713
|
})
|
|
2715
2714
|
]);
|
|
2716
2715
|
const inferenceTimeMs = performance.now() - startTime;
|
|
2717
|
-
const asrOutput = results["asr_logits"];
|
|
2718
2716
|
const blendshapeOutput = results["blendshapes"];
|
|
2719
|
-
if (!
|
|
2720
|
-
throw new Error("Missing
|
|
2717
|
+
if (!blendshapeOutput) {
|
|
2718
|
+
throw new Error("Missing blendshapes output from model");
|
|
2721
2719
|
}
|
|
2722
|
-
const asrData = asrOutput.data;
|
|
2723
2720
|
const blendshapeData = blendshapeOutput.data;
|
|
2724
|
-
const numASRFrames = asrOutput.dims[1];
|
|
2725
2721
|
const numA2EFrames = blendshapeOutput.dims[1];
|
|
2726
|
-
const asrVocabSize = asrOutput.dims[2];
|
|
2727
2722
|
const numBlendshapes = blendshapeOutput.dims[2];
|
|
2728
|
-
const asrLogits = [];
|
|
2729
2723
|
const blendshapes = [];
|
|
2730
|
-
for (let f = 0; f < numASRFrames; f++) {
|
|
2731
|
-
asrLogits.push(asrData.slice(f * asrVocabSize, (f + 1) * asrVocabSize));
|
|
2732
|
-
}
|
|
2733
2724
|
for (let f = 0; f < numA2EFrames; f++) {
|
|
2734
2725
|
const rawFrame = blendshapeData.slice(f * numBlendshapes, (f + 1) * numBlendshapes);
|
|
2735
2726
|
blendshapes.push(symmetrizeBlendshapes(rawFrame));
|
|
2736
2727
|
}
|
|
2737
|
-
const text = this.decodeCTC(asrLogits);
|
|
2738
2728
|
logger3.trace("Inference completed", {
|
|
2739
2729
|
inferenceTimeMs: Math.round(inferenceTimeMs * 100) / 100,
|
|
2740
|
-
numA2EFrames
|
|
2741
|
-
numASRFrames,
|
|
2742
|
-
textLength: text.length
|
|
2730
|
+
numA2EFrames
|
|
2743
2731
|
});
|
|
2744
2732
|
span?.setAttributes({
|
|
2745
2733
|
"inference.duration_ms": inferenceTimeMs,
|
|
2746
|
-
"inference.a2e_frames": numA2EFrames
|
|
2747
|
-
"inference.asr_frames": numASRFrames
|
|
2734
|
+
"inference.a2e_frames": numA2EFrames
|
|
2748
2735
|
});
|
|
2749
2736
|
span?.end();
|
|
2750
2737
|
telemetry?.recordHistogram("omote.inference.latency", inferenceTimeMs, {
|
|
@@ -2758,11 +2745,7 @@ var _Wav2Vec2Inference = class _Wav2Vec2Inference {
|
|
|
2758
2745
|
});
|
|
2759
2746
|
resolve({
|
|
2760
2747
|
blendshapes,
|
|
2761
|
-
asrLogits,
|
|
2762
|
-
text,
|
|
2763
2748
|
numFrames: numA2EFrames,
|
|
2764
|
-
numA2EFrames,
|
|
2765
|
-
numASRFrames,
|
|
2766
2749
|
inferenceTimeMs
|
|
2767
2750
|
});
|
|
2768
2751
|
} catch (err) {
|
|
@@ -2815,19 +2798,7 @@ _Wav2Vec2Inference.INFERENCE_TIMEOUT_MS = 5e3;
|
|
|
2815
2798
|
_Wav2Vec2Inference.isWebGPUAvailable = isWebGPUAvailable;
|
|
2816
2799
|
var Wav2Vec2Inference = _Wav2Vec2Inference;
|
|
2817
2800
|
|
|
2818
|
-
// src/audio/
|
|
2819
|
-
function pcm16ToFloat32(buffer) {
|
|
2820
|
-
const byteLen = buffer.byteLength & ~1;
|
|
2821
|
-
const int16 = byteLen === buffer.byteLength ? new Int16Array(buffer) : new Int16Array(buffer, 0, byteLen / 2);
|
|
2822
|
-
const float32 = new Float32Array(int16.length);
|
|
2823
|
-
for (let i = 0; i < int16.length; i++) {
|
|
2824
|
-
float32[i] = int16[i] / 32768;
|
|
2825
|
-
}
|
|
2826
|
-
return float32;
|
|
2827
|
-
}
|
|
2828
|
-
|
|
2829
|
-
// src/audio/FullFacePipeline.ts
|
|
2830
|
-
var logger4 = createLogger("FullFacePipeline");
|
|
2801
|
+
// src/audio/expressionProfile.ts
|
|
2831
2802
|
var BLENDSHAPE_TO_GROUP = /* @__PURE__ */ new Map();
|
|
2832
2803
|
for (const name of LAM_BLENDSHAPES) {
|
|
2833
2804
|
if (name.startsWith("eye")) {
|
|
@@ -2846,6 +2817,24 @@ for (const name of LAM_BLENDSHAPES) {
|
|
|
2846
2817
|
BLENDSHAPE_TO_GROUP.set(name, "tongue");
|
|
2847
2818
|
}
|
|
2848
2819
|
}
|
|
2820
|
+
function applyProfile(raw, profile) {
|
|
2821
|
+
const scaled = new Float32Array(52);
|
|
2822
|
+
for (let i = 0; i < 52; i++) {
|
|
2823
|
+
const name = LAM_BLENDSHAPES[i];
|
|
2824
|
+
let scaler;
|
|
2825
|
+
if (profile.overrides && profile.overrides[name] !== void 0) {
|
|
2826
|
+
scaler = profile.overrides[name];
|
|
2827
|
+
} else {
|
|
2828
|
+
const group = BLENDSHAPE_TO_GROUP.get(name);
|
|
2829
|
+
scaler = group ? profile[group] ?? 1 : 1;
|
|
2830
|
+
}
|
|
2831
|
+
scaled[i] = Math.min(1, Math.max(0, raw[i] * scaler));
|
|
2832
|
+
}
|
|
2833
|
+
return scaled;
|
|
2834
|
+
}
|
|
2835
|
+
|
|
2836
|
+
// src/audio/FullFacePipeline.ts
|
|
2837
|
+
var logger4 = createLogger("FullFacePipeline");
|
|
2849
2838
|
var FullFacePipeline = class extends EventEmitter {
|
|
2850
2839
|
constructor(options) {
|
|
2851
2840
|
super();
|
|
@@ -2910,25 +2899,10 @@ var FullFacePipeline = class extends EventEmitter {
|
|
|
2910
2899
|
/**
|
|
2911
2900
|
* Apply ExpressionProfile scaling to raw A2E blendshapes.
|
|
2912
2901
|
*
|
|
2913
|
-
*
|
|
2914
|
-
* 1. If an override exists for the blendshape name, use override as scaler
|
|
2915
|
-
* 2. Otherwise, use the group scaler (default 1.0)
|
|
2916
|
-
* 3. Clamp result to [0, 1]
|
|
2902
|
+
* Delegates to the standalone applyProfile() utility from expressionProfile.ts.
|
|
2917
2903
|
*/
|
|
2918
2904
|
applyProfile(raw) {
|
|
2919
|
-
|
|
2920
|
-
for (let i = 0; i < 52; i++) {
|
|
2921
|
-
const name = LAM_BLENDSHAPES[i];
|
|
2922
|
-
let scaler;
|
|
2923
|
-
if (this.profile.overrides && this.profile.overrides[name] !== void 0) {
|
|
2924
|
-
scaler = this.profile.overrides[name];
|
|
2925
|
-
} else {
|
|
2926
|
-
const group = BLENDSHAPE_TO_GROUP.get(name);
|
|
2927
|
-
scaler = group ? this.profile[group] ?? 1 : 1;
|
|
2928
|
-
}
|
|
2929
|
-
scaled[i] = Math.min(1, Math.max(0, raw[i] * scaler));
|
|
2930
|
-
}
|
|
2931
|
-
return scaled;
|
|
2905
|
+
return applyProfile(raw, this.profile);
|
|
2932
2906
|
}
|
|
2933
2907
|
/**
|
|
2934
2908
|
* Start a new playback session
|
|
@@ -3113,6 +3087,329 @@ var FullFacePipeline = class extends EventEmitter {
|
|
|
3113
3087
|
}
|
|
3114
3088
|
};
|
|
3115
3089
|
|
|
3090
|
+
// src/audio/PlaybackPipeline.ts
|
|
3091
|
+
var logger5 = createLogger("PlaybackPipeline");
|
|
3092
|
+
var PlaybackPipeline = class extends EventEmitter {
|
|
3093
|
+
constructor(config) {
|
|
3094
|
+
super();
|
|
3095
|
+
this.config = config;
|
|
3096
|
+
this._state = "idle";
|
|
3097
|
+
this.playbackStarted = false;
|
|
3098
|
+
this.monitorInterval = null;
|
|
3099
|
+
this.frameAnimationId = null;
|
|
3100
|
+
// Stale frame detection
|
|
3101
|
+
this.lastNewFrameTime = 0;
|
|
3102
|
+
this.lastKnownLamFrame = null;
|
|
3103
|
+
this.staleWarningEmitted = false;
|
|
3104
|
+
// Diagnostic counter
|
|
3105
|
+
this.frameLoopCount = 0;
|
|
3106
|
+
this.neutralTransitionFrame = null;
|
|
3107
|
+
this.neutralTransitionStart = 0;
|
|
3108
|
+
this.neutralAnimationId = null;
|
|
3109
|
+
// Current frame refs
|
|
3110
|
+
this._currentFrame = null;
|
|
3111
|
+
this._currentRawFrame = null;
|
|
3112
|
+
this.sampleRate = config.sampleRate ?? 16e3;
|
|
3113
|
+
this.profile = config.profile ?? {};
|
|
3114
|
+
this.staleThresholdMs = config.staleThresholdMs ?? 2e3;
|
|
3115
|
+
this.neutralTransitionEnabled = config.neutralTransitionEnabled ?? false;
|
|
3116
|
+
this.neutralTransitionMs = config.neutralTransitionMs ?? 250;
|
|
3117
|
+
const isCpuModel = config.lam.modelId === "wav2arkit_cpu";
|
|
3118
|
+
const chunkSize = config.chunkSize ?? config.lam.chunkSize ?? 16e3;
|
|
3119
|
+
const chunkAccumulationMs = chunkSize / this.sampleRate * 1e3;
|
|
3120
|
+
const inferenceEstimateMs = isCpuModel ? 300 : config.lam.backend === "wasm" ? 250 : 80;
|
|
3121
|
+
const marginMs = 100;
|
|
3122
|
+
const autoDelay = Math.ceil(chunkAccumulationMs + inferenceEstimateMs + marginMs);
|
|
3123
|
+
const audioDelayMs = config.audioDelayMs ?? autoDelay;
|
|
3124
|
+
logger5.info("PlaybackPipeline config", {
|
|
3125
|
+
chunkSize,
|
|
3126
|
+
audioDelayMs,
|
|
3127
|
+
autoDelay,
|
|
3128
|
+
backend: config.lam.backend,
|
|
3129
|
+
modelId: config.lam.modelId,
|
|
3130
|
+
neutralTransitionEnabled: this.neutralTransitionEnabled
|
|
3131
|
+
});
|
|
3132
|
+
this.scheduler = new AudioScheduler({
|
|
3133
|
+
sampleRate: this.sampleRate,
|
|
3134
|
+
initialLookaheadSec: audioDelayMs / 1e3
|
|
3135
|
+
});
|
|
3136
|
+
this.coalescer = new AudioChunkCoalescer({
|
|
3137
|
+
sampleRate: this.sampleRate,
|
|
3138
|
+
targetDurationMs: config.chunkTargetMs ?? 200
|
|
3139
|
+
});
|
|
3140
|
+
this.processor = new A2EProcessor({
|
|
3141
|
+
backend: config.lam,
|
|
3142
|
+
sampleRate: this.sampleRate,
|
|
3143
|
+
chunkSize,
|
|
3144
|
+
identityIndex: config.identityIndex,
|
|
3145
|
+
onError: (error) => {
|
|
3146
|
+
logger5.error("A2E inference error", { message: error.message, stack: error.stack });
|
|
3147
|
+
this.emit("error", error);
|
|
3148
|
+
}
|
|
3149
|
+
});
|
|
3150
|
+
}
|
|
3151
|
+
/** Current pipeline state */
|
|
3152
|
+
get state() {
|
|
3153
|
+
return this._state;
|
|
3154
|
+
}
|
|
3155
|
+
/** Current scaled blendshapes (updated in-place for perf) */
|
|
3156
|
+
get currentFrame() {
|
|
3157
|
+
return this._currentFrame;
|
|
3158
|
+
}
|
|
3159
|
+
/** Raw A2E blendshapes (before profile scaling) */
|
|
3160
|
+
get currentRawFrame() {
|
|
3161
|
+
return this._currentRawFrame;
|
|
3162
|
+
}
|
|
3163
|
+
// ---------------------------------------------------------------------------
|
|
3164
|
+
// Lifecycle
|
|
3165
|
+
// ---------------------------------------------------------------------------
|
|
3166
|
+
/** Initialize AudioContext (lazy, call after user gesture) */
|
|
3167
|
+
async initialize() {
|
|
3168
|
+
await this.scheduler.initialize();
|
|
3169
|
+
}
|
|
3170
|
+
/** Update ExpressionProfile at runtime */
|
|
3171
|
+
setProfile(profile) {
|
|
3172
|
+
this.profile = profile;
|
|
3173
|
+
}
|
|
3174
|
+
// ---------------------------------------------------------------------------
|
|
3175
|
+
// Async mode (streaming TTS)
|
|
3176
|
+
// ---------------------------------------------------------------------------
|
|
3177
|
+
/**
|
|
3178
|
+
* Start a new playback session.
|
|
3179
|
+
* Idempotent — calling during playback resets cleanly without emitting
|
|
3180
|
+
* spurious playback:complete.
|
|
3181
|
+
*/
|
|
3182
|
+
start() {
|
|
3183
|
+
this.stopInternal(false);
|
|
3184
|
+
this.scheduler.reset();
|
|
3185
|
+
this.coalescer.reset();
|
|
3186
|
+
this.processor.reset();
|
|
3187
|
+
this.playbackStarted = false;
|
|
3188
|
+
this.lastNewFrameTime = 0;
|
|
3189
|
+
this.lastKnownLamFrame = null;
|
|
3190
|
+
this.staleWarningEmitted = false;
|
|
3191
|
+
this.frameLoopCount = 0;
|
|
3192
|
+
this._currentFrame = null;
|
|
3193
|
+
this._currentRawFrame = null;
|
|
3194
|
+
this.cancelNeutralTransition();
|
|
3195
|
+
this.scheduler.warmup();
|
|
3196
|
+
this.startFrameLoop();
|
|
3197
|
+
this.startMonitoring();
|
|
3198
|
+
this.setState("playing");
|
|
3199
|
+
}
|
|
3200
|
+
/** Feed a streaming audio chunk (PCM16 Uint8Array) */
|
|
3201
|
+
async onAudioChunk(chunk) {
|
|
3202
|
+
const combined = this.coalescer.add(chunk);
|
|
3203
|
+
if (!combined) return;
|
|
3204
|
+
const float32 = pcm16ToFloat32(combined);
|
|
3205
|
+
const scheduleTime = await this.scheduler.schedule(float32);
|
|
3206
|
+
if (!this.playbackStarted) {
|
|
3207
|
+
this.playbackStarted = true;
|
|
3208
|
+
this.emit("playback:start", { time: scheduleTime });
|
|
3209
|
+
this.emit("playback_start", scheduleTime);
|
|
3210
|
+
}
|
|
3211
|
+
this.processor.pushAudio(float32, scheduleTime);
|
|
3212
|
+
}
|
|
3213
|
+
/** Signal end of audio stream (flushes remaining audio) */
|
|
3214
|
+
async end() {
|
|
3215
|
+
const remaining = this.coalescer.flush();
|
|
3216
|
+
if (remaining) {
|
|
3217
|
+
const chunk = new Uint8Array(remaining);
|
|
3218
|
+
await this.onAudioChunk(chunk);
|
|
3219
|
+
}
|
|
3220
|
+
await this.processor.flush();
|
|
3221
|
+
}
|
|
3222
|
+
// ---------------------------------------------------------------------------
|
|
3223
|
+
// Sync mode (full buffer)
|
|
3224
|
+
// ---------------------------------------------------------------------------
|
|
3225
|
+
/**
|
|
3226
|
+
* Feed a complete audio buffer. Chunks into 200ms pieces, schedules each
|
|
3227
|
+
* for playback, runs A2E inference, then waits for completion.
|
|
3228
|
+
*/
|
|
3229
|
+
async feedBuffer(audio) {
|
|
3230
|
+
const float32 = audio instanceof Float32Array ? audio : pcm16ToFloat32(audio);
|
|
3231
|
+
this.start();
|
|
3232
|
+
const chunkSamples = Math.floor(this.sampleRate * 0.2);
|
|
3233
|
+
for (let i = 0; i < float32.length; i += chunkSamples) {
|
|
3234
|
+
const chunk = float32.subarray(i, Math.min(i + chunkSamples, float32.length));
|
|
3235
|
+
const scheduleTime = await this.scheduler.schedule(chunk);
|
|
3236
|
+
this.processor.pushAudio(chunk, scheduleTime);
|
|
3237
|
+
if (!this.playbackStarted) {
|
|
3238
|
+
this.playbackStarted = true;
|
|
3239
|
+
this.emit("playback:start", { time: scheduleTime });
|
|
3240
|
+
this.emit("playback_start", scheduleTime);
|
|
3241
|
+
}
|
|
3242
|
+
}
|
|
3243
|
+
await this.processor.flush();
|
|
3244
|
+
return new Promise((resolve) => {
|
|
3245
|
+
const unsub = this.on("playback:complete", () => {
|
|
3246
|
+
unsub();
|
|
3247
|
+
resolve();
|
|
3248
|
+
});
|
|
3249
|
+
});
|
|
3250
|
+
}
|
|
3251
|
+
// ---------------------------------------------------------------------------
|
|
3252
|
+
// Control
|
|
3253
|
+
// ---------------------------------------------------------------------------
|
|
3254
|
+
/** Stop playback immediately with fade-out */
|
|
3255
|
+
async stop(fadeOutMs = 50) {
|
|
3256
|
+
this.setState("stopping");
|
|
3257
|
+
this.stopInternal(true);
|
|
3258
|
+
await this.scheduler.cancelAll(fadeOutMs);
|
|
3259
|
+
this.coalescer.reset();
|
|
3260
|
+
this.processor.reset();
|
|
3261
|
+
this.playbackStarted = false;
|
|
3262
|
+
this._currentFrame = null;
|
|
3263
|
+
this._currentRawFrame = null;
|
|
3264
|
+
this.emit("playback:stop", void 0);
|
|
3265
|
+
this.setState("idle");
|
|
3266
|
+
}
|
|
3267
|
+
/** Cleanup all resources */
|
|
3268
|
+
dispose() {
|
|
3269
|
+
this.stopInternal(true);
|
|
3270
|
+
this.cancelNeutralTransition();
|
|
3271
|
+
this.scheduler.dispose();
|
|
3272
|
+
this.coalescer.reset();
|
|
3273
|
+
this.processor.dispose();
|
|
3274
|
+
this._state = "idle";
|
|
3275
|
+
}
|
|
3276
|
+
/** Get pipeline debug state */
|
|
3277
|
+
getDebugState() {
|
|
3278
|
+
return {
|
|
3279
|
+
state: this._state,
|
|
3280
|
+
playbackStarted: this.playbackStarted,
|
|
3281
|
+
coalescerFill: this.coalescer.fillLevel,
|
|
3282
|
+
processorFill: this.processor.fillLevel,
|
|
3283
|
+
queuedFrames: this.processor.queuedFrameCount,
|
|
3284
|
+
currentTime: this.scheduler.getCurrentTime(),
|
|
3285
|
+
playbackEndTime: this.scheduler.getPlaybackEndTime()
|
|
3286
|
+
};
|
|
3287
|
+
}
|
|
3288
|
+
// ---------------------------------------------------------------------------
|
|
3289
|
+
// Internal: Frame loop
|
|
3290
|
+
// ---------------------------------------------------------------------------
|
|
3291
|
+
startFrameLoop() {
|
|
3292
|
+
const updateFrame = () => {
|
|
3293
|
+
this.frameLoopCount++;
|
|
3294
|
+
const currentTime = this.scheduler.getCurrentTime();
|
|
3295
|
+
const lamFrame = this.processor.getFrameForTime(currentTime);
|
|
3296
|
+
if (lamFrame && lamFrame !== this.lastKnownLamFrame) {
|
|
3297
|
+
this.lastNewFrameTime = performance.now();
|
|
3298
|
+
this.lastKnownLamFrame = lamFrame;
|
|
3299
|
+
this.staleWarningEmitted = false;
|
|
3300
|
+
}
|
|
3301
|
+
if (this.playbackStarted && this.lastNewFrameTime > 0 && performance.now() - this.lastNewFrameTime > this.staleThresholdMs) {
|
|
3302
|
+
if (!this.staleWarningEmitted) {
|
|
3303
|
+
this.staleWarningEmitted = true;
|
|
3304
|
+
logger5.warn("A2E stalled \u2014 no new inference frames", {
|
|
3305
|
+
staleDurationMs: Math.round(performance.now() - this.lastNewFrameTime),
|
|
3306
|
+
queuedFrames: this.processor.queuedFrameCount
|
|
3307
|
+
});
|
|
3308
|
+
}
|
|
3309
|
+
}
|
|
3310
|
+
if (lamFrame) {
|
|
3311
|
+
const scaled = applyProfile(lamFrame, this.profile);
|
|
3312
|
+
this._currentFrame = scaled;
|
|
3313
|
+
this._currentRawFrame = lamFrame;
|
|
3314
|
+
const fullFrame = {
|
|
3315
|
+
blendshapes: scaled,
|
|
3316
|
+
rawBlendshapes: lamFrame,
|
|
3317
|
+
timestamp: currentTime
|
|
3318
|
+
};
|
|
3319
|
+
this.emit("frame", fullFrame);
|
|
3320
|
+
this.emit("frame:raw", lamFrame);
|
|
3321
|
+
this.emit("full_frame_ready", fullFrame);
|
|
3322
|
+
this.emit("lam_frame_ready", lamFrame);
|
|
3323
|
+
}
|
|
3324
|
+
this.frameAnimationId = requestAnimationFrame(updateFrame);
|
|
3325
|
+
};
|
|
3326
|
+
this.frameAnimationId = requestAnimationFrame(updateFrame);
|
|
3327
|
+
}
|
|
3328
|
+
// ---------------------------------------------------------------------------
|
|
3329
|
+
// Internal: Playback monitoring
|
|
3330
|
+
// ---------------------------------------------------------------------------
|
|
3331
|
+
startMonitoring() {
|
|
3332
|
+
if (this.monitorInterval) {
|
|
3333
|
+
clearInterval(this.monitorInterval);
|
|
3334
|
+
}
|
|
3335
|
+
this.monitorInterval = setInterval(() => {
|
|
3336
|
+
if (this.scheduler.isComplete() && this.processor.queuedFrameCount === 0) {
|
|
3337
|
+
this.onPlaybackComplete();
|
|
3338
|
+
}
|
|
3339
|
+
}, 100);
|
|
3340
|
+
}
|
|
3341
|
+
onPlaybackComplete() {
|
|
3342
|
+
this.stopInternal(false);
|
|
3343
|
+
this.playbackStarted = false;
|
|
3344
|
+
this.emit("playback:complete", void 0);
|
|
3345
|
+
this.emit("playback_complete", void 0);
|
|
3346
|
+
if (this.neutralTransitionEnabled && this._currentFrame) {
|
|
3347
|
+
this.startNeutralTransition(this._currentFrame);
|
|
3348
|
+
} else {
|
|
3349
|
+
this.setState("idle");
|
|
3350
|
+
}
|
|
3351
|
+
}
|
|
3352
|
+
// ---------------------------------------------------------------------------
|
|
3353
|
+
// Internal: Neutral transition (opt-in)
|
|
3354
|
+
// ---------------------------------------------------------------------------
|
|
3355
|
+
startNeutralTransition(fromFrame) {
|
|
3356
|
+
this.neutralTransitionFrame = new Float32Array(fromFrame);
|
|
3357
|
+
this.neutralTransitionStart = performance.now();
|
|
3358
|
+
const animate = () => {
|
|
3359
|
+
const elapsed = performance.now() - this.neutralTransitionStart;
|
|
3360
|
+
const t = Math.min(1, elapsed / this.neutralTransitionMs);
|
|
3361
|
+
const eased = 1 - Math.pow(1 - t, 3);
|
|
3362
|
+
const blendshapes = new Float32Array(52);
|
|
3363
|
+
for (let i = 0; i < 52; i++) {
|
|
3364
|
+
blendshapes[i] = this.neutralTransitionFrame[i] * (1 - eased);
|
|
3365
|
+
}
|
|
3366
|
+
this._currentFrame = blendshapes;
|
|
3367
|
+
const frame = {
|
|
3368
|
+
blendshapes,
|
|
3369
|
+
rawBlendshapes: blendshapes,
|
|
3370
|
+
// raw = scaled during transition
|
|
3371
|
+
timestamp: performance.now() / 1e3
|
|
3372
|
+
};
|
|
3373
|
+
this.emit("frame", frame);
|
|
3374
|
+
this.emit("full_frame_ready", frame);
|
|
3375
|
+
if (t >= 1) {
|
|
3376
|
+
this.neutralTransitionFrame = null;
|
|
3377
|
+
this._currentFrame = null;
|
|
3378
|
+
this._currentRawFrame = null;
|
|
3379
|
+
this.setState("idle");
|
|
3380
|
+
return;
|
|
3381
|
+
}
|
|
3382
|
+
this.neutralAnimationId = requestAnimationFrame(animate);
|
|
3383
|
+
};
|
|
3384
|
+
this.neutralAnimationId = requestAnimationFrame(animate);
|
|
3385
|
+
}
|
|
3386
|
+
cancelNeutralTransition() {
|
|
3387
|
+
if (this.neutralAnimationId) {
|
|
3388
|
+
cancelAnimationFrame(this.neutralAnimationId);
|
|
3389
|
+
this.neutralAnimationId = null;
|
|
3390
|
+
}
|
|
3391
|
+
this.neutralTransitionFrame = null;
|
|
3392
|
+
}
|
|
3393
|
+
// ---------------------------------------------------------------------------
|
|
3394
|
+
// Internal: Helpers
|
|
3395
|
+
// ---------------------------------------------------------------------------
|
|
3396
|
+
stopInternal(emitEvents) {
|
|
3397
|
+
if (this.monitorInterval) {
|
|
3398
|
+
clearInterval(this.monitorInterval);
|
|
3399
|
+
this.monitorInterval = null;
|
|
3400
|
+
}
|
|
3401
|
+
if (this.frameAnimationId) {
|
|
3402
|
+
cancelAnimationFrame(this.frameAnimationId);
|
|
3403
|
+
this.frameAnimationId = null;
|
|
3404
|
+
}
|
|
3405
|
+
}
|
|
3406
|
+
setState(state) {
|
|
3407
|
+
if (this._state === state) return;
|
|
3408
|
+
this._state = state;
|
|
3409
|
+
this.emit("state", state);
|
|
3410
|
+
}
|
|
3411
|
+
};
|
|
3412
|
+
|
|
3116
3413
|
// src/audio/InterruptionHandler.ts
|
|
3117
3414
|
var InterruptionHandler = class extends EventEmitter {
|
|
3118
3415
|
constructor(config = {}) {
|
|
@@ -3500,7 +3797,7 @@ function ctcGreedyDecode(logits, seqLen, vocabSize, tokenMap) {
|
|
|
3500
3797
|
}
|
|
3501
3798
|
|
|
3502
3799
|
// src/inference/SenseVoiceInference.ts
|
|
3503
|
-
var
|
|
3800
|
+
var logger6 = createLogger("SenseVoice");
|
|
3504
3801
|
var _SenseVoiceInference = class _SenseVoiceInference {
|
|
3505
3802
|
constructor(config) {
|
|
3506
3803
|
this.session = null;
|
|
@@ -3553,26 +3850,26 @@ var _SenseVoiceInference = class _SenseVoiceInference {
|
|
|
3553
3850
|
"model.backend_requested": this.config.backend
|
|
3554
3851
|
});
|
|
3555
3852
|
try {
|
|
3556
|
-
|
|
3853
|
+
logger6.info("Loading ONNX Runtime...", { preference: this.config.backend });
|
|
3557
3854
|
const { ort, backend } = await getOnnxRuntimeForPreference(this.config.backend);
|
|
3558
3855
|
this.ort = ort;
|
|
3559
3856
|
this._backend = backend;
|
|
3560
|
-
|
|
3561
|
-
|
|
3857
|
+
logger6.info("ONNX Runtime loaded", { backend: this._backend });
|
|
3858
|
+
logger6.debug("Fetching tokens vocabulary", { tokensUrl: this.config.tokensUrl });
|
|
3562
3859
|
const tokensResponse = await fetch(this.config.tokensUrl);
|
|
3563
3860
|
if (!tokensResponse.ok) {
|
|
3564
3861
|
throw new Error(`Failed to fetch tokens.txt: ${tokensResponse.status} ${tokensResponse.statusText}`);
|
|
3565
3862
|
}
|
|
3566
3863
|
const tokensText = await tokensResponse.text();
|
|
3567
3864
|
this.tokenMap = parseTokensFile(tokensText);
|
|
3568
|
-
|
|
3865
|
+
logger6.debug("Tokens loaded", { vocabSize: this.tokenMap.size });
|
|
3569
3866
|
const sessionOptions = getSessionOptions(this._backend);
|
|
3570
3867
|
if (this._backend === "webgpu") {
|
|
3571
3868
|
sessionOptions.graphOptimizationLevel = "basic";
|
|
3572
3869
|
}
|
|
3573
3870
|
let isCached = false;
|
|
3574
3871
|
if (isIOS()) {
|
|
3575
|
-
|
|
3872
|
+
logger6.info("iOS: passing model URL directly to ORT (low-memory path)", {
|
|
3576
3873
|
modelUrl: this.config.modelUrl
|
|
3577
3874
|
});
|
|
3578
3875
|
this.session = await withTimeout(
|
|
@@ -3585,14 +3882,14 @@ var _SenseVoiceInference = class _SenseVoiceInference {
|
|
|
3585
3882
|
isCached = await cache.has(this.config.modelUrl);
|
|
3586
3883
|
let modelBuffer;
|
|
3587
3884
|
if (isCached) {
|
|
3588
|
-
|
|
3885
|
+
logger6.debug("Loading model from cache", { modelUrl: this.config.modelUrl });
|
|
3589
3886
|
modelBuffer = await cache.get(this.config.modelUrl);
|
|
3590
3887
|
onProgress?.(modelBuffer.byteLength, modelBuffer.byteLength);
|
|
3591
3888
|
} else {
|
|
3592
|
-
|
|
3889
|
+
logger6.debug("Fetching and caching model", { modelUrl: this.config.modelUrl });
|
|
3593
3890
|
modelBuffer = await fetchWithCache(this.config.modelUrl, onProgress);
|
|
3594
3891
|
}
|
|
3595
|
-
|
|
3892
|
+
logger6.debug("Creating ONNX session", {
|
|
3596
3893
|
size: formatBytes(modelBuffer.byteLength),
|
|
3597
3894
|
backend: this._backend
|
|
3598
3895
|
});
|
|
@@ -3605,15 +3902,15 @@ var _SenseVoiceInference = class _SenseVoiceInference {
|
|
|
3605
3902
|
const cmvn = parseCMVNFromMetadata(metadata.neg_mean, metadata.inv_stddev);
|
|
3606
3903
|
this.negMean = cmvn.negMean;
|
|
3607
3904
|
this.invStddev = cmvn.invStddev;
|
|
3608
|
-
|
|
3905
|
+
logger6.debug("CMVN loaded from model metadata", { dim: this.negMean.length });
|
|
3609
3906
|
} else {
|
|
3610
|
-
|
|
3907
|
+
logger6.warn("CMVN not found in model metadata \u2014 features will not be normalized");
|
|
3611
3908
|
}
|
|
3612
3909
|
} catch (cmvnErr) {
|
|
3613
|
-
|
|
3910
|
+
logger6.warn("Failed to read CMVN from model metadata", { error: cmvnErr });
|
|
3614
3911
|
}
|
|
3615
3912
|
const loadTimeMs = performance.now() - startTime;
|
|
3616
|
-
|
|
3913
|
+
logger6.info("SenseVoice model loaded", {
|
|
3617
3914
|
backend: this._backend,
|
|
3618
3915
|
loadTimeMs: Math.round(loadTimeMs),
|
|
3619
3916
|
vocabSize: this.tokenMap.size,
|
|
@@ -3724,7 +4021,7 @@ var _SenseVoiceInference = class _SenseVoiceInference {
|
|
|
3724
4021
|
const vocabSize = logitsDims[2];
|
|
3725
4022
|
const decoded = ctcGreedyDecode(logitsData, seqLen, vocabSize, this.tokenMap);
|
|
3726
4023
|
const inferenceTimeMs = performance.now() - startTime;
|
|
3727
|
-
|
|
4024
|
+
logger6.trace("Transcription complete", {
|
|
3728
4025
|
text: decoded.text.substring(0, 50),
|
|
3729
4026
|
language: decoded.language,
|
|
3730
4027
|
emotion: decoded.emotion,
|
|
@@ -3762,7 +4059,7 @@ var _SenseVoiceInference = class _SenseVoiceInference {
|
|
|
3762
4059
|
const errMsg = err instanceof Error ? err.message : String(err);
|
|
3763
4060
|
if (errMsg.includes("timed out")) {
|
|
3764
4061
|
this.poisoned = true;
|
|
3765
|
-
|
|
4062
|
+
logger6.error("CRITICAL: Inference session timed out \u2014 SenseVoice is dead. Page reload required.", {
|
|
3766
4063
|
backend: this._backend,
|
|
3767
4064
|
timeoutMs: _SenseVoiceInference.INFERENCE_TIMEOUT_MS
|
|
3768
4065
|
});
|
|
@@ -3770,7 +4067,7 @@ var _SenseVoiceInference = class _SenseVoiceInference {
|
|
|
3770
4067
|
const oomError = new Error(
|
|
3771
4068
|
`SenseVoice inference failed with raw C++ exception pointer (0x${err.toString(16)}). This is likely an OOM crash in WASM. Try reloading the page.`
|
|
3772
4069
|
);
|
|
3773
|
-
|
|
4070
|
+
logger6.error("ORT WASM OOM \u2014 raw C++ exception pointer", {
|
|
3774
4071
|
pointer: `0x${err.toString(16)}`,
|
|
3775
4072
|
backend: this._backend
|
|
3776
4073
|
});
|
|
@@ -3783,7 +4080,7 @@ var _SenseVoiceInference = class _SenseVoiceInference {
|
|
|
3783
4080
|
reject(oomError);
|
|
3784
4081
|
return;
|
|
3785
4082
|
} else {
|
|
3786
|
-
|
|
4083
|
+
logger6.error("Inference failed", { error: errMsg, backend: this._backend });
|
|
3787
4084
|
}
|
|
3788
4085
|
span?.endWithError(err instanceof Error ? err : new Error(String(err)));
|
|
3789
4086
|
telemetry?.incrementCounter("omote.inference.total", 1, {
|
|
@@ -3812,7 +4109,7 @@ _SenseVoiceInference.INFERENCE_TIMEOUT_MS = 1e4;
|
|
|
3812
4109
|
var SenseVoiceInference = _SenseVoiceInference;
|
|
3813
4110
|
|
|
3814
4111
|
// src/inference/SenseVoiceWorker.ts
|
|
3815
|
-
var
|
|
4112
|
+
var logger7 = createLogger("SenseVoiceWorker");
|
|
3816
4113
|
var WASM_CDN_PATH2 = "https://cdn.jsdelivr.net/npm/onnxruntime-web@1.23.2/dist/";
|
|
3817
4114
|
var LOAD_TIMEOUT_MS = 3e5;
|
|
3818
4115
|
var INFERENCE_TIMEOUT_MS = 1e4;
|
|
@@ -4551,7 +4848,7 @@ var SenseVoiceWorker = class {
|
|
|
4551
4848
|
this.handleWorkerMessage(event.data);
|
|
4552
4849
|
};
|
|
4553
4850
|
worker.onerror = (error) => {
|
|
4554
|
-
|
|
4851
|
+
logger7.error("Worker error", { error: error.message });
|
|
4555
4852
|
for (const [, resolver] of this.pendingResolvers) {
|
|
4556
4853
|
resolver.reject(new Error(`Worker error: ${error.message}`));
|
|
4557
4854
|
}
|
|
@@ -4631,9 +4928,9 @@ var SenseVoiceWorker = class {
|
|
|
4631
4928
|
"model.language": this.config.language
|
|
4632
4929
|
});
|
|
4633
4930
|
try {
|
|
4634
|
-
|
|
4931
|
+
logger7.info("Creating SenseVoice worker...");
|
|
4635
4932
|
this.worker = this.createWorker();
|
|
4636
|
-
|
|
4933
|
+
logger7.info("Loading model in worker...", {
|
|
4637
4934
|
modelUrl: this.config.modelUrl,
|
|
4638
4935
|
tokensUrl: this.config.tokensUrl,
|
|
4639
4936
|
language: this.config.language,
|
|
@@ -4655,7 +4952,7 @@ var SenseVoiceWorker = class {
|
|
|
4655
4952
|
this._isLoaded = true;
|
|
4656
4953
|
const loadTimeMs = performance.now() - startTime;
|
|
4657
4954
|
onProgress?.(1, 1);
|
|
4658
|
-
|
|
4955
|
+
logger7.info("SenseVoice worker loaded successfully", {
|
|
4659
4956
|
backend: "wasm",
|
|
4660
4957
|
loadTimeMs: Math.round(loadTimeMs),
|
|
4661
4958
|
workerLoadTimeMs: Math.round(result.loadTimeMs),
|
|
@@ -4734,7 +5031,7 @@ var SenseVoiceWorker = class {
|
|
|
4734
5031
|
INFERENCE_TIMEOUT_MS
|
|
4735
5032
|
);
|
|
4736
5033
|
const totalTimeMs = performance.now() - startTime;
|
|
4737
|
-
|
|
5034
|
+
logger7.trace("Worker transcription complete", {
|
|
4738
5035
|
text: result.text.substring(0, 50),
|
|
4739
5036
|
language: result.language,
|
|
4740
5037
|
emotion: result.emotion,
|
|
@@ -4770,11 +5067,11 @@ var SenseVoiceWorker = class {
|
|
|
4770
5067
|
} catch (err) {
|
|
4771
5068
|
const errMsg = err instanceof Error ? err.message : String(err);
|
|
4772
5069
|
if (errMsg.includes("timed out")) {
|
|
4773
|
-
|
|
5070
|
+
logger7.error("CRITICAL: Worker inference timed out \u2014 SenseVoice worker is dead. Page reload required.", {
|
|
4774
5071
|
timeoutMs: INFERENCE_TIMEOUT_MS
|
|
4775
5072
|
});
|
|
4776
5073
|
} else {
|
|
4777
|
-
|
|
5074
|
+
logger7.error("Worker inference failed", { error: errMsg });
|
|
4778
5075
|
}
|
|
4779
5076
|
span?.endWithError(err instanceof Error ? err : new Error(String(err)));
|
|
4780
5077
|
telemetry?.incrementCounter("omote.inference.total", 1, {
|
|
@@ -4811,8 +5108,53 @@ var SenseVoiceWorker = class {
|
|
|
4811
5108
|
}
|
|
4812
5109
|
};
|
|
4813
5110
|
|
|
5111
|
+
// src/inference/defaultModelUrls.ts
|
|
5112
|
+
var HF = "https://huggingface.co";
|
|
5113
|
+
var HF_MODEL_URLS = {
|
|
5114
|
+
/** LAM A2E model — fp16 external data (385KB graph + 192MB weights, WebGPU) — 52 ARKit blendshapes */
|
|
5115
|
+
lam: `${HF}/omote-ai/lam-a2e/resolve/main/model_fp16.onnx`,
|
|
5116
|
+
/** wav2arkit_cpu A2E model graph (1.86MB, WASM) — Safari/iOS fallback */
|
|
5117
|
+
wav2arkitCpu: `${HF}/myned-ai/wav2arkit_cpu/resolve/main/wav2arkit_cpu.onnx`,
|
|
5118
|
+
/** SenseVoice ASR model (228MB int8, WASM) — speech recognition + emotion + language */
|
|
5119
|
+
senseVoice: `${HF}/omote-ai/sensevoice-asr/resolve/main/model.int8.onnx`,
|
|
5120
|
+
/** Silero VAD model (~2MB, WASM) — voice activity detection */
|
|
5121
|
+
sileroVad: `${HF}/deepghs/silero-vad-onnx/resolve/main/silero_vad.onnx`
|
|
5122
|
+
};
|
|
5123
|
+
var _overrides = {};
|
|
5124
|
+
var DEFAULT_MODEL_URLS = new Proxy(
|
|
5125
|
+
{},
|
|
5126
|
+
{
|
|
5127
|
+
get(_target, prop) {
|
|
5128
|
+
const key = prop;
|
|
5129
|
+
return _overrides[key] ?? HF_MODEL_URLS[key];
|
|
5130
|
+
},
|
|
5131
|
+
ownKeys() {
|
|
5132
|
+
return Object.keys(HF_MODEL_URLS);
|
|
5133
|
+
},
|
|
5134
|
+
getOwnPropertyDescriptor(_target, prop) {
|
|
5135
|
+
if (prop in HF_MODEL_URLS) {
|
|
5136
|
+
return { configurable: true, enumerable: true, value: this.get(_target, prop, _target) };
|
|
5137
|
+
}
|
|
5138
|
+
return void 0;
|
|
5139
|
+
}
|
|
5140
|
+
}
|
|
5141
|
+
);
|
|
5142
|
+
function configureModelUrls(urls) {
|
|
5143
|
+
for (const [key, url] of Object.entries(urls)) {
|
|
5144
|
+
if (key in HF_MODEL_URLS && typeof url === "string") {
|
|
5145
|
+
_overrides[key] = url;
|
|
5146
|
+
}
|
|
5147
|
+
}
|
|
5148
|
+
}
|
|
5149
|
+
function resetModelUrls() {
|
|
5150
|
+
for (const key of Object.keys(_overrides)) {
|
|
5151
|
+
delete _overrides[key];
|
|
5152
|
+
}
|
|
5153
|
+
}
|
|
5154
|
+
var HF_CDN_URLS = HF_MODEL_URLS;
|
|
5155
|
+
|
|
4814
5156
|
// src/inference/UnifiedInferenceWorker.ts
|
|
4815
|
-
var
|
|
5157
|
+
var logger8 = createLogger("UnifiedInferenceWorker");
|
|
4816
5158
|
var WASM_CDN_PATH3 = "https://cdn.jsdelivr.net/npm/onnxruntime-web@1.23.2/dist/";
|
|
4817
5159
|
var INIT_TIMEOUT_MS = 6e4;
|
|
4818
5160
|
var SV_LOAD_TIMEOUT_MS = 3e5;
|
|
@@ -5514,7 +5856,7 @@ var UnifiedInferenceWorker = class {
|
|
|
5514
5856
|
const telemetry = getTelemetry();
|
|
5515
5857
|
const span = telemetry?.startSpan("UnifiedInferenceWorker.init");
|
|
5516
5858
|
try {
|
|
5517
|
-
|
|
5859
|
+
logger8.info("Creating unified inference worker...");
|
|
5518
5860
|
this.worker = this.createWorker();
|
|
5519
5861
|
await this.sendMessage(
|
|
5520
5862
|
{ type: "init", wasmPaths: WASM_CDN_PATH3, isIOS: isIOS() },
|
|
@@ -5523,7 +5865,7 @@ var UnifiedInferenceWorker = class {
|
|
|
5523
5865
|
);
|
|
5524
5866
|
this.initialized = true;
|
|
5525
5867
|
const loadTimeMs = performance.now() - startTime;
|
|
5526
|
-
|
|
5868
|
+
logger8.info("Unified worker initialized", { loadTimeMs: Math.round(loadTimeMs) });
|
|
5527
5869
|
span?.setAttributes({ "worker.init_time_ms": loadTimeMs });
|
|
5528
5870
|
span?.end();
|
|
5529
5871
|
} catch (error) {
|
|
@@ -5697,7 +6039,7 @@ var UnifiedInferenceWorker = class {
|
|
|
5697
6039
|
this.handleWorkerMessage(event.data);
|
|
5698
6040
|
};
|
|
5699
6041
|
worker.onerror = (error) => {
|
|
5700
|
-
|
|
6042
|
+
logger8.error("Unified worker error", { error: error.message });
|
|
5701
6043
|
this.rejectAllPending(`Worker error: ${error.message}`);
|
|
5702
6044
|
};
|
|
5703
6045
|
return worker;
|
|
@@ -5711,7 +6053,7 @@ var UnifiedInferenceWorker = class {
|
|
|
5711
6053
|
this.pendingRequests.delete(requestId);
|
|
5712
6054
|
pending.reject(new Error(data.error));
|
|
5713
6055
|
} else {
|
|
5714
|
-
|
|
6056
|
+
logger8.error("Worker broadcast error", { error: data.error });
|
|
5715
6057
|
this.rejectAllPending(data.error);
|
|
5716
6058
|
}
|
|
5717
6059
|
return;
|
|
@@ -5733,7 +6075,7 @@ var UnifiedInferenceWorker = class {
|
|
|
5733
6075
|
const timeout = setTimeout(() => {
|
|
5734
6076
|
this.pendingRequests.delete(requestId);
|
|
5735
6077
|
this.poisoned = true;
|
|
5736
|
-
|
|
6078
|
+
logger8.error("CRITICAL: Worker operation timed out \u2014 worker is dead", {
|
|
5737
6079
|
type: message.type,
|
|
5738
6080
|
timeoutMs
|
|
5739
6081
|
});
|
|
@@ -5799,7 +6141,7 @@ var SenseVoiceUnifiedAdapter = class {
|
|
|
5799
6141
|
});
|
|
5800
6142
|
this._isLoaded = true;
|
|
5801
6143
|
onProgress?.(1, 1);
|
|
5802
|
-
|
|
6144
|
+
logger8.info("SenseVoice loaded via unified worker", {
|
|
5803
6145
|
backend: "wasm",
|
|
5804
6146
|
loadTimeMs: Math.round(result.loadTimeMs),
|
|
5805
6147
|
vocabSize: result.vocabSize
|
|
@@ -5864,7 +6206,7 @@ var Wav2ArkitCpuUnifiedAdapter = class {
|
|
|
5864
6206
|
externalDataUrl: externalDataUrl || null
|
|
5865
6207
|
});
|
|
5866
6208
|
this._isLoaded = true;
|
|
5867
|
-
|
|
6209
|
+
logger8.info("Wav2ArkitCpu loaded via unified worker", {
|
|
5868
6210
|
backend: "wasm",
|
|
5869
6211
|
loadTimeMs: Math.round(result.loadTimeMs)
|
|
5870
6212
|
});
|
|
@@ -5970,7 +6312,7 @@ var SileroVADUnifiedAdapter = class {
|
|
|
5970
6312
|
sampleRate: this.config.sampleRate
|
|
5971
6313
|
});
|
|
5972
6314
|
this._isLoaded = true;
|
|
5973
|
-
|
|
6315
|
+
logger8.info("SileroVAD loaded via unified worker", {
|
|
5974
6316
|
backend: "wasm",
|
|
5975
6317
|
loadTimeMs: Math.round(result.loadTimeMs),
|
|
5976
6318
|
sampleRate: this.config.sampleRate,
|
|
@@ -6051,12 +6393,13 @@ var SileroVADUnifiedAdapter = class {
|
|
|
6051
6393
|
};
|
|
6052
6394
|
|
|
6053
6395
|
// src/inference/createSenseVoice.ts
|
|
6054
|
-
var
|
|
6055
|
-
function createSenseVoice(config) {
|
|
6396
|
+
var logger9 = createLogger("createSenseVoice");
|
|
6397
|
+
function createSenseVoice(config = {}) {
|
|
6398
|
+
const modelUrl = config.modelUrl ?? DEFAULT_MODEL_URLS.senseVoice;
|
|
6056
6399
|
if (config.unifiedWorker) {
|
|
6057
|
-
|
|
6400
|
+
logger9.info("Creating SenseVoiceUnifiedAdapter (shared unified worker)");
|
|
6058
6401
|
return new SenseVoiceUnifiedAdapter(config.unifiedWorker, {
|
|
6059
|
-
modelUrl
|
|
6402
|
+
modelUrl,
|
|
6060
6403
|
tokensUrl: config.tokensUrl,
|
|
6061
6404
|
language: config.language,
|
|
6062
6405
|
textNorm: config.textNorm
|
|
@@ -6067,37 +6410,37 @@ function createSenseVoice(config) {
|
|
|
6067
6410
|
if (!SenseVoiceWorker.isSupported()) {
|
|
6068
6411
|
throw new Error("Web Workers are not supported in this environment");
|
|
6069
6412
|
}
|
|
6070
|
-
|
|
6413
|
+
logger9.info("Creating SenseVoiceWorker (off-main-thread)");
|
|
6071
6414
|
return new SenseVoiceWorker({
|
|
6072
|
-
modelUrl
|
|
6415
|
+
modelUrl,
|
|
6073
6416
|
tokensUrl: config.tokensUrl,
|
|
6074
6417
|
language: config.language,
|
|
6075
6418
|
textNorm: config.textNorm
|
|
6076
6419
|
});
|
|
6077
6420
|
}
|
|
6078
6421
|
if (useWorker === false) {
|
|
6079
|
-
|
|
6422
|
+
logger9.info("Creating SenseVoiceInference (main thread)");
|
|
6080
6423
|
return new SenseVoiceInference({
|
|
6081
|
-
modelUrl
|
|
6424
|
+
modelUrl,
|
|
6082
6425
|
tokensUrl: config.tokensUrl,
|
|
6083
6426
|
language: config.language,
|
|
6084
6427
|
textNorm: config.textNorm
|
|
6085
6428
|
});
|
|
6086
6429
|
}
|
|
6087
6430
|
if (SenseVoiceWorker.isSupported() && !isIOS()) {
|
|
6088
|
-
|
|
6431
|
+
logger9.info("Auto-detected: creating SenseVoiceWorker (off-main-thread)");
|
|
6089
6432
|
return new SenseVoiceWorker({
|
|
6090
|
-
modelUrl
|
|
6433
|
+
modelUrl,
|
|
6091
6434
|
tokensUrl: config.tokensUrl,
|
|
6092
6435
|
language: config.language,
|
|
6093
6436
|
textNorm: config.textNorm
|
|
6094
6437
|
});
|
|
6095
6438
|
}
|
|
6096
|
-
|
|
6439
|
+
logger9.info("Auto-detected: creating SenseVoiceInference (main thread)", {
|
|
6097
6440
|
reason: isIOS() ? "iOS (shared ORT instance)" : "Worker unsupported"
|
|
6098
6441
|
});
|
|
6099
6442
|
return new SenseVoiceInference({
|
|
6100
|
-
modelUrl
|
|
6443
|
+
modelUrl,
|
|
6101
6444
|
tokensUrl: config.tokensUrl,
|
|
6102
6445
|
language: config.language,
|
|
6103
6446
|
textNorm: config.textNorm
|
|
@@ -6105,7 +6448,7 @@ function createSenseVoice(config) {
|
|
|
6105
6448
|
}
|
|
6106
6449
|
|
|
6107
6450
|
// src/inference/Wav2ArkitCpuInference.ts
|
|
6108
|
-
var
|
|
6451
|
+
var logger10 = createLogger("Wav2ArkitCpu");
|
|
6109
6452
|
var _Wav2ArkitCpuInference = class _Wav2ArkitCpuInference {
|
|
6110
6453
|
constructor(config) {
|
|
6111
6454
|
this.modelId = "wav2arkit_cpu";
|
|
@@ -6147,16 +6490,16 @@ var _Wav2ArkitCpuInference = class _Wav2ArkitCpuInference {
|
|
|
6147
6490
|
});
|
|
6148
6491
|
try {
|
|
6149
6492
|
const preference = this.config.backend || "wasm";
|
|
6150
|
-
|
|
6493
|
+
logger10.info("Loading ONNX Runtime...", { preference });
|
|
6151
6494
|
const { ort, backend } = await getOnnxRuntimeForPreference(preference);
|
|
6152
6495
|
this.ort = ort;
|
|
6153
6496
|
this._backend = backend;
|
|
6154
|
-
|
|
6497
|
+
logger10.info("ONNX Runtime loaded", { backend: this._backend });
|
|
6155
6498
|
const modelUrl = this.config.modelUrl;
|
|
6156
6499
|
const dataUrl = this.config.externalDataUrl !== false ? this.config.externalDataUrl || `${modelUrl}.data` : null;
|
|
6157
6500
|
const sessionOptions = getSessionOptions(this._backend);
|
|
6158
6501
|
if (isIOS()) {
|
|
6159
|
-
|
|
6502
|
+
logger10.info("iOS: passing model URLs directly to ORT (low-memory path)", {
|
|
6160
6503
|
modelUrl,
|
|
6161
6504
|
dataUrl
|
|
6162
6505
|
});
|
|
@@ -6178,15 +6521,15 @@ var _Wav2ArkitCpuInference = class _Wav2ArkitCpuInference {
|
|
|
6178
6521
|
const isCached = await cache.has(modelUrl);
|
|
6179
6522
|
let modelBuffer;
|
|
6180
6523
|
if (isCached) {
|
|
6181
|
-
|
|
6524
|
+
logger10.debug("Loading model from cache", { modelUrl });
|
|
6182
6525
|
modelBuffer = await cache.get(modelUrl);
|
|
6183
6526
|
if (!modelBuffer) {
|
|
6184
|
-
|
|
6527
|
+
logger10.warn("Cache corruption detected, clearing and retrying", { modelUrl });
|
|
6185
6528
|
await cache.delete(modelUrl);
|
|
6186
6529
|
modelBuffer = await fetchWithCache(modelUrl);
|
|
6187
6530
|
}
|
|
6188
6531
|
} else {
|
|
6189
|
-
|
|
6532
|
+
logger10.debug("Fetching and caching model graph", { modelUrl });
|
|
6190
6533
|
modelBuffer = await fetchWithCache(modelUrl);
|
|
6191
6534
|
}
|
|
6192
6535
|
if (!modelBuffer) {
|
|
@@ -6197,31 +6540,31 @@ var _Wav2ArkitCpuInference = class _Wav2ArkitCpuInference {
|
|
|
6197
6540
|
try {
|
|
6198
6541
|
const isDataCached = await cache.has(dataUrl);
|
|
6199
6542
|
if (isDataCached) {
|
|
6200
|
-
|
|
6543
|
+
logger10.debug("Loading external data from cache", { dataUrl });
|
|
6201
6544
|
externalDataBuffer = await cache.get(dataUrl);
|
|
6202
6545
|
if (!externalDataBuffer) {
|
|
6203
|
-
|
|
6546
|
+
logger10.warn("Cache corruption for external data, retrying", { dataUrl });
|
|
6204
6547
|
await cache.delete(dataUrl);
|
|
6205
6548
|
externalDataBuffer = await fetchWithCache(dataUrl);
|
|
6206
6549
|
}
|
|
6207
6550
|
} else {
|
|
6208
|
-
|
|
6551
|
+
logger10.info("Fetching external model data", {
|
|
6209
6552
|
dataUrl,
|
|
6210
6553
|
note: "This may be a large download (400MB+)"
|
|
6211
6554
|
});
|
|
6212
6555
|
externalDataBuffer = await fetchWithCache(dataUrl);
|
|
6213
6556
|
}
|
|
6214
|
-
|
|
6557
|
+
logger10.info("External data loaded", {
|
|
6215
6558
|
size: formatBytes(externalDataBuffer.byteLength)
|
|
6216
6559
|
});
|
|
6217
6560
|
} catch (err) {
|
|
6218
|
-
|
|
6561
|
+
logger10.debug("No external data file found (single-file model)", {
|
|
6219
6562
|
dataUrl,
|
|
6220
6563
|
error: err.message
|
|
6221
6564
|
});
|
|
6222
6565
|
}
|
|
6223
6566
|
}
|
|
6224
|
-
|
|
6567
|
+
logger10.debug("Creating ONNX session", {
|
|
6225
6568
|
graphSize: formatBytes(modelBuffer.byteLength),
|
|
6226
6569
|
externalDataSize: externalDataBuffer ? formatBytes(externalDataBuffer.byteLength) : "none",
|
|
6227
6570
|
backend: this._backend
|
|
@@ -6237,7 +6580,7 @@ var _Wav2ArkitCpuInference = class _Wav2ArkitCpuInference {
|
|
|
6237
6580
|
this.session = await this.ort.InferenceSession.create(modelData, sessionOptions);
|
|
6238
6581
|
}
|
|
6239
6582
|
const loadTimeMs = performance.now() - startTime;
|
|
6240
|
-
|
|
6583
|
+
logger10.info("Model loaded successfully", {
|
|
6241
6584
|
backend: this._backend,
|
|
6242
6585
|
loadTimeMs: Math.round(loadTimeMs),
|
|
6243
6586
|
inputs: this.session.inputNames,
|
|
@@ -6253,12 +6596,12 @@ var _Wav2ArkitCpuInference = class _Wav2ArkitCpuInference {
|
|
|
6253
6596
|
model: "wav2arkit_cpu",
|
|
6254
6597
|
backend: this._backend
|
|
6255
6598
|
});
|
|
6256
|
-
|
|
6599
|
+
logger10.debug("Running warmup inference");
|
|
6257
6600
|
const warmupStart = performance.now();
|
|
6258
6601
|
const silentAudio = new Float32Array(16e3);
|
|
6259
6602
|
await this.infer(silentAudio);
|
|
6260
6603
|
const warmupTimeMs = performance.now() - warmupStart;
|
|
6261
|
-
|
|
6604
|
+
logger10.info("Warmup inference complete", {
|
|
6262
6605
|
warmupTimeMs: Math.round(warmupTimeMs),
|
|
6263
6606
|
backend: this._backend
|
|
6264
6607
|
});
|
|
@@ -6345,7 +6688,7 @@ var _Wav2ArkitCpuInference = class _Wav2ArkitCpuInference {
|
|
|
6345
6688
|
const symmetrized = symmetrizeBlendshapes(rawFrame);
|
|
6346
6689
|
blendshapes.push(symmetrized);
|
|
6347
6690
|
}
|
|
6348
|
-
|
|
6691
|
+
logger10.trace("Inference completed", {
|
|
6349
6692
|
inferenceTimeMs: Math.round(inferenceTimeMs * 100) / 100,
|
|
6350
6693
|
numFrames,
|
|
6351
6694
|
inputSamples
|
|
@@ -6373,7 +6716,7 @@ var _Wav2ArkitCpuInference = class _Wav2ArkitCpuInference {
|
|
|
6373
6716
|
const errMsg = err instanceof Error ? err.message : String(err);
|
|
6374
6717
|
if (errMsg.includes("timed out")) {
|
|
6375
6718
|
this.poisoned = true;
|
|
6376
|
-
|
|
6719
|
+
logger10.error("CRITICAL: Inference session timed out \u2014 Wav2ArkitCpu is dead. Page reload required.", {
|
|
6377
6720
|
backend: this._backend,
|
|
6378
6721
|
timeoutMs: _Wav2ArkitCpuInference.INFERENCE_TIMEOUT_MS
|
|
6379
6722
|
});
|
|
@@ -6381,7 +6724,7 @@ var _Wav2ArkitCpuInference = class _Wav2ArkitCpuInference {
|
|
|
6381
6724
|
const oomError = new Error(
|
|
6382
6725
|
`Wav2ArkitCpu inference failed with raw C++ exception pointer (0x${err.toString(16)}). This is likely an OOM crash in WASM. Try reloading the page.`
|
|
6383
6726
|
);
|
|
6384
|
-
|
|
6727
|
+
logger10.error("ORT WASM OOM \u2014 raw C++ exception pointer", {
|
|
6385
6728
|
pointer: `0x${err.toString(16)}`,
|
|
6386
6729
|
backend: this._backend
|
|
6387
6730
|
});
|
|
@@ -6394,7 +6737,7 @@ var _Wav2ArkitCpuInference = class _Wav2ArkitCpuInference {
|
|
|
6394
6737
|
reject(oomError);
|
|
6395
6738
|
return;
|
|
6396
6739
|
} else {
|
|
6397
|
-
|
|
6740
|
+
logger10.error("Inference failed", { error: errMsg, backend: this._backend });
|
|
6398
6741
|
}
|
|
6399
6742
|
span?.endWithError(err instanceof Error ? err : new Error(String(err)));
|
|
6400
6743
|
telemetry?.incrementCounter("omote.inference.total", 1, {
|
|
@@ -6421,7 +6764,7 @@ _Wav2ArkitCpuInference.INFERENCE_TIMEOUT_MS = 5e3;
|
|
|
6421
6764
|
var Wav2ArkitCpuInference = _Wav2ArkitCpuInference;
|
|
6422
6765
|
|
|
6423
6766
|
// src/inference/Wav2ArkitCpuWorker.ts
|
|
6424
|
-
var
|
|
6767
|
+
var logger11 = createLogger("Wav2ArkitCpuWorker");
|
|
6425
6768
|
var WASM_CDN_PATH4 = "https://cdn.jsdelivr.net/npm/onnxruntime-web@1.23.2/dist/";
|
|
6426
6769
|
var LOAD_TIMEOUT_MS2 = 42e4;
|
|
6427
6770
|
var INFERENCE_TIMEOUT_MS2 = 5e3;
|
|
@@ -6708,7 +7051,7 @@ var Wav2ArkitCpuWorker = class {
|
|
|
6708
7051
|
this.handleWorkerMessage(event.data);
|
|
6709
7052
|
};
|
|
6710
7053
|
worker.onerror = (error) => {
|
|
6711
|
-
|
|
7054
|
+
logger11.error("Worker error", { error: error.message });
|
|
6712
7055
|
for (const [, resolver] of this.pendingResolvers) {
|
|
6713
7056
|
resolver.reject(new Error(`Worker error: ${error.message}`));
|
|
6714
7057
|
}
|
|
@@ -6784,10 +7127,10 @@ var Wav2ArkitCpuWorker = class {
|
|
|
6784
7127
|
"model.backend_requested": "wasm"
|
|
6785
7128
|
});
|
|
6786
7129
|
try {
|
|
6787
|
-
|
|
7130
|
+
logger11.info("Creating wav2arkit_cpu worker...");
|
|
6788
7131
|
this.worker = this.createWorker();
|
|
6789
7132
|
const externalDataUrl = this.config.externalDataUrl !== false ? this.config.externalDataUrl || `${this.config.modelUrl}.data` : null;
|
|
6790
|
-
|
|
7133
|
+
logger11.info("Loading model in worker...", {
|
|
6791
7134
|
modelUrl: this.config.modelUrl,
|
|
6792
7135
|
externalDataUrl,
|
|
6793
7136
|
isIOS: isIOS()
|
|
@@ -6805,7 +7148,7 @@ var Wav2ArkitCpuWorker = class {
|
|
|
6805
7148
|
);
|
|
6806
7149
|
this._isLoaded = true;
|
|
6807
7150
|
const loadTimeMs = performance.now() - startTime;
|
|
6808
|
-
|
|
7151
|
+
logger11.info("Wav2ArkitCpu worker loaded successfully", {
|
|
6809
7152
|
backend: "wasm",
|
|
6810
7153
|
loadTimeMs: Math.round(loadTimeMs),
|
|
6811
7154
|
workerLoadTimeMs: Math.round(result.loadTimeMs),
|
|
@@ -6890,7 +7233,7 @@ var Wav2ArkitCpuWorker = class {
|
|
|
6890
7233
|
for (let f = 0; f < numFrames; f++) {
|
|
6891
7234
|
blendshapes.push(flatBuffer.slice(f * numBlendshapes, (f + 1) * numBlendshapes));
|
|
6892
7235
|
}
|
|
6893
|
-
|
|
7236
|
+
logger11.trace("Worker inference completed", {
|
|
6894
7237
|
inferenceTimeMs: Math.round(inferenceTimeMs * 100) / 100,
|
|
6895
7238
|
workerTimeMs: Math.round(result.inferenceTimeMs * 100) / 100,
|
|
6896
7239
|
numFrames,
|
|
@@ -6920,12 +7263,12 @@ var Wav2ArkitCpuWorker = class {
|
|
|
6920
7263
|
const errMsg = err instanceof Error ? err.message : String(err);
|
|
6921
7264
|
if (errMsg.includes("timed out")) {
|
|
6922
7265
|
this.poisoned = true;
|
|
6923
|
-
|
|
7266
|
+
logger11.error("CRITICAL: Worker inference timed out \u2014 Wav2ArkitCpu worker is dead. Page reload required.", {
|
|
6924
7267
|
backend: "wasm",
|
|
6925
7268
|
timeoutMs: INFERENCE_TIMEOUT_MS2
|
|
6926
7269
|
});
|
|
6927
7270
|
} else {
|
|
6928
|
-
|
|
7271
|
+
logger11.error("Worker inference failed", { error: errMsg, backend: "wasm" });
|
|
6929
7272
|
}
|
|
6930
7273
|
span?.endWithError(err instanceof Error ? err : new Error(String(err)));
|
|
6931
7274
|
telemetry?.incrementCounter("omote.inference.total", 1, {
|
|
@@ -6963,53 +7306,56 @@ var Wav2ArkitCpuWorker = class {
|
|
|
6963
7306
|
};
|
|
6964
7307
|
|
|
6965
7308
|
// src/inference/createA2E.ts
|
|
6966
|
-
var
|
|
6967
|
-
function createA2E(config) {
|
|
7309
|
+
var logger12 = createLogger("createA2E");
|
|
7310
|
+
function createA2E(config = {}) {
|
|
6968
7311
|
const mode = config.mode ?? "auto";
|
|
6969
7312
|
const fallbackOnError = config.fallbackOnError ?? true;
|
|
7313
|
+
const gpuModelUrl = config.gpuModelUrl ?? DEFAULT_MODEL_URLS.lam;
|
|
7314
|
+
const cpuModelUrl = config.cpuModelUrl ?? DEFAULT_MODEL_URLS.wav2arkitCpu;
|
|
6970
7315
|
let useCpu;
|
|
6971
7316
|
if (mode === "cpu") {
|
|
6972
7317
|
useCpu = true;
|
|
6973
|
-
|
|
7318
|
+
logger12.info("Forcing CPU A2E model (wav2arkit_cpu)");
|
|
6974
7319
|
} else if (mode === "gpu") {
|
|
6975
7320
|
useCpu = false;
|
|
6976
|
-
|
|
7321
|
+
logger12.info("Forcing GPU A2E model (Wav2Vec2)");
|
|
6977
7322
|
} else {
|
|
6978
7323
|
useCpu = shouldUseCpuA2E();
|
|
6979
|
-
|
|
7324
|
+
logger12.info("Auto-detected A2E model", {
|
|
6980
7325
|
useCpu,
|
|
6981
7326
|
isSafari: isSafari()
|
|
6982
7327
|
});
|
|
6983
7328
|
}
|
|
6984
7329
|
if (useCpu) {
|
|
6985
7330
|
if (config.unifiedWorker) {
|
|
6986
|
-
|
|
7331
|
+
logger12.info("Creating Wav2ArkitCpuUnifiedAdapter (404MB, WASM, shared unified worker)");
|
|
6987
7332
|
return new Wav2ArkitCpuUnifiedAdapter(config.unifiedWorker, {
|
|
6988
|
-
modelUrl:
|
|
7333
|
+
modelUrl: cpuModelUrl
|
|
6989
7334
|
});
|
|
6990
7335
|
}
|
|
6991
7336
|
if (config.useWorker && Wav2ArkitCpuWorker.isSupported() && !isIOS()) {
|
|
6992
|
-
|
|
7337
|
+
logger12.info("Creating Wav2ArkitCpuWorker (404MB, WASM, off-main-thread)");
|
|
6993
7338
|
return new Wav2ArkitCpuWorker({
|
|
6994
|
-
modelUrl:
|
|
7339
|
+
modelUrl: cpuModelUrl
|
|
6995
7340
|
});
|
|
6996
7341
|
}
|
|
6997
|
-
|
|
7342
|
+
logger12.info("Creating Wav2ArkitCpuInference (404MB, WASM)");
|
|
6998
7343
|
return new Wav2ArkitCpuInference({
|
|
6999
|
-
modelUrl:
|
|
7344
|
+
modelUrl: cpuModelUrl
|
|
7000
7345
|
});
|
|
7001
7346
|
}
|
|
7347
|
+
const gpuExternalDataUrl = config.gpuExternalDataUrl !== void 0 ? config.gpuExternalDataUrl : void 0;
|
|
7002
7348
|
const gpuInstance = new Wav2Vec2Inference({
|
|
7003
|
-
modelUrl:
|
|
7004
|
-
externalDataUrl:
|
|
7349
|
+
modelUrl: gpuModelUrl,
|
|
7350
|
+
externalDataUrl: gpuExternalDataUrl,
|
|
7005
7351
|
backend: config.gpuBackend ?? "auto",
|
|
7006
7352
|
numIdentityClasses: config.numIdentityClasses
|
|
7007
7353
|
});
|
|
7008
7354
|
if (fallbackOnError) {
|
|
7009
|
-
|
|
7355
|
+
logger12.info("Creating Wav2Vec2Inference with CPU fallback");
|
|
7010
7356
|
return new A2EWithFallback(gpuInstance, config);
|
|
7011
7357
|
}
|
|
7012
|
-
|
|
7358
|
+
logger12.info("Creating Wav2Vec2Inference (no fallback)");
|
|
7013
7359
|
return gpuInstance;
|
|
7014
7360
|
}
|
|
7015
7361
|
var A2EWithFallback = class {
|
|
@@ -7017,6 +7363,7 @@ var A2EWithFallback = class {
|
|
|
7017
7363
|
this.hasFallenBack = false;
|
|
7018
7364
|
this.implementation = gpuInstance;
|
|
7019
7365
|
this.config = config;
|
|
7366
|
+
this.resolvedCpuModelUrl = config.cpuModelUrl ?? DEFAULT_MODEL_URLS.wav2arkitCpu;
|
|
7020
7367
|
}
|
|
7021
7368
|
get modelId() {
|
|
7022
7369
|
return this.implementation.modelId;
|
|
@@ -7038,26 +7385,26 @@ var A2EWithFallback = class {
|
|
|
7038
7385
|
}
|
|
7039
7386
|
}
|
|
7040
7387
|
async fallbackToCpu(reason) {
|
|
7041
|
-
|
|
7388
|
+
logger12.warn("GPU model load failed, falling back to CPU model", { reason });
|
|
7042
7389
|
try {
|
|
7043
7390
|
await this.implementation.dispose();
|
|
7044
7391
|
} catch {
|
|
7045
7392
|
}
|
|
7046
7393
|
if (this.config.unifiedWorker) {
|
|
7047
7394
|
this.implementation = new Wav2ArkitCpuUnifiedAdapter(this.config.unifiedWorker, {
|
|
7048
|
-
modelUrl: this.
|
|
7395
|
+
modelUrl: this.resolvedCpuModelUrl
|
|
7049
7396
|
});
|
|
7050
|
-
|
|
7397
|
+
logger12.info("Fallback to Wav2ArkitCpuUnifiedAdapter successful");
|
|
7051
7398
|
} else if (this.config.useWorker && Wav2ArkitCpuWorker.isSupported() && !isIOS()) {
|
|
7052
7399
|
this.implementation = new Wav2ArkitCpuWorker({
|
|
7053
|
-
modelUrl: this.
|
|
7400
|
+
modelUrl: this.resolvedCpuModelUrl
|
|
7054
7401
|
});
|
|
7055
|
-
|
|
7402
|
+
logger12.info("Fallback to Wav2ArkitCpuWorker successful");
|
|
7056
7403
|
} else {
|
|
7057
7404
|
this.implementation = new Wav2ArkitCpuInference({
|
|
7058
|
-
modelUrl: this.
|
|
7405
|
+
modelUrl: this.resolvedCpuModelUrl
|
|
7059
7406
|
});
|
|
7060
|
-
|
|
7407
|
+
logger12.info("Fallback to Wav2ArkitCpuInference successful");
|
|
7061
7408
|
}
|
|
7062
7409
|
this.hasFallenBack = true;
|
|
7063
7410
|
return await this.implementation.load();
|
|
@@ -7261,7 +7608,7 @@ var EmphasisDetector = class {
|
|
|
7261
7608
|
};
|
|
7262
7609
|
|
|
7263
7610
|
// src/inference/SileroVADInference.ts
|
|
7264
|
-
var
|
|
7611
|
+
var logger13 = createLogger("SileroVAD");
|
|
7265
7612
|
var SileroVADInference = class {
|
|
7266
7613
|
constructor(config) {
|
|
7267
7614
|
this.session = null;
|
|
@@ -7335,23 +7682,23 @@ var SileroVADInference = class {
|
|
|
7335
7682
|
"model.sample_rate": this.config.sampleRate
|
|
7336
7683
|
});
|
|
7337
7684
|
try {
|
|
7338
|
-
|
|
7685
|
+
logger13.info("Loading ONNX Runtime...", { preference: this.config.backend });
|
|
7339
7686
|
const { ort, backend } = await getOnnxRuntimeForPreference(this.config.backend);
|
|
7340
7687
|
this.ort = ort;
|
|
7341
7688
|
this._backend = backend;
|
|
7342
|
-
|
|
7689
|
+
logger13.info("ONNX Runtime loaded", { backend: this._backend });
|
|
7343
7690
|
const cache = getModelCache();
|
|
7344
7691
|
const modelUrl = this.config.modelUrl;
|
|
7345
7692
|
const isCached = await cache.has(modelUrl);
|
|
7346
7693
|
let modelBuffer;
|
|
7347
7694
|
if (isCached) {
|
|
7348
|
-
|
|
7695
|
+
logger13.debug("Loading model from cache", { modelUrl });
|
|
7349
7696
|
modelBuffer = await cache.get(modelUrl);
|
|
7350
7697
|
} else {
|
|
7351
|
-
|
|
7698
|
+
logger13.debug("Fetching and caching model", { modelUrl });
|
|
7352
7699
|
modelBuffer = await fetchWithCache(modelUrl);
|
|
7353
7700
|
}
|
|
7354
|
-
|
|
7701
|
+
logger13.debug("Creating ONNX session", {
|
|
7355
7702
|
size: formatBytes(modelBuffer.byteLength),
|
|
7356
7703
|
backend: this._backend
|
|
7357
7704
|
});
|
|
@@ -7360,7 +7707,7 @@ var SileroVADInference = class {
|
|
|
7360
7707
|
this.session = await ort.InferenceSession.create(modelData, sessionOptions);
|
|
7361
7708
|
this.reset();
|
|
7362
7709
|
const loadTimeMs = performance.now() - startTime;
|
|
7363
|
-
|
|
7710
|
+
logger13.info("Model loaded successfully", {
|
|
7364
7711
|
backend: this._backend,
|
|
7365
7712
|
loadTimeMs: Math.round(loadTimeMs),
|
|
7366
7713
|
sampleRate: this.config.sampleRate,
|
|
@@ -7415,7 +7762,7 @@ var SileroVADInference = class {
|
|
|
7415
7762
|
[]
|
|
7416
7763
|
);
|
|
7417
7764
|
} catch (e) {
|
|
7418
|
-
|
|
7765
|
+
logger13.warn("BigInt64Array not available, using bigint array fallback", {
|
|
7419
7766
|
error: e instanceof Error ? e.message : String(e)
|
|
7420
7767
|
});
|
|
7421
7768
|
this.srTensor = new this.ort.Tensor(
|
|
@@ -7521,7 +7868,7 @@ var SileroVADInference = class {
|
|
|
7521
7868
|
this.preSpeechBuffer.shift();
|
|
7522
7869
|
}
|
|
7523
7870
|
}
|
|
7524
|
-
|
|
7871
|
+
logger13.trace("Skipping VAD inference - audio too quiet", {
|
|
7525
7872
|
rms: Math.round(rms * 1e4) / 1e4,
|
|
7526
7873
|
threshold: MIN_ENERGY_THRESHOLD
|
|
7527
7874
|
});
|
|
@@ -7575,7 +7922,7 @@ var SileroVADInference = class {
|
|
|
7575
7922
|
if (isSpeech && !this.wasSpeaking) {
|
|
7576
7923
|
preSpeechChunks = [...this.preSpeechBuffer];
|
|
7577
7924
|
this.preSpeechBuffer = [];
|
|
7578
|
-
|
|
7925
|
+
logger13.debug("Speech started with pre-speech buffer", {
|
|
7579
7926
|
preSpeechChunks: preSpeechChunks.length,
|
|
7580
7927
|
durationMs: Math.round(preSpeechChunks.length * this.getChunkDurationMs())
|
|
7581
7928
|
});
|
|
@@ -7588,7 +7935,7 @@ var SileroVADInference = class {
|
|
|
7588
7935
|
this.preSpeechBuffer = [];
|
|
7589
7936
|
}
|
|
7590
7937
|
this.wasSpeaking = isSpeech;
|
|
7591
|
-
|
|
7938
|
+
logger13.trace("VAD inference completed", {
|
|
7592
7939
|
probability: Math.round(probability * 1e3) / 1e3,
|
|
7593
7940
|
isSpeech,
|
|
7594
7941
|
inferenceTimeMs: Math.round(inferenceTimeMs * 100) / 100
|
|
@@ -7619,7 +7966,7 @@ var SileroVADInference = class {
|
|
|
7619
7966
|
const oomError = new Error(
|
|
7620
7967
|
`SileroVAD inference failed with raw C++ exception pointer (0x${err.toString(16)}). This is likely an OOM crash in WASM. Try reducing concurrent model sessions or reloading the page.`
|
|
7621
7968
|
);
|
|
7622
|
-
|
|
7969
|
+
logger13.error("ORT WASM OOM \u2014 raw C++ exception pointer", {
|
|
7623
7970
|
pointer: `0x${err.toString(16)}`,
|
|
7624
7971
|
backend: this._backend
|
|
7625
7972
|
});
|
|
@@ -7662,7 +8009,7 @@ var SileroVADInference = class {
|
|
|
7662
8009
|
SileroVADInference.isWebGPUAvailable = isWebGPUAvailable;
|
|
7663
8010
|
|
|
7664
8011
|
// src/inference/SileroVADWorker.ts
|
|
7665
|
-
var
|
|
8012
|
+
var logger14 = createLogger("SileroVADWorker");
|
|
7666
8013
|
var WASM_CDN_PATH5 = "https://cdn.jsdelivr.net/npm/onnxruntime-web@1.23.2/dist/";
|
|
7667
8014
|
var LOAD_TIMEOUT_MS3 = 12e4;
|
|
7668
8015
|
var INFERENCE_TIMEOUT_MS3 = 1e3;
|
|
@@ -7947,7 +8294,7 @@ var SileroVADWorker = class {
|
|
|
7947
8294
|
this.handleWorkerMessage(event.data);
|
|
7948
8295
|
};
|
|
7949
8296
|
worker.onerror = (error) => {
|
|
7950
|
-
|
|
8297
|
+
logger14.error("Worker error", { error: error.message });
|
|
7951
8298
|
for (const [, resolver] of this.pendingResolvers) {
|
|
7952
8299
|
resolver.reject(new Error(`Worker error: ${error.message}`));
|
|
7953
8300
|
}
|
|
@@ -8023,9 +8370,9 @@ var SileroVADWorker = class {
|
|
|
8023
8370
|
"model.sample_rate": this.config.sampleRate
|
|
8024
8371
|
});
|
|
8025
8372
|
try {
|
|
8026
|
-
|
|
8373
|
+
logger14.info("Creating VAD worker...");
|
|
8027
8374
|
this.worker = this.createWorker();
|
|
8028
|
-
|
|
8375
|
+
logger14.info("Loading model in worker...", {
|
|
8029
8376
|
modelUrl: this.config.modelUrl,
|
|
8030
8377
|
sampleRate: this.config.sampleRate
|
|
8031
8378
|
});
|
|
@@ -8041,7 +8388,7 @@ var SileroVADWorker = class {
|
|
|
8041
8388
|
);
|
|
8042
8389
|
this._isLoaded = true;
|
|
8043
8390
|
const loadTimeMs = performance.now() - startTime;
|
|
8044
|
-
|
|
8391
|
+
logger14.info("VAD worker loaded successfully", {
|
|
8045
8392
|
backend: "wasm",
|
|
8046
8393
|
loadTimeMs: Math.round(loadTimeMs),
|
|
8047
8394
|
workerLoadTimeMs: Math.round(result.loadTimeMs),
|
|
@@ -8148,7 +8495,7 @@ var SileroVADWorker = class {
|
|
|
8148
8495
|
if (isSpeech && !this.wasSpeaking) {
|
|
8149
8496
|
preSpeechChunks = [...this.preSpeechBuffer];
|
|
8150
8497
|
this.preSpeechBuffer = [];
|
|
8151
|
-
|
|
8498
|
+
logger14.debug("Speech started with pre-speech buffer", {
|
|
8152
8499
|
preSpeechChunks: preSpeechChunks.length,
|
|
8153
8500
|
durationMs: Math.round(preSpeechChunks.length * this.getChunkDurationMs())
|
|
8154
8501
|
});
|
|
@@ -8161,7 +8508,7 @@ var SileroVADWorker = class {
|
|
|
8161
8508
|
this.preSpeechBuffer = [];
|
|
8162
8509
|
}
|
|
8163
8510
|
this.wasSpeaking = isSpeech;
|
|
8164
|
-
|
|
8511
|
+
logger14.trace("VAD worker inference completed", {
|
|
8165
8512
|
probability: Math.round(result.probability * 1e3) / 1e3,
|
|
8166
8513
|
isSpeech,
|
|
8167
8514
|
inferenceTimeMs: Math.round(inferenceTimeMs * 100) / 100,
|
|
@@ -8229,63 +8576,65 @@ var SileroVADWorker = class {
|
|
|
8229
8576
|
};
|
|
8230
8577
|
|
|
8231
8578
|
// src/inference/createSileroVAD.ts
|
|
8232
|
-
var
|
|
8579
|
+
var logger15 = createLogger("createSileroVAD");
|
|
8233
8580
|
function supportsVADWorker() {
|
|
8234
8581
|
if (typeof Worker === "undefined") {
|
|
8235
|
-
|
|
8582
|
+
logger15.debug("Worker not supported: Worker constructor undefined");
|
|
8236
8583
|
return false;
|
|
8237
8584
|
}
|
|
8238
8585
|
if (typeof URL === "undefined" || typeof URL.createObjectURL === "undefined") {
|
|
8239
|
-
|
|
8586
|
+
logger15.debug("Worker not supported: URL.createObjectURL unavailable");
|
|
8240
8587
|
return false;
|
|
8241
8588
|
}
|
|
8242
8589
|
if (typeof Blob === "undefined") {
|
|
8243
|
-
|
|
8590
|
+
logger15.debug("Worker not supported: Blob constructor unavailable");
|
|
8244
8591
|
return false;
|
|
8245
8592
|
}
|
|
8246
8593
|
return true;
|
|
8247
8594
|
}
|
|
8248
|
-
function createSileroVAD(config) {
|
|
8595
|
+
function createSileroVAD(config = {}) {
|
|
8596
|
+
const modelUrl = config.modelUrl ?? DEFAULT_MODEL_URLS.sileroVad;
|
|
8597
|
+
const resolvedConfig = { ...config, modelUrl };
|
|
8249
8598
|
if (config.unifiedWorker) {
|
|
8250
|
-
|
|
8251
|
-
return new SileroVADUnifiedAdapter(config.unifiedWorker,
|
|
8599
|
+
logger15.info("Creating SileroVADUnifiedAdapter (shared unified worker)");
|
|
8600
|
+
return new SileroVADUnifiedAdapter(config.unifiedWorker, resolvedConfig);
|
|
8252
8601
|
}
|
|
8253
8602
|
const fallbackOnError = config.fallbackOnError ?? true;
|
|
8254
8603
|
let useWorker;
|
|
8255
8604
|
if (config.useWorker !== void 0) {
|
|
8256
8605
|
useWorker = config.useWorker;
|
|
8257
|
-
|
|
8606
|
+
logger15.debug("Worker preference explicitly set", { useWorker });
|
|
8258
8607
|
} else {
|
|
8259
8608
|
const workerSupported = supportsVADWorker();
|
|
8260
8609
|
const onMobile = isMobile();
|
|
8261
8610
|
useWorker = workerSupported && !onMobile;
|
|
8262
|
-
|
|
8611
|
+
logger15.debug("Auto-detected Worker preference", {
|
|
8263
8612
|
useWorker,
|
|
8264
8613
|
workerSupported,
|
|
8265
8614
|
onMobile
|
|
8266
8615
|
});
|
|
8267
8616
|
}
|
|
8268
8617
|
if (useWorker) {
|
|
8269
|
-
|
|
8618
|
+
logger15.info("Creating SileroVADWorker (off-main-thread)");
|
|
8270
8619
|
const worker = new SileroVADWorker({
|
|
8271
|
-
modelUrl
|
|
8620
|
+
modelUrl,
|
|
8272
8621
|
sampleRate: config.sampleRate,
|
|
8273
8622
|
threshold: config.threshold,
|
|
8274
8623
|
preSpeechBufferChunks: config.preSpeechBufferChunks
|
|
8275
8624
|
});
|
|
8276
8625
|
if (fallbackOnError) {
|
|
8277
|
-
return new VADWorkerWithFallback(worker,
|
|
8626
|
+
return new VADWorkerWithFallback(worker, resolvedConfig);
|
|
8278
8627
|
}
|
|
8279
8628
|
return worker;
|
|
8280
8629
|
}
|
|
8281
|
-
|
|
8282
|
-
return new SileroVADInference(
|
|
8630
|
+
logger15.info("Creating SileroVADInference (main thread)");
|
|
8631
|
+
return new SileroVADInference(resolvedConfig);
|
|
8283
8632
|
}
|
|
8284
8633
|
var VADWorkerWithFallback = class {
|
|
8285
|
-
constructor(worker,
|
|
8634
|
+
constructor(worker, resolvedConfig) {
|
|
8286
8635
|
this.hasFallenBack = false;
|
|
8287
8636
|
this.implementation = worker;
|
|
8288
|
-
this.
|
|
8637
|
+
this.resolvedConfig = resolvedConfig;
|
|
8289
8638
|
}
|
|
8290
8639
|
get backend() {
|
|
8291
8640
|
if (!this.isLoaded) return null;
|
|
@@ -8304,16 +8653,16 @@ var VADWorkerWithFallback = class {
|
|
|
8304
8653
|
try {
|
|
8305
8654
|
return await this.implementation.load();
|
|
8306
8655
|
} catch (error) {
|
|
8307
|
-
|
|
8656
|
+
logger15.warn("Worker load failed, falling back to main thread", {
|
|
8308
8657
|
error: error instanceof Error ? error.message : String(error)
|
|
8309
8658
|
});
|
|
8310
8659
|
try {
|
|
8311
8660
|
await this.implementation.dispose();
|
|
8312
8661
|
} catch {
|
|
8313
8662
|
}
|
|
8314
|
-
this.implementation = new SileroVADInference(this.
|
|
8663
|
+
this.implementation = new SileroVADInference(this.resolvedConfig);
|
|
8315
8664
|
this.hasFallenBack = true;
|
|
8316
|
-
|
|
8665
|
+
logger15.info("Fallback to SileroVADInference successful");
|
|
8317
8666
|
return await this.implementation.load();
|
|
8318
8667
|
}
|
|
8319
8668
|
}
|
|
@@ -8335,7 +8684,7 @@ var VADWorkerWithFallback = class {
|
|
|
8335
8684
|
};
|
|
8336
8685
|
|
|
8337
8686
|
// src/inference/A2EOrchestrator.ts
|
|
8338
|
-
var
|
|
8687
|
+
var logger16 = createLogger("A2EOrchestrator");
|
|
8339
8688
|
var A2EOrchestrator = class {
|
|
8340
8689
|
constructor(config) {
|
|
8341
8690
|
this.a2e = null;
|
|
@@ -8376,7 +8725,7 @@ var A2EOrchestrator = class {
|
|
|
8376
8725
|
*/
|
|
8377
8726
|
async load() {
|
|
8378
8727
|
if (this.disposed) throw new Error("A2EOrchestrator has been disposed");
|
|
8379
|
-
|
|
8728
|
+
logger16.info("Loading A2E model...");
|
|
8380
8729
|
this.a2e = createA2E({
|
|
8381
8730
|
gpuModelUrl: this.config.gpuModelUrl,
|
|
8382
8731
|
gpuExternalDataUrl: this.config.gpuExternalDataUrl,
|
|
@@ -8393,7 +8742,7 @@ var A2EOrchestrator = class {
|
|
|
8393
8742
|
onError: this.config.onError
|
|
8394
8743
|
});
|
|
8395
8744
|
this._isReady = true;
|
|
8396
|
-
|
|
8745
|
+
logger16.info("A2E model loaded", {
|
|
8397
8746
|
backend: info.backend,
|
|
8398
8747
|
loadTimeMs: info.loadTimeMs,
|
|
8399
8748
|
modelId: this.a2e.modelId
|
|
@@ -8448,10 +8797,10 @@ var A2EOrchestrator = class {
|
|
|
8448
8797
|
this.scriptProcessor.connect(this.audioContext.destination);
|
|
8449
8798
|
this._isStreaming = true;
|
|
8450
8799
|
this.processor.startDrip();
|
|
8451
|
-
|
|
8800
|
+
logger16.info("Mic capture started", { sampleRate: this.nativeSampleRate });
|
|
8452
8801
|
} catch (err) {
|
|
8453
8802
|
const error = err instanceof Error ? err : new Error(String(err));
|
|
8454
|
-
|
|
8803
|
+
logger16.error("Failed to start mic capture", { error: error.message });
|
|
8455
8804
|
this.config.onError?.(error);
|
|
8456
8805
|
throw error;
|
|
8457
8806
|
}
|
|
@@ -8479,7 +8828,7 @@ var A2EOrchestrator = class {
|
|
|
8479
8828
|
});
|
|
8480
8829
|
this.audioContext = null;
|
|
8481
8830
|
}
|
|
8482
|
-
|
|
8831
|
+
logger16.info("Mic capture stopped");
|
|
8483
8832
|
}
|
|
8484
8833
|
/**
|
|
8485
8834
|
* Dispose of all resources
|
|
@@ -8502,7 +8851,7 @@ var A2EOrchestrator = class {
|
|
|
8502
8851
|
};
|
|
8503
8852
|
|
|
8504
8853
|
// src/inference/SafariSpeechRecognition.ts
|
|
8505
|
-
var
|
|
8854
|
+
var logger17 = createLogger("SafariSpeech");
|
|
8506
8855
|
var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
8507
8856
|
constructor(config = {}) {
|
|
8508
8857
|
this.recognition = null;
|
|
@@ -8521,7 +8870,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
8521
8870
|
interimResults: config.interimResults ?? true,
|
|
8522
8871
|
maxAlternatives: config.maxAlternatives ?? 1
|
|
8523
8872
|
};
|
|
8524
|
-
|
|
8873
|
+
logger17.debug("SafariSpeechRecognition created", {
|
|
8525
8874
|
language: this.config.language,
|
|
8526
8875
|
continuous: this.config.continuous
|
|
8527
8876
|
});
|
|
@@ -8582,7 +8931,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
8582
8931
|
*/
|
|
8583
8932
|
async start() {
|
|
8584
8933
|
if (this.isListening) {
|
|
8585
|
-
|
|
8934
|
+
logger17.warn("Already listening");
|
|
8586
8935
|
return;
|
|
8587
8936
|
}
|
|
8588
8937
|
if (!_SafariSpeechRecognition.isAvailable()) {
|
|
@@ -8612,7 +8961,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
8612
8961
|
this.isListening = true;
|
|
8613
8962
|
this.startTime = performance.now();
|
|
8614
8963
|
this.accumulatedText = "";
|
|
8615
|
-
|
|
8964
|
+
logger17.info("Speech recognition started", {
|
|
8616
8965
|
language: this.config.language
|
|
8617
8966
|
});
|
|
8618
8967
|
span?.end();
|
|
@@ -8627,7 +8976,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
8627
8976
|
*/
|
|
8628
8977
|
async stop() {
|
|
8629
8978
|
if (!this.isListening || !this.recognition) {
|
|
8630
|
-
|
|
8979
|
+
logger17.warn("Not currently listening");
|
|
8631
8980
|
return {
|
|
8632
8981
|
text: this.accumulatedText,
|
|
8633
8982
|
language: this.config.language,
|
|
@@ -8656,7 +9005,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
8656
9005
|
if (this.recognition && this.isListening) {
|
|
8657
9006
|
this.recognition.abort();
|
|
8658
9007
|
this.isListening = false;
|
|
8659
|
-
|
|
9008
|
+
logger17.info("Speech recognition aborted");
|
|
8660
9009
|
}
|
|
8661
9010
|
}
|
|
8662
9011
|
/**
|
|
@@ -8687,7 +9036,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
8687
9036
|
this.isListening = false;
|
|
8688
9037
|
this.resultCallbacks = [];
|
|
8689
9038
|
this.errorCallbacks = [];
|
|
8690
|
-
|
|
9039
|
+
logger17.debug("SafariSpeechRecognition disposed");
|
|
8691
9040
|
}
|
|
8692
9041
|
/**
|
|
8693
9042
|
* Set up event handlers for the recognition instance
|
|
@@ -8715,7 +9064,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
8715
9064
|
confidence: alternative.confidence
|
|
8716
9065
|
};
|
|
8717
9066
|
this.emitResult(speechResult);
|
|
8718
|
-
|
|
9067
|
+
logger17.trace("Speech result", {
|
|
8719
9068
|
text: text.substring(0, 50),
|
|
8720
9069
|
isFinal,
|
|
8721
9070
|
confidence: alternative.confidence
|
|
@@ -8725,12 +9074,12 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
8725
9074
|
span?.end();
|
|
8726
9075
|
} catch (error) {
|
|
8727
9076
|
span?.endWithError(error instanceof Error ? error : new Error(String(error)));
|
|
8728
|
-
|
|
9077
|
+
logger17.error("Error processing speech result", { error });
|
|
8729
9078
|
}
|
|
8730
9079
|
};
|
|
8731
9080
|
this.recognition.onerror = (event) => {
|
|
8732
9081
|
const error = new Error(`Speech recognition error: ${event.error} - ${event.message}`);
|
|
8733
|
-
|
|
9082
|
+
logger17.error("Speech recognition error", { error: event.error, message: event.message });
|
|
8734
9083
|
this.emitError(error);
|
|
8735
9084
|
if (this.stopRejecter) {
|
|
8736
9085
|
this.stopRejecter(error);
|
|
@@ -8740,7 +9089,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
8740
9089
|
};
|
|
8741
9090
|
this.recognition.onend = () => {
|
|
8742
9091
|
this.isListening = false;
|
|
8743
|
-
|
|
9092
|
+
logger17.info("Speech recognition ended", {
|
|
8744
9093
|
totalText: this.accumulatedText.length,
|
|
8745
9094
|
durationMs: performance.now() - this.startTime
|
|
8746
9095
|
});
|
|
@@ -8757,13 +9106,13 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
8757
9106
|
}
|
|
8758
9107
|
};
|
|
8759
9108
|
this.recognition.onstart = () => {
|
|
8760
|
-
|
|
9109
|
+
logger17.debug("Speech recognition started by browser");
|
|
8761
9110
|
};
|
|
8762
9111
|
this.recognition.onspeechstart = () => {
|
|
8763
|
-
|
|
9112
|
+
logger17.debug("Speech detected");
|
|
8764
9113
|
};
|
|
8765
9114
|
this.recognition.onspeechend = () => {
|
|
8766
|
-
|
|
9115
|
+
logger17.debug("Speech ended");
|
|
8767
9116
|
};
|
|
8768
9117
|
}
|
|
8769
9118
|
/**
|
|
@@ -8774,7 +9123,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
8774
9123
|
try {
|
|
8775
9124
|
callback(result);
|
|
8776
9125
|
} catch (error) {
|
|
8777
|
-
|
|
9126
|
+
logger17.error("Error in result callback", { error });
|
|
8778
9127
|
}
|
|
8779
9128
|
}
|
|
8780
9129
|
}
|
|
@@ -8786,7 +9135,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
8786
9135
|
try {
|
|
8787
9136
|
callback(error);
|
|
8788
9137
|
} catch (callbackError) {
|
|
8789
|
-
|
|
9138
|
+
logger17.error("Error in error callback", { error: callbackError });
|
|
8790
9139
|
}
|
|
8791
9140
|
}
|
|
8792
9141
|
}
|
|
@@ -9356,338 +9705,32 @@ var AnimationGraph = class extends EventEmitter {
|
|
|
9356
9705
|
}
|
|
9357
9706
|
};
|
|
9358
9707
|
|
|
9359
|
-
// src/animation/simplex2d.ts
|
|
9360
|
-
var perm = new Uint8Array(512);
|
|
9361
|
-
var grad2 = [
|
|
9362
|
-
[1, 1],
|
|
9363
|
-
[-1, 1],
|
|
9364
|
-
[1, -1],
|
|
9365
|
-
[-1, -1],
|
|
9366
|
-
[1, 0],
|
|
9367
|
-
[-1, 0],
|
|
9368
|
-
[0, 1],
|
|
9369
|
-
[0, -1]
|
|
9370
|
-
];
|
|
9371
|
-
var p = [
|
|
9372
|
-
151,
|
|
9373
|
-
160,
|
|
9374
|
-
137,
|
|
9375
|
-
91,
|
|
9376
|
-
90,
|
|
9377
|
-
15,
|
|
9378
|
-
131,
|
|
9379
|
-
13,
|
|
9380
|
-
201,
|
|
9381
|
-
95,
|
|
9382
|
-
96,
|
|
9383
|
-
53,
|
|
9384
|
-
194,
|
|
9385
|
-
233,
|
|
9386
|
-
7,
|
|
9387
|
-
225,
|
|
9388
|
-
140,
|
|
9389
|
-
36,
|
|
9390
|
-
103,
|
|
9391
|
-
30,
|
|
9392
|
-
69,
|
|
9393
|
-
142,
|
|
9394
|
-
8,
|
|
9395
|
-
99,
|
|
9396
|
-
37,
|
|
9397
|
-
240,
|
|
9398
|
-
21,
|
|
9399
|
-
10,
|
|
9400
|
-
23,
|
|
9401
|
-
190,
|
|
9402
|
-
6,
|
|
9403
|
-
148,
|
|
9404
|
-
247,
|
|
9405
|
-
120,
|
|
9406
|
-
234,
|
|
9407
|
-
75,
|
|
9408
|
-
0,
|
|
9409
|
-
26,
|
|
9410
|
-
197,
|
|
9411
|
-
62,
|
|
9412
|
-
94,
|
|
9413
|
-
252,
|
|
9414
|
-
219,
|
|
9415
|
-
203,
|
|
9416
|
-
117,
|
|
9417
|
-
35,
|
|
9418
|
-
11,
|
|
9419
|
-
32,
|
|
9420
|
-
57,
|
|
9421
|
-
177,
|
|
9422
|
-
33,
|
|
9423
|
-
88,
|
|
9424
|
-
237,
|
|
9425
|
-
149,
|
|
9426
|
-
56,
|
|
9427
|
-
87,
|
|
9428
|
-
174,
|
|
9429
|
-
20,
|
|
9430
|
-
125,
|
|
9431
|
-
136,
|
|
9432
|
-
171,
|
|
9433
|
-
168,
|
|
9434
|
-
68,
|
|
9435
|
-
175,
|
|
9436
|
-
74,
|
|
9437
|
-
165,
|
|
9438
|
-
71,
|
|
9439
|
-
134,
|
|
9440
|
-
139,
|
|
9441
|
-
48,
|
|
9442
|
-
27,
|
|
9443
|
-
166,
|
|
9444
|
-
77,
|
|
9445
|
-
146,
|
|
9446
|
-
158,
|
|
9447
|
-
231,
|
|
9448
|
-
83,
|
|
9449
|
-
111,
|
|
9450
|
-
229,
|
|
9451
|
-
122,
|
|
9452
|
-
60,
|
|
9453
|
-
211,
|
|
9454
|
-
133,
|
|
9455
|
-
230,
|
|
9456
|
-
220,
|
|
9457
|
-
105,
|
|
9458
|
-
92,
|
|
9459
|
-
41,
|
|
9460
|
-
55,
|
|
9461
|
-
46,
|
|
9462
|
-
245,
|
|
9463
|
-
40,
|
|
9464
|
-
244,
|
|
9465
|
-
102,
|
|
9466
|
-
143,
|
|
9467
|
-
54,
|
|
9468
|
-
65,
|
|
9469
|
-
25,
|
|
9470
|
-
63,
|
|
9471
|
-
161,
|
|
9472
|
-
1,
|
|
9473
|
-
216,
|
|
9474
|
-
80,
|
|
9475
|
-
73,
|
|
9476
|
-
209,
|
|
9477
|
-
76,
|
|
9478
|
-
132,
|
|
9479
|
-
187,
|
|
9480
|
-
208,
|
|
9481
|
-
89,
|
|
9482
|
-
18,
|
|
9483
|
-
169,
|
|
9484
|
-
200,
|
|
9485
|
-
196,
|
|
9486
|
-
135,
|
|
9487
|
-
130,
|
|
9488
|
-
116,
|
|
9489
|
-
188,
|
|
9490
|
-
159,
|
|
9491
|
-
86,
|
|
9492
|
-
164,
|
|
9493
|
-
100,
|
|
9494
|
-
109,
|
|
9495
|
-
198,
|
|
9496
|
-
173,
|
|
9497
|
-
186,
|
|
9498
|
-
3,
|
|
9499
|
-
64,
|
|
9500
|
-
52,
|
|
9501
|
-
217,
|
|
9502
|
-
226,
|
|
9503
|
-
250,
|
|
9504
|
-
124,
|
|
9505
|
-
123,
|
|
9506
|
-
5,
|
|
9507
|
-
202,
|
|
9508
|
-
38,
|
|
9509
|
-
147,
|
|
9510
|
-
118,
|
|
9511
|
-
126,
|
|
9512
|
-
255,
|
|
9513
|
-
82,
|
|
9514
|
-
85,
|
|
9515
|
-
212,
|
|
9516
|
-
207,
|
|
9517
|
-
206,
|
|
9518
|
-
59,
|
|
9519
|
-
227,
|
|
9520
|
-
47,
|
|
9521
|
-
16,
|
|
9522
|
-
58,
|
|
9523
|
-
17,
|
|
9524
|
-
182,
|
|
9525
|
-
189,
|
|
9526
|
-
28,
|
|
9527
|
-
42,
|
|
9528
|
-
223,
|
|
9529
|
-
183,
|
|
9530
|
-
170,
|
|
9531
|
-
213,
|
|
9532
|
-
119,
|
|
9533
|
-
248,
|
|
9534
|
-
152,
|
|
9535
|
-
2,
|
|
9536
|
-
44,
|
|
9537
|
-
154,
|
|
9538
|
-
163,
|
|
9539
|
-
70,
|
|
9540
|
-
221,
|
|
9541
|
-
153,
|
|
9542
|
-
101,
|
|
9543
|
-
155,
|
|
9544
|
-
167,
|
|
9545
|
-
43,
|
|
9546
|
-
172,
|
|
9547
|
-
9,
|
|
9548
|
-
129,
|
|
9549
|
-
22,
|
|
9550
|
-
39,
|
|
9551
|
-
253,
|
|
9552
|
-
19,
|
|
9553
|
-
98,
|
|
9554
|
-
108,
|
|
9555
|
-
110,
|
|
9556
|
-
79,
|
|
9557
|
-
113,
|
|
9558
|
-
224,
|
|
9559
|
-
232,
|
|
9560
|
-
178,
|
|
9561
|
-
185,
|
|
9562
|
-
112,
|
|
9563
|
-
104,
|
|
9564
|
-
218,
|
|
9565
|
-
246,
|
|
9566
|
-
97,
|
|
9567
|
-
228,
|
|
9568
|
-
251,
|
|
9569
|
-
34,
|
|
9570
|
-
242,
|
|
9571
|
-
193,
|
|
9572
|
-
238,
|
|
9573
|
-
210,
|
|
9574
|
-
144,
|
|
9575
|
-
12,
|
|
9576
|
-
191,
|
|
9577
|
-
179,
|
|
9578
|
-
162,
|
|
9579
|
-
241,
|
|
9580
|
-
81,
|
|
9581
|
-
51,
|
|
9582
|
-
145,
|
|
9583
|
-
235,
|
|
9584
|
-
249,
|
|
9585
|
-
14,
|
|
9586
|
-
239,
|
|
9587
|
-
107,
|
|
9588
|
-
49,
|
|
9589
|
-
192,
|
|
9590
|
-
214,
|
|
9591
|
-
31,
|
|
9592
|
-
181,
|
|
9593
|
-
199,
|
|
9594
|
-
106,
|
|
9595
|
-
157,
|
|
9596
|
-
184,
|
|
9597
|
-
84,
|
|
9598
|
-
204,
|
|
9599
|
-
176,
|
|
9600
|
-
115,
|
|
9601
|
-
121,
|
|
9602
|
-
50,
|
|
9603
|
-
45,
|
|
9604
|
-
127,
|
|
9605
|
-
4,
|
|
9606
|
-
150,
|
|
9607
|
-
254,
|
|
9608
|
-
138,
|
|
9609
|
-
236,
|
|
9610
|
-
205,
|
|
9611
|
-
93,
|
|
9612
|
-
222,
|
|
9613
|
-
114,
|
|
9614
|
-
67,
|
|
9615
|
-
29,
|
|
9616
|
-
24,
|
|
9617
|
-
72,
|
|
9618
|
-
243,
|
|
9619
|
-
141,
|
|
9620
|
-
128,
|
|
9621
|
-
195,
|
|
9622
|
-
78,
|
|
9623
|
-
66,
|
|
9624
|
-
215,
|
|
9625
|
-
61,
|
|
9626
|
-
156,
|
|
9627
|
-
180
|
|
9628
|
-
];
|
|
9629
|
-
for (let i = 0; i < 256; i++) {
|
|
9630
|
-
perm[i] = p[i];
|
|
9631
|
-
perm[i + 256] = p[i];
|
|
9632
|
-
}
|
|
9633
|
-
var F2 = 0.5 * (Math.sqrt(3) - 1);
|
|
9634
|
-
var G2 = (3 - Math.sqrt(3)) / 6;
|
|
9635
|
-
function dot2(g, x, y) {
|
|
9636
|
-
return g[0] * x + g[1] * y;
|
|
9637
|
-
}
|
|
9638
|
-
function simplex2d(x, y) {
|
|
9639
|
-
const s = (x + y) * F2;
|
|
9640
|
-
const i = Math.floor(x + s);
|
|
9641
|
-
const j = Math.floor(y + s);
|
|
9642
|
-
const t = (i + j) * G2;
|
|
9643
|
-
const X0 = i - t;
|
|
9644
|
-
const Y0 = j - t;
|
|
9645
|
-
const x0 = x - X0;
|
|
9646
|
-
const y0 = y - Y0;
|
|
9647
|
-
const i1 = x0 > y0 ? 1 : 0;
|
|
9648
|
-
const j1 = x0 > y0 ? 0 : 1;
|
|
9649
|
-
const x1 = x0 - i1 + G2;
|
|
9650
|
-
const y1 = y0 - j1 + G2;
|
|
9651
|
-
const x2 = x0 - 1 + 2 * G2;
|
|
9652
|
-
const y2 = y0 - 1 + 2 * G2;
|
|
9653
|
-
const ii = i & 255;
|
|
9654
|
-
const jj = j & 255;
|
|
9655
|
-
const gi0 = perm[ii + perm[jj]] % 8;
|
|
9656
|
-
const gi1 = perm[ii + i1 + perm[jj + j1]] % 8;
|
|
9657
|
-
const gi2 = perm[ii + 1 + perm[jj + 1]] % 8;
|
|
9658
|
-
let n0 = 0;
|
|
9659
|
-
let t0 = 0.5 - x0 * x0 - y0 * y0;
|
|
9660
|
-
if (t0 >= 0) {
|
|
9661
|
-
t0 *= t0;
|
|
9662
|
-
n0 = t0 * t0 * dot2(grad2[gi0], x0, y0);
|
|
9663
|
-
}
|
|
9664
|
-
let n1 = 0;
|
|
9665
|
-
let t1 = 0.5 - x1 * x1 - y1 * y1;
|
|
9666
|
-
if (t1 >= 0) {
|
|
9667
|
-
t1 *= t1;
|
|
9668
|
-
n1 = t1 * t1 * dot2(grad2[gi1], x1, y1);
|
|
9669
|
-
}
|
|
9670
|
-
let n2 = 0;
|
|
9671
|
-
let t2 = 0.5 - x2 * x2 - y2 * y2;
|
|
9672
|
-
if (t2 >= 0) {
|
|
9673
|
-
t2 *= t2;
|
|
9674
|
-
n2 = t2 * t2 * dot2(grad2[gi2], x2, y2);
|
|
9675
|
-
}
|
|
9676
|
-
return 70 * (n0 + n1 + n2);
|
|
9677
|
-
}
|
|
9678
|
-
|
|
9679
9708
|
// src/animation/ProceduralLifeLayer.ts
|
|
9709
|
+
import { createNoise2D } from "simplex-noise";
|
|
9710
|
+
var simplex2d = createNoise2D();
|
|
9711
|
+
var LIFE_BS_INDEX = /* @__PURE__ */ new Map();
|
|
9712
|
+
for (let i = 0; i < LAM_BLENDSHAPES.length; i++) {
|
|
9713
|
+
LIFE_BS_INDEX.set(LAM_BLENDSHAPES[i], i);
|
|
9714
|
+
}
|
|
9680
9715
|
var PHASE_OPEN = 0;
|
|
9681
9716
|
var PHASE_CLOSING = 1;
|
|
9682
9717
|
var PHASE_CLOSED = 2;
|
|
9683
9718
|
var PHASE_OPENING = 3;
|
|
9684
|
-
var BLINK_CLOSE_DURATION = 0.
|
|
9719
|
+
var BLINK_CLOSE_DURATION = 0.092;
|
|
9685
9720
|
var BLINK_HOLD_DURATION = 0.04;
|
|
9686
|
-
var BLINK_OPEN_DURATION = 0.
|
|
9721
|
+
var BLINK_OPEN_DURATION = 0.242;
|
|
9687
9722
|
var BLINK_ASYMMETRY_DELAY = 8e-3;
|
|
9723
|
+
var BLINK_IBI_MU = Math.log(5.97);
|
|
9724
|
+
var BLINK_IBI_SIGMA = 0.89;
|
|
9688
9725
|
var GAZE_BREAK_DURATION = 0.12;
|
|
9689
9726
|
var GAZE_BREAK_HOLD_DURATION = 0.3;
|
|
9690
9727
|
var GAZE_BREAK_RETURN_DURATION = 0.15;
|
|
9728
|
+
var GAZE_STATE_PARAMS = {
|
|
9729
|
+
idle: { interval: [2, 5], amplitude: [0.15, 0.4] },
|
|
9730
|
+
listening: { interval: [4, 10], amplitude: [0.1, 0.25] },
|
|
9731
|
+
thinking: { interval: [1, 3], amplitude: [0.2, 0.5] },
|
|
9732
|
+
speaking: { interval: [2, 6], amplitude: [0.15, 0.35] }
|
|
9733
|
+
};
|
|
9691
9734
|
var EYE_NOISE_X_FREQ = 0.8;
|
|
9692
9735
|
var EYE_NOISE_Y_FREQ = 0.6;
|
|
9693
9736
|
var EYE_NOISE_X_PHASE = 73.1;
|
|
@@ -9715,6 +9758,12 @@ function smoothStep(t) {
|
|
|
9715
9758
|
function softClamp(v, max) {
|
|
9716
9759
|
return Math.tanh(v / max) * max;
|
|
9717
9760
|
}
|
|
9761
|
+
function sampleLogNormal(mu, sigma) {
|
|
9762
|
+
const u1 = Math.random();
|
|
9763
|
+
const u2 = Math.random();
|
|
9764
|
+
const z = Math.sqrt(-2 * Math.log(u1 || 1e-10)) * Math.cos(2 * Math.PI * u2);
|
|
9765
|
+
return Math.exp(mu + sigma * z);
|
|
9766
|
+
}
|
|
9718
9767
|
var ProceduralLifeLayer = class {
|
|
9719
9768
|
constructor(config) {
|
|
9720
9769
|
// Blink state
|
|
@@ -9727,7 +9776,7 @@ var ProceduralLifeLayer = class {
|
|
|
9727
9776
|
// Eye contact (smoothed)
|
|
9728
9777
|
this.smoothedEyeX = 0;
|
|
9729
9778
|
this.smoothedEyeY = 0;
|
|
9730
|
-
// Eye micro-motion
|
|
9779
|
+
// Eye micro-motion
|
|
9731
9780
|
this.eyeNoiseTime = 0;
|
|
9732
9781
|
// Gaze break state
|
|
9733
9782
|
this.gazeBreakTimer = 0;
|
|
@@ -9737,6 +9786,8 @@ var ProceduralLifeLayer = class {
|
|
|
9737
9786
|
this.gazeBreakTargetY = 0;
|
|
9738
9787
|
this.gazeBreakCurrentX = 0;
|
|
9739
9788
|
this.gazeBreakCurrentY = 0;
|
|
9789
|
+
// Conversational state for gaze
|
|
9790
|
+
this.currentState = null;
|
|
9740
9791
|
// Breathing / postural sway
|
|
9741
9792
|
this.microMotionTime = 0;
|
|
9742
9793
|
this.breathingPhase = 0;
|
|
@@ -9745,6 +9796,7 @@ var ProceduralLifeLayer = class {
|
|
|
9745
9796
|
this.previousEnergy = 0;
|
|
9746
9797
|
this.emphasisLevel = 0;
|
|
9747
9798
|
this.blinkIntervalRange = config?.blinkIntervalRange ?? [2.5, 6];
|
|
9799
|
+
this.useLogNormalBlinks = !config?.blinkIntervalRange;
|
|
9748
9800
|
this.gazeBreakIntervalRange = config?.gazeBreakIntervalRange ?? [3, 8];
|
|
9749
9801
|
this.gazeBreakAmplitudeRange = config?.gazeBreakAmplitudeRange ?? [0.15, 0.4];
|
|
9750
9802
|
this.eyeNoiseAmplitude = config?.eyeNoiseAmplitude ?? 0.06;
|
|
@@ -9754,7 +9806,7 @@ var ProceduralLifeLayer = class {
|
|
|
9754
9806
|
this.posturalSwayAmplitude = config?.posturalSwayAmplitude ?? 2e-3;
|
|
9755
9807
|
this.eyeMaxDeviation = config?.eyeMaxDeviation ?? 0.8;
|
|
9756
9808
|
this.eyeSmoothing = config?.eyeSmoothing ?? 15;
|
|
9757
|
-
this.blinkInterval =
|
|
9809
|
+
this.blinkInterval = this.nextBlinkInterval();
|
|
9758
9810
|
this.gazeBreakInterval = randomRange(...this.gazeBreakIntervalRange);
|
|
9759
9811
|
}
|
|
9760
9812
|
/**
|
|
@@ -9769,6 +9821,7 @@ var ProceduralLifeLayer = class {
|
|
|
9769
9821
|
const eyeTargetY = input?.eyeTargetY ?? 0;
|
|
9770
9822
|
const audioEnergy = input?.audioEnergy ?? 0;
|
|
9771
9823
|
const isSpeaking = input?.isSpeaking ?? false;
|
|
9824
|
+
this.currentState = input?.state ?? null;
|
|
9772
9825
|
const safeDelta = Math.min(delta, 0.1);
|
|
9773
9826
|
const blendshapes = {};
|
|
9774
9827
|
this.updateBlinks(delta);
|
|
@@ -9807,6 +9860,12 @@ var ProceduralLifeLayer = class {
|
|
|
9807
9860
|
const swayAmp = this.posturalSwayAmplitude;
|
|
9808
9861
|
const swayX = Math.sin(this.microMotionTime * 0.7) * swayAmp + Math.sin(this.microMotionTime * 1.3) * swayAmp * 0.5;
|
|
9809
9862
|
const swayY = Math.sin(this.microMotionTime * 0.5) * swayAmp * 0.75 + Math.sin(this.microMotionTime * 0.9) * swayAmp * 0.5;
|
|
9863
|
+
const breathVal = Math.sin(this.breathingPhase);
|
|
9864
|
+
if (breathVal > 0) {
|
|
9865
|
+
blendshapes["jawOpen"] = breathVal * 0.015;
|
|
9866
|
+
blendshapes["noseSneerLeft"] = breathVal * 8e-3;
|
|
9867
|
+
blendshapes["noseSneerRight"] = breathVal * 8e-3;
|
|
9868
|
+
}
|
|
9810
9869
|
return {
|
|
9811
9870
|
blendshapes,
|
|
9812
9871
|
headDelta: {
|
|
@@ -9815,12 +9874,35 @@ var ProceduralLifeLayer = class {
|
|
|
9815
9874
|
}
|
|
9816
9875
|
};
|
|
9817
9876
|
}
|
|
9877
|
+
/**
|
|
9878
|
+
* Write life layer output directly to a Float32Array[52] in LAM_BLENDSHAPES order.
|
|
9879
|
+
*
|
|
9880
|
+
* Includes micro-jitter (0.4% amplitude simplex noise on all channels) to
|
|
9881
|
+
* break uncanny stillness on undriven channels.
|
|
9882
|
+
*
|
|
9883
|
+
* @param delta - Time since last frame in seconds
|
|
9884
|
+
* @param input - Per-frame input
|
|
9885
|
+
* @param out - Pre-allocated Float32Array(52) to write into
|
|
9886
|
+
*/
|
|
9887
|
+
updateToArray(delta, input, out) {
|
|
9888
|
+
out.fill(0);
|
|
9889
|
+
const result = this.update(delta, input);
|
|
9890
|
+
for (const [name, value] of Object.entries(result.blendshapes)) {
|
|
9891
|
+
const idx = LIFE_BS_INDEX.get(name);
|
|
9892
|
+
if (idx !== void 0) {
|
|
9893
|
+
out[idx] = value;
|
|
9894
|
+
}
|
|
9895
|
+
}
|
|
9896
|
+
for (let i = 0; i < 52; i++) {
|
|
9897
|
+
out[i] += simplex2d(this.noiseTime * 0.3, i * 7.13) * 4e-3;
|
|
9898
|
+
}
|
|
9899
|
+
}
|
|
9818
9900
|
/**
|
|
9819
9901
|
* Reset all internal state to initial values.
|
|
9820
9902
|
*/
|
|
9821
9903
|
reset() {
|
|
9822
9904
|
this.blinkTimer = 0;
|
|
9823
|
-
this.blinkInterval =
|
|
9905
|
+
this.blinkInterval = this.nextBlinkInterval();
|
|
9824
9906
|
this.blinkPhase = PHASE_OPEN;
|
|
9825
9907
|
this.blinkProgress = 0;
|
|
9826
9908
|
this.asymmetryRight = 0.97;
|
|
@@ -9837,6 +9919,7 @@ var ProceduralLifeLayer = class {
|
|
|
9837
9919
|
this.gazeBreakTargetY = 0;
|
|
9838
9920
|
this.gazeBreakCurrentX = 0;
|
|
9839
9921
|
this.gazeBreakCurrentY = 0;
|
|
9922
|
+
this.currentState = null;
|
|
9840
9923
|
this.microMotionTime = 0;
|
|
9841
9924
|
this.breathingPhase = 0;
|
|
9842
9925
|
this.noiseTime = 0;
|
|
@@ -9844,6 +9927,21 @@ var ProceduralLifeLayer = class {
|
|
|
9844
9927
|
this.emphasisLevel = 0;
|
|
9845
9928
|
}
|
|
9846
9929
|
// =====================================================================
|
|
9930
|
+
// PRIVATE: Blink interval sampling
|
|
9931
|
+
// =====================================================================
|
|
9932
|
+
/**
|
|
9933
|
+
* Sample next blink interval.
|
|
9934
|
+
* Uses log-normal distribution (PMC3565584) when using default config,
|
|
9935
|
+
* or uniform random when custom blinkIntervalRange is provided.
|
|
9936
|
+
*/
|
|
9937
|
+
nextBlinkInterval() {
|
|
9938
|
+
if (this.useLogNormalBlinks) {
|
|
9939
|
+
const sample = sampleLogNormal(BLINK_IBI_MU, BLINK_IBI_SIGMA);
|
|
9940
|
+
return clamp(sample, 1.5, 12);
|
|
9941
|
+
}
|
|
9942
|
+
return randomRange(...this.blinkIntervalRange);
|
|
9943
|
+
}
|
|
9944
|
+
// =====================================================================
|
|
9847
9945
|
// PRIVATE: Blink system
|
|
9848
9946
|
// =====================================================================
|
|
9849
9947
|
updateBlinks(delta) {
|
|
@@ -9852,7 +9950,7 @@ var ProceduralLifeLayer = class {
|
|
|
9852
9950
|
this.blinkPhase = PHASE_CLOSING;
|
|
9853
9951
|
this.blinkProgress = 0;
|
|
9854
9952
|
this.blinkTimer = 0;
|
|
9855
|
-
this.blinkInterval =
|
|
9953
|
+
this.blinkInterval = this.nextBlinkInterval();
|
|
9856
9954
|
this.asymmetryRight = 0.95 + Math.random() * 0.08;
|
|
9857
9955
|
}
|
|
9858
9956
|
if (this.blinkPhase > PHASE_OPEN) {
|
|
@@ -9908,18 +10006,32 @@ var ProceduralLifeLayer = class {
|
|
|
9908
10006
|
return { x, y };
|
|
9909
10007
|
}
|
|
9910
10008
|
// =====================================================================
|
|
9911
|
-
// PRIVATE: Gaze breaks
|
|
10009
|
+
// PRIVATE: Gaze breaks (state-dependent)
|
|
9912
10010
|
// =====================================================================
|
|
10011
|
+
/**
|
|
10012
|
+
* Get active gaze parameters — uses state-dependent params when
|
|
10013
|
+
* conversational state is provided, otherwise falls back to config ranges.
|
|
10014
|
+
*/
|
|
10015
|
+
getActiveGazeParams() {
|
|
10016
|
+
if (this.currentState && GAZE_STATE_PARAMS[this.currentState]) {
|
|
10017
|
+
return GAZE_STATE_PARAMS[this.currentState];
|
|
10018
|
+
}
|
|
10019
|
+
return {
|
|
10020
|
+
interval: this.gazeBreakIntervalRange,
|
|
10021
|
+
amplitude: this.gazeBreakAmplitudeRange
|
|
10022
|
+
};
|
|
10023
|
+
}
|
|
9913
10024
|
updateGazeBreaks(delta) {
|
|
9914
10025
|
this.gazeBreakTimer += delta;
|
|
9915
10026
|
if (this.gazeBreakTimer >= this.gazeBreakInterval && this.gazeBreakPhase === PHASE_OPEN) {
|
|
9916
10027
|
this.gazeBreakPhase = PHASE_CLOSING;
|
|
9917
10028
|
this.gazeBreakProgress = 0;
|
|
9918
10029
|
this.gazeBreakTimer = 0;
|
|
9919
|
-
const
|
|
10030
|
+
const params = this.getActiveGazeParams();
|
|
10031
|
+
const amp = randomRange(...params.amplitude);
|
|
9920
10032
|
this.gazeBreakTargetX = (Math.random() - 0.5) * 2 * amp;
|
|
9921
10033
|
this.gazeBreakTargetY = (Math.random() - 0.5) * amp * 0.4;
|
|
9922
|
-
this.gazeBreakInterval = randomRange(...
|
|
10034
|
+
this.gazeBreakInterval = randomRange(...params.interval);
|
|
9923
10035
|
}
|
|
9924
10036
|
if (this.gazeBreakPhase > PHASE_OPEN) {
|
|
9925
10037
|
this.gazeBreakProgress += delta;
|
|
@@ -9984,6 +10096,971 @@ var ProceduralLifeLayer = class {
|
|
|
9984
10096
|
}
|
|
9985
10097
|
};
|
|
9986
10098
|
|
|
10099
|
+
// src/face/FACSMapping.ts
|
|
10100
|
+
var EMOTION_TO_AU = {
|
|
10101
|
+
joy: [
|
|
10102
|
+
{ au: "AU6", intensity: 0.7, region: "upper" },
|
|
10103
|
+
// cheek raise (Duchenne)
|
|
10104
|
+
{ au: "AU12", intensity: 0.8, region: "lower" }
|
|
10105
|
+
// lip corner pull (smile)
|
|
10106
|
+
],
|
|
10107
|
+
anger: [
|
|
10108
|
+
{ au: "AU4", intensity: 0.8, region: "upper" },
|
|
10109
|
+
// brow lower
|
|
10110
|
+
{ au: "AU5", intensity: 0.4, region: "upper" },
|
|
10111
|
+
// upper lid raise
|
|
10112
|
+
{ au: "AU7", intensity: 0.3, region: "upper" },
|
|
10113
|
+
// lid tighten
|
|
10114
|
+
{ au: "AU23", intensity: 0.6, region: "lower" }
|
|
10115
|
+
// lip tighten
|
|
10116
|
+
],
|
|
10117
|
+
sadness: [
|
|
10118
|
+
{ au: "AU1", intensity: 0.7, region: "upper" },
|
|
10119
|
+
// inner brow raise
|
|
10120
|
+
{ au: "AU4", intensity: 0.3, region: "upper" },
|
|
10121
|
+
// brow lower (furrow)
|
|
10122
|
+
{ au: "AU15", intensity: 0.5, region: "lower" }
|
|
10123
|
+
// lip corner depress
|
|
10124
|
+
],
|
|
10125
|
+
fear: [
|
|
10126
|
+
{ au: "AU1", intensity: 0.6, region: "upper" },
|
|
10127
|
+
// inner brow raise
|
|
10128
|
+
{ au: "AU2", intensity: 0.5, region: "upper" },
|
|
10129
|
+
// outer brow raise
|
|
10130
|
+
{ au: "AU4", intensity: 0.3, region: "upper" },
|
|
10131
|
+
// brow lower
|
|
10132
|
+
{ au: "AU5", intensity: 0.5, region: "upper" },
|
|
10133
|
+
// upper lid raise
|
|
10134
|
+
{ au: "AU20", intensity: 0.4, region: "lower" }
|
|
10135
|
+
// lip stretch
|
|
10136
|
+
],
|
|
10137
|
+
disgust: [
|
|
10138
|
+
{ au: "AU9", intensity: 0.7, region: "upper" },
|
|
10139
|
+
// nose wrinkle
|
|
10140
|
+
{ au: "AU10", intensity: 0.5, region: "lower" },
|
|
10141
|
+
// upper lip raise
|
|
10142
|
+
{ au: "AU15", intensity: 0.4, region: "lower" }
|
|
10143
|
+
// lip corner depress
|
|
10144
|
+
],
|
|
10145
|
+
amazement: [
|
|
10146
|
+
{ au: "AU1", intensity: 0.6, region: "upper" },
|
|
10147
|
+
// inner brow raise
|
|
10148
|
+
{ au: "AU2", intensity: 0.7, region: "upper" },
|
|
10149
|
+
// outer brow raise
|
|
10150
|
+
{ au: "AU5", intensity: 0.6, region: "upper" },
|
|
10151
|
+
// upper lid raise
|
|
10152
|
+
{ au: "AU26", intensity: 0.4, region: "lower" }
|
|
10153
|
+
// jaw drop
|
|
10154
|
+
],
|
|
10155
|
+
grief: [
|
|
10156
|
+
{ au: "AU1", intensity: 0.8, region: "upper" },
|
|
10157
|
+
// inner brow raise
|
|
10158
|
+
{ au: "AU4", intensity: 0.5, region: "upper" },
|
|
10159
|
+
// brow lower
|
|
10160
|
+
{ au: "AU6", intensity: 0.3, region: "upper" },
|
|
10161
|
+
// cheek raise (grief cry)
|
|
10162
|
+
{ au: "AU15", intensity: 0.6, region: "lower" }
|
|
10163
|
+
// lip corner depress
|
|
10164
|
+
],
|
|
10165
|
+
cheekiness: [
|
|
10166
|
+
{ au: "AU2", intensity: 0.4, region: "upper" },
|
|
10167
|
+
// outer brow raise
|
|
10168
|
+
{ au: "AU6", intensity: 0.4, region: "upper" },
|
|
10169
|
+
// cheek raise
|
|
10170
|
+
{ au: "AU12", intensity: 0.6, region: "lower" }
|
|
10171
|
+
// lip corner pull (smirk)
|
|
10172
|
+
],
|
|
10173
|
+
pain: [
|
|
10174
|
+
{ au: "AU4", intensity: 0.7, region: "upper" },
|
|
10175
|
+
// brow lower
|
|
10176
|
+
{ au: "AU6", intensity: 0.4, region: "upper" },
|
|
10177
|
+
// cheek raise (orbicularis)
|
|
10178
|
+
{ au: "AU7", intensity: 0.7, region: "upper" },
|
|
10179
|
+
// lid tighten (squint)
|
|
10180
|
+
{ au: "AU9", intensity: 0.5, region: "upper" }
|
|
10181
|
+
// nose wrinkle
|
|
10182
|
+
],
|
|
10183
|
+
outofbreath: [
|
|
10184
|
+
{ au: "AU1", intensity: 0.3, region: "upper" },
|
|
10185
|
+
// inner brow raise
|
|
10186
|
+
{ au: "AU25", intensity: 0.3, region: "lower" },
|
|
10187
|
+
// lips part
|
|
10188
|
+
{ au: "AU26", intensity: 0.5, region: "lower" }
|
|
10189
|
+
// jaw drop
|
|
10190
|
+
]
|
|
10191
|
+
};
|
|
10192
|
+
var AU_TO_ARKIT = {
|
|
10193
|
+
"AU1": [{ blendshape: "browInnerUp", weight: 1 }],
|
|
10194
|
+
"AU2": [{ blendshape: "browOuterUpLeft", weight: 1 }, { blendshape: "browOuterUpRight", weight: 1 }],
|
|
10195
|
+
"AU4": [{ blendshape: "browDownLeft", weight: 1 }, { blendshape: "browDownRight", weight: 1 }],
|
|
10196
|
+
"AU5": [{ blendshape: "eyeWideLeft", weight: 1 }, { blendshape: "eyeWideRight", weight: 1 }],
|
|
10197
|
+
"AU6": [{ blendshape: "cheekSquintLeft", weight: 1 }, { blendshape: "cheekSquintRight", weight: 1 }],
|
|
10198
|
+
"AU7": [{ blendshape: "eyeSquintLeft", weight: 1 }, { blendshape: "eyeSquintRight", weight: 1 }],
|
|
10199
|
+
"AU9": [{ blendshape: "noseSneerLeft", weight: 1 }, { blendshape: "noseSneerRight", weight: 1 }],
|
|
10200
|
+
"AU10": [{ blendshape: "mouthUpperUpLeft", weight: 1 }, { blendshape: "mouthUpperUpRight", weight: 1 }],
|
|
10201
|
+
"AU12": [{ blendshape: "mouthSmileLeft", weight: 1 }, { blendshape: "mouthSmileRight", weight: 1 }],
|
|
10202
|
+
"AU15": [{ blendshape: "mouthFrownLeft", weight: 1 }, { blendshape: "mouthFrownRight", weight: 1 }],
|
|
10203
|
+
"AU20": [{ blendshape: "mouthStretchLeft", weight: 1 }, { blendshape: "mouthStretchRight", weight: 1 }],
|
|
10204
|
+
"AU23": [{ blendshape: "mouthPressLeft", weight: 1 }, { blendshape: "mouthPressRight", weight: 1 }],
|
|
10205
|
+
"AU25": [{ blendshape: "jawOpen", weight: 0.3 }],
|
|
10206
|
+
"AU26": [{ blendshape: "jawOpen", weight: 1 }]
|
|
10207
|
+
};
|
|
10208
|
+
var ALL_AUS = [...new Set(
|
|
10209
|
+
Object.values(EMOTION_TO_AU).flatMap((activations) => activations.map((a) => a.au))
|
|
10210
|
+
)];
|
|
10211
|
+
|
|
10212
|
+
// src/face/EmotionResolver.ts
|
|
10213
|
+
var BS_INDEX = /* @__PURE__ */ new Map();
|
|
10214
|
+
for (let i = 0; i < LAM_BLENDSHAPES.length; i++) {
|
|
10215
|
+
BS_INDEX.set(LAM_BLENDSHAPES[i], i);
|
|
10216
|
+
}
|
|
10217
|
+
var EmotionResolver = class {
|
|
10218
|
+
constructor() {
|
|
10219
|
+
this.upperBuffer = new Float32Array(52);
|
|
10220
|
+
this.lowerBuffer = new Float32Array(52);
|
|
10221
|
+
}
|
|
10222
|
+
/**
|
|
10223
|
+
* Resolve emotion weights to upper/lower face blendshape contributions.
|
|
10224
|
+
*
|
|
10225
|
+
* @param weights - Emotion channel weights from EmotionController
|
|
10226
|
+
* @param intensity - Global intensity multiplier (0-2). Default: 1.0
|
|
10227
|
+
* @returns Upper and lower face blendshape arrays (52 channels each)
|
|
10228
|
+
*/
|
|
10229
|
+
resolve(weights, intensity = 1) {
|
|
10230
|
+
const upper = this.upperBuffer;
|
|
10231
|
+
const lower = this.lowerBuffer;
|
|
10232
|
+
upper.fill(0);
|
|
10233
|
+
lower.fill(0);
|
|
10234
|
+
for (const emotionName of EMOTION_NAMES) {
|
|
10235
|
+
const emotionWeight = weights[emotionName];
|
|
10236
|
+
if (!emotionWeight || emotionWeight < 0.01) continue;
|
|
10237
|
+
const auActivations = EMOTION_TO_AU[emotionName];
|
|
10238
|
+
if (!auActivations) continue;
|
|
10239
|
+
for (const activation of auActivations) {
|
|
10240
|
+
const arkitMappings = AU_TO_ARKIT[activation.au];
|
|
10241
|
+
if (!arkitMappings) continue;
|
|
10242
|
+
const target = activation.region === "upper" ? upper : lower;
|
|
10243
|
+
const scale = emotionWeight * activation.intensity * intensity;
|
|
10244
|
+
for (const mapping of arkitMappings) {
|
|
10245
|
+
const idx = BS_INDEX.get(mapping.blendshape);
|
|
10246
|
+
if (idx !== void 0) {
|
|
10247
|
+
target[idx] += mapping.weight * scale;
|
|
10248
|
+
}
|
|
10249
|
+
}
|
|
10250
|
+
}
|
|
10251
|
+
}
|
|
10252
|
+
for (let i = 0; i < 52; i++) {
|
|
10253
|
+
if (upper[i] > 1) upper[i] = 1;
|
|
10254
|
+
if (lower[i] > 1) lower[i] = 1;
|
|
10255
|
+
}
|
|
10256
|
+
return {
|
|
10257
|
+
upper: new Float32Array(upper),
|
|
10258
|
+
lower: new Float32Array(lower)
|
|
10259
|
+
};
|
|
10260
|
+
}
|
|
10261
|
+
};
|
|
10262
|
+
|
|
10263
|
+
// src/face/FaceCompositor.ts
|
|
10264
|
+
function smoothstep(t) {
|
|
10265
|
+
return t * t * (3 - 2 * t);
|
|
10266
|
+
}
|
|
10267
|
+
var BS_INDEX2 = /* @__PURE__ */ new Map();
|
|
10268
|
+
for (let i = 0; i < LAM_BLENDSHAPES.length; i++) {
|
|
10269
|
+
BS_INDEX2.set(LAM_BLENDSHAPES[i], i);
|
|
10270
|
+
}
|
|
10271
|
+
var IDX_MOUTH_CLOSE = BS_INDEX2.get("mouthClose");
|
|
10272
|
+
var IS_EYE_CHANNEL = new Array(52).fill(false);
|
|
10273
|
+
for (const name of LAM_BLENDSHAPES) {
|
|
10274
|
+
if (name.startsWith("eyeBlink") || name.startsWith("eyeLook")) {
|
|
10275
|
+
IS_EYE_CHANNEL[BS_INDEX2.get(name)] = true;
|
|
10276
|
+
}
|
|
10277
|
+
}
|
|
10278
|
+
var FaceCompositor = class {
|
|
10279
|
+
constructor(config) {
|
|
10280
|
+
this.emotionResolver = new EmotionResolver();
|
|
10281
|
+
// Pre-allocated buffers
|
|
10282
|
+
this.smoothedUpper = new Float32Array(52);
|
|
10283
|
+
this.smoothedLower = new Float32Array(52);
|
|
10284
|
+
this.lifeBuffer = new Float32Array(52);
|
|
10285
|
+
// Profile arrays (pre-expanded to 52 channels)
|
|
10286
|
+
this.multiplier = new Float32Array(52).fill(1);
|
|
10287
|
+
this.offset = new Float32Array(52);
|
|
10288
|
+
this.lifeLayer = config?.lifeLayer ?? new ProceduralLifeLayer();
|
|
10289
|
+
this.emotionSmoothing = config?.emotionSmoothing ?? 0.12;
|
|
10290
|
+
if (config?.profile) {
|
|
10291
|
+
this.applyProfileArrays(config.profile);
|
|
10292
|
+
}
|
|
10293
|
+
}
|
|
10294
|
+
/**
|
|
10295
|
+
* Compose a single output frame from the 5-stage signal chain.
|
|
10296
|
+
*
|
|
10297
|
+
* @param base - A2E raw output (Float32Array[52], LAM_BLENDSHAPES order)
|
|
10298
|
+
* @param input - Per-frame input (deltaTime, emotion, life layer params)
|
|
10299
|
+
* @returns Float32Array[52] with all values clamped to [0, 1]
|
|
10300
|
+
*/
|
|
10301
|
+
compose(base, input) {
|
|
10302
|
+
const out = new Float32Array(52);
|
|
10303
|
+
out.set(base);
|
|
10304
|
+
const emotion = input.emotion ?? this.stickyEmotion;
|
|
10305
|
+
if (emotion) {
|
|
10306
|
+
const resolved = this.emotionResolver.resolve(
|
|
10307
|
+
emotion,
|
|
10308
|
+
input.emotionIntensity ?? 1
|
|
10309
|
+
);
|
|
10310
|
+
const k = this.emotionSmoothing;
|
|
10311
|
+
for (let i = 0; i < 52; i++) {
|
|
10312
|
+
this.smoothedUpper[i] += (resolved.upper[i] - this.smoothedUpper[i]) * k;
|
|
10313
|
+
this.smoothedLower[i] += (resolved.lower[i] - this.smoothedLower[i]) * k;
|
|
10314
|
+
}
|
|
10315
|
+
const mc = base[IDX_MOUTH_CLOSE];
|
|
10316
|
+
const bilabialSuppress = mc <= 0.3 ? 1 : mc >= 0.7 ? 0.1 : 1 - 0.9 * smoothstep((mc - 0.3) * 2.5);
|
|
10317
|
+
for (let i = 0; i < 52; i++) {
|
|
10318
|
+
out[i] += this.smoothedUpper[i];
|
|
10319
|
+
}
|
|
10320
|
+
for (let i = 0; i < 52; i++) {
|
|
10321
|
+
out[i] *= 1 + this.smoothedLower[i] * bilabialSuppress;
|
|
10322
|
+
}
|
|
10323
|
+
}
|
|
10324
|
+
this.lifeLayer.updateToArray(input.deltaTime, input, this.lifeBuffer);
|
|
10325
|
+
for (let i = 0; i < 52; i++) {
|
|
10326
|
+
if (IS_EYE_CHANNEL[i]) {
|
|
10327
|
+
out[i] = this.lifeBuffer[i];
|
|
10328
|
+
} else {
|
|
10329
|
+
out[i] += this.lifeBuffer[i];
|
|
10330
|
+
}
|
|
10331
|
+
}
|
|
10332
|
+
for (let i = 0; i < 52; i++) {
|
|
10333
|
+
out[i] = out[i] * this.multiplier[i] + this.offset[i];
|
|
10334
|
+
}
|
|
10335
|
+
for (let i = 0; i < 52; i++) {
|
|
10336
|
+
if (out[i] < 0) out[i] = 0;
|
|
10337
|
+
else if (out[i] > 1) out[i] = 1;
|
|
10338
|
+
}
|
|
10339
|
+
return out;
|
|
10340
|
+
}
|
|
10341
|
+
/**
|
|
10342
|
+
* Set sticky emotion (used when input.emotion is not provided).
|
|
10343
|
+
*/
|
|
10344
|
+
setEmotion(weights) {
|
|
10345
|
+
this.stickyEmotion = weights;
|
|
10346
|
+
}
|
|
10347
|
+
/**
|
|
10348
|
+
* Update character profile at runtime.
|
|
10349
|
+
*/
|
|
10350
|
+
setProfile(profile) {
|
|
10351
|
+
this.multiplier.fill(1);
|
|
10352
|
+
this.offset.fill(0);
|
|
10353
|
+
this.applyProfileArrays(profile);
|
|
10354
|
+
}
|
|
10355
|
+
/**
|
|
10356
|
+
* Reset all smoothing state and life layer.
|
|
10357
|
+
*/
|
|
10358
|
+
reset() {
|
|
10359
|
+
this.smoothedUpper.fill(0);
|
|
10360
|
+
this.smoothedLower.fill(0);
|
|
10361
|
+
this.lifeBuffer.fill(0);
|
|
10362
|
+
this.stickyEmotion = void 0;
|
|
10363
|
+
this.lifeLayer.reset();
|
|
10364
|
+
}
|
|
10365
|
+
/** Expand partial profile maps into dense Float32Arrays */
|
|
10366
|
+
applyProfileArrays(profile) {
|
|
10367
|
+
if (profile.multiplier) {
|
|
10368
|
+
for (const [name, value] of Object.entries(profile.multiplier)) {
|
|
10369
|
+
const idx = BS_INDEX2.get(name);
|
|
10370
|
+
if (idx !== void 0 && value !== void 0) {
|
|
10371
|
+
this.multiplier[idx] = value;
|
|
10372
|
+
}
|
|
10373
|
+
}
|
|
10374
|
+
}
|
|
10375
|
+
if (profile.offset) {
|
|
10376
|
+
for (const [name, value] of Object.entries(profile.offset)) {
|
|
10377
|
+
const idx = BS_INDEX2.get(name);
|
|
10378
|
+
if (idx !== void 0 && value !== void 0) {
|
|
10379
|
+
this.offset[idx] = value;
|
|
10380
|
+
}
|
|
10381
|
+
}
|
|
10382
|
+
}
|
|
10383
|
+
}
|
|
10384
|
+
};
|
|
10385
|
+
|
|
10386
|
+
// src/orchestration/MicLipSync.ts
|
|
10387
|
+
var logger18 = createLogger("MicLipSync");
|
|
10388
|
+
var MicLipSync = class extends EventEmitter {
|
|
10389
|
+
constructor(config) {
|
|
10390
|
+
super();
|
|
10391
|
+
this.omoteEvents = new EventEmitter();
|
|
10392
|
+
this._state = "idle";
|
|
10393
|
+
this._isSpeaking = false;
|
|
10394
|
+
this._currentFrame = null;
|
|
10395
|
+
this._currentRawFrame = null;
|
|
10396
|
+
// VAD state
|
|
10397
|
+
this.speechStartTime = 0;
|
|
10398
|
+
this.vadChunkSize = 0;
|
|
10399
|
+
this.vadBuffer = null;
|
|
10400
|
+
this.vadBufferOffset = 0;
|
|
10401
|
+
this.profile = config.profile ?? {};
|
|
10402
|
+
this.vad = config.vad;
|
|
10403
|
+
this.mic = new MicrophoneCapture(this.omoteEvents, {
|
|
10404
|
+
sampleRate: config.sampleRate ?? 16e3,
|
|
10405
|
+
chunkSize: config.micChunkSize ?? 512
|
|
10406
|
+
});
|
|
10407
|
+
this.processor = new A2EProcessor({
|
|
10408
|
+
backend: config.lam,
|
|
10409
|
+
sampleRate: config.sampleRate ?? 16e3,
|
|
10410
|
+
identityIndex: config.identityIndex,
|
|
10411
|
+
onFrame: (raw) => {
|
|
10412
|
+
const scaled = applyProfile(raw, this.profile);
|
|
10413
|
+
this._currentFrame = scaled;
|
|
10414
|
+
this._currentRawFrame = raw;
|
|
10415
|
+
this.emit("frame", { blendshapes: scaled, rawBlendshapes: raw });
|
|
10416
|
+
},
|
|
10417
|
+
onError: (error) => {
|
|
10418
|
+
logger18.error("A2E inference error", { message: error.message });
|
|
10419
|
+
this.emit("error", error);
|
|
10420
|
+
}
|
|
10421
|
+
});
|
|
10422
|
+
this.omoteEvents.on("audio.chunk", ({ pcm }) => {
|
|
10423
|
+
const float32 = int16ToFloat32(pcm);
|
|
10424
|
+
this.processor.pushAudio(float32);
|
|
10425
|
+
if (this.vad) {
|
|
10426
|
+
this.processVAD(float32);
|
|
10427
|
+
}
|
|
10428
|
+
});
|
|
10429
|
+
this.omoteEvents.on("audio.level", (level) => {
|
|
10430
|
+
this.emit("audio:level", level);
|
|
10431
|
+
});
|
|
10432
|
+
if (this.vad) {
|
|
10433
|
+
this.vadChunkSize = this.vad.getChunkSize();
|
|
10434
|
+
this.vadBuffer = new Float32Array(this.vadChunkSize);
|
|
10435
|
+
this.vadBufferOffset = 0;
|
|
10436
|
+
}
|
|
10437
|
+
}
|
|
10438
|
+
/** Current state */
|
|
10439
|
+
get state() {
|
|
10440
|
+
return this._state;
|
|
10441
|
+
}
|
|
10442
|
+
/** Latest blendshape frame (null before first inference) */
|
|
10443
|
+
get currentFrame() {
|
|
10444
|
+
return this._currentFrame;
|
|
10445
|
+
}
|
|
10446
|
+
/** Whether speech is currently detected (requires VAD) */
|
|
10447
|
+
get isSpeaking() {
|
|
10448
|
+
return this._isSpeaking;
|
|
10449
|
+
}
|
|
10450
|
+
/** Current backend type */
|
|
10451
|
+
get backend() {
|
|
10452
|
+
return this.processor ? "active" : null;
|
|
10453
|
+
}
|
|
10454
|
+
// ---------------------------------------------------------------------------
|
|
10455
|
+
// Public API
|
|
10456
|
+
// ---------------------------------------------------------------------------
|
|
10457
|
+
/** Start microphone capture and inference loop */
|
|
10458
|
+
async start() {
|
|
10459
|
+
if (this._state === "active") return;
|
|
10460
|
+
await this.mic.start();
|
|
10461
|
+
this.processor.startDrip();
|
|
10462
|
+
this.emit("mic:start", void 0);
|
|
10463
|
+
this.setState("active");
|
|
10464
|
+
}
|
|
10465
|
+
/** Stop microphone and inference */
|
|
10466
|
+
stop() {
|
|
10467
|
+
if (this._state === "idle") return;
|
|
10468
|
+
this.processor.stopDrip();
|
|
10469
|
+
this.mic.stop();
|
|
10470
|
+
this._isSpeaking = false;
|
|
10471
|
+
this.emit("mic:stop", void 0);
|
|
10472
|
+
this.setState("idle");
|
|
10473
|
+
}
|
|
10474
|
+
/** Pause inference (mic stays open for faster resume) */
|
|
10475
|
+
pause() {
|
|
10476
|
+
if (this._state !== "active") return;
|
|
10477
|
+
this.processor.stopDrip();
|
|
10478
|
+
this.setState("paused");
|
|
10479
|
+
}
|
|
10480
|
+
/** Resume inference after pause */
|
|
10481
|
+
resume() {
|
|
10482
|
+
if (this._state !== "paused") return;
|
|
10483
|
+
this.processor.startDrip();
|
|
10484
|
+
this.setState("active");
|
|
10485
|
+
}
|
|
10486
|
+
/** Update ExpressionProfile at runtime */
|
|
10487
|
+
setProfile(profile) {
|
|
10488
|
+
this.profile = profile;
|
|
10489
|
+
}
|
|
10490
|
+
/** Dispose of all resources */
|
|
10491
|
+
async dispose() {
|
|
10492
|
+
this.stop();
|
|
10493
|
+
this.processor.dispose();
|
|
10494
|
+
}
|
|
10495
|
+
// ---------------------------------------------------------------------------
|
|
10496
|
+
// Internal: VAD processing
|
|
10497
|
+
// ---------------------------------------------------------------------------
|
|
10498
|
+
async processVAD(samples) {
|
|
10499
|
+
if (!this.vad || !this.vadBuffer) return;
|
|
10500
|
+
for (let i = 0; i < samples.length; i++) {
|
|
10501
|
+
this.vadBuffer[this.vadBufferOffset++] = samples[i];
|
|
10502
|
+
if (this.vadBufferOffset >= this.vadChunkSize) {
|
|
10503
|
+
try {
|
|
10504
|
+
const result = await this.vad.process(this.vadBuffer);
|
|
10505
|
+
const wasSpeaking = this._isSpeaking;
|
|
10506
|
+
this._isSpeaking = result.isSpeech;
|
|
10507
|
+
if (!wasSpeaking && result.isSpeech) {
|
|
10508
|
+
this.speechStartTime = performance.now();
|
|
10509
|
+
this.emit("speech:start", void 0);
|
|
10510
|
+
} else if (wasSpeaking && !result.isSpeech) {
|
|
10511
|
+
const durationMs = performance.now() - this.speechStartTime;
|
|
10512
|
+
this.emit("speech:end", { durationMs });
|
|
10513
|
+
}
|
|
10514
|
+
} catch (err) {
|
|
10515
|
+
logger18.warn("VAD process error", { error: String(err) });
|
|
10516
|
+
}
|
|
10517
|
+
this.vadBufferOffset = 0;
|
|
10518
|
+
}
|
|
10519
|
+
}
|
|
10520
|
+
}
|
|
10521
|
+
// ---------------------------------------------------------------------------
|
|
10522
|
+
// Internal: State management
|
|
10523
|
+
// ---------------------------------------------------------------------------
|
|
10524
|
+
setState(state) {
|
|
10525
|
+
if (this._state === state) return;
|
|
10526
|
+
this._state = state;
|
|
10527
|
+
this.emit("state", state);
|
|
10528
|
+
}
|
|
10529
|
+
};
|
|
10530
|
+
|
|
10531
|
+
// src/orchestration/VoicePipeline.ts
|
|
10532
|
+
var logger19 = createLogger("VoicePipeline");
|
|
10533
|
+
var VoicePipeline = class extends EventEmitter {
|
|
10534
|
+
constructor(config) {
|
|
10535
|
+
super();
|
|
10536
|
+
// State
|
|
10537
|
+
this._state = "idle";
|
|
10538
|
+
this.stopped = false;
|
|
10539
|
+
this.epoch = 0;
|
|
10540
|
+
this._sessionId = null;
|
|
10541
|
+
// Models
|
|
10542
|
+
this.asr = null;
|
|
10543
|
+
this.lam = null;
|
|
10544
|
+
this.vad = null;
|
|
10545
|
+
this.unifiedWorker = null;
|
|
10546
|
+
// Pipelines
|
|
10547
|
+
this.playback = null;
|
|
10548
|
+
this.interruption = null;
|
|
10549
|
+
this.omoteEvents = new EventEmitter();
|
|
10550
|
+
this.mic = null;
|
|
10551
|
+
// Audio accumulation
|
|
10552
|
+
this.audioBuffer = [];
|
|
10553
|
+
this.audioBufferSamples = 0;
|
|
10554
|
+
this.speechStartTime = 0;
|
|
10555
|
+
this.silenceTimer = null;
|
|
10556
|
+
this.isSpeaking = false;
|
|
10557
|
+
// Progressive transcription
|
|
10558
|
+
this.progressiveTimer = null;
|
|
10559
|
+
this.progressivePromise = null;
|
|
10560
|
+
this.lastProgressiveResult = null;
|
|
10561
|
+
this.lastProgressiveSamples = 0;
|
|
10562
|
+
// ASR error recovery
|
|
10563
|
+
this.asrErrorCount = 0;
|
|
10564
|
+
// Response abort
|
|
10565
|
+
this.responseAbortController = null;
|
|
10566
|
+
// Frame refs
|
|
10567
|
+
this._currentFrame = null;
|
|
10568
|
+
this.config = config;
|
|
10569
|
+
}
|
|
10570
|
+
/** Current pipeline state */
|
|
10571
|
+
get state() {
|
|
10572
|
+
return this._state;
|
|
10573
|
+
}
|
|
10574
|
+
/** Latest blendshape frame */
|
|
10575
|
+
get currentFrame() {
|
|
10576
|
+
return this._currentFrame;
|
|
10577
|
+
}
|
|
10578
|
+
/** Whether user is currently speaking */
|
|
10579
|
+
get isSpeechActive() {
|
|
10580
|
+
return this.isSpeaking;
|
|
10581
|
+
}
|
|
10582
|
+
/** Session ID (generated on start(), null before) */
|
|
10583
|
+
get sessionId() {
|
|
10584
|
+
return this._sessionId;
|
|
10585
|
+
}
|
|
10586
|
+
// ---------------------------------------------------------------------------
|
|
10587
|
+
// Model loading
|
|
10588
|
+
// ---------------------------------------------------------------------------
|
|
10589
|
+
async loadModels() {
|
|
10590
|
+
this.setState("loading");
|
|
10591
|
+
const timeoutMs = this.config.lamLoadTimeoutMs ?? 3e4;
|
|
10592
|
+
try {
|
|
10593
|
+
if (isIOS()) {
|
|
10594
|
+
this.unifiedWorker = new UnifiedInferenceWorker();
|
|
10595
|
+
await this.unifiedWorker.init();
|
|
10596
|
+
}
|
|
10597
|
+
this.emitProgress("Speech recognition", 0, 3, 0);
|
|
10598
|
+
this.asr = createSenseVoice({
|
|
10599
|
+
modelUrl: this.config.models.senseVoice.modelUrl,
|
|
10600
|
+
tokensUrl: this.config.models.senseVoice.tokensUrl,
|
|
10601
|
+
language: this.config.models.senseVoice.language,
|
|
10602
|
+
unifiedWorker: this.unifiedWorker ?? void 0
|
|
10603
|
+
});
|
|
10604
|
+
await this.asr.load();
|
|
10605
|
+
this.emitProgress("Speech recognition", 45, 3, 1);
|
|
10606
|
+
this.emitProgress("Lip sync", 45, 3, 1);
|
|
10607
|
+
let lam = createA2E({
|
|
10608
|
+
gpuModelUrl: this.config.models.lam.gpuModelUrl,
|
|
10609
|
+
gpuExternalDataUrl: this.config.models.lam.gpuExternalDataUrl,
|
|
10610
|
+
cpuModelUrl: this.config.models.lam.cpuModelUrl,
|
|
10611
|
+
mode: this.config.models.lam.mode,
|
|
10612
|
+
unifiedWorker: this.unifiedWorker ?? void 0
|
|
10613
|
+
});
|
|
10614
|
+
let lamProgress = 45;
|
|
10615
|
+
const lamTickInterval = setInterval(() => {
|
|
10616
|
+
const remaining = 85 - lamProgress;
|
|
10617
|
+
lamProgress += Math.max(0.5, remaining * 0.08);
|
|
10618
|
+
this.emitProgress("Lip sync", Math.round(lamProgress), 3, 1);
|
|
10619
|
+
}, 300);
|
|
10620
|
+
try {
|
|
10621
|
+
const lamLoadResult = await Promise.race([
|
|
10622
|
+
lam.load().then(() => "ok"),
|
|
10623
|
+
new Promise((r) => setTimeout(() => r("timeout"), timeoutMs))
|
|
10624
|
+
]);
|
|
10625
|
+
if (lamLoadResult === "timeout") {
|
|
10626
|
+
logger19.warn(`LAM GPU load timed out after ${timeoutMs}ms, falling back to CPU`);
|
|
10627
|
+
await lam.dispose();
|
|
10628
|
+
lam = createA2E({
|
|
10629
|
+
gpuModelUrl: this.config.models.lam.gpuModelUrl,
|
|
10630
|
+
cpuModelUrl: this.config.models.lam.cpuModelUrl,
|
|
10631
|
+
mode: "cpu",
|
|
10632
|
+
unifiedWorker: this.unifiedWorker ?? void 0
|
|
10633
|
+
});
|
|
10634
|
+
await lam.load();
|
|
10635
|
+
}
|
|
10636
|
+
} finally {
|
|
10637
|
+
clearInterval(lamTickInterval);
|
|
10638
|
+
}
|
|
10639
|
+
this.lam = lam;
|
|
10640
|
+
this.emitProgress("Lip sync", 85, 3, 2);
|
|
10641
|
+
this.emitProgress("Voice detection", 85, 3, 2);
|
|
10642
|
+
this.vad = createSileroVAD({
|
|
10643
|
+
modelUrl: this.config.models.vad.modelUrl,
|
|
10644
|
+
threshold: this.config.models.vad.threshold,
|
|
10645
|
+
unifiedWorker: this.unifiedWorker ?? void 0
|
|
10646
|
+
});
|
|
10647
|
+
await this.vad.load();
|
|
10648
|
+
this.emitProgress("Voice detection", 100, 3, 3);
|
|
10649
|
+
this.playback = new PlaybackPipeline({
|
|
10650
|
+
lam: this.lam,
|
|
10651
|
+
profile: this.config.profile,
|
|
10652
|
+
identityIndex: this.config.identityIndex,
|
|
10653
|
+
neutralTransitionEnabled: this.config.neutralTransitionEnabled ?? true,
|
|
10654
|
+
neutralTransitionMs: this.config.neutralTransitionMs,
|
|
10655
|
+
audioDelayMs: this.config.audioDelayMs,
|
|
10656
|
+
chunkTargetMs: this.config.chunkTargetMs
|
|
10657
|
+
});
|
|
10658
|
+
await this.playback.initialize();
|
|
10659
|
+
this.playback.on("frame", (f) => {
|
|
10660
|
+
this._currentFrame = f.blendshapes;
|
|
10661
|
+
this.emit("frame", f);
|
|
10662
|
+
});
|
|
10663
|
+
this.playback.on("frame:raw", (f) => this.emit("frame:raw", f));
|
|
10664
|
+
this.playback.on("playback:start", (t) => this.emit("playback:start", t));
|
|
10665
|
+
this.playback.on("playback:complete", () => {
|
|
10666
|
+
if (this.stopped) return;
|
|
10667
|
+
this.emit("playback:complete", void 0);
|
|
10668
|
+
this.vad?.reset();
|
|
10669
|
+
this.epoch++;
|
|
10670
|
+
this.setState("listening");
|
|
10671
|
+
});
|
|
10672
|
+
this.playback.on("error", (e) => this.emit("error", e));
|
|
10673
|
+
this.interruption = new InterruptionHandler({
|
|
10674
|
+
enabled: this.config.interruptionEnabled ?? true,
|
|
10675
|
+
minSpeechDurationMs: this.config.interruptionMinSpeechMs ?? 200
|
|
10676
|
+
});
|
|
10677
|
+
this.interruption.on("interruption.triggered", () => {
|
|
10678
|
+
this.handleInterruption();
|
|
10679
|
+
});
|
|
10680
|
+
this.setState("ready");
|
|
10681
|
+
} catch (error) {
|
|
10682
|
+
const err = error instanceof Error ? error : new Error(String(error));
|
|
10683
|
+
logger19.error("Model loading failed", { message: err.message });
|
|
10684
|
+
this.emit("error", err);
|
|
10685
|
+
this.setState("error");
|
|
10686
|
+
throw err;
|
|
10687
|
+
}
|
|
10688
|
+
}
|
|
10689
|
+
// ---------------------------------------------------------------------------
|
|
10690
|
+
// Conversation lifecycle
|
|
10691
|
+
// ---------------------------------------------------------------------------
|
|
10692
|
+
async start() {
|
|
10693
|
+
if (this._state !== "ready") {
|
|
10694
|
+
throw new Error(`Cannot start: state is '${this._state}', expected 'ready'`);
|
|
10695
|
+
}
|
|
10696
|
+
this.stopped = false;
|
|
10697
|
+
this.epoch++;
|
|
10698
|
+
this._sessionId = crypto.randomUUID();
|
|
10699
|
+
this.asrErrorCount = 0;
|
|
10700
|
+
this.mic = new MicrophoneCapture(this.omoteEvents, {
|
|
10701
|
+
sampleRate: 16e3,
|
|
10702
|
+
chunkSize: 512
|
|
10703
|
+
});
|
|
10704
|
+
this.omoteEvents.on("audio.chunk", ({ pcm }) => {
|
|
10705
|
+
const float32 = int16ToFloat32(pcm);
|
|
10706
|
+
this.processAudioChunk(float32);
|
|
10707
|
+
});
|
|
10708
|
+
this.omoteEvents.on("audio.level", (level) => {
|
|
10709
|
+
this.emit("audio:level", level);
|
|
10710
|
+
});
|
|
10711
|
+
await this.mic.start();
|
|
10712
|
+
this.setState("listening");
|
|
10713
|
+
}
|
|
10714
|
+
stop() {
|
|
10715
|
+
this.stopped = true;
|
|
10716
|
+
this.epoch++;
|
|
10717
|
+
this.clearSilenceTimer();
|
|
10718
|
+
this.stopProgressiveTranscription();
|
|
10719
|
+
this.responseAbortController?.abort();
|
|
10720
|
+
this.responseAbortController = null;
|
|
10721
|
+
this.vad?.reset();
|
|
10722
|
+
this.playback?.stop();
|
|
10723
|
+
this.mic?.stop();
|
|
10724
|
+
this.mic = null;
|
|
10725
|
+
this.isSpeaking = false;
|
|
10726
|
+
this.audioBuffer = [];
|
|
10727
|
+
this.audioBufferSamples = 0;
|
|
10728
|
+
this._currentFrame = null;
|
|
10729
|
+
this.interruption?.setAISpeaking(false);
|
|
10730
|
+
if (this._state !== "idle") {
|
|
10731
|
+
this.setState("ready");
|
|
10732
|
+
}
|
|
10733
|
+
}
|
|
10734
|
+
setProfile(profile) {
|
|
10735
|
+
this.config.profile = profile;
|
|
10736
|
+
this.playback?.setProfile(profile);
|
|
10737
|
+
}
|
|
10738
|
+
async dispose() {
|
|
10739
|
+
this.stop();
|
|
10740
|
+
this.epoch++;
|
|
10741
|
+
await this.playback?.dispose();
|
|
10742
|
+
await this.asr?.dispose();
|
|
10743
|
+
await this.lam?.dispose();
|
|
10744
|
+
await this.vad?.dispose();
|
|
10745
|
+
this.playback = null;
|
|
10746
|
+
this.asr = null;
|
|
10747
|
+
this.lam = null;
|
|
10748
|
+
this.vad = null;
|
|
10749
|
+
this._state = "idle";
|
|
10750
|
+
}
|
|
10751
|
+
// ---------------------------------------------------------------------------
|
|
10752
|
+
// Audio processing
|
|
10753
|
+
// ---------------------------------------------------------------------------
|
|
10754
|
+
async processAudioChunk(samples) {
|
|
10755
|
+
if (!this.vad) return;
|
|
10756
|
+
try {
|
|
10757
|
+
const result = await this.vad.process(samples);
|
|
10758
|
+
if (this._state === "speaking" && this.interruption) {
|
|
10759
|
+
this.interruption.processVADResult(result.probability);
|
|
10760
|
+
return;
|
|
10761
|
+
}
|
|
10762
|
+
if (this._state !== "listening" && this._state !== "thinking") return;
|
|
10763
|
+
const wasSpeaking = this.isSpeaking;
|
|
10764
|
+
if (result.isSpeech) {
|
|
10765
|
+
if (!wasSpeaking) {
|
|
10766
|
+
this.isSpeaking = true;
|
|
10767
|
+
this.speechStartTime = performance.now();
|
|
10768
|
+
this.audioBuffer = [];
|
|
10769
|
+
this.audioBufferSamples = 0;
|
|
10770
|
+
this.lastProgressiveResult = null;
|
|
10771
|
+
this.lastProgressiveSamples = 0;
|
|
10772
|
+
this.emit("speech:start", void 0);
|
|
10773
|
+
this.startProgressiveTranscription();
|
|
10774
|
+
}
|
|
10775
|
+
this.audioBuffer.push(new Float32Array(samples));
|
|
10776
|
+
this.audioBufferSamples += samples.length;
|
|
10777
|
+
this.clearSilenceTimer();
|
|
10778
|
+
} else if (wasSpeaking) {
|
|
10779
|
+
this.audioBuffer.push(new Float32Array(samples));
|
|
10780
|
+
this.audioBufferSamples += samples.length;
|
|
10781
|
+
if (!this.silenceTimer) {
|
|
10782
|
+
const timeoutMs = this.getSilenceTimeout();
|
|
10783
|
+
this.silenceTimer = setTimeout(() => {
|
|
10784
|
+
this.onSilenceDetected();
|
|
10785
|
+
}, timeoutMs);
|
|
10786
|
+
}
|
|
10787
|
+
}
|
|
10788
|
+
} catch (err) {
|
|
10789
|
+
logger19.warn("VAD error", { error: String(err) });
|
|
10790
|
+
}
|
|
10791
|
+
}
|
|
10792
|
+
// ---------------------------------------------------------------------------
|
|
10793
|
+
// Silence detection
|
|
10794
|
+
// ---------------------------------------------------------------------------
|
|
10795
|
+
getSilenceTimeout() {
|
|
10796
|
+
const base = this.config.silenceTimeoutMs ?? 500;
|
|
10797
|
+
const extended = this.config.silenceTimeoutExtendedMs ?? 700;
|
|
10798
|
+
const adaptive = this.config.adaptiveTimeout ?? true;
|
|
10799
|
+
if (!adaptive) return base;
|
|
10800
|
+
const speechDurationMs = performance.now() - this.speechStartTime;
|
|
10801
|
+
return speechDurationMs > 3e3 ? extended : base;
|
|
10802
|
+
}
|
|
10803
|
+
onSilenceDetected() {
|
|
10804
|
+
const capturedEpoch = this.epoch;
|
|
10805
|
+
this.isSpeaking = false;
|
|
10806
|
+
const durationMs = performance.now() - this.speechStartTime;
|
|
10807
|
+
this.emit("speech:end", { durationMs });
|
|
10808
|
+
this.clearSilenceTimer();
|
|
10809
|
+
this.processEndOfSpeech(capturedEpoch).catch((err) => {
|
|
10810
|
+
logger19.error("End of speech processing failed", { error: String(err) });
|
|
10811
|
+
if (this.epoch === capturedEpoch && !this.stopped) {
|
|
10812
|
+
this.emit("error", err instanceof Error ? err : new Error(String(err)));
|
|
10813
|
+
this.setState("listening");
|
|
10814
|
+
}
|
|
10815
|
+
});
|
|
10816
|
+
}
|
|
10817
|
+
// ---------------------------------------------------------------------------
|
|
10818
|
+
// End of speech → transcription → response
|
|
10819
|
+
// ---------------------------------------------------------------------------
|
|
10820
|
+
async processEndOfSpeech(capturedEpoch) {
|
|
10821
|
+
if (this.progressivePromise) {
|
|
10822
|
+
try {
|
|
10823
|
+
await this.progressivePromise;
|
|
10824
|
+
} catch {
|
|
10825
|
+
}
|
|
10826
|
+
}
|
|
10827
|
+
this.stopProgressiveTranscription();
|
|
10828
|
+
if (this.epoch !== capturedEpoch || this.stopped) return;
|
|
10829
|
+
const totalSamples = this.audioBufferSamples;
|
|
10830
|
+
const fullAudio = new Float32Array(totalSamples);
|
|
10831
|
+
let offset = 0;
|
|
10832
|
+
for (const chunk of this.audioBuffer) {
|
|
10833
|
+
fullAudio.set(chunk, offset);
|
|
10834
|
+
offset += chunk.length;
|
|
10835
|
+
}
|
|
10836
|
+
this.audioBuffer = [];
|
|
10837
|
+
this.audioBufferSamples = 0;
|
|
10838
|
+
const minDuration = this.config.minAudioDurationSec ?? 0.3;
|
|
10839
|
+
const minEnergy = this.config.minAudioEnergy ?? 0.02;
|
|
10840
|
+
const durationSec = totalSamples / 16e3;
|
|
10841
|
+
if (durationSec < minDuration) {
|
|
10842
|
+
logger19.info("Audio too short, discarding", { durationSec });
|
|
10843
|
+
this.setState("listening");
|
|
10844
|
+
return;
|
|
10845
|
+
}
|
|
10846
|
+
let maxAbs = 0;
|
|
10847
|
+
for (let i = 0; i < fullAudio.length; i++) {
|
|
10848
|
+
const abs = Math.abs(fullAudio[i]);
|
|
10849
|
+
if (abs > maxAbs) maxAbs = abs;
|
|
10850
|
+
}
|
|
10851
|
+
let rms = 0;
|
|
10852
|
+
for (let i = 0; i < fullAudio.length; i++) {
|
|
10853
|
+
rms += fullAudio[i] * fullAudio[i];
|
|
10854
|
+
}
|
|
10855
|
+
rms = Math.sqrt(rms / fullAudio.length);
|
|
10856
|
+
if (rms < minEnergy) {
|
|
10857
|
+
logger19.info("Audio too quiet, discarding", { rms });
|
|
10858
|
+
this.setState("listening");
|
|
10859
|
+
return;
|
|
10860
|
+
}
|
|
10861
|
+
const normalizedAudio = this.normalizeAudio(fullAudio);
|
|
10862
|
+
this.setState("thinking");
|
|
10863
|
+
let transcript = null;
|
|
10864
|
+
const coverageThreshold = this.config.progressiveCoverageThreshold ?? 0.8;
|
|
10865
|
+
if (this.lastProgressiveResult && this.lastProgressiveResult.text.trim().length > 0 && this.lastProgressiveSamples >= totalSamples * coverageThreshold) {
|
|
10866
|
+
transcript = { ...this.lastProgressiveResult, isFinal: true };
|
|
10867
|
+
logger19.info("Using progressive result", {
|
|
10868
|
+
coverage: (this.lastProgressiveSamples / totalSamples).toFixed(2),
|
|
10869
|
+
text: transcript.text
|
|
10870
|
+
});
|
|
10871
|
+
} else {
|
|
10872
|
+
this.lastProgressiveResult = null;
|
|
10873
|
+
transcript = await this.transcribeWithTimeout(normalizedAudio);
|
|
10874
|
+
if (transcript) {
|
|
10875
|
+
transcript.isFinal = true;
|
|
10876
|
+
}
|
|
10877
|
+
}
|
|
10878
|
+
if (this.epoch !== capturedEpoch || this.stopped) return;
|
|
10879
|
+
if (!transcript || !transcript.text.trim()) {
|
|
10880
|
+
logger19.info("No transcript, resuming listening");
|
|
10881
|
+
this.setState("listening");
|
|
10882
|
+
return;
|
|
10883
|
+
}
|
|
10884
|
+
this.emit("transcript", transcript);
|
|
10885
|
+
await this.callResponseHandler(transcript, capturedEpoch);
|
|
10886
|
+
}
|
|
10887
|
+
// ---------------------------------------------------------------------------
|
|
10888
|
+
// Response handler
|
|
10889
|
+
// ---------------------------------------------------------------------------
|
|
10890
|
+
async callResponseHandler(transcript, capturedEpoch) {
|
|
10891
|
+
if (this.epoch !== capturedEpoch || this.stopped) return;
|
|
10892
|
+
this.setState("speaking");
|
|
10893
|
+
this.interruption?.setAISpeaking(true);
|
|
10894
|
+
const abortController = new AbortController();
|
|
10895
|
+
this.responseAbortController = abortController;
|
|
10896
|
+
try {
|
|
10897
|
+
this.playback.start();
|
|
10898
|
+
await this.config.onResponse({
|
|
10899
|
+
text: transcript.text,
|
|
10900
|
+
emotion: transcript.emotion,
|
|
10901
|
+
event: transcript.event,
|
|
10902
|
+
send: async (chunk) => {
|
|
10903
|
+
if (abortController.signal.aborted) return;
|
|
10904
|
+
await this.playback.onAudioChunk(chunk);
|
|
10905
|
+
},
|
|
10906
|
+
done: async () => {
|
|
10907
|
+
if (abortController.signal.aborted) return;
|
|
10908
|
+
await this.playback.end();
|
|
10909
|
+
},
|
|
10910
|
+
signal: abortController.signal,
|
|
10911
|
+
sessionId: this._sessionId
|
|
10912
|
+
});
|
|
10913
|
+
} catch (error) {
|
|
10914
|
+
if (abortController.signal.aborted) return;
|
|
10915
|
+
const err = error instanceof Error ? error : new Error(String(error));
|
|
10916
|
+
logger19.error("Response handler error", { message: err.message });
|
|
10917
|
+
this.emit("error", err);
|
|
10918
|
+
if (this.epoch === capturedEpoch && !this.stopped) {
|
|
10919
|
+
this.interruption?.setAISpeaking(false);
|
|
10920
|
+
this.setState("listening");
|
|
10921
|
+
}
|
|
10922
|
+
} finally {
|
|
10923
|
+
this.responseAbortController = null;
|
|
10924
|
+
}
|
|
10925
|
+
}
|
|
10926
|
+
// ---------------------------------------------------------------------------
|
|
10927
|
+
// Interruption handling
|
|
10928
|
+
// ---------------------------------------------------------------------------
|
|
10929
|
+
handleInterruption() {
|
|
10930
|
+
if (this._state !== "speaking") return;
|
|
10931
|
+
logger19.info("Interruption triggered");
|
|
10932
|
+
this.epoch++;
|
|
10933
|
+
this.responseAbortController?.abort();
|
|
10934
|
+
this.playback?.stop();
|
|
10935
|
+
this.interruption?.setAISpeaking(false);
|
|
10936
|
+
this.emit("interruption", void 0);
|
|
10937
|
+
if (!this.stopped) {
|
|
10938
|
+
this.setState("listening");
|
|
10939
|
+
}
|
|
10940
|
+
}
|
|
10941
|
+
// ---------------------------------------------------------------------------
|
|
10942
|
+
// Progressive transcription
|
|
10943
|
+
// ---------------------------------------------------------------------------
|
|
10944
|
+
startProgressiveTranscription() {
|
|
10945
|
+
this.stopProgressiveTranscription();
|
|
10946
|
+
const intervalMs = isIOS() ? this.config.progressiveIntervalIosMs ?? 800 : this.config.progressiveIntervalMs ?? 500;
|
|
10947
|
+
const minSamples = this.config.progressiveMinSamples ?? 8e3;
|
|
10948
|
+
this.progressiveTimer = setInterval(() => {
|
|
10949
|
+
if (this.audioBufferSamples < minSamples) return;
|
|
10950
|
+
if (!this.asr) return;
|
|
10951
|
+
const capturedEpoch = this.epoch;
|
|
10952
|
+
const snapshot = new Float32Array(this.audioBufferSamples);
|
|
10953
|
+
let offset = 0;
|
|
10954
|
+
for (const chunk of this.audioBuffer) {
|
|
10955
|
+
snapshot.set(chunk, offset);
|
|
10956
|
+
offset += chunk.length;
|
|
10957
|
+
}
|
|
10958
|
+
const snapshotSamples = this.audioBufferSamples;
|
|
10959
|
+
this.progressivePromise = (async () => {
|
|
10960
|
+
try {
|
|
10961
|
+
const result = await this.transcribeWithTimeout(snapshot);
|
|
10962
|
+
if (this.epoch !== capturedEpoch) return;
|
|
10963
|
+
if (result && result.text.trim()) {
|
|
10964
|
+
this.lastProgressiveResult = result;
|
|
10965
|
+
this.lastProgressiveSamples = snapshotSamples;
|
|
10966
|
+
this.emit("transcript", { ...result, isFinal: false });
|
|
10967
|
+
}
|
|
10968
|
+
} catch {
|
|
10969
|
+
}
|
|
10970
|
+
})();
|
|
10971
|
+
}, intervalMs);
|
|
10972
|
+
}
|
|
10973
|
+
stopProgressiveTranscription() {
|
|
10974
|
+
if (this.progressiveTimer) {
|
|
10975
|
+
clearInterval(this.progressiveTimer);
|
|
10976
|
+
this.progressiveTimer = null;
|
|
10977
|
+
}
|
|
10978
|
+
}
|
|
10979
|
+
// ---------------------------------------------------------------------------
|
|
10980
|
+
// Transcription with timeout + ASR error recovery
|
|
10981
|
+
// ---------------------------------------------------------------------------
|
|
10982
|
+
async transcribeWithTimeout(audio) {
|
|
10983
|
+
if (!this.asr) return null;
|
|
10984
|
+
const timeoutMs = this.config.transcriptionTimeoutMs ?? 1e4;
|
|
10985
|
+
const startTime = performance.now();
|
|
10986
|
+
try {
|
|
10987
|
+
const result = await Promise.race([
|
|
10988
|
+
this.asr.transcribe(audio),
|
|
10989
|
+
new Promise(
|
|
10990
|
+
(_, reject) => setTimeout(() => reject(new Error(`Transcription timed out after ${timeoutMs}ms`)), timeoutMs)
|
|
10991
|
+
)
|
|
10992
|
+
]);
|
|
10993
|
+
this.asrErrorCount = 0;
|
|
10994
|
+
return {
|
|
10995
|
+
text: result.text,
|
|
10996
|
+
emotion: result.emotion,
|
|
10997
|
+
language: result.language,
|
|
10998
|
+
isFinal: false,
|
|
10999
|
+
inferenceTimeMs: performance.now() - startTime
|
|
11000
|
+
};
|
|
11001
|
+
} catch (error) {
|
|
11002
|
+
this.asrErrorCount++;
|
|
11003
|
+
logger19.warn("Transcription failed", {
|
|
11004
|
+
attempt: this.asrErrorCount,
|
|
11005
|
+
error: String(error)
|
|
11006
|
+
});
|
|
11007
|
+
if (this.asrErrorCount >= 3) {
|
|
11008
|
+
logger19.warn("3 consecutive ASR errors, recreating session");
|
|
11009
|
+
try {
|
|
11010
|
+
await this.asr.dispose();
|
|
11011
|
+
this.asr = createSenseVoice({
|
|
11012
|
+
modelUrl: this.config.models.senseVoice.modelUrl,
|
|
11013
|
+
tokensUrl: this.config.models.senseVoice.tokensUrl,
|
|
11014
|
+
language: this.config.models.senseVoice.language,
|
|
11015
|
+
unifiedWorker: this.unifiedWorker ?? void 0
|
|
11016
|
+
});
|
|
11017
|
+
await this.asr.load();
|
|
11018
|
+
this.asrErrorCount = 0;
|
|
11019
|
+
} catch (recreateErr) {
|
|
11020
|
+
logger19.error("ASR session recreation failed", { error: String(recreateErr) });
|
|
11021
|
+
}
|
|
11022
|
+
}
|
|
11023
|
+
return null;
|
|
11024
|
+
}
|
|
11025
|
+
}
|
|
11026
|
+
// ---------------------------------------------------------------------------
|
|
11027
|
+
// Audio normalization
|
|
11028
|
+
// ---------------------------------------------------------------------------
|
|
11029
|
+
normalizeAudio(audio) {
|
|
11030
|
+
if (!(this.config.normalizeAudio ?? true)) return audio;
|
|
11031
|
+
let maxAbs = 0;
|
|
11032
|
+
for (let i = 0; i < audio.length; i++) {
|
|
11033
|
+
const abs = Math.abs(audio[i]);
|
|
11034
|
+
if (abs > maxAbs) maxAbs = abs;
|
|
11035
|
+
}
|
|
11036
|
+
if (maxAbs >= 0.1 || maxAbs === 0) return audio;
|
|
11037
|
+
const gain = 0.5 / maxAbs;
|
|
11038
|
+
const normalized = new Float32Array(audio.length);
|
|
11039
|
+
for (let i = 0; i < audio.length; i++) {
|
|
11040
|
+
normalized[i] = audio[i] * gain;
|
|
11041
|
+
}
|
|
11042
|
+
return normalized;
|
|
11043
|
+
}
|
|
11044
|
+
// ---------------------------------------------------------------------------
|
|
11045
|
+
// Helpers
|
|
11046
|
+
// ---------------------------------------------------------------------------
|
|
11047
|
+
setState(state) {
|
|
11048
|
+
if (this._state === state) return;
|
|
11049
|
+
logger19.info("State transition", { from: this._state, to: state });
|
|
11050
|
+
this._state = state;
|
|
11051
|
+
this.emit("state", state);
|
|
11052
|
+
}
|
|
11053
|
+
emitProgress(currentModel, progress, totalModels, modelsLoaded) {
|
|
11054
|
+
this.emit("loading:progress", { currentModel, progress, totalModels, modelsLoaded });
|
|
11055
|
+
}
|
|
11056
|
+
clearSilenceTimer() {
|
|
11057
|
+
if (this.silenceTimer) {
|
|
11058
|
+
clearTimeout(this.silenceTimer);
|
|
11059
|
+
this.silenceTimer = null;
|
|
11060
|
+
}
|
|
11061
|
+
}
|
|
11062
|
+
};
|
|
11063
|
+
|
|
9987
11064
|
// ../types/dist/index.mjs
|
|
9988
11065
|
var PROTOCOL_VERSION = 1;
|
|
9989
11066
|
function isProtocolEvent(obj) {
|
|
@@ -9992,7 +11069,9 @@ function isProtocolEvent(obj) {
|
|
|
9992
11069
|
export {
|
|
9993
11070
|
A2EOrchestrator,
|
|
9994
11071
|
A2EProcessor,
|
|
11072
|
+
ALL_AUS,
|
|
9995
11073
|
ARKIT_BLENDSHAPES,
|
|
11074
|
+
AU_TO_ARKIT,
|
|
9996
11075
|
AnimationGraph,
|
|
9997
11076
|
AudioChunkCoalescer,
|
|
9998
11077
|
AudioEnergyAnalyzer,
|
|
@@ -10003,24 +11082,31 @@ export {
|
|
|
10003
11082
|
ConsoleExporter,
|
|
10004
11083
|
DEFAULT_ANIMATION_CONFIG,
|
|
10005
11084
|
DEFAULT_LOGGING_CONFIG,
|
|
11085
|
+
DEFAULT_MODEL_URLS,
|
|
10006
11086
|
EMOTION_NAMES,
|
|
11087
|
+
EMOTION_TO_AU,
|
|
10007
11088
|
EMOTION_VECTOR_SIZE,
|
|
10008
11089
|
EmotionController,
|
|
10009
11090
|
EmotionPresets,
|
|
11091
|
+
EmotionResolver,
|
|
10010
11092
|
EmphasisDetector,
|
|
10011
11093
|
EventEmitter,
|
|
11094
|
+
FaceCompositor,
|
|
10012
11095
|
FullFacePipeline,
|
|
11096
|
+
HF_CDN_URLS,
|
|
10013
11097
|
INFERENCE_LATENCY_BUCKETS,
|
|
10014
11098
|
InterruptionHandler,
|
|
10015
11099
|
LAM_BLENDSHAPES,
|
|
10016
11100
|
LOG_LEVEL_PRIORITY,
|
|
10017
11101
|
MODEL_LOAD_TIME_BUCKETS,
|
|
10018
11102
|
MetricNames,
|
|
11103
|
+
MicLipSync,
|
|
10019
11104
|
MicrophoneCapture,
|
|
10020
11105
|
ModelCache,
|
|
10021
11106
|
OTLPExporter,
|
|
10022
11107
|
OmoteTelemetry,
|
|
10023
11108
|
PROTOCOL_VERSION,
|
|
11109
|
+
PlaybackPipeline,
|
|
10024
11110
|
ProceduralLifeLayer,
|
|
10025
11111
|
RingBuffer,
|
|
10026
11112
|
SafariSpeechRecognition,
|
|
@@ -10031,15 +11117,18 @@ export {
|
|
|
10031
11117
|
SileroVADUnifiedAdapter,
|
|
10032
11118
|
SileroVADWorker,
|
|
10033
11119
|
UnifiedInferenceWorker,
|
|
11120
|
+
VoicePipeline,
|
|
10034
11121
|
Wav2ArkitCpuInference,
|
|
10035
11122
|
Wav2ArkitCpuUnifiedAdapter,
|
|
10036
11123
|
Wav2ArkitCpuWorker,
|
|
10037
11124
|
Wav2Vec2Inference,
|
|
11125
|
+
applyProfile,
|
|
10038
11126
|
blendEmotions,
|
|
10039
11127
|
calculatePeak,
|
|
10040
11128
|
calculateRMS,
|
|
10041
11129
|
configureCacheLimit,
|
|
10042
11130
|
configureLogging,
|
|
11131
|
+
configureModelUrls,
|
|
10043
11132
|
configureTelemetry,
|
|
10044
11133
|
createA2E,
|
|
10045
11134
|
createEmotionVector,
|
|
@@ -10070,6 +11159,7 @@ export {
|
|
|
10070
11159
|
noopLogger,
|
|
10071
11160
|
preloadModels,
|
|
10072
11161
|
resetLoggingConfig,
|
|
11162
|
+
resetModelUrls,
|
|
10073
11163
|
resolveBackend,
|
|
10074
11164
|
setLogLevel,
|
|
10075
11165
|
setLoggingEnabled,
|