@omote/core 0.5.6 → 0.6.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.mts +484 -35
- package/dist/index.d.ts +484 -35
- package/dist/index.js +1191 -495
- package/dist/index.js.map +1 -1
- package/dist/index.mjs +1186 -490
- package/dist/index.mjs.map +1 -1
- package/package.json +3 -2
package/dist/index.mjs
CHANGED
|
@@ -762,6 +762,24 @@ var A2EProcessor = class {
|
|
|
762
762
|
}
|
|
763
763
|
};
|
|
764
764
|
|
|
765
|
+
// src/audio/audioUtils.ts
|
|
766
|
+
function pcm16ToFloat32(buffer) {
|
|
767
|
+
const byteLen = buffer.byteLength & ~1;
|
|
768
|
+
const int16 = byteLen === buffer.byteLength ? new Int16Array(buffer) : new Int16Array(buffer, 0, byteLen / 2);
|
|
769
|
+
const float32 = new Float32Array(int16.length);
|
|
770
|
+
for (let i = 0; i < int16.length; i++) {
|
|
771
|
+
float32[i] = int16[i] / 32768;
|
|
772
|
+
}
|
|
773
|
+
return float32;
|
|
774
|
+
}
|
|
775
|
+
function int16ToFloat32(int16) {
|
|
776
|
+
const float32 = new Float32Array(int16.length);
|
|
777
|
+
for (let i = 0; i < int16.length; i++) {
|
|
778
|
+
float32[i] = int16[i] / 32768;
|
|
779
|
+
}
|
|
780
|
+
return float32;
|
|
781
|
+
}
|
|
782
|
+
|
|
765
783
|
// src/telemetry/exporters/console.ts
|
|
766
784
|
var ConsoleExporter = class {
|
|
767
785
|
constructor(options = {}) {
|
|
@@ -2815,19 +2833,7 @@ _Wav2Vec2Inference.INFERENCE_TIMEOUT_MS = 5e3;
|
|
|
2815
2833
|
_Wav2Vec2Inference.isWebGPUAvailable = isWebGPUAvailable;
|
|
2816
2834
|
var Wav2Vec2Inference = _Wav2Vec2Inference;
|
|
2817
2835
|
|
|
2818
|
-
// src/audio/
|
|
2819
|
-
function pcm16ToFloat32(buffer) {
|
|
2820
|
-
const byteLen = buffer.byteLength & ~1;
|
|
2821
|
-
const int16 = byteLen === buffer.byteLength ? new Int16Array(buffer) : new Int16Array(buffer, 0, byteLen / 2);
|
|
2822
|
-
const float32 = new Float32Array(int16.length);
|
|
2823
|
-
for (let i = 0; i < int16.length; i++) {
|
|
2824
|
-
float32[i] = int16[i] / 32768;
|
|
2825
|
-
}
|
|
2826
|
-
return float32;
|
|
2827
|
-
}
|
|
2828
|
-
|
|
2829
|
-
// src/audio/FullFacePipeline.ts
|
|
2830
|
-
var logger4 = createLogger("FullFacePipeline");
|
|
2836
|
+
// src/audio/expressionProfile.ts
|
|
2831
2837
|
var BLENDSHAPE_TO_GROUP = /* @__PURE__ */ new Map();
|
|
2832
2838
|
for (const name of LAM_BLENDSHAPES) {
|
|
2833
2839
|
if (name.startsWith("eye")) {
|
|
@@ -2846,6 +2852,24 @@ for (const name of LAM_BLENDSHAPES) {
|
|
|
2846
2852
|
BLENDSHAPE_TO_GROUP.set(name, "tongue");
|
|
2847
2853
|
}
|
|
2848
2854
|
}
|
|
2855
|
+
function applyProfile(raw, profile) {
|
|
2856
|
+
const scaled = new Float32Array(52);
|
|
2857
|
+
for (let i = 0; i < 52; i++) {
|
|
2858
|
+
const name = LAM_BLENDSHAPES[i];
|
|
2859
|
+
let scaler;
|
|
2860
|
+
if (profile.overrides && profile.overrides[name] !== void 0) {
|
|
2861
|
+
scaler = profile.overrides[name];
|
|
2862
|
+
} else {
|
|
2863
|
+
const group = BLENDSHAPE_TO_GROUP.get(name);
|
|
2864
|
+
scaler = group ? profile[group] ?? 1 : 1;
|
|
2865
|
+
}
|
|
2866
|
+
scaled[i] = Math.min(1, Math.max(0, raw[i] * scaler));
|
|
2867
|
+
}
|
|
2868
|
+
return scaled;
|
|
2869
|
+
}
|
|
2870
|
+
|
|
2871
|
+
// src/audio/FullFacePipeline.ts
|
|
2872
|
+
var logger4 = createLogger("FullFacePipeline");
|
|
2849
2873
|
var FullFacePipeline = class extends EventEmitter {
|
|
2850
2874
|
constructor(options) {
|
|
2851
2875
|
super();
|
|
@@ -2910,25 +2934,10 @@ var FullFacePipeline = class extends EventEmitter {
|
|
|
2910
2934
|
/**
|
|
2911
2935
|
* Apply ExpressionProfile scaling to raw A2E blendshapes.
|
|
2912
2936
|
*
|
|
2913
|
-
*
|
|
2914
|
-
* 1. If an override exists for the blendshape name, use override as scaler
|
|
2915
|
-
* 2. Otherwise, use the group scaler (default 1.0)
|
|
2916
|
-
* 3. Clamp result to [0, 1]
|
|
2937
|
+
* Delegates to the standalone applyProfile() utility from expressionProfile.ts.
|
|
2917
2938
|
*/
|
|
2918
2939
|
applyProfile(raw) {
|
|
2919
|
-
|
|
2920
|
-
for (let i = 0; i < 52; i++) {
|
|
2921
|
-
const name = LAM_BLENDSHAPES[i];
|
|
2922
|
-
let scaler;
|
|
2923
|
-
if (this.profile.overrides && this.profile.overrides[name] !== void 0) {
|
|
2924
|
-
scaler = this.profile.overrides[name];
|
|
2925
|
-
} else {
|
|
2926
|
-
const group = BLENDSHAPE_TO_GROUP.get(name);
|
|
2927
|
-
scaler = group ? this.profile[group] ?? 1 : 1;
|
|
2928
|
-
}
|
|
2929
|
-
scaled[i] = Math.min(1, Math.max(0, raw[i] * scaler));
|
|
2930
|
-
}
|
|
2931
|
-
return scaled;
|
|
2940
|
+
return applyProfile(raw, this.profile);
|
|
2932
2941
|
}
|
|
2933
2942
|
/**
|
|
2934
2943
|
* Start a new playback session
|
|
@@ -3113,6 +3122,329 @@ var FullFacePipeline = class extends EventEmitter {
|
|
|
3113
3122
|
}
|
|
3114
3123
|
};
|
|
3115
3124
|
|
|
3125
|
+
// src/audio/PlaybackPipeline.ts
|
|
3126
|
+
var logger5 = createLogger("PlaybackPipeline");
|
|
3127
|
+
var PlaybackPipeline = class extends EventEmitter {
|
|
3128
|
+
constructor(config) {
|
|
3129
|
+
super();
|
|
3130
|
+
this.config = config;
|
|
3131
|
+
this._state = "idle";
|
|
3132
|
+
this.playbackStarted = false;
|
|
3133
|
+
this.monitorInterval = null;
|
|
3134
|
+
this.frameAnimationId = null;
|
|
3135
|
+
// Stale frame detection
|
|
3136
|
+
this.lastNewFrameTime = 0;
|
|
3137
|
+
this.lastKnownLamFrame = null;
|
|
3138
|
+
this.staleWarningEmitted = false;
|
|
3139
|
+
// Diagnostic counter
|
|
3140
|
+
this.frameLoopCount = 0;
|
|
3141
|
+
this.neutralTransitionFrame = null;
|
|
3142
|
+
this.neutralTransitionStart = 0;
|
|
3143
|
+
this.neutralAnimationId = null;
|
|
3144
|
+
// Current frame refs
|
|
3145
|
+
this._currentFrame = null;
|
|
3146
|
+
this._currentRawFrame = null;
|
|
3147
|
+
this.sampleRate = config.sampleRate ?? 16e3;
|
|
3148
|
+
this.profile = config.profile ?? {};
|
|
3149
|
+
this.staleThresholdMs = config.staleThresholdMs ?? 2e3;
|
|
3150
|
+
this.neutralTransitionEnabled = config.neutralTransitionEnabled ?? false;
|
|
3151
|
+
this.neutralTransitionMs = config.neutralTransitionMs ?? 250;
|
|
3152
|
+
const isCpuModel = config.lam.modelId === "wav2arkit_cpu";
|
|
3153
|
+
const chunkSize = config.chunkSize ?? config.lam.chunkSize ?? 16e3;
|
|
3154
|
+
const chunkAccumulationMs = chunkSize / this.sampleRate * 1e3;
|
|
3155
|
+
const inferenceEstimateMs = isCpuModel ? 300 : config.lam.backend === "wasm" ? 250 : 80;
|
|
3156
|
+
const marginMs = 100;
|
|
3157
|
+
const autoDelay = Math.ceil(chunkAccumulationMs + inferenceEstimateMs + marginMs);
|
|
3158
|
+
const audioDelayMs = config.audioDelayMs ?? autoDelay;
|
|
3159
|
+
logger5.info("PlaybackPipeline config", {
|
|
3160
|
+
chunkSize,
|
|
3161
|
+
audioDelayMs,
|
|
3162
|
+
autoDelay,
|
|
3163
|
+
backend: config.lam.backend,
|
|
3164
|
+
modelId: config.lam.modelId,
|
|
3165
|
+
neutralTransitionEnabled: this.neutralTransitionEnabled
|
|
3166
|
+
});
|
|
3167
|
+
this.scheduler = new AudioScheduler({
|
|
3168
|
+
sampleRate: this.sampleRate,
|
|
3169
|
+
initialLookaheadSec: audioDelayMs / 1e3
|
|
3170
|
+
});
|
|
3171
|
+
this.coalescer = new AudioChunkCoalescer({
|
|
3172
|
+
sampleRate: this.sampleRate,
|
|
3173
|
+
targetDurationMs: config.chunkTargetMs ?? 200
|
|
3174
|
+
});
|
|
3175
|
+
this.processor = new A2EProcessor({
|
|
3176
|
+
backend: config.lam,
|
|
3177
|
+
sampleRate: this.sampleRate,
|
|
3178
|
+
chunkSize,
|
|
3179
|
+
identityIndex: config.identityIndex,
|
|
3180
|
+
onError: (error) => {
|
|
3181
|
+
logger5.error("A2E inference error", { message: error.message, stack: error.stack });
|
|
3182
|
+
this.emit("error", error);
|
|
3183
|
+
}
|
|
3184
|
+
});
|
|
3185
|
+
}
|
|
3186
|
+
/** Current pipeline state */
|
|
3187
|
+
get state() {
|
|
3188
|
+
return this._state;
|
|
3189
|
+
}
|
|
3190
|
+
/** Current scaled blendshapes (updated in-place for perf) */
|
|
3191
|
+
get currentFrame() {
|
|
3192
|
+
return this._currentFrame;
|
|
3193
|
+
}
|
|
3194
|
+
/** Raw A2E blendshapes (before profile scaling) */
|
|
3195
|
+
get currentRawFrame() {
|
|
3196
|
+
return this._currentRawFrame;
|
|
3197
|
+
}
|
|
3198
|
+
// ---------------------------------------------------------------------------
|
|
3199
|
+
// Lifecycle
|
|
3200
|
+
// ---------------------------------------------------------------------------
|
|
3201
|
+
/** Initialize AudioContext (lazy, call after user gesture) */
|
|
3202
|
+
async initialize() {
|
|
3203
|
+
await this.scheduler.initialize();
|
|
3204
|
+
}
|
|
3205
|
+
/** Update ExpressionProfile at runtime */
|
|
3206
|
+
setProfile(profile) {
|
|
3207
|
+
this.profile = profile;
|
|
3208
|
+
}
|
|
3209
|
+
// ---------------------------------------------------------------------------
|
|
3210
|
+
// Async mode (streaming TTS)
|
|
3211
|
+
// ---------------------------------------------------------------------------
|
|
3212
|
+
/**
|
|
3213
|
+
* Start a new playback session.
|
|
3214
|
+
* Idempotent — calling during playback resets cleanly without emitting
|
|
3215
|
+
* spurious playback:complete.
|
|
3216
|
+
*/
|
|
3217
|
+
start() {
|
|
3218
|
+
this.stopInternal(false);
|
|
3219
|
+
this.scheduler.reset();
|
|
3220
|
+
this.coalescer.reset();
|
|
3221
|
+
this.processor.reset();
|
|
3222
|
+
this.playbackStarted = false;
|
|
3223
|
+
this.lastNewFrameTime = 0;
|
|
3224
|
+
this.lastKnownLamFrame = null;
|
|
3225
|
+
this.staleWarningEmitted = false;
|
|
3226
|
+
this.frameLoopCount = 0;
|
|
3227
|
+
this._currentFrame = null;
|
|
3228
|
+
this._currentRawFrame = null;
|
|
3229
|
+
this.cancelNeutralTransition();
|
|
3230
|
+
this.scheduler.warmup();
|
|
3231
|
+
this.startFrameLoop();
|
|
3232
|
+
this.startMonitoring();
|
|
3233
|
+
this.setState("playing");
|
|
3234
|
+
}
|
|
3235
|
+
/** Feed a streaming audio chunk (PCM16 Uint8Array) */
|
|
3236
|
+
async onAudioChunk(chunk) {
|
|
3237
|
+
const combined = this.coalescer.add(chunk);
|
|
3238
|
+
if (!combined) return;
|
|
3239
|
+
const float32 = pcm16ToFloat32(combined);
|
|
3240
|
+
const scheduleTime = await this.scheduler.schedule(float32);
|
|
3241
|
+
if (!this.playbackStarted) {
|
|
3242
|
+
this.playbackStarted = true;
|
|
3243
|
+
this.emit("playback:start", { time: scheduleTime });
|
|
3244
|
+
this.emit("playback_start", scheduleTime);
|
|
3245
|
+
}
|
|
3246
|
+
this.processor.pushAudio(float32, scheduleTime);
|
|
3247
|
+
}
|
|
3248
|
+
/** Signal end of audio stream (flushes remaining audio) */
|
|
3249
|
+
async end() {
|
|
3250
|
+
const remaining = this.coalescer.flush();
|
|
3251
|
+
if (remaining) {
|
|
3252
|
+
const chunk = new Uint8Array(remaining);
|
|
3253
|
+
await this.onAudioChunk(chunk);
|
|
3254
|
+
}
|
|
3255
|
+
await this.processor.flush();
|
|
3256
|
+
}
|
|
3257
|
+
// ---------------------------------------------------------------------------
|
|
3258
|
+
// Sync mode (full buffer)
|
|
3259
|
+
// ---------------------------------------------------------------------------
|
|
3260
|
+
/**
|
|
3261
|
+
* Feed a complete audio buffer. Chunks into 200ms pieces, schedules each
|
|
3262
|
+
* for playback, runs A2E inference, then waits for completion.
|
|
3263
|
+
*/
|
|
3264
|
+
async feedBuffer(audio) {
|
|
3265
|
+
const float32 = audio instanceof Float32Array ? audio : pcm16ToFloat32(audio);
|
|
3266
|
+
this.start();
|
|
3267
|
+
const chunkSamples = Math.floor(this.sampleRate * 0.2);
|
|
3268
|
+
for (let i = 0; i < float32.length; i += chunkSamples) {
|
|
3269
|
+
const chunk = float32.subarray(i, Math.min(i + chunkSamples, float32.length));
|
|
3270
|
+
const scheduleTime = await this.scheduler.schedule(chunk);
|
|
3271
|
+
this.processor.pushAudio(chunk, scheduleTime);
|
|
3272
|
+
if (!this.playbackStarted) {
|
|
3273
|
+
this.playbackStarted = true;
|
|
3274
|
+
this.emit("playback:start", { time: scheduleTime });
|
|
3275
|
+
this.emit("playback_start", scheduleTime);
|
|
3276
|
+
}
|
|
3277
|
+
}
|
|
3278
|
+
await this.processor.flush();
|
|
3279
|
+
return new Promise((resolve) => {
|
|
3280
|
+
const unsub = this.on("playback:complete", () => {
|
|
3281
|
+
unsub();
|
|
3282
|
+
resolve();
|
|
3283
|
+
});
|
|
3284
|
+
});
|
|
3285
|
+
}
|
|
3286
|
+
// ---------------------------------------------------------------------------
|
|
3287
|
+
// Control
|
|
3288
|
+
// ---------------------------------------------------------------------------
|
|
3289
|
+
/** Stop playback immediately with fade-out */
|
|
3290
|
+
async stop(fadeOutMs = 50) {
|
|
3291
|
+
this.setState("stopping");
|
|
3292
|
+
this.stopInternal(true);
|
|
3293
|
+
await this.scheduler.cancelAll(fadeOutMs);
|
|
3294
|
+
this.coalescer.reset();
|
|
3295
|
+
this.processor.reset();
|
|
3296
|
+
this.playbackStarted = false;
|
|
3297
|
+
this._currentFrame = null;
|
|
3298
|
+
this._currentRawFrame = null;
|
|
3299
|
+
this.emit("playback:stop", void 0);
|
|
3300
|
+
this.setState("idle");
|
|
3301
|
+
}
|
|
3302
|
+
/** Cleanup all resources */
|
|
3303
|
+
dispose() {
|
|
3304
|
+
this.stopInternal(true);
|
|
3305
|
+
this.cancelNeutralTransition();
|
|
3306
|
+
this.scheduler.dispose();
|
|
3307
|
+
this.coalescer.reset();
|
|
3308
|
+
this.processor.dispose();
|
|
3309
|
+
this._state = "idle";
|
|
3310
|
+
}
|
|
3311
|
+
/** Get pipeline debug state */
|
|
3312
|
+
getDebugState() {
|
|
3313
|
+
return {
|
|
3314
|
+
state: this._state,
|
|
3315
|
+
playbackStarted: this.playbackStarted,
|
|
3316
|
+
coalescerFill: this.coalescer.fillLevel,
|
|
3317
|
+
processorFill: this.processor.fillLevel,
|
|
3318
|
+
queuedFrames: this.processor.queuedFrameCount,
|
|
3319
|
+
currentTime: this.scheduler.getCurrentTime(),
|
|
3320
|
+
playbackEndTime: this.scheduler.getPlaybackEndTime()
|
|
3321
|
+
};
|
|
3322
|
+
}
|
|
3323
|
+
// ---------------------------------------------------------------------------
|
|
3324
|
+
// Internal: Frame loop
|
|
3325
|
+
// ---------------------------------------------------------------------------
|
|
3326
|
+
startFrameLoop() {
|
|
3327
|
+
const updateFrame = () => {
|
|
3328
|
+
this.frameLoopCount++;
|
|
3329
|
+
const currentTime = this.scheduler.getCurrentTime();
|
|
3330
|
+
const lamFrame = this.processor.getFrameForTime(currentTime);
|
|
3331
|
+
if (lamFrame && lamFrame !== this.lastKnownLamFrame) {
|
|
3332
|
+
this.lastNewFrameTime = performance.now();
|
|
3333
|
+
this.lastKnownLamFrame = lamFrame;
|
|
3334
|
+
this.staleWarningEmitted = false;
|
|
3335
|
+
}
|
|
3336
|
+
if (this.playbackStarted && this.lastNewFrameTime > 0 && performance.now() - this.lastNewFrameTime > this.staleThresholdMs) {
|
|
3337
|
+
if (!this.staleWarningEmitted) {
|
|
3338
|
+
this.staleWarningEmitted = true;
|
|
3339
|
+
logger5.warn("A2E stalled \u2014 no new inference frames", {
|
|
3340
|
+
staleDurationMs: Math.round(performance.now() - this.lastNewFrameTime),
|
|
3341
|
+
queuedFrames: this.processor.queuedFrameCount
|
|
3342
|
+
});
|
|
3343
|
+
}
|
|
3344
|
+
}
|
|
3345
|
+
if (lamFrame) {
|
|
3346
|
+
const scaled = applyProfile(lamFrame, this.profile);
|
|
3347
|
+
this._currentFrame = scaled;
|
|
3348
|
+
this._currentRawFrame = lamFrame;
|
|
3349
|
+
const fullFrame = {
|
|
3350
|
+
blendshapes: scaled,
|
|
3351
|
+
rawBlendshapes: lamFrame,
|
|
3352
|
+
timestamp: currentTime
|
|
3353
|
+
};
|
|
3354
|
+
this.emit("frame", fullFrame);
|
|
3355
|
+
this.emit("frame:raw", lamFrame);
|
|
3356
|
+
this.emit("full_frame_ready", fullFrame);
|
|
3357
|
+
this.emit("lam_frame_ready", lamFrame);
|
|
3358
|
+
}
|
|
3359
|
+
this.frameAnimationId = requestAnimationFrame(updateFrame);
|
|
3360
|
+
};
|
|
3361
|
+
this.frameAnimationId = requestAnimationFrame(updateFrame);
|
|
3362
|
+
}
|
|
3363
|
+
// ---------------------------------------------------------------------------
|
|
3364
|
+
// Internal: Playback monitoring
|
|
3365
|
+
// ---------------------------------------------------------------------------
|
|
3366
|
+
startMonitoring() {
|
|
3367
|
+
if (this.monitorInterval) {
|
|
3368
|
+
clearInterval(this.monitorInterval);
|
|
3369
|
+
}
|
|
3370
|
+
this.monitorInterval = setInterval(() => {
|
|
3371
|
+
if (this.scheduler.isComplete() && this.processor.queuedFrameCount === 0) {
|
|
3372
|
+
this.onPlaybackComplete();
|
|
3373
|
+
}
|
|
3374
|
+
}, 100);
|
|
3375
|
+
}
|
|
3376
|
+
onPlaybackComplete() {
|
|
3377
|
+
this.stopInternal(false);
|
|
3378
|
+
this.playbackStarted = false;
|
|
3379
|
+
this.emit("playback:complete", void 0);
|
|
3380
|
+
this.emit("playback_complete", void 0);
|
|
3381
|
+
if (this.neutralTransitionEnabled && this._currentFrame) {
|
|
3382
|
+
this.startNeutralTransition(this._currentFrame);
|
|
3383
|
+
} else {
|
|
3384
|
+
this.setState("idle");
|
|
3385
|
+
}
|
|
3386
|
+
}
|
|
3387
|
+
// ---------------------------------------------------------------------------
|
|
3388
|
+
// Internal: Neutral transition (opt-in)
|
|
3389
|
+
// ---------------------------------------------------------------------------
|
|
3390
|
+
startNeutralTransition(fromFrame) {
|
|
3391
|
+
this.neutralTransitionFrame = new Float32Array(fromFrame);
|
|
3392
|
+
this.neutralTransitionStart = performance.now();
|
|
3393
|
+
const animate = () => {
|
|
3394
|
+
const elapsed = performance.now() - this.neutralTransitionStart;
|
|
3395
|
+
const t = Math.min(1, elapsed / this.neutralTransitionMs);
|
|
3396
|
+
const eased = 1 - Math.pow(1 - t, 3);
|
|
3397
|
+
const blendshapes = new Float32Array(52);
|
|
3398
|
+
for (let i = 0; i < 52; i++) {
|
|
3399
|
+
blendshapes[i] = this.neutralTransitionFrame[i] * (1 - eased);
|
|
3400
|
+
}
|
|
3401
|
+
this._currentFrame = blendshapes;
|
|
3402
|
+
const frame = {
|
|
3403
|
+
blendshapes,
|
|
3404
|
+
rawBlendshapes: blendshapes,
|
|
3405
|
+
// raw = scaled during transition
|
|
3406
|
+
timestamp: performance.now() / 1e3
|
|
3407
|
+
};
|
|
3408
|
+
this.emit("frame", frame);
|
|
3409
|
+
this.emit("full_frame_ready", frame);
|
|
3410
|
+
if (t >= 1) {
|
|
3411
|
+
this.neutralTransitionFrame = null;
|
|
3412
|
+
this._currentFrame = null;
|
|
3413
|
+
this._currentRawFrame = null;
|
|
3414
|
+
this.setState("idle");
|
|
3415
|
+
return;
|
|
3416
|
+
}
|
|
3417
|
+
this.neutralAnimationId = requestAnimationFrame(animate);
|
|
3418
|
+
};
|
|
3419
|
+
this.neutralAnimationId = requestAnimationFrame(animate);
|
|
3420
|
+
}
|
|
3421
|
+
cancelNeutralTransition() {
|
|
3422
|
+
if (this.neutralAnimationId) {
|
|
3423
|
+
cancelAnimationFrame(this.neutralAnimationId);
|
|
3424
|
+
this.neutralAnimationId = null;
|
|
3425
|
+
}
|
|
3426
|
+
this.neutralTransitionFrame = null;
|
|
3427
|
+
}
|
|
3428
|
+
// ---------------------------------------------------------------------------
|
|
3429
|
+
// Internal: Helpers
|
|
3430
|
+
// ---------------------------------------------------------------------------
|
|
3431
|
+
stopInternal(emitEvents) {
|
|
3432
|
+
if (this.monitorInterval) {
|
|
3433
|
+
clearInterval(this.monitorInterval);
|
|
3434
|
+
this.monitorInterval = null;
|
|
3435
|
+
}
|
|
3436
|
+
if (this.frameAnimationId) {
|
|
3437
|
+
cancelAnimationFrame(this.frameAnimationId);
|
|
3438
|
+
this.frameAnimationId = null;
|
|
3439
|
+
}
|
|
3440
|
+
}
|
|
3441
|
+
setState(state) {
|
|
3442
|
+
if (this._state === state) return;
|
|
3443
|
+
this._state = state;
|
|
3444
|
+
this.emit("state", state);
|
|
3445
|
+
}
|
|
3446
|
+
};
|
|
3447
|
+
|
|
3116
3448
|
// src/audio/InterruptionHandler.ts
|
|
3117
3449
|
var InterruptionHandler = class extends EventEmitter {
|
|
3118
3450
|
constructor(config = {}) {
|
|
@@ -3500,7 +3832,7 @@ function ctcGreedyDecode(logits, seqLen, vocabSize, tokenMap) {
|
|
|
3500
3832
|
}
|
|
3501
3833
|
|
|
3502
3834
|
// src/inference/SenseVoiceInference.ts
|
|
3503
|
-
var
|
|
3835
|
+
var logger6 = createLogger("SenseVoice");
|
|
3504
3836
|
var _SenseVoiceInference = class _SenseVoiceInference {
|
|
3505
3837
|
constructor(config) {
|
|
3506
3838
|
this.session = null;
|
|
@@ -3553,26 +3885,26 @@ var _SenseVoiceInference = class _SenseVoiceInference {
|
|
|
3553
3885
|
"model.backend_requested": this.config.backend
|
|
3554
3886
|
});
|
|
3555
3887
|
try {
|
|
3556
|
-
|
|
3888
|
+
logger6.info("Loading ONNX Runtime...", { preference: this.config.backend });
|
|
3557
3889
|
const { ort, backend } = await getOnnxRuntimeForPreference(this.config.backend);
|
|
3558
3890
|
this.ort = ort;
|
|
3559
3891
|
this._backend = backend;
|
|
3560
|
-
|
|
3561
|
-
|
|
3892
|
+
logger6.info("ONNX Runtime loaded", { backend: this._backend });
|
|
3893
|
+
logger6.debug("Fetching tokens vocabulary", { tokensUrl: this.config.tokensUrl });
|
|
3562
3894
|
const tokensResponse = await fetch(this.config.tokensUrl);
|
|
3563
3895
|
if (!tokensResponse.ok) {
|
|
3564
3896
|
throw new Error(`Failed to fetch tokens.txt: ${tokensResponse.status} ${tokensResponse.statusText}`);
|
|
3565
3897
|
}
|
|
3566
3898
|
const tokensText = await tokensResponse.text();
|
|
3567
3899
|
this.tokenMap = parseTokensFile(tokensText);
|
|
3568
|
-
|
|
3900
|
+
logger6.debug("Tokens loaded", { vocabSize: this.tokenMap.size });
|
|
3569
3901
|
const sessionOptions = getSessionOptions(this._backend);
|
|
3570
3902
|
if (this._backend === "webgpu") {
|
|
3571
3903
|
sessionOptions.graphOptimizationLevel = "basic";
|
|
3572
3904
|
}
|
|
3573
3905
|
let isCached = false;
|
|
3574
3906
|
if (isIOS()) {
|
|
3575
|
-
|
|
3907
|
+
logger6.info("iOS: passing model URL directly to ORT (low-memory path)", {
|
|
3576
3908
|
modelUrl: this.config.modelUrl
|
|
3577
3909
|
});
|
|
3578
3910
|
this.session = await withTimeout(
|
|
@@ -3585,14 +3917,14 @@ var _SenseVoiceInference = class _SenseVoiceInference {
|
|
|
3585
3917
|
isCached = await cache.has(this.config.modelUrl);
|
|
3586
3918
|
let modelBuffer;
|
|
3587
3919
|
if (isCached) {
|
|
3588
|
-
|
|
3920
|
+
logger6.debug("Loading model from cache", { modelUrl: this.config.modelUrl });
|
|
3589
3921
|
modelBuffer = await cache.get(this.config.modelUrl);
|
|
3590
3922
|
onProgress?.(modelBuffer.byteLength, modelBuffer.byteLength);
|
|
3591
3923
|
} else {
|
|
3592
|
-
|
|
3924
|
+
logger6.debug("Fetching and caching model", { modelUrl: this.config.modelUrl });
|
|
3593
3925
|
modelBuffer = await fetchWithCache(this.config.modelUrl, onProgress);
|
|
3594
3926
|
}
|
|
3595
|
-
|
|
3927
|
+
logger6.debug("Creating ONNX session", {
|
|
3596
3928
|
size: formatBytes(modelBuffer.byteLength),
|
|
3597
3929
|
backend: this._backend
|
|
3598
3930
|
});
|
|
@@ -3605,15 +3937,15 @@ var _SenseVoiceInference = class _SenseVoiceInference {
|
|
|
3605
3937
|
const cmvn = parseCMVNFromMetadata(metadata.neg_mean, metadata.inv_stddev);
|
|
3606
3938
|
this.negMean = cmvn.negMean;
|
|
3607
3939
|
this.invStddev = cmvn.invStddev;
|
|
3608
|
-
|
|
3940
|
+
logger6.debug("CMVN loaded from model metadata", { dim: this.negMean.length });
|
|
3609
3941
|
} else {
|
|
3610
|
-
|
|
3942
|
+
logger6.warn("CMVN not found in model metadata \u2014 features will not be normalized");
|
|
3611
3943
|
}
|
|
3612
3944
|
} catch (cmvnErr) {
|
|
3613
|
-
|
|
3945
|
+
logger6.warn("Failed to read CMVN from model metadata", { error: cmvnErr });
|
|
3614
3946
|
}
|
|
3615
3947
|
const loadTimeMs = performance.now() - startTime;
|
|
3616
|
-
|
|
3948
|
+
logger6.info("SenseVoice model loaded", {
|
|
3617
3949
|
backend: this._backend,
|
|
3618
3950
|
loadTimeMs: Math.round(loadTimeMs),
|
|
3619
3951
|
vocabSize: this.tokenMap.size,
|
|
@@ -3724,7 +4056,7 @@ var _SenseVoiceInference = class _SenseVoiceInference {
|
|
|
3724
4056
|
const vocabSize = logitsDims[2];
|
|
3725
4057
|
const decoded = ctcGreedyDecode(logitsData, seqLen, vocabSize, this.tokenMap);
|
|
3726
4058
|
const inferenceTimeMs = performance.now() - startTime;
|
|
3727
|
-
|
|
4059
|
+
logger6.trace("Transcription complete", {
|
|
3728
4060
|
text: decoded.text.substring(0, 50),
|
|
3729
4061
|
language: decoded.language,
|
|
3730
4062
|
emotion: decoded.emotion,
|
|
@@ -3762,7 +4094,7 @@ var _SenseVoiceInference = class _SenseVoiceInference {
|
|
|
3762
4094
|
const errMsg = err instanceof Error ? err.message : String(err);
|
|
3763
4095
|
if (errMsg.includes("timed out")) {
|
|
3764
4096
|
this.poisoned = true;
|
|
3765
|
-
|
|
4097
|
+
logger6.error("CRITICAL: Inference session timed out \u2014 SenseVoice is dead. Page reload required.", {
|
|
3766
4098
|
backend: this._backend,
|
|
3767
4099
|
timeoutMs: _SenseVoiceInference.INFERENCE_TIMEOUT_MS
|
|
3768
4100
|
});
|
|
@@ -3770,7 +4102,7 @@ var _SenseVoiceInference = class _SenseVoiceInference {
|
|
|
3770
4102
|
const oomError = new Error(
|
|
3771
4103
|
`SenseVoice inference failed with raw C++ exception pointer (0x${err.toString(16)}). This is likely an OOM crash in WASM. Try reloading the page.`
|
|
3772
4104
|
);
|
|
3773
|
-
|
|
4105
|
+
logger6.error("ORT WASM OOM \u2014 raw C++ exception pointer", {
|
|
3774
4106
|
pointer: `0x${err.toString(16)}`,
|
|
3775
4107
|
backend: this._backend
|
|
3776
4108
|
});
|
|
@@ -3783,7 +4115,7 @@ var _SenseVoiceInference = class _SenseVoiceInference {
|
|
|
3783
4115
|
reject(oomError);
|
|
3784
4116
|
return;
|
|
3785
4117
|
} else {
|
|
3786
|
-
|
|
4118
|
+
logger6.error("Inference failed", { error: errMsg, backend: this._backend });
|
|
3787
4119
|
}
|
|
3788
4120
|
span?.endWithError(err instanceof Error ? err : new Error(String(err)));
|
|
3789
4121
|
telemetry?.incrementCounter("omote.inference.total", 1, {
|
|
@@ -3812,9 +4144,9 @@ _SenseVoiceInference.INFERENCE_TIMEOUT_MS = 1e4;
|
|
|
3812
4144
|
var SenseVoiceInference = _SenseVoiceInference;
|
|
3813
4145
|
|
|
3814
4146
|
// src/inference/SenseVoiceWorker.ts
|
|
3815
|
-
var
|
|
4147
|
+
var logger7 = createLogger("SenseVoiceWorker");
|
|
3816
4148
|
var WASM_CDN_PATH2 = "https://cdn.jsdelivr.net/npm/onnxruntime-web@1.23.2/dist/";
|
|
3817
|
-
var LOAD_TIMEOUT_MS =
|
|
4149
|
+
var LOAD_TIMEOUT_MS = 3e5;
|
|
3818
4150
|
var INFERENCE_TIMEOUT_MS = 1e4;
|
|
3819
4151
|
function resolveUrl(url) {
|
|
3820
4152
|
if (/^https?:\/\//i.test(url) || /^blob:/i.test(url)) return url;
|
|
@@ -4551,7 +4883,7 @@ var SenseVoiceWorker = class {
|
|
|
4551
4883
|
this.handleWorkerMessage(event.data);
|
|
4552
4884
|
};
|
|
4553
4885
|
worker.onerror = (error) => {
|
|
4554
|
-
|
|
4886
|
+
logger7.error("Worker error", { error: error.message });
|
|
4555
4887
|
for (const [, resolver] of this.pendingResolvers) {
|
|
4556
4888
|
resolver.reject(new Error(`Worker error: ${error.message}`));
|
|
4557
4889
|
}
|
|
@@ -4631,9 +4963,9 @@ var SenseVoiceWorker = class {
|
|
|
4631
4963
|
"model.language": this.config.language
|
|
4632
4964
|
});
|
|
4633
4965
|
try {
|
|
4634
|
-
|
|
4966
|
+
logger7.info("Creating SenseVoice worker...");
|
|
4635
4967
|
this.worker = this.createWorker();
|
|
4636
|
-
|
|
4968
|
+
logger7.info("Loading model in worker...", {
|
|
4637
4969
|
modelUrl: this.config.modelUrl,
|
|
4638
4970
|
tokensUrl: this.config.tokensUrl,
|
|
4639
4971
|
language: this.config.language,
|
|
@@ -4655,7 +4987,7 @@ var SenseVoiceWorker = class {
|
|
|
4655
4987
|
this._isLoaded = true;
|
|
4656
4988
|
const loadTimeMs = performance.now() - startTime;
|
|
4657
4989
|
onProgress?.(1, 1);
|
|
4658
|
-
|
|
4990
|
+
logger7.info("SenseVoice worker loaded successfully", {
|
|
4659
4991
|
backend: "wasm",
|
|
4660
4992
|
loadTimeMs: Math.round(loadTimeMs),
|
|
4661
4993
|
workerLoadTimeMs: Math.round(result.loadTimeMs),
|
|
@@ -4734,7 +5066,7 @@ var SenseVoiceWorker = class {
|
|
|
4734
5066
|
INFERENCE_TIMEOUT_MS
|
|
4735
5067
|
);
|
|
4736
5068
|
const totalTimeMs = performance.now() - startTime;
|
|
4737
|
-
|
|
5069
|
+
logger7.trace("Worker transcription complete", {
|
|
4738
5070
|
text: result.text.substring(0, 50),
|
|
4739
5071
|
language: result.language,
|
|
4740
5072
|
emotion: result.emotion,
|
|
@@ -4770,11 +5102,11 @@ var SenseVoiceWorker = class {
|
|
|
4770
5102
|
} catch (err) {
|
|
4771
5103
|
const errMsg = err instanceof Error ? err.message : String(err);
|
|
4772
5104
|
if (errMsg.includes("timed out")) {
|
|
4773
|
-
|
|
5105
|
+
logger7.error("CRITICAL: Worker inference timed out \u2014 SenseVoice worker is dead. Page reload required.", {
|
|
4774
5106
|
timeoutMs: INFERENCE_TIMEOUT_MS
|
|
4775
5107
|
});
|
|
4776
5108
|
} else {
|
|
4777
|
-
|
|
5109
|
+
logger7.error("Worker inference failed", { error: errMsg });
|
|
4778
5110
|
}
|
|
4779
5111
|
span?.endWithError(err instanceof Error ? err : new Error(String(err)));
|
|
4780
5112
|
telemetry?.incrementCounter("omote.inference.total", 1, {
|
|
@@ -4812,14 +5144,14 @@ var SenseVoiceWorker = class {
|
|
|
4812
5144
|
};
|
|
4813
5145
|
|
|
4814
5146
|
// src/inference/UnifiedInferenceWorker.ts
|
|
4815
|
-
var
|
|
5147
|
+
var logger8 = createLogger("UnifiedInferenceWorker");
|
|
4816
5148
|
var WASM_CDN_PATH3 = "https://cdn.jsdelivr.net/npm/onnxruntime-web@1.23.2/dist/";
|
|
4817
|
-
var INIT_TIMEOUT_MS =
|
|
4818
|
-
var SV_LOAD_TIMEOUT_MS =
|
|
5149
|
+
var INIT_TIMEOUT_MS = 6e4;
|
|
5150
|
+
var SV_LOAD_TIMEOUT_MS = 3e5;
|
|
4819
5151
|
var SV_INFER_TIMEOUT_MS = 1e4;
|
|
4820
|
-
var CPU_LOAD_TIMEOUT_MS =
|
|
5152
|
+
var CPU_LOAD_TIMEOUT_MS = 42e4;
|
|
4821
5153
|
var CPU_INFER_TIMEOUT_MS = 5e3;
|
|
4822
|
-
var VAD_LOAD_TIMEOUT_MS =
|
|
5154
|
+
var VAD_LOAD_TIMEOUT_MS = 12e4;
|
|
4823
5155
|
var VAD_INFER_TIMEOUT_MS = 1e3;
|
|
4824
5156
|
var DISPOSE_TIMEOUT_MS = 5e3;
|
|
4825
5157
|
function resolveUrl2(url) {
|
|
@@ -5514,7 +5846,7 @@ var UnifiedInferenceWorker = class {
|
|
|
5514
5846
|
const telemetry = getTelemetry();
|
|
5515
5847
|
const span = telemetry?.startSpan("UnifiedInferenceWorker.init");
|
|
5516
5848
|
try {
|
|
5517
|
-
|
|
5849
|
+
logger8.info("Creating unified inference worker...");
|
|
5518
5850
|
this.worker = this.createWorker();
|
|
5519
5851
|
await this.sendMessage(
|
|
5520
5852
|
{ type: "init", wasmPaths: WASM_CDN_PATH3, isIOS: isIOS() },
|
|
@@ -5523,7 +5855,7 @@ var UnifiedInferenceWorker = class {
|
|
|
5523
5855
|
);
|
|
5524
5856
|
this.initialized = true;
|
|
5525
5857
|
const loadTimeMs = performance.now() - startTime;
|
|
5526
|
-
|
|
5858
|
+
logger8.info("Unified worker initialized", { loadTimeMs: Math.round(loadTimeMs) });
|
|
5527
5859
|
span?.setAttributes({ "worker.init_time_ms": loadTimeMs });
|
|
5528
5860
|
span?.end();
|
|
5529
5861
|
} catch (error) {
|
|
@@ -5697,7 +6029,7 @@ var UnifiedInferenceWorker = class {
|
|
|
5697
6029
|
this.handleWorkerMessage(event.data);
|
|
5698
6030
|
};
|
|
5699
6031
|
worker.onerror = (error) => {
|
|
5700
|
-
|
|
6032
|
+
logger8.error("Unified worker error", { error: error.message });
|
|
5701
6033
|
this.rejectAllPending(`Worker error: ${error.message}`);
|
|
5702
6034
|
};
|
|
5703
6035
|
return worker;
|
|
@@ -5711,7 +6043,7 @@ var UnifiedInferenceWorker = class {
|
|
|
5711
6043
|
this.pendingRequests.delete(requestId);
|
|
5712
6044
|
pending.reject(new Error(data.error));
|
|
5713
6045
|
} else {
|
|
5714
|
-
|
|
6046
|
+
logger8.error("Worker broadcast error", { error: data.error });
|
|
5715
6047
|
this.rejectAllPending(data.error);
|
|
5716
6048
|
}
|
|
5717
6049
|
return;
|
|
@@ -5733,7 +6065,7 @@ var UnifiedInferenceWorker = class {
|
|
|
5733
6065
|
const timeout = setTimeout(() => {
|
|
5734
6066
|
this.pendingRequests.delete(requestId);
|
|
5735
6067
|
this.poisoned = true;
|
|
5736
|
-
|
|
6068
|
+
logger8.error("CRITICAL: Worker operation timed out \u2014 worker is dead", {
|
|
5737
6069
|
type: message.type,
|
|
5738
6070
|
timeoutMs
|
|
5739
6071
|
});
|
|
@@ -5799,7 +6131,7 @@ var SenseVoiceUnifiedAdapter = class {
|
|
|
5799
6131
|
});
|
|
5800
6132
|
this._isLoaded = true;
|
|
5801
6133
|
onProgress?.(1, 1);
|
|
5802
|
-
|
|
6134
|
+
logger8.info("SenseVoice loaded via unified worker", {
|
|
5803
6135
|
backend: "wasm",
|
|
5804
6136
|
loadTimeMs: Math.round(result.loadTimeMs),
|
|
5805
6137
|
vocabSize: result.vocabSize
|
|
@@ -5864,7 +6196,7 @@ var Wav2ArkitCpuUnifiedAdapter = class {
|
|
|
5864
6196
|
externalDataUrl: externalDataUrl || null
|
|
5865
6197
|
});
|
|
5866
6198
|
this._isLoaded = true;
|
|
5867
|
-
|
|
6199
|
+
logger8.info("Wav2ArkitCpu loaded via unified worker", {
|
|
5868
6200
|
backend: "wasm",
|
|
5869
6201
|
loadTimeMs: Math.round(result.loadTimeMs)
|
|
5870
6202
|
});
|
|
@@ -5970,7 +6302,7 @@ var SileroVADUnifiedAdapter = class {
|
|
|
5970
6302
|
sampleRate: this.config.sampleRate
|
|
5971
6303
|
});
|
|
5972
6304
|
this._isLoaded = true;
|
|
5973
|
-
|
|
6305
|
+
logger8.info("SileroVAD loaded via unified worker", {
|
|
5974
6306
|
backend: "wasm",
|
|
5975
6307
|
loadTimeMs: Math.round(result.loadTimeMs),
|
|
5976
6308
|
sampleRate: this.config.sampleRate,
|
|
@@ -6051,10 +6383,10 @@ var SileroVADUnifiedAdapter = class {
|
|
|
6051
6383
|
};
|
|
6052
6384
|
|
|
6053
6385
|
// src/inference/createSenseVoice.ts
|
|
6054
|
-
var
|
|
6386
|
+
var logger9 = createLogger("createSenseVoice");
|
|
6055
6387
|
function createSenseVoice(config) {
|
|
6056
6388
|
if (config.unifiedWorker) {
|
|
6057
|
-
|
|
6389
|
+
logger9.info("Creating SenseVoiceUnifiedAdapter (shared unified worker)");
|
|
6058
6390
|
return new SenseVoiceUnifiedAdapter(config.unifiedWorker, {
|
|
6059
6391
|
modelUrl: config.modelUrl,
|
|
6060
6392
|
tokensUrl: config.tokensUrl,
|
|
@@ -6067,7 +6399,7 @@ function createSenseVoice(config) {
|
|
|
6067
6399
|
if (!SenseVoiceWorker.isSupported()) {
|
|
6068
6400
|
throw new Error("Web Workers are not supported in this environment");
|
|
6069
6401
|
}
|
|
6070
|
-
|
|
6402
|
+
logger9.info("Creating SenseVoiceWorker (off-main-thread)");
|
|
6071
6403
|
return new SenseVoiceWorker({
|
|
6072
6404
|
modelUrl: config.modelUrl,
|
|
6073
6405
|
tokensUrl: config.tokensUrl,
|
|
@@ -6076,7 +6408,7 @@ function createSenseVoice(config) {
|
|
|
6076
6408
|
});
|
|
6077
6409
|
}
|
|
6078
6410
|
if (useWorker === false) {
|
|
6079
|
-
|
|
6411
|
+
logger9.info("Creating SenseVoiceInference (main thread)");
|
|
6080
6412
|
return new SenseVoiceInference({
|
|
6081
6413
|
modelUrl: config.modelUrl,
|
|
6082
6414
|
tokensUrl: config.tokensUrl,
|
|
@@ -6085,7 +6417,7 @@ function createSenseVoice(config) {
|
|
|
6085
6417
|
});
|
|
6086
6418
|
}
|
|
6087
6419
|
if (SenseVoiceWorker.isSupported() && !isIOS()) {
|
|
6088
|
-
|
|
6420
|
+
logger9.info("Auto-detected: creating SenseVoiceWorker (off-main-thread)");
|
|
6089
6421
|
return new SenseVoiceWorker({
|
|
6090
6422
|
modelUrl: config.modelUrl,
|
|
6091
6423
|
tokensUrl: config.tokensUrl,
|
|
@@ -6093,7 +6425,7 @@ function createSenseVoice(config) {
|
|
|
6093
6425
|
textNorm: config.textNorm
|
|
6094
6426
|
});
|
|
6095
6427
|
}
|
|
6096
|
-
|
|
6428
|
+
logger9.info("Auto-detected: creating SenseVoiceInference (main thread)", {
|
|
6097
6429
|
reason: isIOS() ? "iOS (shared ORT instance)" : "Worker unsupported"
|
|
6098
6430
|
});
|
|
6099
6431
|
return new SenseVoiceInference({
|
|
@@ -6105,7 +6437,7 @@ function createSenseVoice(config) {
|
|
|
6105
6437
|
}
|
|
6106
6438
|
|
|
6107
6439
|
// src/inference/Wav2ArkitCpuInference.ts
|
|
6108
|
-
var
|
|
6440
|
+
var logger10 = createLogger("Wav2ArkitCpu");
|
|
6109
6441
|
var _Wav2ArkitCpuInference = class _Wav2ArkitCpuInference {
|
|
6110
6442
|
constructor(config) {
|
|
6111
6443
|
this.modelId = "wav2arkit_cpu";
|
|
@@ -6147,16 +6479,16 @@ var _Wav2ArkitCpuInference = class _Wav2ArkitCpuInference {
|
|
|
6147
6479
|
});
|
|
6148
6480
|
try {
|
|
6149
6481
|
const preference = this.config.backend || "wasm";
|
|
6150
|
-
|
|
6482
|
+
logger10.info("Loading ONNX Runtime...", { preference });
|
|
6151
6483
|
const { ort, backend } = await getOnnxRuntimeForPreference(preference);
|
|
6152
6484
|
this.ort = ort;
|
|
6153
6485
|
this._backend = backend;
|
|
6154
|
-
|
|
6486
|
+
logger10.info("ONNX Runtime loaded", { backend: this._backend });
|
|
6155
6487
|
const modelUrl = this.config.modelUrl;
|
|
6156
6488
|
const dataUrl = this.config.externalDataUrl !== false ? this.config.externalDataUrl || `${modelUrl}.data` : null;
|
|
6157
6489
|
const sessionOptions = getSessionOptions(this._backend);
|
|
6158
6490
|
if (isIOS()) {
|
|
6159
|
-
|
|
6491
|
+
logger10.info("iOS: passing model URLs directly to ORT (low-memory path)", {
|
|
6160
6492
|
modelUrl,
|
|
6161
6493
|
dataUrl
|
|
6162
6494
|
});
|
|
@@ -6178,15 +6510,15 @@ var _Wav2ArkitCpuInference = class _Wav2ArkitCpuInference {
|
|
|
6178
6510
|
const isCached = await cache.has(modelUrl);
|
|
6179
6511
|
let modelBuffer;
|
|
6180
6512
|
if (isCached) {
|
|
6181
|
-
|
|
6513
|
+
logger10.debug("Loading model from cache", { modelUrl });
|
|
6182
6514
|
modelBuffer = await cache.get(modelUrl);
|
|
6183
6515
|
if (!modelBuffer) {
|
|
6184
|
-
|
|
6516
|
+
logger10.warn("Cache corruption detected, clearing and retrying", { modelUrl });
|
|
6185
6517
|
await cache.delete(modelUrl);
|
|
6186
6518
|
modelBuffer = await fetchWithCache(modelUrl);
|
|
6187
6519
|
}
|
|
6188
6520
|
} else {
|
|
6189
|
-
|
|
6521
|
+
logger10.debug("Fetching and caching model graph", { modelUrl });
|
|
6190
6522
|
modelBuffer = await fetchWithCache(modelUrl);
|
|
6191
6523
|
}
|
|
6192
6524
|
if (!modelBuffer) {
|
|
@@ -6197,31 +6529,31 @@ var _Wav2ArkitCpuInference = class _Wav2ArkitCpuInference {
|
|
|
6197
6529
|
try {
|
|
6198
6530
|
const isDataCached = await cache.has(dataUrl);
|
|
6199
6531
|
if (isDataCached) {
|
|
6200
|
-
|
|
6532
|
+
logger10.debug("Loading external data from cache", { dataUrl });
|
|
6201
6533
|
externalDataBuffer = await cache.get(dataUrl);
|
|
6202
6534
|
if (!externalDataBuffer) {
|
|
6203
|
-
|
|
6535
|
+
logger10.warn("Cache corruption for external data, retrying", { dataUrl });
|
|
6204
6536
|
await cache.delete(dataUrl);
|
|
6205
6537
|
externalDataBuffer = await fetchWithCache(dataUrl);
|
|
6206
6538
|
}
|
|
6207
6539
|
} else {
|
|
6208
|
-
|
|
6540
|
+
logger10.info("Fetching external model data", {
|
|
6209
6541
|
dataUrl,
|
|
6210
6542
|
note: "This may be a large download (400MB+)"
|
|
6211
6543
|
});
|
|
6212
6544
|
externalDataBuffer = await fetchWithCache(dataUrl);
|
|
6213
6545
|
}
|
|
6214
|
-
|
|
6546
|
+
logger10.info("External data loaded", {
|
|
6215
6547
|
size: formatBytes(externalDataBuffer.byteLength)
|
|
6216
6548
|
});
|
|
6217
6549
|
} catch (err) {
|
|
6218
|
-
|
|
6550
|
+
logger10.debug("No external data file found (single-file model)", {
|
|
6219
6551
|
dataUrl,
|
|
6220
6552
|
error: err.message
|
|
6221
6553
|
});
|
|
6222
6554
|
}
|
|
6223
6555
|
}
|
|
6224
|
-
|
|
6556
|
+
logger10.debug("Creating ONNX session", {
|
|
6225
6557
|
graphSize: formatBytes(modelBuffer.byteLength),
|
|
6226
6558
|
externalDataSize: externalDataBuffer ? formatBytes(externalDataBuffer.byteLength) : "none",
|
|
6227
6559
|
backend: this._backend
|
|
@@ -6237,7 +6569,7 @@ var _Wav2ArkitCpuInference = class _Wav2ArkitCpuInference {
|
|
|
6237
6569
|
this.session = await this.ort.InferenceSession.create(modelData, sessionOptions);
|
|
6238
6570
|
}
|
|
6239
6571
|
const loadTimeMs = performance.now() - startTime;
|
|
6240
|
-
|
|
6572
|
+
logger10.info("Model loaded successfully", {
|
|
6241
6573
|
backend: this._backend,
|
|
6242
6574
|
loadTimeMs: Math.round(loadTimeMs),
|
|
6243
6575
|
inputs: this.session.inputNames,
|
|
@@ -6253,12 +6585,12 @@ var _Wav2ArkitCpuInference = class _Wav2ArkitCpuInference {
|
|
|
6253
6585
|
model: "wav2arkit_cpu",
|
|
6254
6586
|
backend: this._backend
|
|
6255
6587
|
});
|
|
6256
|
-
|
|
6588
|
+
logger10.debug("Running warmup inference");
|
|
6257
6589
|
const warmupStart = performance.now();
|
|
6258
6590
|
const silentAudio = new Float32Array(16e3);
|
|
6259
6591
|
await this.infer(silentAudio);
|
|
6260
6592
|
const warmupTimeMs = performance.now() - warmupStart;
|
|
6261
|
-
|
|
6593
|
+
logger10.info("Warmup inference complete", {
|
|
6262
6594
|
warmupTimeMs: Math.round(warmupTimeMs),
|
|
6263
6595
|
backend: this._backend
|
|
6264
6596
|
});
|
|
@@ -6345,7 +6677,7 @@ var _Wav2ArkitCpuInference = class _Wav2ArkitCpuInference {
|
|
|
6345
6677
|
const symmetrized = symmetrizeBlendshapes(rawFrame);
|
|
6346
6678
|
blendshapes.push(symmetrized);
|
|
6347
6679
|
}
|
|
6348
|
-
|
|
6680
|
+
logger10.trace("Inference completed", {
|
|
6349
6681
|
inferenceTimeMs: Math.round(inferenceTimeMs * 100) / 100,
|
|
6350
6682
|
numFrames,
|
|
6351
6683
|
inputSamples
|
|
@@ -6373,7 +6705,7 @@ var _Wav2ArkitCpuInference = class _Wav2ArkitCpuInference {
|
|
|
6373
6705
|
const errMsg = err instanceof Error ? err.message : String(err);
|
|
6374
6706
|
if (errMsg.includes("timed out")) {
|
|
6375
6707
|
this.poisoned = true;
|
|
6376
|
-
|
|
6708
|
+
logger10.error("CRITICAL: Inference session timed out \u2014 Wav2ArkitCpu is dead. Page reload required.", {
|
|
6377
6709
|
backend: this._backend,
|
|
6378
6710
|
timeoutMs: _Wav2ArkitCpuInference.INFERENCE_TIMEOUT_MS
|
|
6379
6711
|
});
|
|
@@ -6381,7 +6713,7 @@ var _Wav2ArkitCpuInference = class _Wav2ArkitCpuInference {
|
|
|
6381
6713
|
const oomError = new Error(
|
|
6382
6714
|
`Wav2ArkitCpu inference failed with raw C++ exception pointer (0x${err.toString(16)}). This is likely an OOM crash in WASM. Try reloading the page.`
|
|
6383
6715
|
);
|
|
6384
|
-
|
|
6716
|
+
logger10.error("ORT WASM OOM \u2014 raw C++ exception pointer", {
|
|
6385
6717
|
pointer: `0x${err.toString(16)}`,
|
|
6386
6718
|
backend: this._backend
|
|
6387
6719
|
});
|
|
@@ -6394,7 +6726,7 @@ var _Wav2ArkitCpuInference = class _Wav2ArkitCpuInference {
|
|
|
6394
6726
|
reject(oomError);
|
|
6395
6727
|
return;
|
|
6396
6728
|
} else {
|
|
6397
|
-
|
|
6729
|
+
logger10.error("Inference failed", { error: errMsg, backend: this._backend });
|
|
6398
6730
|
}
|
|
6399
6731
|
span?.endWithError(err instanceof Error ? err : new Error(String(err)));
|
|
6400
6732
|
telemetry?.incrementCounter("omote.inference.total", 1, {
|
|
@@ -6421,9 +6753,9 @@ _Wav2ArkitCpuInference.INFERENCE_TIMEOUT_MS = 5e3;
|
|
|
6421
6753
|
var Wav2ArkitCpuInference = _Wav2ArkitCpuInference;
|
|
6422
6754
|
|
|
6423
6755
|
// src/inference/Wav2ArkitCpuWorker.ts
|
|
6424
|
-
var
|
|
6756
|
+
var logger11 = createLogger("Wav2ArkitCpuWorker");
|
|
6425
6757
|
var WASM_CDN_PATH4 = "https://cdn.jsdelivr.net/npm/onnxruntime-web@1.23.2/dist/";
|
|
6426
|
-
var LOAD_TIMEOUT_MS2 =
|
|
6758
|
+
var LOAD_TIMEOUT_MS2 = 42e4;
|
|
6427
6759
|
var INFERENCE_TIMEOUT_MS2 = 5e3;
|
|
6428
6760
|
function resolveUrl3(url) {
|
|
6429
6761
|
if (/^https?:\/\//i.test(url) || /^blob:/i.test(url)) return url;
|
|
@@ -6708,7 +7040,7 @@ var Wav2ArkitCpuWorker = class {
|
|
|
6708
7040
|
this.handleWorkerMessage(event.data);
|
|
6709
7041
|
};
|
|
6710
7042
|
worker.onerror = (error) => {
|
|
6711
|
-
|
|
7043
|
+
logger11.error("Worker error", { error: error.message });
|
|
6712
7044
|
for (const [, resolver] of this.pendingResolvers) {
|
|
6713
7045
|
resolver.reject(new Error(`Worker error: ${error.message}`));
|
|
6714
7046
|
}
|
|
@@ -6784,10 +7116,10 @@ var Wav2ArkitCpuWorker = class {
|
|
|
6784
7116
|
"model.backend_requested": "wasm"
|
|
6785
7117
|
});
|
|
6786
7118
|
try {
|
|
6787
|
-
|
|
7119
|
+
logger11.info("Creating wav2arkit_cpu worker...");
|
|
6788
7120
|
this.worker = this.createWorker();
|
|
6789
7121
|
const externalDataUrl = this.config.externalDataUrl !== false ? this.config.externalDataUrl || `${this.config.modelUrl}.data` : null;
|
|
6790
|
-
|
|
7122
|
+
logger11.info("Loading model in worker...", {
|
|
6791
7123
|
modelUrl: this.config.modelUrl,
|
|
6792
7124
|
externalDataUrl,
|
|
6793
7125
|
isIOS: isIOS()
|
|
@@ -6805,7 +7137,7 @@ var Wav2ArkitCpuWorker = class {
|
|
|
6805
7137
|
);
|
|
6806
7138
|
this._isLoaded = true;
|
|
6807
7139
|
const loadTimeMs = performance.now() - startTime;
|
|
6808
|
-
|
|
7140
|
+
logger11.info("Wav2ArkitCpu worker loaded successfully", {
|
|
6809
7141
|
backend: "wasm",
|
|
6810
7142
|
loadTimeMs: Math.round(loadTimeMs),
|
|
6811
7143
|
workerLoadTimeMs: Math.round(result.loadTimeMs),
|
|
@@ -6890,7 +7222,7 @@ var Wav2ArkitCpuWorker = class {
|
|
|
6890
7222
|
for (let f = 0; f < numFrames; f++) {
|
|
6891
7223
|
blendshapes.push(flatBuffer.slice(f * numBlendshapes, (f + 1) * numBlendshapes));
|
|
6892
7224
|
}
|
|
6893
|
-
|
|
7225
|
+
logger11.trace("Worker inference completed", {
|
|
6894
7226
|
inferenceTimeMs: Math.round(inferenceTimeMs * 100) / 100,
|
|
6895
7227
|
workerTimeMs: Math.round(result.inferenceTimeMs * 100) / 100,
|
|
6896
7228
|
numFrames,
|
|
@@ -6920,12 +7252,12 @@ var Wav2ArkitCpuWorker = class {
|
|
|
6920
7252
|
const errMsg = err instanceof Error ? err.message : String(err);
|
|
6921
7253
|
if (errMsg.includes("timed out")) {
|
|
6922
7254
|
this.poisoned = true;
|
|
6923
|
-
|
|
7255
|
+
logger11.error("CRITICAL: Worker inference timed out \u2014 Wav2ArkitCpu worker is dead. Page reload required.", {
|
|
6924
7256
|
backend: "wasm",
|
|
6925
7257
|
timeoutMs: INFERENCE_TIMEOUT_MS2
|
|
6926
7258
|
});
|
|
6927
7259
|
} else {
|
|
6928
|
-
|
|
7260
|
+
logger11.error("Worker inference failed", { error: errMsg, backend: "wasm" });
|
|
6929
7261
|
}
|
|
6930
7262
|
span?.endWithError(err instanceof Error ? err : new Error(String(err)));
|
|
6931
7263
|
telemetry?.incrementCounter("omote.inference.total", 1, {
|
|
@@ -6963,38 +7295,38 @@ var Wav2ArkitCpuWorker = class {
|
|
|
6963
7295
|
};
|
|
6964
7296
|
|
|
6965
7297
|
// src/inference/createA2E.ts
|
|
6966
|
-
var
|
|
7298
|
+
var logger12 = createLogger("createA2E");
|
|
6967
7299
|
function createA2E(config) {
|
|
6968
7300
|
const mode = config.mode ?? "auto";
|
|
6969
7301
|
const fallbackOnError = config.fallbackOnError ?? true;
|
|
6970
7302
|
let useCpu;
|
|
6971
7303
|
if (mode === "cpu") {
|
|
6972
7304
|
useCpu = true;
|
|
6973
|
-
|
|
7305
|
+
logger12.info("Forcing CPU A2E model (wav2arkit_cpu)");
|
|
6974
7306
|
} else if (mode === "gpu") {
|
|
6975
7307
|
useCpu = false;
|
|
6976
|
-
|
|
7308
|
+
logger12.info("Forcing GPU A2E model (Wav2Vec2)");
|
|
6977
7309
|
} else {
|
|
6978
7310
|
useCpu = shouldUseCpuA2E();
|
|
6979
|
-
|
|
7311
|
+
logger12.info("Auto-detected A2E model", {
|
|
6980
7312
|
useCpu,
|
|
6981
7313
|
isSafari: isSafari()
|
|
6982
7314
|
});
|
|
6983
7315
|
}
|
|
6984
7316
|
if (useCpu) {
|
|
6985
7317
|
if (config.unifiedWorker) {
|
|
6986
|
-
|
|
7318
|
+
logger12.info("Creating Wav2ArkitCpuUnifiedAdapter (404MB, WASM, shared unified worker)");
|
|
6987
7319
|
return new Wav2ArkitCpuUnifiedAdapter(config.unifiedWorker, {
|
|
6988
7320
|
modelUrl: config.cpuModelUrl
|
|
6989
7321
|
});
|
|
6990
7322
|
}
|
|
6991
7323
|
if (config.useWorker && Wav2ArkitCpuWorker.isSupported() && !isIOS()) {
|
|
6992
|
-
|
|
7324
|
+
logger12.info("Creating Wav2ArkitCpuWorker (404MB, WASM, off-main-thread)");
|
|
6993
7325
|
return new Wav2ArkitCpuWorker({
|
|
6994
7326
|
modelUrl: config.cpuModelUrl
|
|
6995
7327
|
});
|
|
6996
7328
|
}
|
|
6997
|
-
|
|
7329
|
+
logger12.info("Creating Wav2ArkitCpuInference (404MB, WASM)");
|
|
6998
7330
|
return new Wav2ArkitCpuInference({
|
|
6999
7331
|
modelUrl: config.cpuModelUrl
|
|
7000
7332
|
});
|
|
@@ -7006,10 +7338,10 @@ function createA2E(config) {
|
|
|
7006
7338
|
numIdentityClasses: config.numIdentityClasses
|
|
7007
7339
|
});
|
|
7008
7340
|
if (fallbackOnError) {
|
|
7009
|
-
|
|
7341
|
+
logger12.info("Creating Wav2Vec2Inference with CPU fallback");
|
|
7010
7342
|
return new A2EWithFallback(gpuInstance, config);
|
|
7011
7343
|
}
|
|
7012
|
-
|
|
7344
|
+
logger12.info("Creating Wav2Vec2Inference (no fallback)");
|
|
7013
7345
|
return gpuInstance;
|
|
7014
7346
|
}
|
|
7015
7347
|
var A2EWithFallback = class {
|
|
@@ -7038,7 +7370,7 @@ var A2EWithFallback = class {
|
|
|
7038
7370
|
}
|
|
7039
7371
|
}
|
|
7040
7372
|
async fallbackToCpu(reason) {
|
|
7041
|
-
|
|
7373
|
+
logger12.warn("GPU model load failed, falling back to CPU model", { reason });
|
|
7042
7374
|
try {
|
|
7043
7375
|
await this.implementation.dispose();
|
|
7044
7376
|
} catch {
|
|
@@ -7047,17 +7379,17 @@ var A2EWithFallback = class {
|
|
|
7047
7379
|
this.implementation = new Wav2ArkitCpuUnifiedAdapter(this.config.unifiedWorker, {
|
|
7048
7380
|
modelUrl: this.config.cpuModelUrl
|
|
7049
7381
|
});
|
|
7050
|
-
|
|
7382
|
+
logger12.info("Fallback to Wav2ArkitCpuUnifiedAdapter successful");
|
|
7051
7383
|
} else if (this.config.useWorker && Wav2ArkitCpuWorker.isSupported() && !isIOS()) {
|
|
7052
7384
|
this.implementation = new Wav2ArkitCpuWorker({
|
|
7053
7385
|
modelUrl: this.config.cpuModelUrl
|
|
7054
7386
|
});
|
|
7055
|
-
|
|
7387
|
+
logger12.info("Fallback to Wav2ArkitCpuWorker successful");
|
|
7056
7388
|
} else {
|
|
7057
7389
|
this.implementation = new Wav2ArkitCpuInference({
|
|
7058
7390
|
modelUrl: this.config.cpuModelUrl
|
|
7059
7391
|
});
|
|
7060
|
-
|
|
7392
|
+
logger12.info("Fallback to Wav2ArkitCpuInference successful");
|
|
7061
7393
|
}
|
|
7062
7394
|
this.hasFallenBack = true;
|
|
7063
7395
|
return await this.implementation.load();
|
|
@@ -7261,7 +7593,7 @@ var EmphasisDetector = class {
|
|
|
7261
7593
|
};
|
|
7262
7594
|
|
|
7263
7595
|
// src/inference/SileroVADInference.ts
|
|
7264
|
-
var
|
|
7596
|
+
var logger13 = createLogger("SileroVAD");
|
|
7265
7597
|
var SileroVADInference = class {
|
|
7266
7598
|
constructor(config) {
|
|
7267
7599
|
this.session = null;
|
|
@@ -7335,23 +7667,23 @@ var SileroVADInference = class {
|
|
|
7335
7667
|
"model.sample_rate": this.config.sampleRate
|
|
7336
7668
|
});
|
|
7337
7669
|
try {
|
|
7338
|
-
|
|
7670
|
+
logger13.info("Loading ONNX Runtime...", { preference: this.config.backend });
|
|
7339
7671
|
const { ort, backend } = await getOnnxRuntimeForPreference(this.config.backend);
|
|
7340
7672
|
this.ort = ort;
|
|
7341
7673
|
this._backend = backend;
|
|
7342
|
-
|
|
7674
|
+
logger13.info("ONNX Runtime loaded", { backend: this._backend });
|
|
7343
7675
|
const cache = getModelCache();
|
|
7344
7676
|
const modelUrl = this.config.modelUrl;
|
|
7345
7677
|
const isCached = await cache.has(modelUrl);
|
|
7346
7678
|
let modelBuffer;
|
|
7347
7679
|
if (isCached) {
|
|
7348
|
-
|
|
7680
|
+
logger13.debug("Loading model from cache", { modelUrl });
|
|
7349
7681
|
modelBuffer = await cache.get(modelUrl);
|
|
7350
7682
|
} else {
|
|
7351
|
-
|
|
7683
|
+
logger13.debug("Fetching and caching model", { modelUrl });
|
|
7352
7684
|
modelBuffer = await fetchWithCache(modelUrl);
|
|
7353
7685
|
}
|
|
7354
|
-
|
|
7686
|
+
logger13.debug("Creating ONNX session", {
|
|
7355
7687
|
size: formatBytes(modelBuffer.byteLength),
|
|
7356
7688
|
backend: this._backend
|
|
7357
7689
|
});
|
|
@@ -7360,7 +7692,7 @@ var SileroVADInference = class {
|
|
|
7360
7692
|
this.session = await ort.InferenceSession.create(modelData, sessionOptions);
|
|
7361
7693
|
this.reset();
|
|
7362
7694
|
const loadTimeMs = performance.now() - startTime;
|
|
7363
|
-
|
|
7695
|
+
logger13.info("Model loaded successfully", {
|
|
7364
7696
|
backend: this._backend,
|
|
7365
7697
|
loadTimeMs: Math.round(loadTimeMs),
|
|
7366
7698
|
sampleRate: this.config.sampleRate,
|
|
@@ -7415,7 +7747,7 @@ var SileroVADInference = class {
|
|
|
7415
7747
|
[]
|
|
7416
7748
|
);
|
|
7417
7749
|
} catch (e) {
|
|
7418
|
-
|
|
7750
|
+
logger13.warn("BigInt64Array not available, using bigint array fallback", {
|
|
7419
7751
|
error: e instanceof Error ? e.message : String(e)
|
|
7420
7752
|
});
|
|
7421
7753
|
this.srTensor = new this.ort.Tensor(
|
|
@@ -7521,7 +7853,7 @@ var SileroVADInference = class {
|
|
|
7521
7853
|
this.preSpeechBuffer.shift();
|
|
7522
7854
|
}
|
|
7523
7855
|
}
|
|
7524
|
-
|
|
7856
|
+
logger13.trace("Skipping VAD inference - audio too quiet", {
|
|
7525
7857
|
rms: Math.round(rms * 1e4) / 1e4,
|
|
7526
7858
|
threshold: MIN_ENERGY_THRESHOLD
|
|
7527
7859
|
});
|
|
@@ -7575,7 +7907,7 @@ var SileroVADInference = class {
|
|
|
7575
7907
|
if (isSpeech && !this.wasSpeaking) {
|
|
7576
7908
|
preSpeechChunks = [...this.preSpeechBuffer];
|
|
7577
7909
|
this.preSpeechBuffer = [];
|
|
7578
|
-
|
|
7910
|
+
logger13.debug("Speech started with pre-speech buffer", {
|
|
7579
7911
|
preSpeechChunks: preSpeechChunks.length,
|
|
7580
7912
|
durationMs: Math.round(preSpeechChunks.length * this.getChunkDurationMs())
|
|
7581
7913
|
});
|
|
@@ -7588,7 +7920,7 @@ var SileroVADInference = class {
|
|
|
7588
7920
|
this.preSpeechBuffer = [];
|
|
7589
7921
|
}
|
|
7590
7922
|
this.wasSpeaking = isSpeech;
|
|
7591
|
-
|
|
7923
|
+
logger13.trace("VAD inference completed", {
|
|
7592
7924
|
probability: Math.round(probability * 1e3) / 1e3,
|
|
7593
7925
|
isSpeech,
|
|
7594
7926
|
inferenceTimeMs: Math.round(inferenceTimeMs * 100) / 100
|
|
@@ -7619,7 +7951,7 @@ var SileroVADInference = class {
|
|
|
7619
7951
|
const oomError = new Error(
|
|
7620
7952
|
`SileroVAD inference failed with raw C++ exception pointer (0x${err.toString(16)}). This is likely an OOM crash in WASM. Try reducing concurrent model sessions or reloading the page.`
|
|
7621
7953
|
);
|
|
7622
|
-
|
|
7954
|
+
logger13.error("ORT WASM OOM \u2014 raw C++ exception pointer", {
|
|
7623
7955
|
pointer: `0x${err.toString(16)}`,
|
|
7624
7956
|
backend: this._backend
|
|
7625
7957
|
});
|
|
@@ -7662,9 +7994,9 @@ var SileroVADInference = class {
|
|
|
7662
7994
|
SileroVADInference.isWebGPUAvailable = isWebGPUAvailable;
|
|
7663
7995
|
|
|
7664
7996
|
// src/inference/SileroVADWorker.ts
|
|
7665
|
-
var
|
|
7997
|
+
var logger14 = createLogger("SileroVADWorker");
|
|
7666
7998
|
var WASM_CDN_PATH5 = "https://cdn.jsdelivr.net/npm/onnxruntime-web@1.23.2/dist/";
|
|
7667
|
-
var LOAD_TIMEOUT_MS3 =
|
|
7999
|
+
var LOAD_TIMEOUT_MS3 = 12e4;
|
|
7668
8000
|
var INFERENCE_TIMEOUT_MS3 = 1e3;
|
|
7669
8001
|
function resolveUrl4(url) {
|
|
7670
8002
|
if (/^https?:\/\//i.test(url) || /^blob:/i.test(url)) return url;
|
|
@@ -7947,7 +8279,7 @@ var SileroVADWorker = class {
|
|
|
7947
8279
|
this.handleWorkerMessage(event.data);
|
|
7948
8280
|
};
|
|
7949
8281
|
worker.onerror = (error) => {
|
|
7950
|
-
|
|
8282
|
+
logger14.error("Worker error", { error: error.message });
|
|
7951
8283
|
for (const [, resolver] of this.pendingResolvers) {
|
|
7952
8284
|
resolver.reject(new Error(`Worker error: ${error.message}`));
|
|
7953
8285
|
}
|
|
@@ -8023,9 +8355,9 @@ var SileroVADWorker = class {
|
|
|
8023
8355
|
"model.sample_rate": this.config.sampleRate
|
|
8024
8356
|
});
|
|
8025
8357
|
try {
|
|
8026
|
-
|
|
8358
|
+
logger14.info("Creating VAD worker...");
|
|
8027
8359
|
this.worker = this.createWorker();
|
|
8028
|
-
|
|
8360
|
+
logger14.info("Loading model in worker...", {
|
|
8029
8361
|
modelUrl: this.config.modelUrl,
|
|
8030
8362
|
sampleRate: this.config.sampleRate
|
|
8031
8363
|
});
|
|
@@ -8041,7 +8373,7 @@ var SileroVADWorker = class {
|
|
|
8041
8373
|
);
|
|
8042
8374
|
this._isLoaded = true;
|
|
8043
8375
|
const loadTimeMs = performance.now() - startTime;
|
|
8044
|
-
|
|
8376
|
+
logger14.info("VAD worker loaded successfully", {
|
|
8045
8377
|
backend: "wasm",
|
|
8046
8378
|
loadTimeMs: Math.round(loadTimeMs),
|
|
8047
8379
|
workerLoadTimeMs: Math.round(result.loadTimeMs),
|
|
@@ -8148,7 +8480,7 @@ var SileroVADWorker = class {
|
|
|
8148
8480
|
if (isSpeech && !this.wasSpeaking) {
|
|
8149
8481
|
preSpeechChunks = [...this.preSpeechBuffer];
|
|
8150
8482
|
this.preSpeechBuffer = [];
|
|
8151
|
-
|
|
8483
|
+
logger14.debug("Speech started with pre-speech buffer", {
|
|
8152
8484
|
preSpeechChunks: preSpeechChunks.length,
|
|
8153
8485
|
durationMs: Math.round(preSpeechChunks.length * this.getChunkDurationMs())
|
|
8154
8486
|
});
|
|
@@ -8161,7 +8493,7 @@ var SileroVADWorker = class {
|
|
|
8161
8493
|
this.preSpeechBuffer = [];
|
|
8162
8494
|
}
|
|
8163
8495
|
this.wasSpeaking = isSpeech;
|
|
8164
|
-
|
|
8496
|
+
logger14.trace("VAD worker inference completed", {
|
|
8165
8497
|
probability: Math.round(result.probability * 1e3) / 1e3,
|
|
8166
8498
|
isSpeech,
|
|
8167
8499
|
inferenceTimeMs: Math.round(inferenceTimeMs * 100) / 100,
|
|
@@ -8229,44 +8561,44 @@ var SileroVADWorker = class {
|
|
|
8229
8561
|
};
|
|
8230
8562
|
|
|
8231
8563
|
// src/inference/createSileroVAD.ts
|
|
8232
|
-
var
|
|
8564
|
+
var logger15 = createLogger("createSileroVAD");
|
|
8233
8565
|
function supportsVADWorker() {
|
|
8234
8566
|
if (typeof Worker === "undefined") {
|
|
8235
|
-
|
|
8567
|
+
logger15.debug("Worker not supported: Worker constructor undefined");
|
|
8236
8568
|
return false;
|
|
8237
8569
|
}
|
|
8238
8570
|
if (typeof URL === "undefined" || typeof URL.createObjectURL === "undefined") {
|
|
8239
|
-
|
|
8571
|
+
logger15.debug("Worker not supported: URL.createObjectURL unavailable");
|
|
8240
8572
|
return false;
|
|
8241
8573
|
}
|
|
8242
8574
|
if (typeof Blob === "undefined") {
|
|
8243
|
-
|
|
8575
|
+
logger15.debug("Worker not supported: Blob constructor unavailable");
|
|
8244
8576
|
return false;
|
|
8245
8577
|
}
|
|
8246
8578
|
return true;
|
|
8247
8579
|
}
|
|
8248
8580
|
function createSileroVAD(config) {
|
|
8249
8581
|
if (config.unifiedWorker) {
|
|
8250
|
-
|
|
8582
|
+
logger15.info("Creating SileroVADUnifiedAdapter (shared unified worker)");
|
|
8251
8583
|
return new SileroVADUnifiedAdapter(config.unifiedWorker, config);
|
|
8252
8584
|
}
|
|
8253
8585
|
const fallbackOnError = config.fallbackOnError ?? true;
|
|
8254
8586
|
let useWorker;
|
|
8255
8587
|
if (config.useWorker !== void 0) {
|
|
8256
8588
|
useWorker = config.useWorker;
|
|
8257
|
-
|
|
8589
|
+
logger15.debug("Worker preference explicitly set", { useWorker });
|
|
8258
8590
|
} else {
|
|
8259
8591
|
const workerSupported = supportsVADWorker();
|
|
8260
8592
|
const onMobile = isMobile();
|
|
8261
8593
|
useWorker = workerSupported && !onMobile;
|
|
8262
|
-
|
|
8594
|
+
logger15.debug("Auto-detected Worker preference", {
|
|
8263
8595
|
useWorker,
|
|
8264
8596
|
workerSupported,
|
|
8265
8597
|
onMobile
|
|
8266
8598
|
});
|
|
8267
8599
|
}
|
|
8268
8600
|
if (useWorker) {
|
|
8269
|
-
|
|
8601
|
+
logger15.info("Creating SileroVADWorker (off-main-thread)");
|
|
8270
8602
|
const worker = new SileroVADWorker({
|
|
8271
8603
|
modelUrl: config.modelUrl,
|
|
8272
8604
|
sampleRate: config.sampleRate,
|
|
@@ -8278,7 +8610,7 @@ function createSileroVAD(config) {
|
|
|
8278
8610
|
}
|
|
8279
8611
|
return worker;
|
|
8280
8612
|
}
|
|
8281
|
-
|
|
8613
|
+
logger15.info("Creating SileroVADInference (main thread)");
|
|
8282
8614
|
return new SileroVADInference(config);
|
|
8283
8615
|
}
|
|
8284
8616
|
var VADWorkerWithFallback = class {
|
|
@@ -8304,7 +8636,7 @@ var VADWorkerWithFallback = class {
|
|
|
8304
8636
|
try {
|
|
8305
8637
|
return await this.implementation.load();
|
|
8306
8638
|
} catch (error) {
|
|
8307
|
-
|
|
8639
|
+
logger15.warn("Worker load failed, falling back to main thread", {
|
|
8308
8640
|
error: error instanceof Error ? error.message : String(error)
|
|
8309
8641
|
});
|
|
8310
8642
|
try {
|
|
@@ -8313,7 +8645,7 @@ var VADWorkerWithFallback = class {
|
|
|
8313
8645
|
}
|
|
8314
8646
|
this.implementation = new SileroVADInference(this.config);
|
|
8315
8647
|
this.hasFallenBack = true;
|
|
8316
|
-
|
|
8648
|
+
logger15.info("Fallback to SileroVADInference successful");
|
|
8317
8649
|
return await this.implementation.load();
|
|
8318
8650
|
}
|
|
8319
8651
|
}
|
|
@@ -8335,7 +8667,7 @@ var VADWorkerWithFallback = class {
|
|
|
8335
8667
|
};
|
|
8336
8668
|
|
|
8337
8669
|
// src/inference/A2EOrchestrator.ts
|
|
8338
|
-
var
|
|
8670
|
+
var logger16 = createLogger("A2EOrchestrator");
|
|
8339
8671
|
var A2EOrchestrator = class {
|
|
8340
8672
|
constructor(config) {
|
|
8341
8673
|
this.a2e = null;
|
|
@@ -8376,7 +8708,7 @@ var A2EOrchestrator = class {
|
|
|
8376
8708
|
*/
|
|
8377
8709
|
async load() {
|
|
8378
8710
|
if (this.disposed) throw new Error("A2EOrchestrator has been disposed");
|
|
8379
|
-
|
|
8711
|
+
logger16.info("Loading A2E model...");
|
|
8380
8712
|
this.a2e = createA2E({
|
|
8381
8713
|
gpuModelUrl: this.config.gpuModelUrl,
|
|
8382
8714
|
gpuExternalDataUrl: this.config.gpuExternalDataUrl,
|
|
@@ -8393,7 +8725,7 @@ var A2EOrchestrator = class {
|
|
|
8393
8725
|
onError: this.config.onError
|
|
8394
8726
|
});
|
|
8395
8727
|
this._isReady = true;
|
|
8396
|
-
|
|
8728
|
+
logger16.info("A2E model loaded", {
|
|
8397
8729
|
backend: info.backend,
|
|
8398
8730
|
loadTimeMs: info.loadTimeMs,
|
|
8399
8731
|
modelId: this.a2e.modelId
|
|
@@ -8448,10 +8780,10 @@ var A2EOrchestrator = class {
|
|
|
8448
8780
|
this.scriptProcessor.connect(this.audioContext.destination);
|
|
8449
8781
|
this._isStreaming = true;
|
|
8450
8782
|
this.processor.startDrip();
|
|
8451
|
-
|
|
8783
|
+
logger16.info("Mic capture started", { sampleRate: this.nativeSampleRate });
|
|
8452
8784
|
} catch (err) {
|
|
8453
8785
|
const error = err instanceof Error ? err : new Error(String(err));
|
|
8454
|
-
|
|
8786
|
+
logger16.error("Failed to start mic capture", { error: error.message });
|
|
8455
8787
|
this.config.onError?.(error);
|
|
8456
8788
|
throw error;
|
|
8457
8789
|
}
|
|
@@ -8479,7 +8811,7 @@ var A2EOrchestrator = class {
|
|
|
8479
8811
|
});
|
|
8480
8812
|
this.audioContext = null;
|
|
8481
8813
|
}
|
|
8482
|
-
|
|
8814
|
+
logger16.info("Mic capture stopped");
|
|
8483
8815
|
}
|
|
8484
8816
|
/**
|
|
8485
8817
|
* Dispose of all resources
|
|
@@ -8502,7 +8834,7 @@ var A2EOrchestrator = class {
|
|
|
8502
8834
|
};
|
|
8503
8835
|
|
|
8504
8836
|
// src/inference/SafariSpeechRecognition.ts
|
|
8505
|
-
var
|
|
8837
|
+
var logger17 = createLogger("SafariSpeech");
|
|
8506
8838
|
var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
8507
8839
|
constructor(config = {}) {
|
|
8508
8840
|
this.recognition = null;
|
|
@@ -8521,7 +8853,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
8521
8853
|
interimResults: config.interimResults ?? true,
|
|
8522
8854
|
maxAlternatives: config.maxAlternatives ?? 1
|
|
8523
8855
|
};
|
|
8524
|
-
|
|
8856
|
+
logger17.debug("SafariSpeechRecognition created", {
|
|
8525
8857
|
language: this.config.language,
|
|
8526
8858
|
continuous: this.config.continuous
|
|
8527
8859
|
});
|
|
@@ -8582,7 +8914,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
8582
8914
|
*/
|
|
8583
8915
|
async start() {
|
|
8584
8916
|
if (this.isListening) {
|
|
8585
|
-
|
|
8917
|
+
logger17.warn("Already listening");
|
|
8586
8918
|
return;
|
|
8587
8919
|
}
|
|
8588
8920
|
if (!_SafariSpeechRecognition.isAvailable()) {
|
|
@@ -8612,7 +8944,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
8612
8944
|
this.isListening = true;
|
|
8613
8945
|
this.startTime = performance.now();
|
|
8614
8946
|
this.accumulatedText = "";
|
|
8615
|
-
|
|
8947
|
+
logger17.info("Speech recognition started", {
|
|
8616
8948
|
language: this.config.language
|
|
8617
8949
|
});
|
|
8618
8950
|
span?.end();
|
|
@@ -8627,7 +8959,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
8627
8959
|
*/
|
|
8628
8960
|
async stop() {
|
|
8629
8961
|
if (!this.isListening || !this.recognition) {
|
|
8630
|
-
|
|
8962
|
+
logger17.warn("Not currently listening");
|
|
8631
8963
|
return {
|
|
8632
8964
|
text: this.accumulatedText,
|
|
8633
8965
|
language: this.config.language,
|
|
@@ -8656,7 +8988,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
8656
8988
|
if (this.recognition && this.isListening) {
|
|
8657
8989
|
this.recognition.abort();
|
|
8658
8990
|
this.isListening = false;
|
|
8659
|
-
|
|
8991
|
+
logger17.info("Speech recognition aborted");
|
|
8660
8992
|
}
|
|
8661
8993
|
}
|
|
8662
8994
|
/**
|
|
@@ -8687,7 +9019,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
8687
9019
|
this.isListening = false;
|
|
8688
9020
|
this.resultCallbacks = [];
|
|
8689
9021
|
this.errorCallbacks = [];
|
|
8690
|
-
|
|
9022
|
+
logger17.debug("SafariSpeechRecognition disposed");
|
|
8691
9023
|
}
|
|
8692
9024
|
/**
|
|
8693
9025
|
* Set up event handlers for the recognition instance
|
|
@@ -8715,7 +9047,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
8715
9047
|
confidence: alternative.confidence
|
|
8716
9048
|
};
|
|
8717
9049
|
this.emitResult(speechResult);
|
|
8718
|
-
|
|
9050
|
+
logger17.trace("Speech result", {
|
|
8719
9051
|
text: text.substring(0, 50),
|
|
8720
9052
|
isFinal,
|
|
8721
9053
|
confidence: alternative.confidence
|
|
@@ -8725,12 +9057,12 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
8725
9057
|
span?.end();
|
|
8726
9058
|
} catch (error) {
|
|
8727
9059
|
span?.endWithError(error instanceof Error ? error : new Error(String(error)));
|
|
8728
|
-
|
|
9060
|
+
logger17.error("Error processing speech result", { error });
|
|
8729
9061
|
}
|
|
8730
9062
|
};
|
|
8731
9063
|
this.recognition.onerror = (event) => {
|
|
8732
9064
|
const error = new Error(`Speech recognition error: ${event.error} - ${event.message}`);
|
|
8733
|
-
|
|
9065
|
+
logger17.error("Speech recognition error", { error: event.error, message: event.message });
|
|
8734
9066
|
this.emitError(error);
|
|
8735
9067
|
if (this.stopRejecter) {
|
|
8736
9068
|
this.stopRejecter(error);
|
|
@@ -8740,7 +9072,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
8740
9072
|
};
|
|
8741
9073
|
this.recognition.onend = () => {
|
|
8742
9074
|
this.isListening = false;
|
|
8743
|
-
|
|
9075
|
+
logger17.info("Speech recognition ended", {
|
|
8744
9076
|
totalText: this.accumulatedText.length,
|
|
8745
9077
|
durationMs: performance.now() - this.startTime
|
|
8746
9078
|
});
|
|
@@ -8757,13 +9089,13 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
8757
9089
|
}
|
|
8758
9090
|
};
|
|
8759
9091
|
this.recognition.onstart = () => {
|
|
8760
|
-
|
|
9092
|
+
logger17.debug("Speech recognition started by browser");
|
|
8761
9093
|
};
|
|
8762
9094
|
this.recognition.onspeechstart = () => {
|
|
8763
|
-
|
|
9095
|
+
logger17.debug("Speech detected");
|
|
8764
9096
|
};
|
|
8765
9097
|
this.recognition.onspeechend = () => {
|
|
8766
|
-
|
|
9098
|
+
logger17.debug("Speech ended");
|
|
8767
9099
|
};
|
|
8768
9100
|
}
|
|
8769
9101
|
/**
|
|
@@ -8774,7 +9106,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
8774
9106
|
try {
|
|
8775
9107
|
callback(result);
|
|
8776
9108
|
} catch (error) {
|
|
8777
|
-
|
|
9109
|
+
logger17.error("Error in result callback", { error });
|
|
8778
9110
|
}
|
|
8779
9111
|
}
|
|
8780
9112
|
}
|
|
@@ -8786,7 +9118,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
8786
9118
|
try {
|
|
8787
9119
|
callback(error);
|
|
8788
9120
|
} catch (callbackError) {
|
|
8789
|
-
|
|
9121
|
+
logger17.error("Error in error callback", { error: callbackError });
|
|
8790
9122
|
}
|
|
8791
9123
|
}
|
|
8792
9124
|
}
|
|
@@ -9356,327 +9688,9 @@ var AnimationGraph = class extends EventEmitter {
|
|
|
9356
9688
|
}
|
|
9357
9689
|
};
|
|
9358
9690
|
|
|
9359
|
-
// src/animation/simplex2d.ts
|
|
9360
|
-
var perm = new Uint8Array(512);
|
|
9361
|
-
var grad2 = [
|
|
9362
|
-
[1, 1],
|
|
9363
|
-
[-1, 1],
|
|
9364
|
-
[1, -1],
|
|
9365
|
-
[-1, -1],
|
|
9366
|
-
[1, 0],
|
|
9367
|
-
[-1, 0],
|
|
9368
|
-
[0, 1],
|
|
9369
|
-
[0, -1]
|
|
9370
|
-
];
|
|
9371
|
-
var p = [
|
|
9372
|
-
151,
|
|
9373
|
-
160,
|
|
9374
|
-
137,
|
|
9375
|
-
91,
|
|
9376
|
-
90,
|
|
9377
|
-
15,
|
|
9378
|
-
131,
|
|
9379
|
-
13,
|
|
9380
|
-
201,
|
|
9381
|
-
95,
|
|
9382
|
-
96,
|
|
9383
|
-
53,
|
|
9384
|
-
194,
|
|
9385
|
-
233,
|
|
9386
|
-
7,
|
|
9387
|
-
225,
|
|
9388
|
-
140,
|
|
9389
|
-
36,
|
|
9390
|
-
103,
|
|
9391
|
-
30,
|
|
9392
|
-
69,
|
|
9393
|
-
142,
|
|
9394
|
-
8,
|
|
9395
|
-
99,
|
|
9396
|
-
37,
|
|
9397
|
-
240,
|
|
9398
|
-
21,
|
|
9399
|
-
10,
|
|
9400
|
-
23,
|
|
9401
|
-
190,
|
|
9402
|
-
6,
|
|
9403
|
-
148,
|
|
9404
|
-
247,
|
|
9405
|
-
120,
|
|
9406
|
-
234,
|
|
9407
|
-
75,
|
|
9408
|
-
0,
|
|
9409
|
-
26,
|
|
9410
|
-
197,
|
|
9411
|
-
62,
|
|
9412
|
-
94,
|
|
9413
|
-
252,
|
|
9414
|
-
219,
|
|
9415
|
-
203,
|
|
9416
|
-
117,
|
|
9417
|
-
35,
|
|
9418
|
-
11,
|
|
9419
|
-
32,
|
|
9420
|
-
57,
|
|
9421
|
-
177,
|
|
9422
|
-
33,
|
|
9423
|
-
88,
|
|
9424
|
-
237,
|
|
9425
|
-
149,
|
|
9426
|
-
56,
|
|
9427
|
-
87,
|
|
9428
|
-
174,
|
|
9429
|
-
20,
|
|
9430
|
-
125,
|
|
9431
|
-
136,
|
|
9432
|
-
171,
|
|
9433
|
-
168,
|
|
9434
|
-
68,
|
|
9435
|
-
175,
|
|
9436
|
-
74,
|
|
9437
|
-
165,
|
|
9438
|
-
71,
|
|
9439
|
-
134,
|
|
9440
|
-
139,
|
|
9441
|
-
48,
|
|
9442
|
-
27,
|
|
9443
|
-
166,
|
|
9444
|
-
77,
|
|
9445
|
-
146,
|
|
9446
|
-
158,
|
|
9447
|
-
231,
|
|
9448
|
-
83,
|
|
9449
|
-
111,
|
|
9450
|
-
229,
|
|
9451
|
-
122,
|
|
9452
|
-
60,
|
|
9453
|
-
211,
|
|
9454
|
-
133,
|
|
9455
|
-
230,
|
|
9456
|
-
220,
|
|
9457
|
-
105,
|
|
9458
|
-
92,
|
|
9459
|
-
41,
|
|
9460
|
-
55,
|
|
9461
|
-
46,
|
|
9462
|
-
245,
|
|
9463
|
-
40,
|
|
9464
|
-
244,
|
|
9465
|
-
102,
|
|
9466
|
-
143,
|
|
9467
|
-
54,
|
|
9468
|
-
65,
|
|
9469
|
-
25,
|
|
9470
|
-
63,
|
|
9471
|
-
161,
|
|
9472
|
-
1,
|
|
9473
|
-
216,
|
|
9474
|
-
80,
|
|
9475
|
-
73,
|
|
9476
|
-
209,
|
|
9477
|
-
76,
|
|
9478
|
-
132,
|
|
9479
|
-
187,
|
|
9480
|
-
208,
|
|
9481
|
-
89,
|
|
9482
|
-
18,
|
|
9483
|
-
169,
|
|
9484
|
-
200,
|
|
9485
|
-
196,
|
|
9486
|
-
135,
|
|
9487
|
-
130,
|
|
9488
|
-
116,
|
|
9489
|
-
188,
|
|
9490
|
-
159,
|
|
9491
|
-
86,
|
|
9492
|
-
164,
|
|
9493
|
-
100,
|
|
9494
|
-
109,
|
|
9495
|
-
198,
|
|
9496
|
-
173,
|
|
9497
|
-
186,
|
|
9498
|
-
3,
|
|
9499
|
-
64,
|
|
9500
|
-
52,
|
|
9501
|
-
217,
|
|
9502
|
-
226,
|
|
9503
|
-
250,
|
|
9504
|
-
124,
|
|
9505
|
-
123,
|
|
9506
|
-
5,
|
|
9507
|
-
202,
|
|
9508
|
-
38,
|
|
9509
|
-
147,
|
|
9510
|
-
118,
|
|
9511
|
-
126,
|
|
9512
|
-
255,
|
|
9513
|
-
82,
|
|
9514
|
-
85,
|
|
9515
|
-
212,
|
|
9516
|
-
207,
|
|
9517
|
-
206,
|
|
9518
|
-
59,
|
|
9519
|
-
227,
|
|
9520
|
-
47,
|
|
9521
|
-
16,
|
|
9522
|
-
58,
|
|
9523
|
-
17,
|
|
9524
|
-
182,
|
|
9525
|
-
189,
|
|
9526
|
-
28,
|
|
9527
|
-
42,
|
|
9528
|
-
223,
|
|
9529
|
-
183,
|
|
9530
|
-
170,
|
|
9531
|
-
213,
|
|
9532
|
-
119,
|
|
9533
|
-
248,
|
|
9534
|
-
152,
|
|
9535
|
-
2,
|
|
9536
|
-
44,
|
|
9537
|
-
154,
|
|
9538
|
-
163,
|
|
9539
|
-
70,
|
|
9540
|
-
221,
|
|
9541
|
-
153,
|
|
9542
|
-
101,
|
|
9543
|
-
155,
|
|
9544
|
-
167,
|
|
9545
|
-
43,
|
|
9546
|
-
172,
|
|
9547
|
-
9,
|
|
9548
|
-
129,
|
|
9549
|
-
22,
|
|
9550
|
-
39,
|
|
9551
|
-
253,
|
|
9552
|
-
19,
|
|
9553
|
-
98,
|
|
9554
|
-
108,
|
|
9555
|
-
110,
|
|
9556
|
-
79,
|
|
9557
|
-
113,
|
|
9558
|
-
224,
|
|
9559
|
-
232,
|
|
9560
|
-
178,
|
|
9561
|
-
185,
|
|
9562
|
-
112,
|
|
9563
|
-
104,
|
|
9564
|
-
218,
|
|
9565
|
-
246,
|
|
9566
|
-
97,
|
|
9567
|
-
228,
|
|
9568
|
-
251,
|
|
9569
|
-
34,
|
|
9570
|
-
242,
|
|
9571
|
-
193,
|
|
9572
|
-
238,
|
|
9573
|
-
210,
|
|
9574
|
-
144,
|
|
9575
|
-
12,
|
|
9576
|
-
191,
|
|
9577
|
-
179,
|
|
9578
|
-
162,
|
|
9579
|
-
241,
|
|
9580
|
-
81,
|
|
9581
|
-
51,
|
|
9582
|
-
145,
|
|
9583
|
-
235,
|
|
9584
|
-
249,
|
|
9585
|
-
14,
|
|
9586
|
-
239,
|
|
9587
|
-
107,
|
|
9588
|
-
49,
|
|
9589
|
-
192,
|
|
9590
|
-
214,
|
|
9591
|
-
31,
|
|
9592
|
-
181,
|
|
9593
|
-
199,
|
|
9594
|
-
106,
|
|
9595
|
-
157,
|
|
9596
|
-
184,
|
|
9597
|
-
84,
|
|
9598
|
-
204,
|
|
9599
|
-
176,
|
|
9600
|
-
115,
|
|
9601
|
-
121,
|
|
9602
|
-
50,
|
|
9603
|
-
45,
|
|
9604
|
-
127,
|
|
9605
|
-
4,
|
|
9606
|
-
150,
|
|
9607
|
-
254,
|
|
9608
|
-
138,
|
|
9609
|
-
236,
|
|
9610
|
-
205,
|
|
9611
|
-
93,
|
|
9612
|
-
222,
|
|
9613
|
-
114,
|
|
9614
|
-
67,
|
|
9615
|
-
29,
|
|
9616
|
-
24,
|
|
9617
|
-
72,
|
|
9618
|
-
243,
|
|
9619
|
-
141,
|
|
9620
|
-
128,
|
|
9621
|
-
195,
|
|
9622
|
-
78,
|
|
9623
|
-
66,
|
|
9624
|
-
215,
|
|
9625
|
-
61,
|
|
9626
|
-
156,
|
|
9627
|
-
180
|
|
9628
|
-
];
|
|
9629
|
-
for (let i = 0; i < 256; i++) {
|
|
9630
|
-
perm[i] = p[i];
|
|
9631
|
-
perm[i + 256] = p[i];
|
|
9632
|
-
}
|
|
9633
|
-
var F2 = 0.5 * (Math.sqrt(3) - 1);
|
|
9634
|
-
var G2 = (3 - Math.sqrt(3)) / 6;
|
|
9635
|
-
function dot2(g, x, y) {
|
|
9636
|
-
return g[0] * x + g[1] * y;
|
|
9637
|
-
}
|
|
9638
|
-
function simplex2d(x, y) {
|
|
9639
|
-
const s = (x + y) * F2;
|
|
9640
|
-
const i = Math.floor(x + s);
|
|
9641
|
-
const j = Math.floor(y + s);
|
|
9642
|
-
const t = (i + j) * G2;
|
|
9643
|
-
const X0 = i - t;
|
|
9644
|
-
const Y0 = j - t;
|
|
9645
|
-
const x0 = x - X0;
|
|
9646
|
-
const y0 = y - Y0;
|
|
9647
|
-
const i1 = x0 > y0 ? 1 : 0;
|
|
9648
|
-
const j1 = x0 > y0 ? 0 : 1;
|
|
9649
|
-
const x1 = x0 - i1 + G2;
|
|
9650
|
-
const y1 = y0 - j1 + G2;
|
|
9651
|
-
const x2 = x0 - 1 + 2 * G2;
|
|
9652
|
-
const y2 = y0 - 1 + 2 * G2;
|
|
9653
|
-
const ii = i & 255;
|
|
9654
|
-
const jj = j & 255;
|
|
9655
|
-
const gi0 = perm[ii + perm[jj]] % 8;
|
|
9656
|
-
const gi1 = perm[ii + i1 + perm[jj + j1]] % 8;
|
|
9657
|
-
const gi2 = perm[ii + 1 + perm[jj + 1]] % 8;
|
|
9658
|
-
let n0 = 0;
|
|
9659
|
-
let t0 = 0.5 - x0 * x0 - y0 * y0;
|
|
9660
|
-
if (t0 >= 0) {
|
|
9661
|
-
t0 *= t0;
|
|
9662
|
-
n0 = t0 * t0 * dot2(grad2[gi0], x0, y0);
|
|
9663
|
-
}
|
|
9664
|
-
let n1 = 0;
|
|
9665
|
-
let t1 = 0.5 - x1 * x1 - y1 * y1;
|
|
9666
|
-
if (t1 >= 0) {
|
|
9667
|
-
t1 *= t1;
|
|
9668
|
-
n1 = t1 * t1 * dot2(grad2[gi1], x1, y1);
|
|
9669
|
-
}
|
|
9670
|
-
let n2 = 0;
|
|
9671
|
-
let t2 = 0.5 - x2 * x2 - y2 * y2;
|
|
9672
|
-
if (t2 >= 0) {
|
|
9673
|
-
t2 *= t2;
|
|
9674
|
-
n2 = t2 * t2 * dot2(grad2[gi2], x2, y2);
|
|
9675
|
-
}
|
|
9676
|
-
return 70 * (n0 + n1 + n2);
|
|
9677
|
-
}
|
|
9678
|
-
|
|
9679
9691
|
// src/animation/ProceduralLifeLayer.ts
|
|
9692
|
+
import { createNoise2D } from "simplex-noise";
|
|
9693
|
+
var simplex2d = createNoise2D();
|
|
9680
9694
|
var PHASE_OPEN = 0;
|
|
9681
9695
|
var PHASE_CLOSING = 1;
|
|
9682
9696
|
var PHASE_CLOSED = 2;
|
|
@@ -9984,6 +9998,684 @@ var ProceduralLifeLayer = class {
|
|
|
9984
9998
|
}
|
|
9985
9999
|
};
|
|
9986
10000
|
|
|
10001
|
+
// src/orchestration/MicLipSync.ts
|
|
10002
|
+
var logger18 = createLogger("MicLipSync");
|
|
10003
|
+
var MicLipSync = class extends EventEmitter {
|
|
10004
|
+
constructor(config) {
|
|
10005
|
+
super();
|
|
10006
|
+
this.omoteEvents = new EventEmitter();
|
|
10007
|
+
this._state = "idle";
|
|
10008
|
+
this._isSpeaking = false;
|
|
10009
|
+
this._currentFrame = null;
|
|
10010
|
+
this._currentRawFrame = null;
|
|
10011
|
+
// VAD state
|
|
10012
|
+
this.speechStartTime = 0;
|
|
10013
|
+
this.vadChunkSize = 0;
|
|
10014
|
+
this.vadBuffer = null;
|
|
10015
|
+
this.vadBufferOffset = 0;
|
|
10016
|
+
this.profile = config.profile ?? {};
|
|
10017
|
+
this.vad = config.vad;
|
|
10018
|
+
this.mic = new MicrophoneCapture(this.omoteEvents, {
|
|
10019
|
+
sampleRate: config.sampleRate ?? 16e3,
|
|
10020
|
+
chunkSize: config.micChunkSize ?? 512
|
|
10021
|
+
});
|
|
10022
|
+
this.processor = new A2EProcessor({
|
|
10023
|
+
backend: config.lam,
|
|
10024
|
+
sampleRate: config.sampleRate ?? 16e3,
|
|
10025
|
+
identityIndex: config.identityIndex,
|
|
10026
|
+
onFrame: (raw) => {
|
|
10027
|
+
const scaled = applyProfile(raw, this.profile);
|
|
10028
|
+
this._currentFrame = scaled;
|
|
10029
|
+
this._currentRawFrame = raw;
|
|
10030
|
+
this.emit("frame", { blendshapes: scaled, rawBlendshapes: raw });
|
|
10031
|
+
},
|
|
10032
|
+
onError: (error) => {
|
|
10033
|
+
logger18.error("A2E inference error", { message: error.message });
|
|
10034
|
+
this.emit("error", error);
|
|
10035
|
+
}
|
|
10036
|
+
});
|
|
10037
|
+
this.omoteEvents.on("audio.chunk", ({ pcm }) => {
|
|
10038
|
+
const float32 = int16ToFloat32(pcm);
|
|
10039
|
+
this.processor.pushAudio(float32);
|
|
10040
|
+
if (this.vad) {
|
|
10041
|
+
this.processVAD(float32);
|
|
10042
|
+
}
|
|
10043
|
+
});
|
|
10044
|
+
this.omoteEvents.on("audio.level", (level) => {
|
|
10045
|
+
this.emit("audio:level", level);
|
|
10046
|
+
});
|
|
10047
|
+
if (this.vad) {
|
|
10048
|
+
this.vadChunkSize = this.vad.getChunkSize();
|
|
10049
|
+
this.vadBuffer = new Float32Array(this.vadChunkSize);
|
|
10050
|
+
this.vadBufferOffset = 0;
|
|
10051
|
+
}
|
|
10052
|
+
}
|
|
10053
|
+
/** Current state */
|
|
10054
|
+
get state() {
|
|
10055
|
+
return this._state;
|
|
10056
|
+
}
|
|
10057
|
+
/** Latest blendshape frame (null before first inference) */
|
|
10058
|
+
get currentFrame() {
|
|
10059
|
+
return this._currentFrame;
|
|
10060
|
+
}
|
|
10061
|
+
/** Whether speech is currently detected (requires VAD) */
|
|
10062
|
+
get isSpeaking() {
|
|
10063
|
+
return this._isSpeaking;
|
|
10064
|
+
}
|
|
10065
|
+
/** Current backend type */
|
|
10066
|
+
get backend() {
|
|
10067
|
+
return this.processor ? "active" : null;
|
|
10068
|
+
}
|
|
10069
|
+
// ---------------------------------------------------------------------------
|
|
10070
|
+
// Public API
|
|
10071
|
+
// ---------------------------------------------------------------------------
|
|
10072
|
+
/** Start microphone capture and inference loop */
|
|
10073
|
+
async start() {
|
|
10074
|
+
if (this._state === "active") return;
|
|
10075
|
+
await this.mic.start();
|
|
10076
|
+
this.processor.startDrip();
|
|
10077
|
+
this.emit("mic:start", void 0);
|
|
10078
|
+
this.setState("active");
|
|
10079
|
+
}
|
|
10080
|
+
/** Stop microphone and inference */
|
|
10081
|
+
stop() {
|
|
10082
|
+
if (this._state === "idle") return;
|
|
10083
|
+
this.processor.stopDrip();
|
|
10084
|
+
this.mic.stop();
|
|
10085
|
+
this._isSpeaking = false;
|
|
10086
|
+
this.emit("mic:stop", void 0);
|
|
10087
|
+
this.setState("idle");
|
|
10088
|
+
}
|
|
10089
|
+
/** Pause inference (mic stays open for faster resume) */
|
|
10090
|
+
pause() {
|
|
10091
|
+
if (this._state !== "active") return;
|
|
10092
|
+
this.processor.stopDrip();
|
|
10093
|
+
this.setState("paused");
|
|
10094
|
+
}
|
|
10095
|
+
/** Resume inference after pause */
|
|
10096
|
+
resume() {
|
|
10097
|
+
if (this._state !== "paused") return;
|
|
10098
|
+
this.processor.startDrip();
|
|
10099
|
+
this.setState("active");
|
|
10100
|
+
}
|
|
10101
|
+
/** Update ExpressionProfile at runtime */
|
|
10102
|
+
setProfile(profile) {
|
|
10103
|
+
this.profile = profile;
|
|
10104
|
+
}
|
|
10105
|
+
/** Dispose of all resources */
|
|
10106
|
+
async dispose() {
|
|
10107
|
+
this.stop();
|
|
10108
|
+
this.processor.dispose();
|
|
10109
|
+
}
|
|
10110
|
+
// ---------------------------------------------------------------------------
|
|
10111
|
+
// Internal: VAD processing
|
|
10112
|
+
// ---------------------------------------------------------------------------
|
|
10113
|
+
async processVAD(samples) {
|
|
10114
|
+
if (!this.vad || !this.vadBuffer) return;
|
|
10115
|
+
for (let i = 0; i < samples.length; i++) {
|
|
10116
|
+
this.vadBuffer[this.vadBufferOffset++] = samples[i];
|
|
10117
|
+
if (this.vadBufferOffset >= this.vadChunkSize) {
|
|
10118
|
+
try {
|
|
10119
|
+
const result = await this.vad.process(this.vadBuffer);
|
|
10120
|
+
const wasSpeaking = this._isSpeaking;
|
|
10121
|
+
this._isSpeaking = result.isSpeech;
|
|
10122
|
+
if (!wasSpeaking && result.isSpeech) {
|
|
10123
|
+
this.speechStartTime = performance.now();
|
|
10124
|
+
this.emit("speech:start", void 0);
|
|
10125
|
+
} else if (wasSpeaking && !result.isSpeech) {
|
|
10126
|
+
const durationMs = performance.now() - this.speechStartTime;
|
|
10127
|
+
this.emit("speech:end", { durationMs });
|
|
10128
|
+
}
|
|
10129
|
+
} catch (err) {
|
|
10130
|
+
logger18.warn("VAD process error", { error: String(err) });
|
|
10131
|
+
}
|
|
10132
|
+
this.vadBufferOffset = 0;
|
|
10133
|
+
}
|
|
10134
|
+
}
|
|
10135
|
+
}
|
|
10136
|
+
// ---------------------------------------------------------------------------
|
|
10137
|
+
// Internal: State management
|
|
10138
|
+
// ---------------------------------------------------------------------------
|
|
10139
|
+
setState(state) {
|
|
10140
|
+
if (this._state === state) return;
|
|
10141
|
+
this._state = state;
|
|
10142
|
+
this.emit("state", state);
|
|
10143
|
+
}
|
|
10144
|
+
};
|
|
10145
|
+
|
|
10146
|
+
// src/orchestration/VoicePipeline.ts
|
|
10147
|
+
var logger19 = createLogger("VoicePipeline");
|
|
10148
|
+
var VoicePipeline = class extends EventEmitter {
|
|
10149
|
+
constructor(config) {
|
|
10150
|
+
super();
|
|
10151
|
+
// State
|
|
10152
|
+
this._state = "idle";
|
|
10153
|
+
this.stopped = false;
|
|
10154
|
+
this.epoch = 0;
|
|
10155
|
+
this._sessionId = null;
|
|
10156
|
+
// Models
|
|
10157
|
+
this.asr = null;
|
|
10158
|
+
this.lam = null;
|
|
10159
|
+
this.vad = null;
|
|
10160
|
+
this.unifiedWorker = null;
|
|
10161
|
+
// Pipelines
|
|
10162
|
+
this.playback = null;
|
|
10163
|
+
this.interruption = null;
|
|
10164
|
+
this.omoteEvents = new EventEmitter();
|
|
10165
|
+
this.mic = null;
|
|
10166
|
+
// Audio accumulation
|
|
10167
|
+
this.audioBuffer = [];
|
|
10168
|
+
this.audioBufferSamples = 0;
|
|
10169
|
+
this.speechStartTime = 0;
|
|
10170
|
+
this.silenceTimer = null;
|
|
10171
|
+
this.isSpeaking = false;
|
|
10172
|
+
// Progressive transcription
|
|
10173
|
+
this.progressiveTimer = null;
|
|
10174
|
+
this.progressivePromise = null;
|
|
10175
|
+
this.lastProgressiveResult = null;
|
|
10176
|
+
this.lastProgressiveSamples = 0;
|
|
10177
|
+
// ASR error recovery
|
|
10178
|
+
this.asrErrorCount = 0;
|
|
10179
|
+
// Response abort
|
|
10180
|
+
this.responseAbortController = null;
|
|
10181
|
+
// Frame refs
|
|
10182
|
+
this._currentFrame = null;
|
|
10183
|
+
this.config = config;
|
|
10184
|
+
}
|
|
10185
|
+
/** Current pipeline state */
|
|
10186
|
+
get state() {
|
|
10187
|
+
return this._state;
|
|
10188
|
+
}
|
|
10189
|
+
/** Latest blendshape frame */
|
|
10190
|
+
get currentFrame() {
|
|
10191
|
+
return this._currentFrame;
|
|
10192
|
+
}
|
|
10193
|
+
/** Whether user is currently speaking */
|
|
10194
|
+
get isSpeechActive() {
|
|
10195
|
+
return this.isSpeaking;
|
|
10196
|
+
}
|
|
10197
|
+
/** Session ID (generated on start(), null before) */
|
|
10198
|
+
get sessionId() {
|
|
10199
|
+
return this._sessionId;
|
|
10200
|
+
}
|
|
10201
|
+
// ---------------------------------------------------------------------------
|
|
10202
|
+
// Model loading
|
|
10203
|
+
// ---------------------------------------------------------------------------
|
|
10204
|
+
async loadModels() {
|
|
10205
|
+
this.setState("loading");
|
|
10206
|
+
const timeoutMs = this.config.lamLoadTimeoutMs ?? 3e4;
|
|
10207
|
+
try {
|
|
10208
|
+
if (isIOS()) {
|
|
10209
|
+
this.unifiedWorker = new UnifiedInferenceWorker();
|
|
10210
|
+
await this.unifiedWorker.init();
|
|
10211
|
+
}
|
|
10212
|
+
this.emitProgress("Speech recognition", 0, 3, 0);
|
|
10213
|
+
this.asr = createSenseVoice({
|
|
10214
|
+
modelUrl: this.config.models.senseVoice.modelUrl,
|
|
10215
|
+
tokensUrl: this.config.models.senseVoice.tokensUrl,
|
|
10216
|
+
language: this.config.models.senseVoice.language,
|
|
10217
|
+
unifiedWorker: this.unifiedWorker ?? void 0
|
|
10218
|
+
});
|
|
10219
|
+
await this.asr.load();
|
|
10220
|
+
this.emitProgress("Speech recognition", 45, 3, 1);
|
|
10221
|
+
this.emitProgress("Lip sync", 45, 3, 1);
|
|
10222
|
+
let lam = createA2E({
|
|
10223
|
+
gpuModelUrl: this.config.models.lam.gpuModelUrl,
|
|
10224
|
+
gpuExternalDataUrl: this.config.models.lam.gpuExternalDataUrl,
|
|
10225
|
+
cpuModelUrl: this.config.models.lam.cpuModelUrl,
|
|
10226
|
+
mode: this.config.models.lam.mode,
|
|
10227
|
+
unifiedWorker: this.unifiedWorker ?? void 0
|
|
10228
|
+
});
|
|
10229
|
+
let lamProgress = 45;
|
|
10230
|
+
const lamTickInterval = setInterval(() => {
|
|
10231
|
+
const remaining = 85 - lamProgress;
|
|
10232
|
+
lamProgress += Math.max(0.5, remaining * 0.08);
|
|
10233
|
+
this.emitProgress("Lip sync", Math.round(lamProgress), 3, 1);
|
|
10234
|
+
}, 300);
|
|
10235
|
+
try {
|
|
10236
|
+
const lamLoadResult = await Promise.race([
|
|
10237
|
+
lam.load().then(() => "ok"),
|
|
10238
|
+
new Promise((r) => setTimeout(() => r("timeout"), timeoutMs))
|
|
10239
|
+
]);
|
|
10240
|
+
if (lamLoadResult === "timeout") {
|
|
10241
|
+
logger19.warn(`LAM GPU load timed out after ${timeoutMs}ms, falling back to CPU`);
|
|
10242
|
+
await lam.dispose();
|
|
10243
|
+
lam = createA2E({
|
|
10244
|
+
gpuModelUrl: this.config.models.lam.gpuModelUrl,
|
|
10245
|
+
cpuModelUrl: this.config.models.lam.cpuModelUrl,
|
|
10246
|
+
mode: "cpu",
|
|
10247
|
+
unifiedWorker: this.unifiedWorker ?? void 0
|
|
10248
|
+
});
|
|
10249
|
+
await lam.load();
|
|
10250
|
+
}
|
|
10251
|
+
} finally {
|
|
10252
|
+
clearInterval(lamTickInterval);
|
|
10253
|
+
}
|
|
10254
|
+
this.lam = lam;
|
|
10255
|
+
this.emitProgress("Lip sync", 85, 3, 2);
|
|
10256
|
+
this.emitProgress("Voice detection", 85, 3, 2);
|
|
10257
|
+
this.vad = createSileroVAD({
|
|
10258
|
+
modelUrl: this.config.models.vad.modelUrl,
|
|
10259
|
+
threshold: this.config.models.vad.threshold,
|
|
10260
|
+
unifiedWorker: this.unifiedWorker ?? void 0
|
|
10261
|
+
});
|
|
10262
|
+
await this.vad.load();
|
|
10263
|
+
this.emitProgress("Voice detection", 100, 3, 3);
|
|
10264
|
+
this.playback = new PlaybackPipeline({
|
|
10265
|
+
lam: this.lam,
|
|
10266
|
+
profile: this.config.profile,
|
|
10267
|
+
identityIndex: this.config.identityIndex,
|
|
10268
|
+
neutralTransitionEnabled: this.config.neutralTransitionEnabled ?? true,
|
|
10269
|
+
neutralTransitionMs: this.config.neutralTransitionMs,
|
|
10270
|
+
audioDelayMs: this.config.audioDelayMs,
|
|
10271
|
+
chunkTargetMs: this.config.chunkTargetMs
|
|
10272
|
+
});
|
|
10273
|
+
await this.playback.initialize();
|
|
10274
|
+
this.playback.on("frame", (f) => {
|
|
10275
|
+
this._currentFrame = f.blendshapes;
|
|
10276
|
+
this.emit("frame", f);
|
|
10277
|
+
});
|
|
10278
|
+
this.playback.on("frame:raw", (f) => this.emit("frame:raw", f));
|
|
10279
|
+
this.playback.on("playback:start", (t) => this.emit("playback:start", t));
|
|
10280
|
+
this.playback.on("playback:complete", () => {
|
|
10281
|
+
if (this.stopped) return;
|
|
10282
|
+
this.emit("playback:complete", void 0);
|
|
10283
|
+
this.vad?.reset();
|
|
10284
|
+
this.epoch++;
|
|
10285
|
+
this.setState("listening");
|
|
10286
|
+
});
|
|
10287
|
+
this.playback.on("error", (e) => this.emit("error", e));
|
|
10288
|
+
this.interruption = new InterruptionHandler({
|
|
10289
|
+
enabled: this.config.interruptionEnabled ?? true,
|
|
10290
|
+
minSpeechDurationMs: this.config.interruptionMinSpeechMs ?? 200
|
|
10291
|
+
});
|
|
10292
|
+
this.interruption.on("interruption.triggered", () => {
|
|
10293
|
+
this.handleInterruption();
|
|
10294
|
+
});
|
|
10295
|
+
this.setState("ready");
|
|
10296
|
+
} catch (error) {
|
|
10297
|
+
const err = error instanceof Error ? error : new Error(String(error));
|
|
10298
|
+
logger19.error("Model loading failed", { message: err.message });
|
|
10299
|
+
this.emit("error", err);
|
|
10300
|
+
this.setState("error");
|
|
10301
|
+
throw err;
|
|
10302
|
+
}
|
|
10303
|
+
}
|
|
10304
|
+
// ---------------------------------------------------------------------------
|
|
10305
|
+
// Conversation lifecycle
|
|
10306
|
+
// ---------------------------------------------------------------------------
|
|
10307
|
+
async start() {
|
|
10308
|
+
if (this._state !== "ready") {
|
|
10309
|
+
throw new Error(`Cannot start: state is '${this._state}', expected 'ready'`);
|
|
10310
|
+
}
|
|
10311
|
+
this.stopped = false;
|
|
10312
|
+
this.epoch++;
|
|
10313
|
+
this._sessionId = crypto.randomUUID();
|
|
10314
|
+
this.asrErrorCount = 0;
|
|
10315
|
+
this.mic = new MicrophoneCapture(this.omoteEvents, {
|
|
10316
|
+
sampleRate: 16e3,
|
|
10317
|
+
chunkSize: 512
|
|
10318
|
+
});
|
|
10319
|
+
this.omoteEvents.on("audio.chunk", ({ pcm }) => {
|
|
10320
|
+
const float32 = int16ToFloat32(pcm);
|
|
10321
|
+
this.processAudioChunk(float32);
|
|
10322
|
+
});
|
|
10323
|
+
this.omoteEvents.on("audio.level", (level) => {
|
|
10324
|
+
this.emit("audio:level", level);
|
|
10325
|
+
});
|
|
10326
|
+
await this.mic.start();
|
|
10327
|
+
this.setState("listening");
|
|
10328
|
+
}
|
|
10329
|
+
stop() {
|
|
10330
|
+
this.stopped = true;
|
|
10331
|
+
this.epoch++;
|
|
10332
|
+
this.clearSilenceTimer();
|
|
10333
|
+
this.stopProgressiveTranscription();
|
|
10334
|
+
this.responseAbortController?.abort();
|
|
10335
|
+
this.responseAbortController = null;
|
|
10336
|
+
this.vad?.reset();
|
|
10337
|
+
this.playback?.stop();
|
|
10338
|
+
this.mic?.stop();
|
|
10339
|
+
this.mic = null;
|
|
10340
|
+
this.isSpeaking = false;
|
|
10341
|
+
this.audioBuffer = [];
|
|
10342
|
+
this.audioBufferSamples = 0;
|
|
10343
|
+
this._currentFrame = null;
|
|
10344
|
+
this.interruption?.setAISpeaking(false);
|
|
10345
|
+
if (this._state !== "idle") {
|
|
10346
|
+
this.setState("ready");
|
|
10347
|
+
}
|
|
10348
|
+
}
|
|
10349
|
+
setProfile(profile) {
|
|
10350
|
+
this.config.profile = profile;
|
|
10351
|
+
this.playback?.setProfile(profile);
|
|
10352
|
+
}
|
|
10353
|
+
async dispose() {
|
|
10354
|
+
this.stop();
|
|
10355
|
+
this.epoch++;
|
|
10356
|
+
await this.playback?.dispose();
|
|
10357
|
+
await this.asr?.dispose();
|
|
10358
|
+
await this.lam?.dispose();
|
|
10359
|
+
await this.vad?.dispose();
|
|
10360
|
+
this.playback = null;
|
|
10361
|
+
this.asr = null;
|
|
10362
|
+
this.lam = null;
|
|
10363
|
+
this.vad = null;
|
|
10364
|
+
this._state = "idle";
|
|
10365
|
+
}
|
|
10366
|
+
// ---------------------------------------------------------------------------
|
|
10367
|
+
// Audio processing
|
|
10368
|
+
// ---------------------------------------------------------------------------
|
|
10369
|
+
async processAudioChunk(samples) {
|
|
10370
|
+
if (!this.vad) return;
|
|
10371
|
+
try {
|
|
10372
|
+
const result = await this.vad.process(samples);
|
|
10373
|
+
if (this._state === "speaking" && this.interruption) {
|
|
10374
|
+
this.interruption.processVADResult(result.probability);
|
|
10375
|
+
return;
|
|
10376
|
+
}
|
|
10377
|
+
if (this._state !== "listening" && this._state !== "thinking") return;
|
|
10378
|
+
const wasSpeaking = this.isSpeaking;
|
|
10379
|
+
if (result.isSpeech) {
|
|
10380
|
+
if (!wasSpeaking) {
|
|
10381
|
+
this.isSpeaking = true;
|
|
10382
|
+
this.speechStartTime = performance.now();
|
|
10383
|
+
this.audioBuffer = [];
|
|
10384
|
+
this.audioBufferSamples = 0;
|
|
10385
|
+
this.lastProgressiveResult = null;
|
|
10386
|
+
this.lastProgressiveSamples = 0;
|
|
10387
|
+
this.emit("speech:start", void 0);
|
|
10388
|
+
this.startProgressiveTranscription();
|
|
10389
|
+
}
|
|
10390
|
+
this.audioBuffer.push(new Float32Array(samples));
|
|
10391
|
+
this.audioBufferSamples += samples.length;
|
|
10392
|
+
this.clearSilenceTimer();
|
|
10393
|
+
} else if (wasSpeaking) {
|
|
10394
|
+
this.audioBuffer.push(new Float32Array(samples));
|
|
10395
|
+
this.audioBufferSamples += samples.length;
|
|
10396
|
+
if (!this.silenceTimer) {
|
|
10397
|
+
const timeoutMs = this.getSilenceTimeout();
|
|
10398
|
+
this.silenceTimer = setTimeout(() => {
|
|
10399
|
+
this.onSilenceDetected();
|
|
10400
|
+
}, timeoutMs);
|
|
10401
|
+
}
|
|
10402
|
+
}
|
|
10403
|
+
} catch (err) {
|
|
10404
|
+
logger19.warn("VAD error", { error: String(err) });
|
|
10405
|
+
}
|
|
10406
|
+
}
|
|
10407
|
+
// ---------------------------------------------------------------------------
|
|
10408
|
+
// Silence detection
|
|
10409
|
+
// ---------------------------------------------------------------------------
|
|
10410
|
+
getSilenceTimeout() {
|
|
10411
|
+
const base = this.config.silenceTimeoutMs ?? 500;
|
|
10412
|
+
const extended = this.config.silenceTimeoutExtendedMs ?? 700;
|
|
10413
|
+
const adaptive = this.config.adaptiveTimeout ?? true;
|
|
10414
|
+
if (!adaptive) return base;
|
|
10415
|
+
const speechDurationMs = performance.now() - this.speechStartTime;
|
|
10416
|
+
return speechDurationMs > 3e3 ? extended : base;
|
|
10417
|
+
}
|
|
10418
|
+
onSilenceDetected() {
|
|
10419
|
+
const capturedEpoch = this.epoch;
|
|
10420
|
+
this.isSpeaking = false;
|
|
10421
|
+
const durationMs = performance.now() - this.speechStartTime;
|
|
10422
|
+
this.emit("speech:end", { durationMs });
|
|
10423
|
+
this.clearSilenceTimer();
|
|
10424
|
+
this.processEndOfSpeech(capturedEpoch).catch((err) => {
|
|
10425
|
+
logger19.error("End of speech processing failed", { error: String(err) });
|
|
10426
|
+
if (this.epoch === capturedEpoch && !this.stopped) {
|
|
10427
|
+
this.emit("error", err instanceof Error ? err : new Error(String(err)));
|
|
10428
|
+
this.setState("listening");
|
|
10429
|
+
}
|
|
10430
|
+
});
|
|
10431
|
+
}
|
|
10432
|
+
// ---------------------------------------------------------------------------
|
|
10433
|
+
// End of speech → transcription → response
|
|
10434
|
+
// ---------------------------------------------------------------------------
|
|
10435
|
+
async processEndOfSpeech(capturedEpoch) {
|
|
10436
|
+
if (this.progressivePromise) {
|
|
10437
|
+
try {
|
|
10438
|
+
await this.progressivePromise;
|
|
10439
|
+
} catch {
|
|
10440
|
+
}
|
|
10441
|
+
}
|
|
10442
|
+
this.stopProgressiveTranscription();
|
|
10443
|
+
if (this.epoch !== capturedEpoch || this.stopped) return;
|
|
10444
|
+
const totalSamples = this.audioBufferSamples;
|
|
10445
|
+
const fullAudio = new Float32Array(totalSamples);
|
|
10446
|
+
let offset = 0;
|
|
10447
|
+
for (const chunk of this.audioBuffer) {
|
|
10448
|
+
fullAudio.set(chunk, offset);
|
|
10449
|
+
offset += chunk.length;
|
|
10450
|
+
}
|
|
10451
|
+
this.audioBuffer = [];
|
|
10452
|
+
this.audioBufferSamples = 0;
|
|
10453
|
+
const minDuration = this.config.minAudioDurationSec ?? 0.3;
|
|
10454
|
+
const minEnergy = this.config.minAudioEnergy ?? 0.02;
|
|
10455
|
+
const durationSec = totalSamples / 16e3;
|
|
10456
|
+
if (durationSec < minDuration) {
|
|
10457
|
+
logger19.info("Audio too short, discarding", { durationSec });
|
|
10458
|
+
this.setState("listening");
|
|
10459
|
+
return;
|
|
10460
|
+
}
|
|
10461
|
+
let maxAbs = 0;
|
|
10462
|
+
for (let i = 0; i < fullAudio.length; i++) {
|
|
10463
|
+
const abs = Math.abs(fullAudio[i]);
|
|
10464
|
+
if (abs > maxAbs) maxAbs = abs;
|
|
10465
|
+
}
|
|
10466
|
+
let rms = 0;
|
|
10467
|
+
for (let i = 0; i < fullAudio.length; i++) {
|
|
10468
|
+
rms += fullAudio[i] * fullAudio[i];
|
|
10469
|
+
}
|
|
10470
|
+
rms = Math.sqrt(rms / fullAudio.length);
|
|
10471
|
+
if (rms < minEnergy) {
|
|
10472
|
+
logger19.info("Audio too quiet, discarding", { rms });
|
|
10473
|
+
this.setState("listening");
|
|
10474
|
+
return;
|
|
10475
|
+
}
|
|
10476
|
+
const normalizedAudio = this.normalizeAudio(fullAudio);
|
|
10477
|
+
this.setState("thinking");
|
|
10478
|
+
let transcript = null;
|
|
10479
|
+
const coverageThreshold = this.config.progressiveCoverageThreshold ?? 0.8;
|
|
10480
|
+
if (this.lastProgressiveResult && this.lastProgressiveResult.text.trim().length > 0 && this.lastProgressiveSamples >= totalSamples * coverageThreshold) {
|
|
10481
|
+
transcript = { ...this.lastProgressiveResult, isFinal: true };
|
|
10482
|
+
logger19.info("Using progressive result", {
|
|
10483
|
+
coverage: (this.lastProgressiveSamples / totalSamples).toFixed(2),
|
|
10484
|
+
text: transcript.text
|
|
10485
|
+
});
|
|
10486
|
+
} else {
|
|
10487
|
+
this.lastProgressiveResult = null;
|
|
10488
|
+
transcript = await this.transcribeWithTimeout(normalizedAudio);
|
|
10489
|
+
if (transcript) {
|
|
10490
|
+
transcript.isFinal = true;
|
|
10491
|
+
}
|
|
10492
|
+
}
|
|
10493
|
+
if (this.epoch !== capturedEpoch || this.stopped) return;
|
|
10494
|
+
if (!transcript || !transcript.text.trim()) {
|
|
10495
|
+
logger19.info("No transcript, resuming listening");
|
|
10496
|
+
this.setState("listening");
|
|
10497
|
+
return;
|
|
10498
|
+
}
|
|
10499
|
+
this.emit("transcript", transcript);
|
|
10500
|
+
await this.callResponseHandler(transcript, capturedEpoch);
|
|
10501
|
+
}
|
|
10502
|
+
// ---------------------------------------------------------------------------
|
|
10503
|
+
// Response handler
|
|
10504
|
+
// ---------------------------------------------------------------------------
|
|
10505
|
+
async callResponseHandler(transcript, capturedEpoch) {
|
|
10506
|
+
if (this.epoch !== capturedEpoch || this.stopped) return;
|
|
10507
|
+
this.setState("speaking");
|
|
10508
|
+
this.interruption?.setAISpeaking(true);
|
|
10509
|
+
const abortController = new AbortController();
|
|
10510
|
+
this.responseAbortController = abortController;
|
|
10511
|
+
try {
|
|
10512
|
+
this.playback.start();
|
|
10513
|
+
await this.config.onResponse({
|
|
10514
|
+
text: transcript.text,
|
|
10515
|
+
emotion: transcript.emotion,
|
|
10516
|
+
event: transcript.event,
|
|
10517
|
+
send: async (chunk) => {
|
|
10518
|
+
if (abortController.signal.aborted) return;
|
|
10519
|
+
await this.playback.onAudioChunk(chunk);
|
|
10520
|
+
},
|
|
10521
|
+
done: async () => {
|
|
10522
|
+
if (abortController.signal.aborted) return;
|
|
10523
|
+
await this.playback.end();
|
|
10524
|
+
},
|
|
10525
|
+
signal: abortController.signal,
|
|
10526
|
+
sessionId: this._sessionId
|
|
10527
|
+
});
|
|
10528
|
+
} catch (error) {
|
|
10529
|
+
if (abortController.signal.aborted) return;
|
|
10530
|
+
const err = error instanceof Error ? error : new Error(String(error));
|
|
10531
|
+
logger19.error("Response handler error", { message: err.message });
|
|
10532
|
+
this.emit("error", err);
|
|
10533
|
+
if (this.epoch === capturedEpoch && !this.stopped) {
|
|
10534
|
+
this.interruption?.setAISpeaking(false);
|
|
10535
|
+
this.setState("listening");
|
|
10536
|
+
}
|
|
10537
|
+
} finally {
|
|
10538
|
+
this.responseAbortController = null;
|
|
10539
|
+
}
|
|
10540
|
+
}
|
|
10541
|
+
// ---------------------------------------------------------------------------
|
|
10542
|
+
// Interruption handling
|
|
10543
|
+
// ---------------------------------------------------------------------------
|
|
10544
|
+
handleInterruption() {
|
|
10545
|
+
if (this._state !== "speaking") return;
|
|
10546
|
+
logger19.info("Interruption triggered");
|
|
10547
|
+
this.epoch++;
|
|
10548
|
+
this.responseAbortController?.abort();
|
|
10549
|
+
this.playback?.stop();
|
|
10550
|
+
this.interruption?.setAISpeaking(false);
|
|
10551
|
+
this.emit("interruption", void 0);
|
|
10552
|
+
if (!this.stopped) {
|
|
10553
|
+
this.setState("listening");
|
|
10554
|
+
}
|
|
10555
|
+
}
|
|
10556
|
+
// ---------------------------------------------------------------------------
|
|
10557
|
+
// Progressive transcription
|
|
10558
|
+
// ---------------------------------------------------------------------------
|
|
10559
|
+
startProgressiveTranscription() {
|
|
10560
|
+
this.stopProgressiveTranscription();
|
|
10561
|
+
const intervalMs = isIOS() ? this.config.progressiveIntervalIosMs ?? 800 : this.config.progressiveIntervalMs ?? 500;
|
|
10562
|
+
const minSamples = this.config.progressiveMinSamples ?? 8e3;
|
|
10563
|
+
this.progressiveTimer = setInterval(() => {
|
|
10564
|
+
if (this.audioBufferSamples < minSamples) return;
|
|
10565
|
+
if (!this.asr) return;
|
|
10566
|
+
const capturedEpoch = this.epoch;
|
|
10567
|
+
const snapshot = new Float32Array(this.audioBufferSamples);
|
|
10568
|
+
let offset = 0;
|
|
10569
|
+
for (const chunk of this.audioBuffer) {
|
|
10570
|
+
snapshot.set(chunk, offset);
|
|
10571
|
+
offset += chunk.length;
|
|
10572
|
+
}
|
|
10573
|
+
const snapshotSamples = this.audioBufferSamples;
|
|
10574
|
+
this.progressivePromise = (async () => {
|
|
10575
|
+
try {
|
|
10576
|
+
const result = await this.transcribeWithTimeout(snapshot);
|
|
10577
|
+
if (this.epoch !== capturedEpoch) return;
|
|
10578
|
+
if (result && result.text.trim()) {
|
|
10579
|
+
this.lastProgressiveResult = result;
|
|
10580
|
+
this.lastProgressiveSamples = snapshotSamples;
|
|
10581
|
+
this.emit("transcript", { ...result, isFinal: false });
|
|
10582
|
+
}
|
|
10583
|
+
} catch {
|
|
10584
|
+
}
|
|
10585
|
+
})();
|
|
10586
|
+
}, intervalMs);
|
|
10587
|
+
}
|
|
10588
|
+
stopProgressiveTranscription() {
|
|
10589
|
+
if (this.progressiveTimer) {
|
|
10590
|
+
clearInterval(this.progressiveTimer);
|
|
10591
|
+
this.progressiveTimer = null;
|
|
10592
|
+
}
|
|
10593
|
+
}
|
|
10594
|
+
// ---------------------------------------------------------------------------
|
|
10595
|
+
// Transcription with timeout + ASR error recovery
|
|
10596
|
+
// ---------------------------------------------------------------------------
|
|
10597
|
+
async transcribeWithTimeout(audio) {
|
|
10598
|
+
if (!this.asr) return null;
|
|
10599
|
+
const timeoutMs = this.config.transcriptionTimeoutMs ?? 1e4;
|
|
10600
|
+
const startTime = performance.now();
|
|
10601
|
+
try {
|
|
10602
|
+
const result = await Promise.race([
|
|
10603
|
+
this.asr.transcribe(audio),
|
|
10604
|
+
new Promise(
|
|
10605
|
+
(_, reject) => setTimeout(() => reject(new Error(`Transcription timed out after ${timeoutMs}ms`)), timeoutMs)
|
|
10606
|
+
)
|
|
10607
|
+
]);
|
|
10608
|
+
this.asrErrorCount = 0;
|
|
10609
|
+
return {
|
|
10610
|
+
text: result.text,
|
|
10611
|
+
emotion: result.emotion,
|
|
10612
|
+
language: result.language,
|
|
10613
|
+
isFinal: false,
|
|
10614
|
+
inferenceTimeMs: performance.now() - startTime
|
|
10615
|
+
};
|
|
10616
|
+
} catch (error) {
|
|
10617
|
+
this.asrErrorCount++;
|
|
10618
|
+
logger19.warn("Transcription failed", {
|
|
10619
|
+
attempt: this.asrErrorCount,
|
|
10620
|
+
error: String(error)
|
|
10621
|
+
});
|
|
10622
|
+
if (this.asrErrorCount >= 3) {
|
|
10623
|
+
logger19.warn("3 consecutive ASR errors, recreating session");
|
|
10624
|
+
try {
|
|
10625
|
+
await this.asr.dispose();
|
|
10626
|
+
this.asr = createSenseVoice({
|
|
10627
|
+
modelUrl: this.config.models.senseVoice.modelUrl,
|
|
10628
|
+
tokensUrl: this.config.models.senseVoice.tokensUrl,
|
|
10629
|
+
language: this.config.models.senseVoice.language,
|
|
10630
|
+
unifiedWorker: this.unifiedWorker ?? void 0
|
|
10631
|
+
});
|
|
10632
|
+
await this.asr.load();
|
|
10633
|
+
this.asrErrorCount = 0;
|
|
10634
|
+
} catch (recreateErr) {
|
|
10635
|
+
logger19.error("ASR session recreation failed", { error: String(recreateErr) });
|
|
10636
|
+
}
|
|
10637
|
+
}
|
|
10638
|
+
return null;
|
|
10639
|
+
}
|
|
10640
|
+
}
|
|
10641
|
+
// ---------------------------------------------------------------------------
|
|
10642
|
+
// Audio normalization
|
|
10643
|
+
// ---------------------------------------------------------------------------
|
|
10644
|
+
normalizeAudio(audio) {
|
|
10645
|
+
if (!(this.config.normalizeAudio ?? true)) return audio;
|
|
10646
|
+
let maxAbs = 0;
|
|
10647
|
+
for (let i = 0; i < audio.length; i++) {
|
|
10648
|
+
const abs = Math.abs(audio[i]);
|
|
10649
|
+
if (abs > maxAbs) maxAbs = abs;
|
|
10650
|
+
}
|
|
10651
|
+
if (maxAbs >= 0.1 || maxAbs === 0) return audio;
|
|
10652
|
+
const gain = 0.5 / maxAbs;
|
|
10653
|
+
const normalized = new Float32Array(audio.length);
|
|
10654
|
+
for (let i = 0; i < audio.length; i++) {
|
|
10655
|
+
normalized[i] = audio[i] * gain;
|
|
10656
|
+
}
|
|
10657
|
+
return normalized;
|
|
10658
|
+
}
|
|
10659
|
+
// ---------------------------------------------------------------------------
|
|
10660
|
+
// Helpers
|
|
10661
|
+
// ---------------------------------------------------------------------------
|
|
10662
|
+
setState(state) {
|
|
10663
|
+
if (this._state === state) return;
|
|
10664
|
+
logger19.info("State transition", { from: this._state, to: state });
|
|
10665
|
+
this._state = state;
|
|
10666
|
+
this.emit("state", state);
|
|
10667
|
+
}
|
|
10668
|
+
emitProgress(currentModel, progress, totalModels, modelsLoaded) {
|
|
10669
|
+
this.emit("loading:progress", { currentModel, progress, totalModels, modelsLoaded });
|
|
10670
|
+
}
|
|
10671
|
+
clearSilenceTimer() {
|
|
10672
|
+
if (this.silenceTimer) {
|
|
10673
|
+
clearTimeout(this.silenceTimer);
|
|
10674
|
+
this.silenceTimer = null;
|
|
10675
|
+
}
|
|
10676
|
+
}
|
|
10677
|
+
};
|
|
10678
|
+
|
|
9987
10679
|
// ../types/dist/index.mjs
|
|
9988
10680
|
var PROTOCOL_VERSION = 1;
|
|
9989
10681
|
function isProtocolEvent(obj) {
|
|
@@ -10016,11 +10708,13 @@ export {
|
|
|
10016
10708
|
LOG_LEVEL_PRIORITY,
|
|
10017
10709
|
MODEL_LOAD_TIME_BUCKETS,
|
|
10018
10710
|
MetricNames,
|
|
10711
|
+
MicLipSync,
|
|
10019
10712
|
MicrophoneCapture,
|
|
10020
10713
|
ModelCache,
|
|
10021
10714
|
OTLPExporter,
|
|
10022
10715
|
OmoteTelemetry,
|
|
10023
10716
|
PROTOCOL_VERSION,
|
|
10717
|
+
PlaybackPipeline,
|
|
10024
10718
|
ProceduralLifeLayer,
|
|
10025
10719
|
RingBuffer,
|
|
10026
10720
|
SafariSpeechRecognition,
|
|
@@ -10031,10 +10725,12 @@ export {
|
|
|
10031
10725
|
SileroVADUnifiedAdapter,
|
|
10032
10726
|
SileroVADWorker,
|
|
10033
10727
|
UnifiedInferenceWorker,
|
|
10728
|
+
VoicePipeline,
|
|
10034
10729
|
Wav2ArkitCpuInference,
|
|
10035
10730
|
Wav2ArkitCpuUnifiedAdapter,
|
|
10036
10731
|
Wav2ArkitCpuWorker,
|
|
10037
10732
|
Wav2Vec2Inference,
|
|
10733
|
+
applyProfile,
|
|
10038
10734
|
blendEmotions,
|
|
10039
10735
|
calculatePeak,
|
|
10040
10736
|
calculateRMS,
|