@omote/core 0.5.6 → 0.6.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.mts +484 -35
- package/dist/index.d.ts +484 -35
- package/dist/index.js +1191 -495
- package/dist/index.js.map +1 -1
- package/dist/index.mjs +1186 -490
- package/dist/index.mjs.map +1 -1
- package/package.json +3 -2
package/dist/index.js
CHANGED
|
@@ -56,11 +56,13 @@ __export(index_exports, {
|
|
|
56
56
|
LOG_LEVEL_PRIORITY: () => LOG_LEVEL_PRIORITY,
|
|
57
57
|
MODEL_LOAD_TIME_BUCKETS: () => MODEL_LOAD_TIME_BUCKETS,
|
|
58
58
|
MetricNames: () => MetricNames,
|
|
59
|
+
MicLipSync: () => MicLipSync,
|
|
59
60
|
MicrophoneCapture: () => MicrophoneCapture,
|
|
60
61
|
ModelCache: () => ModelCache,
|
|
61
62
|
OTLPExporter: () => OTLPExporter,
|
|
62
63
|
OmoteTelemetry: () => OmoteTelemetry,
|
|
63
64
|
PROTOCOL_VERSION: () => PROTOCOL_VERSION,
|
|
65
|
+
PlaybackPipeline: () => PlaybackPipeline,
|
|
64
66
|
ProceduralLifeLayer: () => ProceduralLifeLayer,
|
|
65
67
|
RingBuffer: () => RingBuffer,
|
|
66
68
|
SafariSpeechRecognition: () => SafariSpeechRecognition,
|
|
@@ -71,10 +73,12 @@ __export(index_exports, {
|
|
|
71
73
|
SileroVADUnifiedAdapter: () => SileroVADUnifiedAdapter,
|
|
72
74
|
SileroVADWorker: () => SileroVADWorker,
|
|
73
75
|
UnifiedInferenceWorker: () => UnifiedInferenceWorker,
|
|
76
|
+
VoicePipeline: () => VoicePipeline,
|
|
74
77
|
Wav2ArkitCpuInference: () => Wav2ArkitCpuInference,
|
|
75
78
|
Wav2ArkitCpuUnifiedAdapter: () => Wav2ArkitCpuUnifiedAdapter,
|
|
76
79
|
Wav2ArkitCpuWorker: () => Wav2ArkitCpuWorker,
|
|
77
80
|
Wav2Vec2Inference: () => Wav2Vec2Inference,
|
|
81
|
+
applyProfile: () => applyProfile,
|
|
78
82
|
blendEmotions: () => blendEmotions,
|
|
79
83
|
calculatePeak: () => calculatePeak,
|
|
80
84
|
calculateRMS: () => calculateRMS,
|
|
@@ -867,12 +871,12 @@ var Logger = class _Logger {
|
|
|
867
871
|
};
|
|
868
872
|
var loggerCache = /* @__PURE__ */ new Map();
|
|
869
873
|
function createLogger(module2) {
|
|
870
|
-
let
|
|
871
|
-
if (!
|
|
872
|
-
|
|
873
|
-
loggerCache.set(module2,
|
|
874
|
+
let logger20 = loggerCache.get(module2);
|
|
875
|
+
if (!logger20) {
|
|
876
|
+
logger20 = new Logger(module2);
|
|
877
|
+
loggerCache.set(module2, logger20);
|
|
874
878
|
}
|
|
875
|
-
return
|
|
879
|
+
return logger20;
|
|
876
880
|
}
|
|
877
881
|
var noopLogger = {
|
|
878
882
|
module: "noop",
|
|
@@ -1168,6 +1172,24 @@ var A2EProcessor = class {
|
|
|
1168
1172
|
}
|
|
1169
1173
|
};
|
|
1170
1174
|
|
|
1175
|
+
// src/audio/audioUtils.ts
|
|
1176
|
+
function pcm16ToFloat32(buffer) {
|
|
1177
|
+
const byteLen = buffer.byteLength & ~1;
|
|
1178
|
+
const int16 = byteLen === buffer.byteLength ? new Int16Array(buffer) : new Int16Array(buffer, 0, byteLen / 2);
|
|
1179
|
+
const float32 = new Float32Array(int16.length);
|
|
1180
|
+
for (let i = 0; i < int16.length; i++) {
|
|
1181
|
+
float32[i] = int16[i] / 32768;
|
|
1182
|
+
}
|
|
1183
|
+
return float32;
|
|
1184
|
+
}
|
|
1185
|
+
function int16ToFloat32(int16) {
|
|
1186
|
+
const float32 = new Float32Array(int16.length);
|
|
1187
|
+
for (let i = 0; i < int16.length; i++) {
|
|
1188
|
+
float32[i] = int16[i] / 32768;
|
|
1189
|
+
}
|
|
1190
|
+
return float32;
|
|
1191
|
+
}
|
|
1192
|
+
|
|
1171
1193
|
// src/telemetry/exporters/console.ts
|
|
1172
1194
|
var ConsoleExporter = class {
|
|
1173
1195
|
constructor(options = {}) {
|
|
@@ -3221,19 +3243,7 @@ _Wav2Vec2Inference.INFERENCE_TIMEOUT_MS = 5e3;
|
|
|
3221
3243
|
_Wav2Vec2Inference.isWebGPUAvailable = isWebGPUAvailable;
|
|
3222
3244
|
var Wav2Vec2Inference = _Wav2Vec2Inference;
|
|
3223
3245
|
|
|
3224
|
-
// src/audio/
|
|
3225
|
-
function pcm16ToFloat32(buffer) {
|
|
3226
|
-
const byteLen = buffer.byteLength & ~1;
|
|
3227
|
-
const int16 = byteLen === buffer.byteLength ? new Int16Array(buffer) : new Int16Array(buffer, 0, byteLen / 2);
|
|
3228
|
-
const float32 = new Float32Array(int16.length);
|
|
3229
|
-
for (let i = 0; i < int16.length; i++) {
|
|
3230
|
-
float32[i] = int16[i] / 32768;
|
|
3231
|
-
}
|
|
3232
|
-
return float32;
|
|
3233
|
-
}
|
|
3234
|
-
|
|
3235
|
-
// src/audio/FullFacePipeline.ts
|
|
3236
|
-
var logger4 = createLogger("FullFacePipeline");
|
|
3246
|
+
// src/audio/expressionProfile.ts
|
|
3237
3247
|
var BLENDSHAPE_TO_GROUP = /* @__PURE__ */ new Map();
|
|
3238
3248
|
for (const name of LAM_BLENDSHAPES) {
|
|
3239
3249
|
if (name.startsWith("eye")) {
|
|
@@ -3252,6 +3262,24 @@ for (const name of LAM_BLENDSHAPES) {
|
|
|
3252
3262
|
BLENDSHAPE_TO_GROUP.set(name, "tongue");
|
|
3253
3263
|
}
|
|
3254
3264
|
}
|
|
3265
|
+
function applyProfile(raw, profile) {
|
|
3266
|
+
const scaled = new Float32Array(52);
|
|
3267
|
+
for (let i = 0; i < 52; i++) {
|
|
3268
|
+
const name = LAM_BLENDSHAPES[i];
|
|
3269
|
+
let scaler;
|
|
3270
|
+
if (profile.overrides && profile.overrides[name] !== void 0) {
|
|
3271
|
+
scaler = profile.overrides[name];
|
|
3272
|
+
} else {
|
|
3273
|
+
const group = BLENDSHAPE_TO_GROUP.get(name);
|
|
3274
|
+
scaler = group ? profile[group] ?? 1 : 1;
|
|
3275
|
+
}
|
|
3276
|
+
scaled[i] = Math.min(1, Math.max(0, raw[i] * scaler));
|
|
3277
|
+
}
|
|
3278
|
+
return scaled;
|
|
3279
|
+
}
|
|
3280
|
+
|
|
3281
|
+
// src/audio/FullFacePipeline.ts
|
|
3282
|
+
var logger4 = createLogger("FullFacePipeline");
|
|
3255
3283
|
var FullFacePipeline = class extends EventEmitter {
|
|
3256
3284
|
constructor(options) {
|
|
3257
3285
|
super();
|
|
@@ -3316,25 +3344,10 @@ var FullFacePipeline = class extends EventEmitter {
|
|
|
3316
3344
|
/**
|
|
3317
3345
|
* Apply ExpressionProfile scaling to raw A2E blendshapes.
|
|
3318
3346
|
*
|
|
3319
|
-
*
|
|
3320
|
-
* 1. If an override exists for the blendshape name, use override as scaler
|
|
3321
|
-
* 2. Otherwise, use the group scaler (default 1.0)
|
|
3322
|
-
* 3. Clamp result to [0, 1]
|
|
3347
|
+
* Delegates to the standalone applyProfile() utility from expressionProfile.ts.
|
|
3323
3348
|
*/
|
|
3324
3349
|
applyProfile(raw) {
|
|
3325
|
-
|
|
3326
|
-
for (let i = 0; i < 52; i++) {
|
|
3327
|
-
const name = LAM_BLENDSHAPES[i];
|
|
3328
|
-
let scaler;
|
|
3329
|
-
if (this.profile.overrides && this.profile.overrides[name] !== void 0) {
|
|
3330
|
-
scaler = this.profile.overrides[name];
|
|
3331
|
-
} else {
|
|
3332
|
-
const group = BLENDSHAPE_TO_GROUP.get(name);
|
|
3333
|
-
scaler = group ? this.profile[group] ?? 1 : 1;
|
|
3334
|
-
}
|
|
3335
|
-
scaled[i] = Math.min(1, Math.max(0, raw[i] * scaler));
|
|
3336
|
-
}
|
|
3337
|
-
return scaled;
|
|
3350
|
+
return applyProfile(raw, this.profile);
|
|
3338
3351
|
}
|
|
3339
3352
|
/**
|
|
3340
3353
|
* Start a new playback session
|
|
@@ -3519,6 +3532,329 @@ var FullFacePipeline = class extends EventEmitter {
|
|
|
3519
3532
|
}
|
|
3520
3533
|
};
|
|
3521
3534
|
|
|
3535
|
+
// src/audio/PlaybackPipeline.ts
|
|
3536
|
+
var logger5 = createLogger("PlaybackPipeline");
|
|
3537
|
+
var PlaybackPipeline = class extends EventEmitter {
|
|
3538
|
+
constructor(config) {
|
|
3539
|
+
super();
|
|
3540
|
+
this.config = config;
|
|
3541
|
+
this._state = "idle";
|
|
3542
|
+
this.playbackStarted = false;
|
|
3543
|
+
this.monitorInterval = null;
|
|
3544
|
+
this.frameAnimationId = null;
|
|
3545
|
+
// Stale frame detection
|
|
3546
|
+
this.lastNewFrameTime = 0;
|
|
3547
|
+
this.lastKnownLamFrame = null;
|
|
3548
|
+
this.staleWarningEmitted = false;
|
|
3549
|
+
// Diagnostic counter
|
|
3550
|
+
this.frameLoopCount = 0;
|
|
3551
|
+
this.neutralTransitionFrame = null;
|
|
3552
|
+
this.neutralTransitionStart = 0;
|
|
3553
|
+
this.neutralAnimationId = null;
|
|
3554
|
+
// Current frame refs
|
|
3555
|
+
this._currentFrame = null;
|
|
3556
|
+
this._currentRawFrame = null;
|
|
3557
|
+
this.sampleRate = config.sampleRate ?? 16e3;
|
|
3558
|
+
this.profile = config.profile ?? {};
|
|
3559
|
+
this.staleThresholdMs = config.staleThresholdMs ?? 2e3;
|
|
3560
|
+
this.neutralTransitionEnabled = config.neutralTransitionEnabled ?? false;
|
|
3561
|
+
this.neutralTransitionMs = config.neutralTransitionMs ?? 250;
|
|
3562
|
+
const isCpuModel = config.lam.modelId === "wav2arkit_cpu";
|
|
3563
|
+
const chunkSize = config.chunkSize ?? config.lam.chunkSize ?? 16e3;
|
|
3564
|
+
const chunkAccumulationMs = chunkSize / this.sampleRate * 1e3;
|
|
3565
|
+
const inferenceEstimateMs = isCpuModel ? 300 : config.lam.backend === "wasm" ? 250 : 80;
|
|
3566
|
+
const marginMs = 100;
|
|
3567
|
+
const autoDelay = Math.ceil(chunkAccumulationMs + inferenceEstimateMs + marginMs);
|
|
3568
|
+
const audioDelayMs = config.audioDelayMs ?? autoDelay;
|
|
3569
|
+
logger5.info("PlaybackPipeline config", {
|
|
3570
|
+
chunkSize,
|
|
3571
|
+
audioDelayMs,
|
|
3572
|
+
autoDelay,
|
|
3573
|
+
backend: config.lam.backend,
|
|
3574
|
+
modelId: config.lam.modelId,
|
|
3575
|
+
neutralTransitionEnabled: this.neutralTransitionEnabled
|
|
3576
|
+
});
|
|
3577
|
+
this.scheduler = new AudioScheduler({
|
|
3578
|
+
sampleRate: this.sampleRate,
|
|
3579
|
+
initialLookaheadSec: audioDelayMs / 1e3
|
|
3580
|
+
});
|
|
3581
|
+
this.coalescer = new AudioChunkCoalescer({
|
|
3582
|
+
sampleRate: this.sampleRate,
|
|
3583
|
+
targetDurationMs: config.chunkTargetMs ?? 200
|
|
3584
|
+
});
|
|
3585
|
+
this.processor = new A2EProcessor({
|
|
3586
|
+
backend: config.lam,
|
|
3587
|
+
sampleRate: this.sampleRate,
|
|
3588
|
+
chunkSize,
|
|
3589
|
+
identityIndex: config.identityIndex,
|
|
3590
|
+
onError: (error) => {
|
|
3591
|
+
logger5.error("A2E inference error", { message: error.message, stack: error.stack });
|
|
3592
|
+
this.emit("error", error);
|
|
3593
|
+
}
|
|
3594
|
+
});
|
|
3595
|
+
}
|
|
3596
|
+
/** Current pipeline state */
|
|
3597
|
+
get state() {
|
|
3598
|
+
return this._state;
|
|
3599
|
+
}
|
|
3600
|
+
/** Current scaled blendshapes (updated in-place for perf) */
|
|
3601
|
+
get currentFrame() {
|
|
3602
|
+
return this._currentFrame;
|
|
3603
|
+
}
|
|
3604
|
+
/** Raw A2E blendshapes (before profile scaling) */
|
|
3605
|
+
get currentRawFrame() {
|
|
3606
|
+
return this._currentRawFrame;
|
|
3607
|
+
}
|
|
3608
|
+
// ---------------------------------------------------------------------------
|
|
3609
|
+
// Lifecycle
|
|
3610
|
+
// ---------------------------------------------------------------------------
|
|
3611
|
+
/** Initialize AudioContext (lazy, call after user gesture) */
|
|
3612
|
+
async initialize() {
|
|
3613
|
+
await this.scheduler.initialize();
|
|
3614
|
+
}
|
|
3615
|
+
/** Update ExpressionProfile at runtime */
|
|
3616
|
+
setProfile(profile) {
|
|
3617
|
+
this.profile = profile;
|
|
3618
|
+
}
|
|
3619
|
+
// ---------------------------------------------------------------------------
|
|
3620
|
+
// Async mode (streaming TTS)
|
|
3621
|
+
// ---------------------------------------------------------------------------
|
|
3622
|
+
/**
|
|
3623
|
+
* Start a new playback session.
|
|
3624
|
+
* Idempotent — calling during playback resets cleanly without emitting
|
|
3625
|
+
* spurious playback:complete.
|
|
3626
|
+
*/
|
|
3627
|
+
start() {
|
|
3628
|
+
this.stopInternal(false);
|
|
3629
|
+
this.scheduler.reset();
|
|
3630
|
+
this.coalescer.reset();
|
|
3631
|
+
this.processor.reset();
|
|
3632
|
+
this.playbackStarted = false;
|
|
3633
|
+
this.lastNewFrameTime = 0;
|
|
3634
|
+
this.lastKnownLamFrame = null;
|
|
3635
|
+
this.staleWarningEmitted = false;
|
|
3636
|
+
this.frameLoopCount = 0;
|
|
3637
|
+
this._currentFrame = null;
|
|
3638
|
+
this._currentRawFrame = null;
|
|
3639
|
+
this.cancelNeutralTransition();
|
|
3640
|
+
this.scheduler.warmup();
|
|
3641
|
+
this.startFrameLoop();
|
|
3642
|
+
this.startMonitoring();
|
|
3643
|
+
this.setState("playing");
|
|
3644
|
+
}
|
|
3645
|
+
/** Feed a streaming audio chunk (PCM16 Uint8Array) */
|
|
3646
|
+
async onAudioChunk(chunk) {
|
|
3647
|
+
const combined = this.coalescer.add(chunk);
|
|
3648
|
+
if (!combined) return;
|
|
3649
|
+
const float32 = pcm16ToFloat32(combined);
|
|
3650
|
+
const scheduleTime = await this.scheduler.schedule(float32);
|
|
3651
|
+
if (!this.playbackStarted) {
|
|
3652
|
+
this.playbackStarted = true;
|
|
3653
|
+
this.emit("playback:start", { time: scheduleTime });
|
|
3654
|
+
this.emit("playback_start", scheduleTime);
|
|
3655
|
+
}
|
|
3656
|
+
this.processor.pushAudio(float32, scheduleTime);
|
|
3657
|
+
}
|
|
3658
|
+
/** Signal end of audio stream (flushes remaining audio) */
|
|
3659
|
+
async end() {
|
|
3660
|
+
const remaining = this.coalescer.flush();
|
|
3661
|
+
if (remaining) {
|
|
3662
|
+
const chunk = new Uint8Array(remaining);
|
|
3663
|
+
await this.onAudioChunk(chunk);
|
|
3664
|
+
}
|
|
3665
|
+
await this.processor.flush();
|
|
3666
|
+
}
|
|
3667
|
+
// ---------------------------------------------------------------------------
|
|
3668
|
+
// Sync mode (full buffer)
|
|
3669
|
+
// ---------------------------------------------------------------------------
|
|
3670
|
+
/**
|
|
3671
|
+
* Feed a complete audio buffer. Chunks into 200ms pieces, schedules each
|
|
3672
|
+
* for playback, runs A2E inference, then waits for completion.
|
|
3673
|
+
*/
|
|
3674
|
+
async feedBuffer(audio) {
|
|
3675
|
+
const float32 = audio instanceof Float32Array ? audio : pcm16ToFloat32(audio);
|
|
3676
|
+
this.start();
|
|
3677
|
+
const chunkSamples = Math.floor(this.sampleRate * 0.2);
|
|
3678
|
+
for (let i = 0; i < float32.length; i += chunkSamples) {
|
|
3679
|
+
const chunk = float32.subarray(i, Math.min(i + chunkSamples, float32.length));
|
|
3680
|
+
const scheduleTime = await this.scheduler.schedule(chunk);
|
|
3681
|
+
this.processor.pushAudio(chunk, scheduleTime);
|
|
3682
|
+
if (!this.playbackStarted) {
|
|
3683
|
+
this.playbackStarted = true;
|
|
3684
|
+
this.emit("playback:start", { time: scheduleTime });
|
|
3685
|
+
this.emit("playback_start", scheduleTime);
|
|
3686
|
+
}
|
|
3687
|
+
}
|
|
3688
|
+
await this.processor.flush();
|
|
3689
|
+
return new Promise((resolve) => {
|
|
3690
|
+
const unsub = this.on("playback:complete", () => {
|
|
3691
|
+
unsub();
|
|
3692
|
+
resolve();
|
|
3693
|
+
});
|
|
3694
|
+
});
|
|
3695
|
+
}
|
|
3696
|
+
// ---------------------------------------------------------------------------
|
|
3697
|
+
// Control
|
|
3698
|
+
// ---------------------------------------------------------------------------
|
|
3699
|
+
/** Stop playback immediately with fade-out */
|
|
3700
|
+
async stop(fadeOutMs = 50) {
|
|
3701
|
+
this.setState("stopping");
|
|
3702
|
+
this.stopInternal(true);
|
|
3703
|
+
await this.scheduler.cancelAll(fadeOutMs);
|
|
3704
|
+
this.coalescer.reset();
|
|
3705
|
+
this.processor.reset();
|
|
3706
|
+
this.playbackStarted = false;
|
|
3707
|
+
this._currentFrame = null;
|
|
3708
|
+
this._currentRawFrame = null;
|
|
3709
|
+
this.emit("playback:stop", void 0);
|
|
3710
|
+
this.setState("idle");
|
|
3711
|
+
}
|
|
3712
|
+
/** Cleanup all resources */
|
|
3713
|
+
dispose() {
|
|
3714
|
+
this.stopInternal(true);
|
|
3715
|
+
this.cancelNeutralTransition();
|
|
3716
|
+
this.scheduler.dispose();
|
|
3717
|
+
this.coalescer.reset();
|
|
3718
|
+
this.processor.dispose();
|
|
3719
|
+
this._state = "idle";
|
|
3720
|
+
}
|
|
3721
|
+
/** Get pipeline debug state */
|
|
3722
|
+
getDebugState() {
|
|
3723
|
+
return {
|
|
3724
|
+
state: this._state,
|
|
3725
|
+
playbackStarted: this.playbackStarted,
|
|
3726
|
+
coalescerFill: this.coalescer.fillLevel,
|
|
3727
|
+
processorFill: this.processor.fillLevel,
|
|
3728
|
+
queuedFrames: this.processor.queuedFrameCount,
|
|
3729
|
+
currentTime: this.scheduler.getCurrentTime(),
|
|
3730
|
+
playbackEndTime: this.scheduler.getPlaybackEndTime()
|
|
3731
|
+
};
|
|
3732
|
+
}
|
|
3733
|
+
// ---------------------------------------------------------------------------
|
|
3734
|
+
// Internal: Frame loop
|
|
3735
|
+
// ---------------------------------------------------------------------------
|
|
3736
|
+
startFrameLoop() {
|
|
3737
|
+
const updateFrame = () => {
|
|
3738
|
+
this.frameLoopCount++;
|
|
3739
|
+
const currentTime = this.scheduler.getCurrentTime();
|
|
3740
|
+
const lamFrame = this.processor.getFrameForTime(currentTime);
|
|
3741
|
+
if (lamFrame && lamFrame !== this.lastKnownLamFrame) {
|
|
3742
|
+
this.lastNewFrameTime = performance.now();
|
|
3743
|
+
this.lastKnownLamFrame = lamFrame;
|
|
3744
|
+
this.staleWarningEmitted = false;
|
|
3745
|
+
}
|
|
3746
|
+
if (this.playbackStarted && this.lastNewFrameTime > 0 && performance.now() - this.lastNewFrameTime > this.staleThresholdMs) {
|
|
3747
|
+
if (!this.staleWarningEmitted) {
|
|
3748
|
+
this.staleWarningEmitted = true;
|
|
3749
|
+
logger5.warn("A2E stalled \u2014 no new inference frames", {
|
|
3750
|
+
staleDurationMs: Math.round(performance.now() - this.lastNewFrameTime),
|
|
3751
|
+
queuedFrames: this.processor.queuedFrameCount
|
|
3752
|
+
});
|
|
3753
|
+
}
|
|
3754
|
+
}
|
|
3755
|
+
if (lamFrame) {
|
|
3756
|
+
const scaled = applyProfile(lamFrame, this.profile);
|
|
3757
|
+
this._currentFrame = scaled;
|
|
3758
|
+
this._currentRawFrame = lamFrame;
|
|
3759
|
+
const fullFrame = {
|
|
3760
|
+
blendshapes: scaled,
|
|
3761
|
+
rawBlendshapes: lamFrame,
|
|
3762
|
+
timestamp: currentTime
|
|
3763
|
+
};
|
|
3764
|
+
this.emit("frame", fullFrame);
|
|
3765
|
+
this.emit("frame:raw", lamFrame);
|
|
3766
|
+
this.emit("full_frame_ready", fullFrame);
|
|
3767
|
+
this.emit("lam_frame_ready", lamFrame);
|
|
3768
|
+
}
|
|
3769
|
+
this.frameAnimationId = requestAnimationFrame(updateFrame);
|
|
3770
|
+
};
|
|
3771
|
+
this.frameAnimationId = requestAnimationFrame(updateFrame);
|
|
3772
|
+
}
|
|
3773
|
+
// ---------------------------------------------------------------------------
|
|
3774
|
+
// Internal: Playback monitoring
|
|
3775
|
+
// ---------------------------------------------------------------------------
|
|
3776
|
+
startMonitoring() {
|
|
3777
|
+
if (this.monitorInterval) {
|
|
3778
|
+
clearInterval(this.monitorInterval);
|
|
3779
|
+
}
|
|
3780
|
+
this.monitorInterval = setInterval(() => {
|
|
3781
|
+
if (this.scheduler.isComplete() && this.processor.queuedFrameCount === 0) {
|
|
3782
|
+
this.onPlaybackComplete();
|
|
3783
|
+
}
|
|
3784
|
+
}, 100);
|
|
3785
|
+
}
|
|
3786
|
+
onPlaybackComplete() {
|
|
3787
|
+
this.stopInternal(false);
|
|
3788
|
+
this.playbackStarted = false;
|
|
3789
|
+
this.emit("playback:complete", void 0);
|
|
3790
|
+
this.emit("playback_complete", void 0);
|
|
3791
|
+
if (this.neutralTransitionEnabled && this._currentFrame) {
|
|
3792
|
+
this.startNeutralTransition(this._currentFrame);
|
|
3793
|
+
} else {
|
|
3794
|
+
this.setState("idle");
|
|
3795
|
+
}
|
|
3796
|
+
}
|
|
3797
|
+
// ---------------------------------------------------------------------------
|
|
3798
|
+
// Internal: Neutral transition (opt-in)
|
|
3799
|
+
// ---------------------------------------------------------------------------
|
|
3800
|
+
startNeutralTransition(fromFrame) {
|
|
3801
|
+
this.neutralTransitionFrame = new Float32Array(fromFrame);
|
|
3802
|
+
this.neutralTransitionStart = performance.now();
|
|
3803
|
+
const animate = () => {
|
|
3804
|
+
const elapsed = performance.now() - this.neutralTransitionStart;
|
|
3805
|
+
const t = Math.min(1, elapsed / this.neutralTransitionMs);
|
|
3806
|
+
const eased = 1 - Math.pow(1 - t, 3);
|
|
3807
|
+
const blendshapes = new Float32Array(52);
|
|
3808
|
+
for (let i = 0; i < 52; i++) {
|
|
3809
|
+
blendshapes[i] = this.neutralTransitionFrame[i] * (1 - eased);
|
|
3810
|
+
}
|
|
3811
|
+
this._currentFrame = blendshapes;
|
|
3812
|
+
const frame = {
|
|
3813
|
+
blendshapes,
|
|
3814
|
+
rawBlendshapes: blendshapes,
|
|
3815
|
+
// raw = scaled during transition
|
|
3816
|
+
timestamp: performance.now() / 1e3
|
|
3817
|
+
};
|
|
3818
|
+
this.emit("frame", frame);
|
|
3819
|
+
this.emit("full_frame_ready", frame);
|
|
3820
|
+
if (t >= 1) {
|
|
3821
|
+
this.neutralTransitionFrame = null;
|
|
3822
|
+
this._currentFrame = null;
|
|
3823
|
+
this._currentRawFrame = null;
|
|
3824
|
+
this.setState("idle");
|
|
3825
|
+
return;
|
|
3826
|
+
}
|
|
3827
|
+
this.neutralAnimationId = requestAnimationFrame(animate);
|
|
3828
|
+
};
|
|
3829
|
+
this.neutralAnimationId = requestAnimationFrame(animate);
|
|
3830
|
+
}
|
|
3831
|
+
cancelNeutralTransition() {
|
|
3832
|
+
if (this.neutralAnimationId) {
|
|
3833
|
+
cancelAnimationFrame(this.neutralAnimationId);
|
|
3834
|
+
this.neutralAnimationId = null;
|
|
3835
|
+
}
|
|
3836
|
+
this.neutralTransitionFrame = null;
|
|
3837
|
+
}
|
|
3838
|
+
// ---------------------------------------------------------------------------
|
|
3839
|
+
// Internal: Helpers
|
|
3840
|
+
// ---------------------------------------------------------------------------
|
|
3841
|
+
stopInternal(emitEvents) {
|
|
3842
|
+
if (this.monitorInterval) {
|
|
3843
|
+
clearInterval(this.monitorInterval);
|
|
3844
|
+
this.monitorInterval = null;
|
|
3845
|
+
}
|
|
3846
|
+
if (this.frameAnimationId) {
|
|
3847
|
+
cancelAnimationFrame(this.frameAnimationId);
|
|
3848
|
+
this.frameAnimationId = null;
|
|
3849
|
+
}
|
|
3850
|
+
}
|
|
3851
|
+
setState(state) {
|
|
3852
|
+
if (this._state === state) return;
|
|
3853
|
+
this._state = state;
|
|
3854
|
+
this.emit("state", state);
|
|
3855
|
+
}
|
|
3856
|
+
};
|
|
3857
|
+
|
|
3522
3858
|
// src/audio/InterruptionHandler.ts
|
|
3523
3859
|
var InterruptionHandler = class extends EventEmitter {
|
|
3524
3860
|
constructor(config = {}) {
|
|
@@ -3906,7 +4242,7 @@ function ctcGreedyDecode(logits, seqLen, vocabSize, tokenMap) {
|
|
|
3906
4242
|
}
|
|
3907
4243
|
|
|
3908
4244
|
// src/inference/SenseVoiceInference.ts
|
|
3909
|
-
var
|
|
4245
|
+
var logger6 = createLogger("SenseVoice");
|
|
3910
4246
|
var _SenseVoiceInference = class _SenseVoiceInference {
|
|
3911
4247
|
constructor(config) {
|
|
3912
4248
|
this.session = null;
|
|
@@ -3959,26 +4295,26 @@ var _SenseVoiceInference = class _SenseVoiceInference {
|
|
|
3959
4295
|
"model.backend_requested": this.config.backend
|
|
3960
4296
|
});
|
|
3961
4297
|
try {
|
|
3962
|
-
|
|
4298
|
+
logger6.info("Loading ONNX Runtime...", { preference: this.config.backend });
|
|
3963
4299
|
const { ort, backend } = await getOnnxRuntimeForPreference(this.config.backend);
|
|
3964
4300
|
this.ort = ort;
|
|
3965
4301
|
this._backend = backend;
|
|
3966
|
-
|
|
3967
|
-
|
|
4302
|
+
logger6.info("ONNX Runtime loaded", { backend: this._backend });
|
|
4303
|
+
logger6.debug("Fetching tokens vocabulary", { tokensUrl: this.config.tokensUrl });
|
|
3968
4304
|
const tokensResponse = await fetch(this.config.tokensUrl);
|
|
3969
4305
|
if (!tokensResponse.ok) {
|
|
3970
4306
|
throw new Error(`Failed to fetch tokens.txt: ${tokensResponse.status} ${tokensResponse.statusText}`);
|
|
3971
4307
|
}
|
|
3972
4308
|
const tokensText = await tokensResponse.text();
|
|
3973
4309
|
this.tokenMap = parseTokensFile(tokensText);
|
|
3974
|
-
|
|
4310
|
+
logger6.debug("Tokens loaded", { vocabSize: this.tokenMap.size });
|
|
3975
4311
|
const sessionOptions = getSessionOptions(this._backend);
|
|
3976
4312
|
if (this._backend === "webgpu") {
|
|
3977
4313
|
sessionOptions.graphOptimizationLevel = "basic";
|
|
3978
4314
|
}
|
|
3979
4315
|
let isCached = false;
|
|
3980
4316
|
if (isIOS()) {
|
|
3981
|
-
|
|
4317
|
+
logger6.info("iOS: passing model URL directly to ORT (low-memory path)", {
|
|
3982
4318
|
modelUrl: this.config.modelUrl
|
|
3983
4319
|
});
|
|
3984
4320
|
this.session = await withTimeout(
|
|
@@ -3991,14 +4327,14 @@ var _SenseVoiceInference = class _SenseVoiceInference {
|
|
|
3991
4327
|
isCached = await cache.has(this.config.modelUrl);
|
|
3992
4328
|
let modelBuffer;
|
|
3993
4329
|
if (isCached) {
|
|
3994
|
-
|
|
4330
|
+
logger6.debug("Loading model from cache", { modelUrl: this.config.modelUrl });
|
|
3995
4331
|
modelBuffer = await cache.get(this.config.modelUrl);
|
|
3996
4332
|
onProgress?.(modelBuffer.byteLength, modelBuffer.byteLength);
|
|
3997
4333
|
} else {
|
|
3998
|
-
|
|
4334
|
+
logger6.debug("Fetching and caching model", { modelUrl: this.config.modelUrl });
|
|
3999
4335
|
modelBuffer = await fetchWithCache(this.config.modelUrl, onProgress);
|
|
4000
4336
|
}
|
|
4001
|
-
|
|
4337
|
+
logger6.debug("Creating ONNX session", {
|
|
4002
4338
|
size: formatBytes(modelBuffer.byteLength),
|
|
4003
4339
|
backend: this._backend
|
|
4004
4340
|
});
|
|
@@ -4011,15 +4347,15 @@ var _SenseVoiceInference = class _SenseVoiceInference {
|
|
|
4011
4347
|
const cmvn = parseCMVNFromMetadata(metadata.neg_mean, metadata.inv_stddev);
|
|
4012
4348
|
this.negMean = cmvn.negMean;
|
|
4013
4349
|
this.invStddev = cmvn.invStddev;
|
|
4014
|
-
|
|
4350
|
+
logger6.debug("CMVN loaded from model metadata", { dim: this.negMean.length });
|
|
4015
4351
|
} else {
|
|
4016
|
-
|
|
4352
|
+
logger6.warn("CMVN not found in model metadata \u2014 features will not be normalized");
|
|
4017
4353
|
}
|
|
4018
4354
|
} catch (cmvnErr) {
|
|
4019
|
-
|
|
4355
|
+
logger6.warn("Failed to read CMVN from model metadata", { error: cmvnErr });
|
|
4020
4356
|
}
|
|
4021
4357
|
const loadTimeMs = performance.now() - startTime;
|
|
4022
|
-
|
|
4358
|
+
logger6.info("SenseVoice model loaded", {
|
|
4023
4359
|
backend: this._backend,
|
|
4024
4360
|
loadTimeMs: Math.round(loadTimeMs),
|
|
4025
4361
|
vocabSize: this.tokenMap.size,
|
|
@@ -4130,7 +4466,7 @@ var _SenseVoiceInference = class _SenseVoiceInference {
|
|
|
4130
4466
|
const vocabSize = logitsDims[2];
|
|
4131
4467
|
const decoded = ctcGreedyDecode(logitsData, seqLen, vocabSize, this.tokenMap);
|
|
4132
4468
|
const inferenceTimeMs = performance.now() - startTime;
|
|
4133
|
-
|
|
4469
|
+
logger6.trace("Transcription complete", {
|
|
4134
4470
|
text: decoded.text.substring(0, 50),
|
|
4135
4471
|
language: decoded.language,
|
|
4136
4472
|
emotion: decoded.emotion,
|
|
@@ -4168,7 +4504,7 @@ var _SenseVoiceInference = class _SenseVoiceInference {
|
|
|
4168
4504
|
const errMsg = err instanceof Error ? err.message : String(err);
|
|
4169
4505
|
if (errMsg.includes("timed out")) {
|
|
4170
4506
|
this.poisoned = true;
|
|
4171
|
-
|
|
4507
|
+
logger6.error("CRITICAL: Inference session timed out \u2014 SenseVoice is dead. Page reload required.", {
|
|
4172
4508
|
backend: this._backend,
|
|
4173
4509
|
timeoutMs: _SenseVoiceInference.INFERENCE_TIMEOUT_MS
|
|
4174
4510
|
});
|
|
@@ -4176,7 +4512,7 @@ var _SenseVoiceInference = class _SenseVoiceInference {
|
|
|
4176
4512
|
const oomError = new Error(
|
|
4177
4513
|
`SenseVoice inference failed with raw C++ exception pointer (0x${err.toString(16)}). This is likely an OOM crash in WASM. Try reloading the page.`
|
|
4178
4514
|
);
|
|
4179
|
-
|
|
4515
|
+
logger6.error("ORT WASM OOM \u2014 raw C++ exception pointer", {
|
|
4180
4516
|
pointer: `0x${err.toString(16)}`,
|
|
4181
4517
|
backend: this._backend
|
|
4182
4518
|
});
|
|
@@ -4189,7 +4525,7 @@ var _SenseVoiceInference = class _SenseVoiceInference {
|
|
|
4189
4525
|
reject(oomError);
|
|
4190
4526
|
return;
|
|
4191
4527
|
} else {
|
|
4192
|
-
|
|
4528
|
+
logger6.error("Inference failed", { error: errMsg, backend: this._backend });
|
|
4193
4529
|
}
|
|
4194
4530
|
span?.endWithError(err instanceof Error ? err : new Error(String(err)));
|
|
4195
4531
|
telemetry?.incrementCounter("omote.inference.total", 1, {
|
|
@@ -4218,9 +4554,9 @@ _SenseVoiceInference.INFERENCE_TIMEOUT_MS = 1e4;
|
|
|
4218
4554
|
var SenseVoiceInference = _SenseVoiceInference;
|
|
4219
4555
|
|
|
4220
4556
|
// src/inference/SenseVoiceWorker.ts
|
|
4221
|
-
var
|
|
4557
|
+
var logger7 = createLogger("SenseVoiceWorker");
|
|
4222
4558
|
var WASM_CDN_PATH2 = "https://cdn.jsdelivr.net/npm/onnxruntime-web@1.23.2/dist/";
|
|
4223
|
-
var LOAD_TIMEOUT_MS =
|
|
4559
|
+
var LOAD_TIMEOUT_MS = 3e5;
|
|
4224
4560
|
var INFERENCE_TIMEOUT_MS = 1e4;
|
|
4225
4561
|
function resolveUrl(url) {
|
|
4226
4562
|
if (/^https?:\/\//i.test(url) || /^blob:/i.test(url)) return url;
|
|
@@ -4957,7 +5293,7 @@ var SenseVoiceWorker = class {
|
|
|
4957
5293
|
this.handleWorkerMessage(event.data);
|
|
4958
5294
|
};
|
|
4959
5295
|
worker.onerror = (error) => {
|
|
4960
|
-
|
|
5296
|
+
logger7.error("Worker error", { error: error.message });
|
|
4961
5297
|
for (const [, resolver] of this.pendingResolvers) {
|
|
4962
5298
|
resolver.reject(new Error(`Worker error: ${error.message}`));
|
|
4963
5299
|
}
|
|
@@ -5037,9 +5373,9 @@ var SenseVoiceWorker = class {
|
|
|
5037
5373
|
"model.language": this.config.language
|
|
5038
5374
|
});
|
|
5039
5375
|
try {
|
|
5040
|
-
|
|
5376
|
+
logger7.info("Creating SenseVoice worker...");
|
|
5041
5377
|
this.worker = this.createWorker();
|
|
5042
|
-
|
|
5378
|
+
logger7.info("Loading model in worker...", {
|
|
5043
5379
|
modelUrl: this.config.modelUrl,
|
|
5044
5380
|
tokensUrl: this.config.tokensUrl,
|
|
5045
5381
|
language: this.config.language,
|
|
@@ -5061,7 +5397,7 @@ var SenseVoiceWorker = class {
|
|
|
5061
5397
|
this._isLoaded = true;
|
|
5062
5398
|
const loadTimeMs = performance.now() - startTime;
|
|
5063
5399
|
onProgress?.(1, 1);
|
|
5064
|
-
|
|
5400
|
+
logger7.info("SenseVoice worker loaded successfully", {
|
|
5065
5401
|
backend: "wasm",
|
|
5066
5402
|
loadTimeMs: Math.round(loadTimeMs),
|
|
5067
5403
|
workerLoadTimeMs: Math.round(result.loadTimeMs),
|
|
@@ -5140,7 +5476,7 @@ var SenseVoiceWorker = class {
|
|
|
5140
5476
|
INFERENCE_TIMEOUT_MS
|
|
5141
5477
|
);
|
|
5142
5478
|
const totalTimeMs = performance.now() - startTime;
|
|
5143
|
-
|
|
5479
|
+
logger7.trace("Worker transcription complete", {
|
|
5144
5480
|
text: result.text.substring(0, 50),
|
|
5145
5481
|
language: result.language,
|
|
5146
5482
|
emotion: result.emotion,
|
|
@@ -5176,11 +5512,11 @@ var SenseVoiceWorker = class {
|
|
|
5176
5512
|
} catch (err) {
|
|
5177
5513
|
const errMsg = err instanceof Error ? err.message : String(err);
|
|
5178
5514
|
if (errMsg.includes("timed out")) {
|
|
5179
|
-
|
|
5515
|
+
logger7.error("CRITICAL: Worker inference timed out \u2014 SenseVoice worker is dead. Page reload required.", {
|
|
5180
5516
|
timeoutMs: INFERENCE_TIMEOUT_MS
|
|
5181
5517
|
});
|
|
5182
5518
|
} else {
|
|
5183
|
-
|
|
5519
|
+
logger7.error("Worker inference failed", { error: errMsg });
|
|
5184
5520
|
}
|
|
5185
5521
|
span?.endWithError(err instanceof Error ? err : new Error(String(err)));
|
|
5186
5522
|
telemetry?.incrementCounter("omote.inference.total", 1, {
|
|
@@ -5218,14 +5554,14 @@ var SenseVoiceWorker = class {
|
|
|
5218
5554
|
};
|
|
5219
5555
|
|
|
5220
5556
|
// src/inference/UnifiedInferenceWorker.ts
|
|
5221
|
-
var
|
|
5557
|
+
var logger8 = createLogger("UnifiedInferenceWorker");
|
|
5222
5558
|
var WASM_CDN_PATH3 = "https://cdn.jsdelivr.net/npm/onnxruntime-web@1.23.2/dist/";
|
|
5223
|
-
var INIT_TIMEOUT_MS =
|
|
5224
|
-
var SV_LOAD_TIMEOUT_MS =
|
|
5559
|
+
var INIT_TIMEOUT_MS = 6e4;
|
|
5560
|
+
var SV_LOAD_TIMEOUT_MS = 3e5;
|
|
5225
5561
|
var SV_INFER_TIMEOUT_MS = 1e4;
|
|
5226
|
-
var CPU_LOAD_TIMEOUT_MS =
|
|
5562
|
+
var CPU_LOAD_TIMEOUT_MS = 42e4;
|
|
5227
5563
|
var CPU_INFER_TIMEOUT_MS = 5e3;
|
|
5228
|
-
var VAD_LOAD_TIMEOUT_MS =
|
|
5564
|
+
var VAD_LOAD_TIMEOUT_MS = 12e4;
|
|
5229
5565
|
var VAD_INFER_TIMEOUT_MS = 1e3;
|
|
5230
5566
|
var DISPOSE_TIMEOUT_MS = 5e3;
|
|
5231
5567
|
function resolveUrl2(url) {
|
|
@@ -5920,7 +6256,7 @@ var UnifiedInferenceWorker = class {
|
|
|
5920
6256
|
const telemetry = getTelemetry();
|
|
5921
6257
|
const span = telemetry?.startSpan("UnifiedInferenceWorker.init");
|
|
5922
6258
|
try {
|
|
5923
|
-
|
|
6259
|
+
logger8.info("Creating unified inference worker...");
|
|
5924
6260
|
this.worker = this.createWorker();
|
|
5925
6261
|
await this.sendMessage(
|
|
5926
6262
|
{ type: "init", wasmPaths: WASM_CDN_PATH3, isIOS: isIOS() },
|
|
@@ -5929,7 +6265,7 @@ var UnifiedInferenceWorker = class {
|
|
|
5929
6265
|
);
|
|
5930
6266
|
this.initialized = true;
|
|
5931
6267
|
const loadTimeMs = performance.now() - startTime;
|
|
5932
|
-
|
|
6268
|
+
logger8.info("Unified worker initialized", { loadTimeMs: Math.round(loadTimeMs) });
|
|
5933
6269
|
span?.setAttributes({ "worker.init_time_ms": loadTimeMs });
|
|
5934
6270
|
span?.end();
|
|
5935
6271
|
} catch (error) {
|
|
@@ -6103,7 +6439,7 @@ var UnifiedInferenceWorker = class {
|
|
|
6103
6439
|
this.handleWorkerMessage(event.data);
|
|
6104
6440
|
};
|
|
6105
6441
|
worker.onerror = (error) => {
|
|
6106
|
-
|
|
6442
|
+
logger8.error("Unified worker error", { error: error.message });
|
|
6107
6443
|
this.rejectAllPending(`Worker error: ${error.message}`);
|
|
6108
6444
|
};
|
|
6109
6445
|
return worker;
|
|
@@ -6117,7 +6453,7 @@ var UnifiedInferenceWorker = class {
|
|
|
6117
6453
|
this.pendingRequests.delete(requestId);
|
|
6118
6454
|
pending.reject(new Error(data.error));
|
|
6119
6455
|
} else {
|
|
6120
|
-
|
|
6456
|
+
logger8.error("Worker broadcast error", { error: data.error });
|
|
6121
6457
|
this.rejectAllPending(data.error);
|
|
6122
6458
|
}
|
|
6123
6459
|
return;
|
|
@@ -6139,7 +6475,7 @@ var UnifiedInferenceWorker = class {
|
|
|
6139
6475
|
const timeout = setTimeout(() => {
|
|
6140
6476
|
this.pendingRequests.delete(requestId);
|
|
6141
6477
|
this.poisoned = true;
|
|
6142
|
-
|
|
6478
|
+
logger8.error("CRITICAL: Worker operation timed out \u2014 worker is dead", {
|
|
6143
6479
|
type: message.type,
|
|
6144
6480
|
timeoutMs
|
|
6145
6481
|
});
|
|
@@ -6205,7 +6541,7 @@ var SenseVoiceUnifiedAdapter = class {
|
|
|
6205
6541
|
});
|
|
6206
6542
|
this._isLoaded = true;
|
|
6207
6543
|
onProgress?.(1, 1);
|
|
6208
|
-
|
|
6544
|
+
logger8.info("SenseVoice loaded via unified worker", {
|
|
6209
6545
|
backend: "wasm",
|
|
6210
6546
|
loadTimeMs: Math.round(result.loadTimeMs),
|
|
6211
6547
|
vocabSize: result.vocabSize
|
|
@@ -6270,7 +6606,7 @@ var Wav2ArkitCpuUnifiedAdapter = class {
|
|
|
6270
6606
|
externalDataUrl: externalDataUrl || null
|
|
6271
6607
|
});
|
|
6272
6608
|
this._isLoaded = true;
|
|
6273
|
-
|
|
6609
|
+
logger8.info("Wav2ArkitCpu loaded via unified worker", {
|
|
6274
6610
|
backend: "wasm",
|
|
6275
6611
|
loadTimeMs: Math.round(result.loadTimeMs)
|
|
6276
6612
|
});
|
|
@@ -6376,7 +6712,7 @@ var SileroVADUnifiedAdapter = class {
|
|
|
6376
6712
|
sampleRate: this.config.sampleRate
|
|
6377
6713
|
});
|
|
6378
6714
|
this._isLoaded = true;
|
|
6379
|
-
|
|
6715
|
+
logger8.info("SileroVAD loaded via unified worker", {
|
|
6380
6716
|
backend: "wasm",
|
|
6381
6717
|
loadTimeMs: Math.round(result.loadTimeMs),
|
|
6382
6718
|
sampleRate: this.config.sampleRate,
|
|
@@ -6457,10 +6793,10 @@ var SileroVADUnifiedAdapter = class {
|
|
|
6457
6793
|
};
|
|
6458
6794
|
|
|
6459
6795
|
// src/inference/createSenseVoice.ts
|
|
6460
|
-
var
|
|
6796
|
+
var logger9 = createLogger("createSenseVoice");
|
|
6461
6797
|
function createSenseVoice(config) {
|
|
6462
6798
|
if (config.unifiedWorker) {
|
|
6463
|
-
|
|
6799
|
+
logger9.info("Creating SenseVoiceUnifiedAdapter (shared unified worker)");
|
|
6464
6800
|
return new SenseVoiceUnifiedAdapter(config.unifiedWorker, {
|
|
6465
6801
|
modelUrl: config.modelUrl,
|
|
6466
6802
|
tokensUrl: config.tokensUrl,
|
|
@@ -6473,7 +6809,7 @@ function createSenseVoice(config) {
|
|
|
6473
6809
|
if (!SenseVoiceWorker.isSupported()) {
|
|
6474
6810
|
throw new Error("Web Workers are not supported in this environment");
|
|
6475
6811
|
}
|
|
6476
|
-
|
|
6812
|
+
logger9.info("Creating SenseVoiceWorker (off-main-thread)");
|
|
6477
6813
|
return new SenseVoiceWorker({
|
|
6478
6814
|
modelUrl: config.modelUrl,
|
|
6479
6815
|
tokensUrl: config.tokensUrl,
|
|
@@ -6482,7 +6818,7 @@ function createSenseVoice(config) {
|
|
|
6482
6818
|
});
|
|
6483
6819
|
}
|
|
6484
6820
|
if (useWorker === false) {
|
|
6485
|
-
|
|
6821
|
+
logger9.info("Creating SenseVoiceInference (main thread)");
|
|
6486
6822
|
return new SenseVoiceInference({
|
|
6487
6823
|
modelUrl: config.modelUrl,
|
|
6488
6824
|
tokensUrl: config.tokensUrl,
|
|
@@ -6491,7 +6827,7 @@ function createSenseVoice(config) {
|
|
|
6491
6827
|
});
|
|
6492
6828
|
}
|
|
6493
6829
|
if (SenseVoiceWorker.isSupported() && !isIOS()) {
|
|
6494
|
-
|
|
6830
|
+
logger9.info("Auto-detected: creating SenseVoiceWorker (off-main-thread)");
|
|
6495
6831
|
return new SenseVoiceWorker({
|
|
6496
6832
|
modelUrl: config.modelUrl,
|
|
6497
6833
|
tokensUrl: config.tokensUrl,
|
|
@@ -6499,7 +6835,7 @@ function createSenseVoice(config) {
|
|
|
6499
6835
|
textNorm: config.textNorm
|
|
6500
6836
|
});
|
|
6501
6837
|
}
|
|
6502
|
-
|
|
6838
|
+
logger9.info("Auto-detected: creating SenseVoiceInference (main thread)", {
|
|
6503
6839
|
reason: isIOS() ? "iOS (shared ORT instance)" : "Worker unsupported"
|
|
6504
6840
|
});
|
|
6505
6841
|
return new SenseVoiceInference({
|
|
@@ -6511,7 +6847,7 @@ function createSenseVoice(config) {
|
|
|
6511
6847
|
}
|
|
6512
6848
|
|
|
6513
6849
|
// src/inference/Wav2ArkitCpuInference.ts
|
|
6514
|
-
var
|
|
6850
|
+
var logger10 = createLogger("Wav2ArkitCpu");
|
|
6515
6851
|
var _Wav2ArkitCpuInference = class _Wav2ArkitCpuInference {
|
|
6516
6852
|
constructor(config) {
|
|
6517
6853
|
this.modelId = "wav2arkit_cpu";
|
|
@@ -6553,16 +6889,16 @@ var _Wav2ArkitCpuInference = class _Wav2ArkitCpuInference {
|
|
|
6553
6889
|
});
|
|
6554
6890
|
try {
|
|
6555
6891
|
const preference = this.config.backend || "wasm";
|
|
6556
|
-
|
|
6892
|
+
logger10.info("Loading ONNX Runtime...", { preference });
|
|
6557
6893
|
const { ort, backend } = await getOnnxRuntimeForPreference(preference);
|
|
6558
6894
|
this.ort = ort;
|
|
6559
6895
|
this._backend = backend;
|
|
6560
|
-
|
|
6896
|
+
logger10.info("ONNX Runtime loaded", { backend: this._backend });
|
|
6561
6897
|
const modelUrl = this.config.modelUrl;
|
|
6562
6898
|
const dataUrl = this.config.externalDataUrl !== false ? this.config.externalDataUrl || `${modelUrl}.data` : null;
|
|
6563
6899
|
const sessionOptions = getSessionOptions(this._backend);
|
|
6564
6900
|
if (isIOS()) {
|
|
6565
|
-
|
|
6901
|
+
logger10.info("iOS: passing model URLs directly to ORT (low-memory path)", {
|
|
6566
6902
|
modelUrl,
|
|
6567
6903
|
dataUrl
|
|
6568
6904
|
});
|
|
@@ -6584,15 +6920,15 @@ var _Wav2ArkitCpuInference = class _Wav2ArkitCpuInference {
|
|
|
6584
6920
|
const isCached = await cache.has(modelUrl);
|
|
6585
6921
|
let modelBuffer;
|
|
6586
6922
|
if (isCached) {
|
|
6587
|
-
|
|
6923
|
+
logger10.debug("Loading model from cache", { modelUrl });
|
|
6588
6924
|
modelBuffer = await cache.get(modelUrl);
|
|
6589
6925
|
if (!modelBuffer) {
|
|
6590
|
-
|
|
6926
|
+
logger10.warn("Cache corruption detected, clearing and retrying", { modelUrl });
|
|
6591
6927
|
await cache.delete(modelUrl);
|
|
6592
6928
|
modelBuffer = await fetchWithCache(modelUrl);
|
|
6593
6929
|
}
|
|
6594
6930
|
} else {
|
|
6595
|
-
|
|
6931
|
+
logger10.debug("Fetching and caching model graph", { modelUrl });
|
|
6596
6932
|
modelBuffer = await fetchWithCache(modelUrl);
|
|
6597
6933
|
}
|
|
6598
6934
|
if (!modelBuffer) {
|
|
@@ -6603,31 +6939,31 @@ var _Wav2ArkitCpuInference = class _Wav2ArkitCpuInference {
|
|
|
6603
6939
|
try {
|
|
6604
6940
|
const isDataCached = await cache.has(dataUrl);
|
|
6605
6941
|
if (isDataCached) {
|
|
6606
|
-
|
|
6942
|
+
logger10.debug("Loading external data from cache", { dataUrl });
|
|
6607
6943
|
externalDataBuffer = await cache.get(dataUrl);
|
|
6608
6944
|
if (!externalDataBuffer) {
|
|
6609
|
-
|
|
6945
|
+
logger10.warn("Cache corruption for external data, retrying", { dataUrl });
|
|
6610
6946
|
await cache.delete(dataUrl);
|
|
6611
6947
|
externalDataBuffer = await fetchWithCache(dataUrl);
|
|
6612
6948
|
}
|
|
6613
6949
|
} else {
|
|
6614
|
-
|
|
6950
|
+
logger10.info("Fetching external model data", {
|
|
6615
6951
|
dataUrl,
|
|
6616
6952
|
note: "This may be a large download (400MB+)"
|
|
6617
6953
|
});
|
|
6618
6954
|
externalDataBuffer = await fetchWithCache(dataUrl);
|
|
6619
6955
|
}
|
|
6620
|
-
|
|
6956
|
+
logger10.info("External data loaded", {
|
|
6621
6957
|
size: formatBytes(externalDataBuffer.byteLength)
|
|
6622
6958
|
});
|
|
6623
6959
|
} catch (err) {
|
|
6624
|
-
|
|
6960
|
+
logger10.debug("No external data file found (single-file model)", {
|
|
6625
6961
|
dataUrl,
|
|
6626
6962
|
error: err.message
|
|
6627
6963
|
});
|
|
6628
6964
|
}
|
|
6629
6965
|
}
|
|
6630
|
-
|
|
6966
|
+
logger10.debug("Creating ONNX session", {
|
|
6631
6967
|
graphSize: formatBytes(modelBuffer.byteLength),
|
|
6632
6968
|
externalDataSize: externalDataBuffer ? formatBytes(externalDataBuffer.byteLength) : "none",
|
|
6633
6969
|
backend: this._backend
|
|
@@ -6643,7 +6979,7 @@ var _Wav2ArkitCpuInference = class _Wav2ArkitCpuInference {
|
|
|
6643
6979
|
this.session = await this.ort.InferenceSession.create(modelData, sessionOptions);
|
|
6644
6980
|
}
|
|
6645
6981
|
const loadTimeMs = performance.now() - startTime;
|
|
6646
|
-
|
|
6982
|
+
logger10.info("Model loaded successfully", {
|
|
6647
6983
|
backend: this._backend,
|
|
6648
6984
|
loadTimeMs: Math.round(loadTimeMs),
|
|
6649
6985
|
inputs: this.session.inputNames,
|
|
@@ -6659,12 +6995,12 @@ var _Wav2ArkitCpuInference = class _Wav2ArkitCpuInference {
|
|
|
6659
6995
|
model: "wav2arkit_cpu",
|
|
6660
6996
|
backend: this._backend
|
|
6661
6997
|
});
|
|
6662
|
-
|
|
6998
|
+
logger10.debug("Running warmup inference");
|
|
6663
6999
|
const warmupStart = performance.now();
|
|
6664
7000
|
const silentAudio = new Float32Array(16e3);
|
|
6665
7001
|
await this.infer(silentAudio);
|
|
6666
7002
|
const warmupTimeMs = performance.now() - warmupStart;
|
|
6667
|
-
|
|
7003
|
+
logger10.info("Warmup inference complete", {
|
|
6668
7004
|
warmupTimeMs: Math.round(warmupTimeMs),
|
|
6669
7005
|
backend: this._backend
|
|
6670
7006
|
});
|
|
@@ -6751,7 +7087,7 @@ var _Wav2ArkitCpuInference = class _Wav2ArkitCpuInference {
|
|
|
6751
7087
|
const symmetrized = symmetrizeBlendshapes(rawFrame);
|
|
6752
7088
|
blendshapes.push(symmetrized);
|
|
6753
7089
|
}
|
|
6754
|
-
|
|
7090
|
+
logger10.trace("Inference completed", {
|
|
6755
7091
|
inferenceTimeMs: Math.round(inferenceTimeMs * 100) / 100,
|
|
6756
7092
|
numFrames,
|
|
6757
7093
|
inputSamples
|
|
@@ -6779,7 +7115,7 @@ var _Wav2ArkitCpuInference = class _Wav2ArkitCpuInference {
|
|
|
6779
7115
|
const errMsg = err instanceof Error ? err.message : String(err);
|
|
6780
7116
|
if (errMsg.includes("timed out")) {
|
|
6781
7117
|
this.poisoned = true;
|
|
6782
|
-
|
|
7118
|
+
logger10.error("CRITICAL: Inference session timed out \u2014 Wav2ArkitCpu is dead. Page reload required.", {
|
|
6783
7119
|
backend: this._backend,
|
|
6784
7120
|
timeoutMs: _Wav2ArkitCpuInference.INFERENCE_TIMEOUT_MS
|
|
6785
7121
|
});
|
|
@@ -6787,7 +7123,7 @@ var _Wav2ArkitCpuInference = class _Wav2ArkitCpuInference {
|
|
|
6787
7123
|
const oomError = new Error(
|
|
6788
7124
|
`Wav2ArkitCpu inference failed with raw C++ exception pointer (0x${err.toString(16)}). This is likely an OOM crash in WASM. Try reloading the page.`
|
|
6789
7125
|
);
|
|
6790
|
-
|
|
7126
|
+
logger10.error("ORT WASM OOM \u2014 raw C++ exception pointer", {
|
|
6791
7127
|
pointer: `0x${err.toString(16)}`,
|
|
6792
7128
|
backend: this._backend
|
|
6793
7129
|
});
|
|
@@ -6800,7 +7136,7 @@ var _Wav2ArkitCpuInference = class _Wav2ArkitCpuInference {
|
|
|
6800
7136
|
reject(oomError);
|
|
6801
7137
|
return;
|
|
6802
7138
|
} else {
|
|
6803
|
-
|
|
7139
|
+
logger10.error("Inference failed", { error: errMsg, backend: this._backend });
|
|
6804
7140
|
}
|
|
6805
7141
|
span?.endWithError(err instanceof Error ? err : new Error(String(err)));
|
|
6806
7142
|
telemetry?.incrementCounter("omote.inference.total", 1, {
|
|
@@ -6827,9 +7163,9 @@ _Wav2ArkitCpuInference.INFERENCE_TIMEOUT_MS = 5e3;
|
|
|
6827
7163
|
var Wav2ArkitCpuInference = _Wav2ArkitCpuInference;
|
|
6828
7164
|
|
|
6829
7165
|
// src/inference/Wav2ArkitCpuWorker.ts
|
|
6830
|
-
var
|
|
7166
|
+
var logger11 = createLogger("Wav2ArkitCpuWorker");
|
|
6831
7167
|
var WASM_CDN_PATH4 = "https://cdn.jsdelivr.net/npm/onnxruntime-web@1.23.2/dist/";
|
|
6832
|
-
var LOAD_TIMEOUT_MS2 =
|
|
7168
|
+
var LOAD_TIMEOUT_MS2 = 42e4;
|
|
6833
7169
|
var INFERENCE_TIMEOUT_MS2 = 5e3;
|
|
6834
7170
|
function resolveUrl3(url) {
|
|
6835
7171
|
if (/^https?:\/\//i.test(url) || /^blob:/i.test(url)) return url;
|
|
@@ -7114,7 +7450,7 @@ var Wav2ArkitCpuWorker = class {
|
|
|
7114
7450
|
this.handleWorkerMessage(event.data);
|
|
7115
7451
|
};
|
|
7116
7452
|
worker.onerror = (error) => {
|
|
7117
|
-
|
|
7453
|
+
logger11.error("Worker error", { error: error.message });
|
|
7118
7454
|
for (const [, resolver] of this.pendingResolvers) {
|
|
7119
7455
|
resolver.reject(new Error(`Worker error: ${error.message}`));
|
|
7120
7456
|
}
|
|
@@ -7190,10 +7526,10 @@ var Wav2ArkitCpuWorker = class {
|
|
|
7190
7526
|
"model.backend_requested": "wasm"
|
|
7191
7527
|
});
|
|
7192
7528
|
try {
|
|
7193
|
-
|
|
7529
|
+
logger11.info("Creating wav2arkit_cpu worker...");
|
|
7194
7530
|
this.worker = this.createWorker();
|
|
7195
7531
|
const externalDataUrl = this.config.externalDataUrl !== false ? this.config.externalDataUrl || `${this.config.modelUrl}.data` : null;
|
|
7196
|
-
|
|
7532
|
+
logger11.info("Loading model in worker...", {
|
|
7197
7533
|
modelUrl: this.config.modelUrl,
|
|
7198
7534
|
externalDataUrl,
|
|
7199
7535
|
isIOS: isIOS()
|
|
@@ -7211,7 +7547,7 @@ var Wav2ArkitCpuWorker = class {
|
|
|
7211
7547
|
);
|
|
7212
7548
|
this._isLoaded = true;
|
|
7213
7549
|
const loadTimeMs = performance.now() - startTime;
|
|
7214
|
-
|
|
7550
|
+
logger11.info("Wav2ArkitCpu worker loaded successfully", {
|
|
7215
7551
|
backend: "wasm",
|
|
7216
7552
|
loadTimeMs: Math.round(loadTimeMs),
|
|
7217
7553
|
workerLoadTimeMs: Math.round(result.loadTimeMs),
|
|
@@ -7296,7 +7632,7 @@ var Wav2ArkitCpuWorker = class {
|
|
|
7296
7632
|
for (let f = 0; f < numFrames; f++) {
|
|
7297
7633
|
blendshapes.push(flatBuffer.slice(f * numBlendshapes, (f + 1) * numBlendshapes));
|
|
7298
7634
|
}
|
|
7299
|
-
|
|
7635
|
+
logger11.trace("Worker inference completed", {
|
|
7300
7636
|
inferenceTimeMs: Math.round(inferenceTimeMs * 100) / 100,
|
|
7301
7637
|
workerTimeMs: Math.round(result.inferenceTimeMs * 100) / 100,
|
|
7302
7638
|
numFrames,
|
|
@@ -7326,12 +7662,12 @@ var Wav2ArkitCpuWorker = class {
|
|
|
7326
7662
|
const errMsg = err instanceof Error ? err.message : String(err);
|
|
7327
7663
|
if (errMsg.includes("timed out")) {
|
|
7328
7664
|
this.poisoned = true;
|
|
7329
|
-
|
|
7665
|
+
logger11.error("CRITICAL: Worker inference timed out \u2014 Wav2ArkitCpu worker is dead. Page reload required.", {
|
|
7330
7666
|
backend: "wasm",
|
|
7331
7667
|
timeoutMs: INFERENCE_TIMEOUT_MS2
|
|
7332
7668
|
});
|
|
7333
7669
|
} else {
|
|
7334
|
-
|
|
7670
|
+
logger11.error("Worker inference failed", { error: errMsg, backend: "wasm" });
|
|
7335
7671
|
}
|
|
7336
7672
|
span?.endWithError(err instanceof Error ? err : new Error(String(err)));
|
|
7337
7673
|
telemetry?.incrementCounter("omote.inference.total", 1, {
|
|
@@ -7369,38 +7705,38 @@ var Wav2ArkitCpuWorker = class {
|
|
|
7369
7705
|
};
|
|
7370
7706
|
|
|
7371
7707
|
// src/inference/createA2E.ts
|
|
7372
|
-
var
|
|
7708
|
+
var logger12 = createLogger("createA2E");
|
|
7373
7709
|
function createA2E(config) {
|
|
7374
7710
|
const mode = config.mode ?? "auto";
|
|
7375
7711
|
const fallbackOnError = config.fallbackOnError ?? true;
|
|
7376
7712
|
let useCpu;
|
|
7377
7713
|
if (mode === "cpu") {
|
|
7378
7714
|
useCpu = true;
|
|
7379
|
-
|
|
7715
|
+
logger12.info("Forcing CPU A2E model (wav2arkit_cpu)");
|
|
7380
7716
|
} else if (mode === "gpu") {
|
|
7381
7717
|
useCpu = false;
|
|
7382
|
-
|
|
7718
|
+
logger12.info("Forcing GPU A2E model (Wav2Vec2)");
|
|
7383
7719
|
} else {
|
|
7384
7720
|
useCpu = shouldUseCpuA2E();
|
|
7385
|
-
|
|
7721
|
+
logger12.info("Auto-detected A2E model", {
|
|
7386
7722
|
useCpu,
|
|
7387
7723
|
isSafari: isSafari()
|
|
7388
7724
|
});
|
|
7389
7725
|
}
|
|
7390
7726
|
if (useCpu) {
|
|
7391
7727
|
if (config.unifiedWorker) {
|
|
7392
|
-
|
|
7728
|
+
logger12.info("Creating Wav2ArkitCpuUnifiedAdapter (404MB, WASM, shared unified worker)");
|
|
7393
7729
|
return new Wav2ArkitCpuUnifiedAdapter(config.unifiedWorker, {
|
|
7394
7730
|
modelUrl: config.cpuModelUrl
|
|
7395
7731
|
});
|
|
7396
7732
|
}
|
|
7397
7733
|
if (config.useWorker && Wav2ArkitCpuWorker.isSupported() && !isIOS()) {
|
|
7398
|
-
|
|
7734
|
+
logger12.info("Creating Wav2ArkitCpuWorker (404MB, WASM, off-main-thread)");
|
|
7399
7735
|
return new Wav2ArkitCpuWorker({
|
|
7400
7736
|
modelUrl: config.cpuModelUrl
|
|
7401
7737
|
});
|
|
7402
7738
|
}
|
|
7403
|
-
|
|
7739
|
+
logger12.info("Creating Wav2ArkitCpuInference (404MB, WASM)");
|
|
7404
7740
|
return new Wav2ArkitCpuInference({
|
|
7405
7741
|
modelUrl: config.cpuModelUrl
|
|
7406
7742
|
});
|
|
@@ -7412,10 +7748,10 @@ function createA2E(config) {
|
|
|
7412
7748
|
numIdentityClasses: config.numIdentityClasses
|
|
7413
7749
|
});
|
|
7414
7750
|
if (fallbackOnError) {
|
|
7415
|
-
|
|
7751
|
+
logger12.info("Creating Wav2Vec2Inference with CPU fallback");
|
|
7416
7752
|
return new A2EWithFallback(gpuInstance, config);
|
|
7417
7753
|
}
|
|
7418
|
-
|
|
7754
|
+
logger12.info("Creating Wav2Vec2Inference (no fallback)");
|
|
7419
7755
|
return gpuInstance;
|
|
7420
7756
|
}
|
|
7421
7757
|
var A2EWithFallback = class {
|
|
@@ -7444,7 +7780,7 @@ var A2EWithFallback = class {
|
|
|
7444
7780
|
}
|
|
7445
7781
|
}
|
|
7446
7782
|
async fallbackToCpu(reason) {
|
|
7447
|
-
|
|
7783
|
+
logger12.warn("GPU model load failed, falling back to CPU model", { reason });
|
|
7448
7784
|
try {
|
|
7449
7785
|
await this.implementation.dispose();
|
|
7450
7786
|
} catch {
|
|
@@ -7453,17 +7789,17 @@ var A2EWithFallback = class {
|
|
|
7453
7789
|
this.implementation = new Wav2ArkitCpuUnifiedAdapter(this.config.unifiedWorker, {
|
|
7454
7790
|
modelUrl: this.config.cpuModelUrl
|
|
7455
7791
|
});
|
|
7456
|
-
|
|
7792
|
+
logger12.info("Fallback to Wav2ArkitCpuUnifiedAdapter successful");
|
|
7457
7793
|
} else if (this.config.useWorker && Wav2ArkitCpuWorker.isSupported() && !isIOS()) {
|
|
7458
7794
|
this.implementation = new Wav2ArkitCpuWorker({
|
|
7459
7795
|
modelUrl: this.config.cpuModelUrl
|
|
7460
7796
|
});
|
|
7461
|
-
|
|
7797
|
+
logger12.info("Fallback to Wav2ArkitCpuWorker successful");
|
|
7462
7798
|
} else {
|
|
7463
7799
|
this.implementation = new Wav2ArkitCpuInference({
|
|
7464
7800
|
modelUrl: this.config.cpuModelUrl
|
|
7465
7801
|
});
|
|
7466
|
-
|
|
7802
|
+
logger12.info("Fallback to Wav2ArkitCpuInference successful");
|
|
7467
7803
|
}
|
|
7468
7804
|
this.hasFallenBack = true;
|
|
7469
7805
|
return await this.implementation.load();
|
|
@@ -7667,7 +8003,7 @@ var EmphasisDetector = class {
|
|
|
7667
8003
|
};
|
|
7668
8004
|
|
|
7669
8005
|
// src/inference/SileroVADInference.ts
|
|
7670
|
-
var
|
|
8006
|
+
var logger13 = createLogger("SileroVAD");
|
|
7671
8007
|
var SileroVADInference = class {
|
|
7672
8008
|
constructor(config) {
|
|
7673
8009
|
this.session = null;
|
|
@@ -7741,23 +8077,23 @@ var SileroVADInference = class {
|
|
|
7741
8077
|
"model.sample_rate": this.config.sampleRate
|
|
7742
8078
|
});
|
|
7743
8079
|
try {
|
|
7744
|
-
|
|
8080
|
+
logger13.info("Loading ONNX Runtime...", { preference: this.config.backend });
|
|
7745
8081
|
const { ort, backend } = await getOnnxRuntimeForPreference(this.config.backend);
|
|
7746
8082
|
this.ort = ort;
|
|
7747
8083
|
this._backend = backend;
|
|
7748
|
-
|
|
8084
|
+
logger13.info("ONNX Runtime loaded", { backend: this._backend });
|
|
7749
8085
|
const cache = getModelCache();
|
|
7750
8086
|
const modelUrl = this.config.modelUrl;
|
|
7751
8087
|
const isCached = await cache.has(modelUrl);
|
|
7752
8088
|
let modelBuffer;
|
|
7753
8089
|
if (isCached) {
|
|
7754
|
-
|
|
8090
|
+
logger13.debug("Loading model from cache", { modelUrl });
|
|
7755
8091
|
modelBuffer = await cache.get(modelUrl);
|
|
7756
8092
|
} else {
|
|
7757
|
-
|
|
8093
|
+
logger13.debug("Fetching and caching model", { modelUrl });
|
|
7758
8094
|
modelBuffer = await fetchWithCache(modelUrl);
|
|
7759
8095
|
}
|
|
7760
|
-
|
|
8096
|
+
logger13.debug("Creating ONNX session", {
|
|
7761
8097
|
size: formatBytes(modelBuffer.byteLength),
|
|
7762
8098
|
backend: this._backend
|
|
7763
8099
|
});
|
|
@@ -7766,7 +8102,7 @@ var SileroVADInference = class {
|
|
|
7766
8102
|
this.session = await ort.InferenceSession.create(modelData, sessionOptions);
|
|
7767
8103
|
this.reset();
|
|
7768
8104
|
const loadTimeMs = performance.now() - startTime;
|
|
7769
|
-
|
|
8105
|
+
logger13.info("Model loaded successfully", {
|
|
7770
8106
|
backend: this._backend,
|
|
7771
8107
|
loadTimeMs: Math.round(loadTimeMs),
|
|
7772
8108
|
sampleRate: this.config.sampleRate,
|
|
@@ -7821,7 +8157,7 @@ var SileroVADInference = class {
|
|
|
7821
8157
|
[]
|
|
7822
8158
|
);
|
|
7823
8159
|
} catch (e) {
|
|
7824
|
-
|
|
8160
|
+
logger13.warn("BigInt64Array not available, using bigint array fallback", {
|
|
7825
8161
|
error: e instanceof Error ? e.message : String(e)
|
|
7826
8162
|
});
|
|
7827
8163
|
this.srTensor = new this.ort.Tensor(
|
|
@@ -7927,7 +8263,7 @@ var SileroVADInference = class {
|
|
|
7927
8263
|
this.preSpeechBuffer.shift();
|
|
7928
8264
|
}
|
|
7929
8265
|
}
|
|
7930
|
-
|
|
8266
|
+
logger13.trace("Skipping VAD inference - audio too quiet", {
|
|
7931
8267
|
rms: Math.round(rms * 1e4) / 1e4,
|
|
7932
8268
|
threshold: MIN_ENERGY_THRESHOLD
|
|
7933
8269
|
});
|
|
@@ -7981,7 +8317,7 @@ var SileroVADInference = class {
|
|
|
7981
8317
|
if (isSpeech && !this.wasSpeaking) {
|
|
7982
8318
|
preSpeechChunks = [...this.preSpeechBuffer];
|
|
7983
8319
|
this.preSpeechBuffer = [];
|
|
7984
|
-
|
|
8320
|
+
logger13.debug("Speech started with pre-speech buffer", {
|
|
7985
8321
|
preSpeechChunks: preSpeechChunks.length,
|
|
7986
8322
|
durationMs: Math.round(preSpeechChunks.length * this.getChunkDurationMs())
|
|
7987
8323
|
});
|
|
@@ -7994,7 +8330,7 @@ var SileroVADInference = class {
|
|
|
7994
8330
|
this.preSpeechBuffer = [];
|
|
7995
8331
|
}
|
|
7996
8332
|
this.wasSpeaking = isSpeech;
|
|
7997
|
-
|
|
8333
|
+
logger13.trace("VAD inference completed", {
|
|
7998
8334
|
probability: Math.round(probability * 1e3) / 1e3,
|
|
7999
8335
|
isSpeech,
|
|
8000
8336
|
inferenceTimeMs: Math.round(inferenceTimeMs * 100) / 100
|
|
@@ -8025,7 +8361,7 @@ var SileroVADInference = class {
|
|
|
8025
8361
|
const oomError = new Error(
|
|
8026
8362
|
`SileroVAD inference failed with raw C++ exception pointer (0x${err.toString(16)}). This is likely an OOM crash in WASM. Try reducing concurrent model sessions or reloading the page.`
|
|
8027
8363
|
);
|
|
8028
|
-
|
|
8364
|
+
logger13.error("ORT WASM OOM \u2014 raw C++ exception pointer", {
|
|
8029
8365
|
pointer: `0x${err.toString(16)}`,
|
|
8030
8366
|
backend: this._backend
|
|
8031
8367
|
});
|
|
@@ -8068,9 +8404,9 @@ var SileroVADInference = class {
|
|
|
8068
8404
|
SileroVADInference.isWebGPUAvailable = isWebGPUAvailable;
|
|
8069
8405
|
|
|
8070
8406
|
// src/inference/SileroVADWorker.ts
|
|
8071
|
-
var
|
|
8407
|
+
var logger14 = createLogger("SileroVADWorker");
|
|
8072
8408
|
var WASM_CDN_PATH5 = "https://cdn.jsdelivr.net/npm/onnxruntime-web@1.23.2/dist/";
|
|
8073
|
-
var LOAD_TIMEOUT_MS3 =
|
|
8409
|
+
var LOAD_TIMEOUT_MS3 = 12e4;
|
|
8074
8410
|
var INFERENCE_TIMEOUT_MS3 = 1e3;
|
|
8075
8411
|
function resolveUrl4(url) {
|
|
8076
8412
|
if (/^https?:\/\//i.test(url) || /^blob:/i.test(url)) return url;
|
|
@@ -8353,7 +8689,7 @@ var SileroVADWorker = class {
|
|
|
8353
8689
|
this.handleWorkerMessage(event.data);
|
|
8354
8690
|
};
|
|
8355
8691
|
worker.onerror = (error) => {
|
|
8356
|
-
|
|
8692
|
+
logger14.error("Worker error", { error: error.message });
|
|
8357
8693
|
for (const [, resolver] of this.pendingResolvers) {
|
|
8358
8694
|
resolver.reject(new Error(`Worker error: ${error.message}`));
|
|
8359
8695
|
}
|
|
@@ -8429,9 +8765,9 @@ var SileroVADWorker = class {
|
|
|
8429
8765
|
"model.sample_rate": this.config.sampleRate
|
|
8430
8766
|
});
|
|
8431
8767
|
try {
|
|
8432
|
-
|
|
8768
|
+
logger14.info("Creating VAD worker...");
|
|
8433
8769
|
this.worker = this.createWorker();
|
|
8434
|
-
|
|
8770
|
+
logger14.info("Loading model in worker...", {
|
|
8435
8771
|
modelUrl: this.config.modelUrl,
|
|
8436
8772
|
sampleRate: this.config.sampleRate
|
|
8437
8773
|
});
|
|
@@ -8447,7 +8783,7 @@ var SileroVADWorker = class {
|
|
|
8447
8783
|
);
|
|
8448
8784
|
this._isLoaded = true;
|
|
8449
8785
|
const loadTimeMs = performance.now() - startTime;
|
|
8450
|
-
|
|
8786
|
+
logger14.info("VAD worker loaded successfully", {
|
|
8451
8787
|
backend: "wasm",
|
|
8452
8788
|
loadTimeMs: Math.round(loadTimeMs),
|
|
8453
8789
|
workerLoadTimeMs: Math.round(result.loadTimeMs),
|
|
@@ -8554,7 +8890,7 @@ var SileroVADWorker = class {
|
|
|
8554
8890
|
if (isSpeech && !this.wasSpeaking) {
|
|
8555
8891
|
preSpeechChunks = [...this.preSpeechBuffer];
|
|
8556
8892
|
this.preSpeechBuffer = [];
|
|
8557
|
-
|
|
8893
|
+
logger14.debug("Speech started with pre-speech buffer", {
|
|
8558
8894
|
preSpeechChunks: preSpeechChunks.length,
|
|
8559
8895
|
durationMs: Math.round(preSpeechChunks.length * this.getChunkDurationMs())
|
|
8560
8896
|
});
|
|
@@ -8567,7 +8903,7 @@ var SileroVADWorker = class {
|
|
|
8567
8903
|
this.preSpeechBuffer = [];
|
|
8568
8904
|
}
|
|
8569
8905
|
this.wasSpeaking = isSpeech;
|
|
8570
|
-
|
|
8906
|
+
logger14.trace("VAD worker inference completed", {
|
|
8571
8907
|
probability: Math.round(result.probability * 1e3) / 1e3,
|
|
8572
8908
|
isSpeech,
|
|
8573
8909
|
inferenceTimeMs: Math.round(inferenceTimeMs * 100) / 100,
|
|
@@ -8635,44 +8971,44 @@ var SileroVADWorker = class {
|
|
|
8635
8971
|
};
|
|
8636
8972
|
|
|
8637
8973
|
// src/inference/createSileroVAD.ts
|
|
8638
|
-
var
|
|
8974
|
+
var logger15 = createLogger("createSileroVAD");
|
|
8639
8975
|
function supportsVADWorker() {
|
|
8640
8976
|
if (typeof Worker === "undefined") {
|
|
8641
|
-
|
|
8977
|
+
logger15.debug("Worker not supported: Worker constructor undefined");
|
|
8642
8978
|
return false;
|
|
8643
8979
|
}
|
|
8644
8980
|
if (typeof URL === "undefined" || typeof URL.createObjectURL === "undefined") {
|
|
8645
|
-
|
|
8981
|
+
logger15.debug("Worker not supported: URL.createObjectURL unavailable");
|
|
8646
8982
|
return false;
|
|
8647
8983
|
}
|
|
8648
8984
|
if (typeof Blob === "undefined") {
|
|
8649
|
-
|
|
8985
|
+
logger15.debug("Worker not supported: Blob constructor unavailable");
|
|
8650
8986
|
return false;
|
|
8651
8987
|
}
|
|
8652
8988
|
return true;
|
|
8653
8989
|
}
|
|
8654
8990
|
function createSileroVAD(config) {
|
|
8655
8991
|
if (config.unifiedWorker) {
|
|
8656
|
-
|
|
8992
|
+
logger15.info("Creating SileroVADUnifiedAdapter (shared unified worker)");
|
|
8657
8993
|
return new SileroVADUnifiedAdapter(config.unifiedWorker, config);
|
|
8658
8994
|
}
|
|
8659
8995
|
const fallbackOnError = config.fallbackOnError ?? true;
|
|
8660
8996
|
let useWorker;
|
|
8661
8997
|
if (config.useWorker !== void 0) {
|
|
8662
8998
|
useWorker = config.useWorker;
|
|
8663
|
-
|
|
8999
|
+
logger15.debug("Worker preference explicitly set", { useWorker });
|
|
8664
9000
|
} else {
|
|
8665
9001
|
const workerSupported = supportsVADWorker();
|
|
8666
9002
|
const onMobile = isMobile();
|
|
8667
9003
|
useWorker = workerSupported && !onMobile;
|
|
8668
|
-
|
|
9004
|
+
logger15.debug("Auto-detected Worker preference", {
|
|
8669
9005
|
useWorker,
|
|
8670
9006
|
workerSupported,
|
|
8671
9007
|
onMobile
|
|
8672
9008
|
});
|
|
8673
9009
|
}
|
|
8674
9010
|
if (useWorker) {
|
|
8675
|
-
|
|
9011
|
+
logger15.info("Creating SileroVADWorker (off-main-thread)");
|
|
8676
9012
|
const worker = new SileroVADWorker({
|
|
8677
9013
|
modelUrl: config.modelUrl,
|
|
8678
9014
|
sampleRate: config.sampleRate,
|
|
@@ -8684,7 +9020,7 @@ function createSileroVAD(config) {
|
|
|
8684
9020
|
}
|
|
8685
9021
|
return worker;
|
|
8686
9022
|
}
|
|
8687
|
-
|
|
9023
|
+
logger15.info("Creating SileroVADInference (main thread)");
|
|
8688
9024
|
return new SileroVADInference(config);
|
|
8689
9025
|
}
|
|
8690
9026
|
var VADWorkerWithFallback = class {
|
|
@@ -8710,7 +9046,7 @@ var VADWorkerWithFallback = class {
|
|
|
8710
9046
|
try {
|
|
8711
9047
|
return await this.implementation.load();
|
|
8712
9048
|
} catch (error) {
|
|
8713
|
-
|
|
9049
|
+
logger15.warn("Worker load failed, falling back to main thread", {
|
|
8714
9050
|
error: error instanceof Error ? error.message : String(error)
|
|
8715
9051
|
});
|
|
8716
9052
|
try {
|
|
@@ -8719,7 +9055,7 @@ var VADWorkerWithFallback = class {
|
|
|
8719
9055
|
}
|
|
8720
9056
|
this.implementation = new SileroVADInference(this.config);
|
|
8721
9057
|
this.hasFallenBack = true;
|
|
8722
|
-
|
|
9058
|
+
logger15.info("Fallback to SileroVADInference successful");
|
|
8723
9059
|
return await this.implementation.load();
|
|
8724
9060
|
}
|
|
8725
9061
|
}
|
|
@@ -8741,7 +9077,7 @@ var VADWorkerWithFallback = class {
|
|
|
8741
9077
|
};
|
|
8742
9078
|
|
|
8743
9079
|
// src/inference/A2EOrchestrator.ts
|
|
8744
|
-
var
|
|
9080
|
+
var logger16 = createLogger("A2EOrchestrator");
|
|
8745
9081
|
var A2EOrchestrator = class {
|
|
8746
9082
|
constructor(config) {
|
|
8747
9083
|
this.a2e = null;
|
|
@@ -8782,7 +9118,7 @@ var A2EOrchestrator = class {
|
|
|
8782
9118
|
*/
|
|
8783
9119
|
async load() {
|
|
8784
9120
|
if (this.disposed) throw new Error("A2EOrchestrator has been disposed");
|
|
8785
|
-
|
|
9121
|
+
logger16.info("Loading A2E model...");
|
|
8786
9122
|
this.a2e = createA2E({
|
|
8787
9123
|
gpuModelUrl: this.config.gpuModelUrl,
|
|
8788
9124
|
gpuExternalDataUrl: this.config.gpuExternalDataUrl,
|
|
@@ -8799,7 +9135,7 @@ var A2EOrchestrator = class {
|
|
|
8799
9135
|
onError: this.config.onError
|
|
8800
9136
|
});
|
|
8801
9137
|
this._isReady = true;
|
|
8802
|
-
|
|
9138
|
+
logger16.info("A2E model loaded", {
|
|
8803
9139
|
backend: info.backend,
|
|
8804
9140
|
loadTimeMs: info.loadTimeMs,
|
|
8805
9141
|
modelId: this.a2e.modelId
|
|
@@ -8854,10 +9190,10 @@ var A2EOrchestrator = class {
|
|
|
8854
9190
|
this.scriptProcessor.connect(this.audioContext.destination);
|
|
8855
9191
|
this._isStreaming = true;
|
|
8856
9192
|
this.processor.startDrip();
|
|
8857
|
-
|
|
9193
|
+
logger16.info("Mic capture started", { sampleRate: this.nativeSampleRate });
|
|
8858
9194
|
} catch (err) {
|
|
8859
9195
|
const error = err instanceof Error ? err : new Error(String(err));
|
|
8860
|
-
|
|
9196
|
+
logger16.error("Failed to start mic capture", { error: error.message });
|
|
8861
9197
|
this.config.onError?.(error);
|
|
8862
9198
|
throw error;
|
|
8863
9199
|
}
|
|
@@ -8885,7 +9221,7 @@ var A2EOrchestrator = class {
|
|
|
8885
9221
|
});
|
|
8886
9222
|
this.audioContext = null;
|
|
8887
9223
|
}
|
|
8888
|
-
|
|
9224
|
+
logger16.info("Mic capture stopped");
|
|
8889
9225
|
}
|
|
8890
9226
|
/**
|
|
8891
9227
|
* Dispose of all resources
|
|
@@ -8908,7 +9244,7 @@ var A2EOrchestrator = class {
|
|
|
8908
9244
|
};
|
|
8909
9245
|
|
|
8910
9246
|
// src/inference/SafariSpeechRecognition.ts
|
|
8911
|
-
var
|
|
9247
|
+
var logger17 = createLogger("SafariSpeech");
|
|
8912
9248
|
var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
8913
9249
|
constructor(config = {}) {
|
|
8914
9250
|
this.recognition = null;
|
|
@@ -8927,7 +9263,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
8927
9263
|
interimResults: config.interimResults ?? true,
|
|
8928
9264
|
maxAlternatives: config.maxAlternatives ?? 1
|
|
8929
9265
|
};
|
|
8930
|
-
|
|
9266
|
+
logger17.debug("SafariSpeechRecognition created", {
|
|
8931
9267
|
language: this.config.language,
|
|
8932
9268
|
continuous: this.config.continuous
|
|
8933
9269
|
});
|
|
@@ -8988,7 +9324,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
8988
9324
|
*/
|
|
8989
9325
|
async start() {
|
|
8990
9326
|
if (this.isListening) {
|
|
8991
|
-
|
|
9327
|
+
logger17.warn("Already listening");
|
|
8992
9328
|
return;
|
|
8993
9329
|
}
|
|
8994
9330
|
if (!_SafariSpeechRecognition.isAvailable()) {
|
|
@@ -9018,7 +9354,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
9018
9354
|
this.isListening = true;
|
|
9019
9355
|
this.startTime = performance.now();
|
|
9020
9356
|
this.accumulatedText = "";
|
|
9021
|
-
|
|
9357
|
+
logger17.info("Speech recognition started", {
|
|
9022
9358
|
language: this.config.language
|
|
9023
9359
|
});
|
|
9024
9360
|
span?.end();
|
|
@@ -9033,7 +9369,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
9033
9369
|
*/
|
|
9034
9370
|
async stop() {
|
|
9035
9371
|
if (!this.isListening || !this.recognition) {
|
|
9036
|
-
|
|
9372
|
+
logger17.warn("Not currently listening");
|
|
9037
9373
|
return {
|
|
9038
9374
|
text: this.accumulatedText,
|
|
9039
9375
|
language: this.config.language,
|
|
@@ -9062,7 +9398,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
9062
9398
|
if (this.recognition && this.isListening) {
|
|
9063
9399
|
this.recognition.abort();
|
|
9064
9400
|
this.isListening = false;
|
|
9065
|
-
|
|
9401
|
+
logger17.info("Speech recognition aborted");
|
|
9066
9402
|
}
|
|
9067
9403
|
}
|
|
9068
9404
|
/**
|
|
@@ -9093,7 +9429,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
9093
9429
|
this.isListening = false;
|
|
9094
9430
|
this.resultCallbacks = [];
|
|
9095
9431
|
this.errorCallbacks = [];
|
|
9096
|
-
|
|
9432
|
+
logger17.debug("SafariSpeechRecognition disposed");
|
|
9097
9433
|
}
|
|
9098
9434
|
/**
|
|
9099
9435
|
* Set up event handlers for the recognition instance
|
|
@@ -9121,7 +9457,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
9121
9457
|
confidence: alternative.confidence
|
|
9122
9458
|
};
|
|
9123
9459
|
this.emitResult(speechResult);
|
|
9124
|
-
|
|
9460
|
+
logger17.trace("Speech result", {
|
|
9125
9461
|
text: text.substring(0, 50),
|
|
9126
9462
|
isFinal,
|
|
9127
9463
|
confidence: alternative.confidence
|
|
@@ -9131,12 +9467,12 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
9131
9467
|
span?.end();
|
|
9132
9468
|
} catch (error) {
|
|
9133
9469
|
span?.endWithError(error instanceof Error ? error : new Error(String(error)));
|
|
9134
|
-
|
|
9470
|
+
logger17.error("Error processing speech result", { error });
|
|
9135
9471
|
}
|
|
9136
9472
|
};
|
|
9137
9473
|
this.recognition.onerror = (event) => {
|
|
9138
9474
|
const error = new Error(`Speech recognition error: ${event.error} - ${event.message}`);
|
|
9139
|
-
|
|
9475
|
+
logger17.error("Speech recognition error", { error: event.error, message: event.message });
|
|
9140
9476
|
this.emitError(error);
|
|
9141
9477
|
if (this.stopRejecter) {
|
|
9142
9478
|
this.stopRejecter(error);
|
|
@@ -9146,7 +9482,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
9146
9482
|
};
|
|
9147
9483
|
this.recognition.onend = () => {
|
|
9148
9484
|
this.isListening = false;
|
|
9149
|
-
|
|
9485
|
+
logger17.info("Speech recognition ended", {
|
|
9150
9486
|
totalText: this.accumulatedText.length,
|
|
9151
9487
|
durationMs: performance.now() - this.startTime
|
|
9152
9488
|
});
|
|
@@ -9163,13 +9499,13 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
9163
9499
|
}
|
|
9164
9500
|
};
|
|
9165
9501
|
this.recognition.onstart = () => {
|
|
9166
|
-
|
|
9502
|
+
logger17.debug("Speech recognition started by browser");
|
|
9167
9503
|
};
|
|
9168
9504
|
this.recognition.onspeechstart = () => {
|
|
9169
|
-
|
|
9505
|
+
logger17.debug("Speech detected");
|
|
9170
9506
|
};
|
|
9171
9507
|
this.recognition.onspeechend = () => {
|
|
9172
|
-
|
|
9508
|
+
logger17.debug("Speech ended");
|
|
9173
9509
|
};
|
|
9174
9510
|
}
|
|
9175
9511
|
/**
|
|
@@ -9180,7 +9516,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
9180
9516
|
try {
|
|
9181
9517
|
callback(result);
|
|
9182
9518
|
} catch (error) {
|
|
9183
|
-
|
|
9519
|
+
logger17.error("Error in result callback", { error });
|
|
9184
9520
|
}
|
|
9185
9521
|
}
|
|
9186
9522
|
}
|
|
@@ -9192,7 +9528,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
9192
9528
|
try {
|
|
9193
9529
|
callback(error);
|
|
9194
9530
|
} catch (callbackError) {
|
|
9195
|
-
|
|
9531
|
+
logger17.error("Error in error callback", { error: callbackError });
|
|
9196
9532
|
}
|
|
9197
9533
|
}
|
|
9198
9534
|
}
|
|
@@ -9762,327 +10098,9 @@ var AnimationGraph = class extends EventEmitter {
|
|
|
9762
10098
|
}
|
|
9763
10099
|
};
|
|
9764
10100
|
|
|
9765
|
-
// src/animation/simplex2d.ts
|
|
9766
|
-
var perm = new Uint8Array(512);
|
|
9767
|
-
var grad2 = [
|
|
9768
|
-
[1, 1],
|
|
9769
|
-
[-1, 1],
|
|
9770
|
-
[1, -1],
|
|
9771
|
-
[-1, -1],
|
|
9772
|
-
[1, 0],
|
|
9773
|
-
[-1, 0],
|
|
9774
|
-
[0, 1],
|
|
9775
|
-
[0, -1]
|
|
9776
|
-
];
|
|
9777
|
-
var p = [
|
|
9778
|
-
151,
|
|
9779
|
-
160,
|
|
9780
|
-
137,
|
|
9781
|
-
91,
|
|
9782
|
-
90,
|
|
9783
|
-
15,
|
|
9784
|
-
131,
|
|
9785
|
-
13,
|
|
9786
|
-
201,
|
|
9787
|
-
95,
|
|
9788
|
-
96,
|
|
9789
|
-
53,
|
|
9790
|
-
194,
|
|
9791
|
-
233,
|
|
9792
|
-
7,
|
|
9793
|
-
225,
|
|
9794
|
-
140,
|
|
9795
|
-
36,
|
|
9796
|
-
103,
|
|
9797
|
-
30,
|
|
9798
|
-
69,
|
|
9799
|
-
142,
|
|
9800
|
-
8,
|
|
9801
|
-
99,
|
|
9802
|
-
37,
|
|
9803
|
-
240,
|
|
9804
|
-
21,
|
|
9805
|
-
10,
|
|
9806
|
-
23,
|
|
9807
|
-
190,
|
|
9808
|
-
6,
|
|
9809
|
-
148,
|
|
9810
|
-
247,
|
|
9811
|
-
120,
|
|
9812
|
-
234,
|
|
9813
|
-
75,
|
|
9814
|
-
0,
|
|
9815
|
-
26,
|
|
9816
|
-
197,
|
|
9817
|
-
62,
|
|
9818
|
-
94,
|
|
9819
|
-
252,
|
|
9820
|
-
219,
|
|
9821
|
-
203,
|
|
9822
|
-
117,
|
|
9823
|
-
35,
|
|
9824
|
-
11,
|
|
9825
|
-
32,
|
|
9826
|
-
57,
|
|
9827
|
-
177,
|
|
9828
|
-
33,
|
|
9829
|
-
88,
|
|
9830
|
-
237,
|
|
9831
|
-
149,
|
|
9832
|
-
56,
|
|
9833
|
-
87,
|
|
9834
|
-
174,
|
|
9835
|
-
20,
|
|
9836
|
-
125,
|
|
9837
|
-
136,
|
|
9838
|
-
171,
|
|
9839
|
-
168,
|
|
9840
|
-
68,
|
|
9841
|
-
175,
|
|
9842
|
-
74,
|
|
9843
|
-
165,
|
|
9844
|
-
71,
|
|
9845
|
-
134,
|
|
9846
|
-
139,
|
|
9847
|
-
48,
|
|
9848
|
-
27,
|
|
9849
|
-
166,
|
|
9850
|
-
77,
|
|
9851
|
-
146,
|
|
9852
|
-
158,
|
|
9853
|
-
231,
|
|
9854
|
-
83,
|
|
9855
|
-
111,
|
|
9856
|
-
229,
|
|
9857
|
-
122,
|
|
9858
|
-
60,
|
|
9859
|
-
211,
|
|
9860
|
-
133,
|
|
9861
|
-
230,
|
|
9862
|
-
220,
|
|
9863
|
-
105,
|
|
9864
|
-
92,
|
|
9865
|
-
41,
|
|
9866
|
-
55,
|
|
9867
|
-
46,
|
|
9868
|
-
245,
|
|
9869
|
-
40,
|
|
9870
|
-
244,
|
|
9871
|
-
102,
|
|
9872
|
-
143,
|
|
9873
|
-
54,
|
|
9874
|
-
65,
|
|
9875
|
-
25,
|
|
9876
|
-
63,
|
|
9877
|
-
161,
|
|
9878
|
-
1,
|
|
9879
|
-
216,
|
|
9880
|
-
80,
|
|
9881
|
-
73,
|
|
9882
|
-
209,
|
|
9883
|
-
76,
|
|
9884
|
-
132,
|
|
9885
|
-
187,
|
|
9886
|
-
208,
|
|
9887
|
-
89,
|
|
9888
|
-
18,
|
|
9889
|
-
169,
|
|
9890
|
-
200,
|
|
9891
|
-
196,
|
|
9892
|
-
135,
|
|
9893
|
-
130,
|
|
9894
|
-
116,
|
|
9895
|
-
188,
|
|
9896
|
-
159,
|
|
9897
|
-
86,
|
|
9898
|
-
164,
|
|
9899
|
-
100,
|
|
9900
|
-
109,
|
|
9901
|
-
198,
|
|
9902
|
-
173,
|
|
9903
|
-
186,
|
|
9904
|
-
3,
|
|
9905
|
-
64,
|
|
9906
|
-
52,
|
|
9907
|
-
217,
|
|
9908
|
-
226,
|
|
9909
|
-
250,
|
|
9910
|
-
124,
|
|
9911
|
-
123,
|
|
9912
|
-
5,
|
|
9913
|
-
202,
|
|
9914
|
-
38,
|
|
9915
|
-
147,
|
|
9916
|
-
118,
|
|
9917
|
-
126,
|
|
9918
|
-
255,
|
|
9919
|
-
82,
|
|
9920
|
-
85,
|
|
9921
|
-
212,
|
|
9922
|
-
207,
|
|
9923
|
-
206,
|
|
9924
|
-
59,
|
|
9925
|
-
227,
|
|
9926
|
-
47,
|
|
9927
|
-
16,
|
|
9928
|
-
58,
|
|
9929
|
-
17,
|
|
9930
|
-
182,
|
|
9931
|
-
189,
|
|
9932
|
-
28,
|
|
9933
|
-
42,
|
|
9934
|
-
223,
|
|
9935
|
-
183,
|
|
9936
|
-
170,
|
|
9937
|
-
213,
|
|
9938
|
-
119,
|
|
9939
|
-
248,
|
|
9940
|
-
152,
|
|
9941
|
-
2,
|
|
9942
|
-
44,
|
|
9943
|
-
154,
|
|
9944
|
-
163,
|
|
9945
|
-
70,
|
|
9946
|
-
221,
|
|
9947
|
-
153,
|
|
9948
|
-
101,
|
|
9949
|
-
155,
|
|
9950
|
-
167,
|
|
9951
|
-
43,
|
|
9952
|
-
172,
|
|
9953
|
-
9,
|
|
9954
|
-
129,
|
|
9955
|
-
22,
|
|
9956
|
-
39,
|
|
9957
|
-
253,
|
|
9958
|
-
19,
|
|
9959
|
-
98,
|
|
9960
|
-
108,
|
|
9961
|
-
110,
|
|
9962
|
-
79,
|
|
9963
|
-
113,
|
|
9964
|
-
224,
|
|
9965
|
-
232,
|
|
9966
|
-
178,
|
|
9967
|
-
185,
|
|
9968
|
-
112,
|
|
9969
|
-
104,
|
|
9970
|
-
218,
|
|
9971
|
-
246,
|
|
9972
|
-
97,
|
|
9973
|
-
228,
|
|
9974
|
-
251,
|
|
9975
|
-
34,
|
|
9976
|
-
242,
|
|
9977
|
-
193,
|
|
9978
|
-
238,
|
|
9979
|
-
210,
|
|
9980
|
-
144,
|
|
9981
|
-
12,
|
|
9982
|
-
191,
|
|
9983
|
-
179,
|
|
9984
|
-
162,
|
|
9985
|
-
241,
|
|
9986
|
-
81,
|
|
9987
|
-
51,
|
|
9988
|
-
145,
|
|
9989
|
-
235,
|
|
9990
|
-
249,
|
|
9991
|
-
14,
|
|
9992
|
-
239,
|
|
9993
|
-
107,
|
|
9994
|
-
49,
|
|
9995
|
-
192,
|
|
9996
|
-
214,
|
|
9997
|
-
31,
|
|
9998
|
-
181,
|
|
9999
|
-
199,
|
|
10000
|
-
106,
|
|
10001
|
-
157,
|
|
10002
|
-
184,
|
|
10003
|
-
84,
|
|
10004
|
-
204,
|
|
10005
|
-
176,
|
|
10006
|
-
115,
|
|
10007
|
-
121,
|
|
10008
|
-
50,
|
|
10009
|
-
45,
|
|
10010
|
-
127,
|
|
10011
|
-
4,
|
|
10012
|
-
150,
|
|
10013
|
-
254,
|
|
10014
|
-
138,
|
|
10015
|
-
236,
|
|
10016
|
-
205,
|
|
10017
|
-
93,
|
|
10018
|
-
222,
|
|
10019
|
-
114,
|
|
10020
|
-
67,
|
|
10021
|
-
29,
|
|
10022
|
-
24,
|
|
10023
|
-
72,
|
|
10024
|
-
243,
|
|
10025
|
-
141,
|
|
10026
|
-
128,
|
|
10027
|
-
195,
|
|
10028
|
-
78,
|
|
10029
|
-
66,
|
|
10030
|
-
215,
|
|
10031
|
-
61,
|
|
10032
|
-
156,
|
|
10033
|
-
180
|
|
10034
|
-
];
|
|
10035
|
-
for (let i = 0; i < 256; i++) {
|
|
10036
|
-
perm[i] = p[i];
|
|
10037
|
-
perm[i + 256] = p[i];
|
|
10038
|
-
}
|
|
10039
|
-
var F2 = 0.5 * (Math.sqrt(3) - 1);
|
|
10040
|
-
var G2 = (3 - Math.sqrt(3)) / 6;
|
|
10041
|
-
function dot2(g, x, y) {
|
|
10042
|
-
return g[0] * x + g[1] * y;
|
|
10043
|
-
}
|
|
10044
|
-
function simplex2d(x, y) {
|
|
10045
|
-
const s = (x + y) * F2;
|
|
10046
|
-
const i = Math.floor(x + s);
|
|
10047
|
-
const j = Math.floor(y + s);
|
|
10048
|
-
const t = (i + j) * G2;
|
|
10049
|
-
const X0 = i - t;
|
|
10050
|
-
const Y0 = j - t;
|
|
10051
|
-
const x0 = x - X0;
|
|
10052
|
-
const y0 = y - Y0;
|
|
10053
|
-
const i1 = x0 > y0 ? 1 : 0;
|
|
10054
|
-
const j1 = x0 > y0 ? 0 : 1;
|
|
10055
|
-
const x1 = x0 - i1 + G2;
|
|
10056
|
-
const y1 = y0 - j1 + G2;
|
|
10057
|
-
const x2 = x0 - 1 + 2 * G2;
|
|
10058
|
-
const y2 = y0 - 1 + 2 * G2;
|
|
10059
|
-
const ii = i & 255;
|
|
10060
|
-
const jj = j & 255;
|
|
10061
|
-
const gi0 = perm[ii + perm[jj]] % 8;
|
|
10062
|
-
const gi1 = perm[ii + i1 + perm[jj + j1]] % 8;
|
|
10063
|
-
const gi2 = perm[ii + 1 + perm[jj + 1]] % 8;
|
|
10064
|
-
let n0 = 0;
|
|
10065
|
-
let t0 = 0.5 - x0 * x0 - y0 * y0;
|
|
10066
|
-
if (t0 >= 0) {
|
|
10067
|
-
t0 *= t0;
|
|
10068
|
-
n0 = t0 * t0 * dot2(grad2[gi0], x0, y0);
|
|
10069
|
-
}
|
|
10070
|
-
let n1 = 0;
|
|
10071
|
-
let t1 = 0.5 - x1 * x1 - y1 * y1;
|
|
10072
|
-
if (t1 >= 0) {
|
|
10073
|
-
t1 *= t1;
|
|
10074
|
-
n1 = t1 * t1 * dot2(grad2[gi1], x1, y1);
|
|
10075
|
-
}
|
|
10076
|
-
let n2 = 0;
|
|
10077
|
-
let t2 = 0.5 - x2 * x2 - y2 * y2;
|
|
10078
|
-
if (t2 >= 0) {
|
|
10079
|
-
t2 *= t2;
|
|
10080
|
-
n2 = t2 * t2 * dot2(grad2[gi2], x2, y2);
|
|
10081
|
-
}
|
|
10082
|
-
return 70 * (n0 + n1 + n2);
|
|
10083
|
-
}
|
|
10084
|
-
|
|
10085
10101
|
// src/animation/ProceduralLifeLayer.ts
|
|
10102
|
+
var import_simplex_noise = require("simplex-noise");
|
|
10103
|
+
var simplex2d = (0, import_simplex_noise.createNoise2D)();
|
|
10086
10104
|
var PHASE_OPEN = 0;
|
|
10087
10105
|
var PHASE_CLOSING = 1;
|
|
10088
10106
|
var PHASE_CLOSED = 2;
|
|
@@ -10390,6 +10408,684 @@ var ProceduralLifeLayer = class {
|
|
|
10390
10408
|
}
|
|
10391
10409
|
};
|
|
10392
10410
|
|
|
10411
|
+
// src/orchestration/MicLipSync.ts
|
|
10412
|
+
var logger18 = createLogger("MicLipSync");
|
|
10413
|
+
var MicLipSync = class extends EventEmitter {
|
|
10414
|
+
constructor(config) {
|
|
10415
|
+
super();
|
|
10416
|
+
this.omoteEvents = new EventEmitter();
|
|
10417
|
+
this._state = "idle";
|
|
10418
|
+
this._isSpeaking = false;
|
|
10419
|
+
this._currentFrame = null;
|
|
10420
|
+
this._currentRawFrame = null;
|
|
10421
|
+
// VAD state
|
|
10422
|
+
this.speechStartTime = 0;
|
|
10423
|
+
this.vadChunkSize = 0;
|
|
10424
|
+
this.vadBuffer = null;
|
|
10425
|
+
this.vadBufferOffset = 0;
|
|
10426
|
+
this.profile = config.profile ?? {};
|
|
10427
|
+
this.vad = config.vad;
|
|
10428
|
+
this.mic = new MicrophoneCapture(this.omoteEvents, {
|
|
10429
|
+
sampleRate: config.sampleRate ?? 16e3,
|
|
10430
|
+
chunkSize: config.micChunkSize ?? 512
|
|
10431
|
+
});
|
|
10432
|
+
this.processor = new A2EProcessor({
|
|
10433
|
+
backend: config.lam,
|
|
10434
|
+
sampleRate: config.sampleRate ?? 16e3,
|
|
10435
|
+
identityIndex: config.identityIndex,
|
|
10436
|
+
onFrame: (raw) => {
|
|
10437
|
+
const scaled = applyProfile(raw, this.profile);
|
|
10438
|
+
this._currentFrame = scaled;
|
|
10439
|
+
this._currentRawFrame = raw;
|
|
10440
|
+
this.emit("frame", { blendshapes: scaled, rawBlendshapes: raw });
|
|
10441
|
+
},
|
|
10442
|
+
onError: (error) => {
|
|
10443
|
+
logger18.error("A2E inference error", { message: error.message });
|
|
10444
|
+
this.emit("error", error);
|
|
10445
|
+
}
|
|
10446
|
+
});
|
|
10447
|
+
this.omoteEvents.on("audio.chunk", ({ pcm }) => {
|
|
10448
|
+
const float32 = int16ToFloat32(pcm);
|
|
10449
|
+
this.processor.pushAudio(float32);
|
|
10450
|
+
if (this.vad) {
|
|
10451
|
+
this.processVAD(float32);
|
|
10452
|
+
}
|
|
10453
|
+
});
|
|
10454
|
+
this.omoteEvents.on("audio.level", (level) => {
|
|
10455
|
+
this.emit("audio:level", level);
|
|
10456
|
+
});
|
|
10457
|
+
if (this.vad) {
|
|
10458
|
+
this.vadChunkSize = this.vad.getChunkSize();
|
|
10459
|
+
this.vadBuffer = new Float32Array(this.vadChunkSize);
|
|
10460
|
+
this.vadBufferOffset = 0;
|
|
10461
|
+
}
|
|
10462
|
+
}
|
|
10463
|
+
/** Current state */
|
|
10464
|
+
get state() {
|
|
10465
|
+
return this._state;
|
|
10466
|
+
}
|
|
10467
|
+
/** Latest blendshape frame (null before first inference) */
|
|
10468
|
+
get currentFrame() {
|
|
10469
|
+
return this._currentFrame;
|
|
10470
|
+
}
|
|
10471
|
+
/** Whether speech is currently detected (requires VAD) */
|
|
10472
|
+
get isSpeaking() {
|
|
10473
|
+
return this._isSpeaking;
|
|
10474
|
+
}
|
|
10475
|
+
/** Current backend type */
|
|
10476
|
+
get backend() {
|
|
10477
|
+
return this.processor ? "active" : null;
|
|
10478
|
+
}
|
|
10479
|
+
// ---------------------------------------------------------------------------
|
|
10480
|
+
// Public API
|
|
10481
|
+
// ---------------------------------------------------------------------------
|
|
10482
|
+
/** Start microphone capture and inference loop */
|
|
10483
|
+
async start() {
|
|
10484
|
+
if (this._state === "active") return;
|
|
10485
|
+
await this.mic.start();
|
|
10486
|
+
this.processor.startDrip();
|
|
10487
|
+
this.emit("mic:start", void 0);
|
|
10488
|
+
this.setState("active");
|
|
10489
|
+
}
|
|
10490
|
+
/** Stop microphone and inference */
|
|
10491
|
+
stop() {
|
|
10492
|
+
if (this._state === "idle") return;
|
|
10493
|
+
this.processor.stopDrip();
|
|
10494
|
+
this.mic.stop();
|
|
10495
|
+
this._isSpeaking = false;
|
|
10496
|
+
this.emit("mic:stop", void 0);
|
|
10497
|
+
this.setState("idle");
|
|
10498
|
+
}
|
|
10499
|
+
/** Pause inference (mic stays open for faster resume) */
|
|
10500
|
+
pause() {
|
|
10501
|
+
if (this._state !== "active") return;
|
|
10502
|
+
this.processor.stopDrip();
|
|
10503
|
+
this.setState("paused");
|
|
10504
|
+
}
|
|
10505
|
+
/** Resume inference after pause */
|
|
10506
|
+
resume() {
|
|
10507
|
+
if (this._state !== "paused") return;
|
|
10508
|
+
this.processor.startDrip();
|
|
10509
|
+
this.setState("active");
|
|
10510
|
+
}
|
|
10511
|
+
/** Update ExpressionProfile at runtime */
|
|
10512
|
+
setProfile(profile) {
|
|
10513
|
+
this.profile = profile;
|
|
10514
|
+
}
|
|
10515
|
+
/** Dispose of all resources */
|
|
10516
|
+
async dispose() {
|
|
10517
|
+
this.stop();
|
|
10518
|
+
this.processor.dispose();
|
|
10519
|
+
}
|
|
10520
|
+
// ---------------------------------------------------------------------------
|
|
10521
|
+
// Internal: VAD processing
|
|
10522
|
+
// ---------------------------------------------------------------------------
|
|
10523
|
+
async processVAD(samples) {
|
|
10524
|
+
if (!this.vad || !this.vadBuffer) return;
|
|
10525
|
+
for (let i = 0; i < samples.length; i++) {
|
|
10526
|
+
this.vadBuffer[this.vadBufferOffset++] = samples[i];
|
|
10527
|
+
if (this.vadBufferOffset >= this.vadChunkSize) {
|
|
10528
|
+
try {
|
|
10529
|
+
const result = await this.vad.process(this.vadBuffer);
|
|
10530
|
+
const wasSpeaking = this._isSpeaking;
|
|
10531
|
+
this._isSpeaking = result.isSpeech;
|
|
10532
|
+
if (!wasSpeaking && result.isSpeech) {
|
|
10533
|
+
this.speechStartTime = performance.now();
|
|
10534
|
+
this.emit("speech:start", void 0);
|
|
10535
|
+
} else if (wasSpeaking && !result.isSpeech) {
|
|
10536
|
+
const durationMs = performance.now() - this.speechStartTime;
|
|
10537
|
+
this.emit("speech:end", { durationMs });
|
|
10538
|
+
}
|
|
10539
|
+
} catch (err) {
|
|
10540
|
+
logger18.warn("VAD process error", { error: String(err) });
|
|
10541
|
+
}
|
|
10542
|
+
this.vadBufferOffset = 0;
|
|
10543
|
+
}
|
|
10544
|
+
}
|
|
10545
|
+
}
|
|
10546
|
+
// ---------------------------------------------------------------------------
|
|
10547
|
+
// Internal: State management
|
|
10548
|
+
// ---------------------------------------------------------------------------
|
|
10549
|
+
setState(state) {
|
|
10550
|
+
if (this._state === state) return;
|
|
10551
|
+
this._state = state;
|
|
10552
|
+
this.emit("state", state);
|
|
10553
|
+
}
|
|
10554
|
+
};
|
|
10555
|
+
|
|
10556
|
+
// src/orchestration/VoicePipeline.ts
|
|
10557
|
+
var logger19 = createLogger("VoicePipeline");
|
|
10558
|
+
var VoicePipeline = class extends EventEmitter {
|
|
10559
|
+
constructor(config) {
|
|
10560
|
+
super();
|
|
10561
|
+
// State
|
|
10562
|
+
this._state = "idle";
|
|
10563
|
+
this.stopped = false;
|
|
10564
|
+
this.epoch = 0;
|
|
10565
|
+
this._sessionId = null;
|
|
10566
|
+
// Models
|
|
10567
|
+
this.asr = null;
|
|
10568
|
+
this.lam = null;
|
|
10569
|
+
this.vad = null;
|
|
10570
|
+
this.unifiedWorker = null;
|
|
10571
|
+
// Pipelines
|
|
10572
|
+
this.playback = null;
|
|
10573
|
+
this.interruption = null;
|
|
10574
|
+
this.omoteEvents = new EventEmitter();
|
|
10575
|
+
this.mic = null;
|
|
10576
|
+
// Audio accumulation
|
|
10577
|
+
this.audioBuffer = [];
|
|
10578
|
+
this.audioBufferSamples = 0;
|
|
10579
|
+
this.speechStartTime = 0;
|
|
10580
|
+
this.silenceTimer = null;
|
|
10581
|
+
this.isSpeaking = false;
|
|
10582
|
+
// Progressive transcription
|
|
10583
|
+
this.progressiveTimer = null;
|
|
10584
|
+
this.progressivePromise = null;
|
|
10585
|
+
this.lastProgressiveResult = null;
|
|
10586
|
+
this.lastProgressiveSamples = 0;
|
|
10587
|
+
// ASR error recovery
|
|
10588
|
+
this.asrErrorCount = 0;
|
|
10589
|
+
// Response abort
|
|
10590
|
+
this.responseAbortController = null;
|
|
10591
|
+
// Frame refs
|
|
10592
|
+
this._currentFrame = null;
|
|
10593
|
+
this.config = config;
|
|
10594
|
+
}
|
|
10595
|
+
/** Current pipeline state */
|
|
10596
|
+
get state() {
|
|
10597
|
+
return this._state;
|
|
10598
|
+
}
|
|
10599
|
+
/** Latest blendshape frame */
|
|
10600
|
+
get currentFrame() {
|
|
10601
|
+
return this._currentFrame;
|
|
10602
|
+
}
|
|
10603
|
+
/** Whether user is currently speaking */
|
|
10604
|
+
get isSpeechActive() {
|
|
10605
|
+
return this.isSpeaking;
|
|
10606
|
+
}
|
|
10607
|
+
/** Session ID (generated on start(), null before) */
|
|
10608
|
+
get sessionId() {
|
|
10609
|
+
return this._sessionId;
|
|
10610
|
+
}
|
|
10611
|
+
// ---------------------------------------------------------------------------
|
|
10612
|
+
// Model loading
|
|
10613
|
+
// ---------------------------------------------------------------------------
|
|
10614
|
+
async loadModels() {
|
|
10615
|
+
this.setState("loading");
|
|
10616
|
+
const timeoutMs = this.config.lamLoadTimeoutMs ?? 3e4;
|
|
10617
|
+
try {
|
|
10618
|
+
if (isIOS()) {
|
|
10619
|
+
this.unifiedWorker = new UnifiedInferenceWorker();
|
|
10620
|
+
await this.unifiedWorker.init();
|
|
10621
|
+
}
|
|
10622
|
+
this.emitProgress("Speech recognition", 0, 3, 0);
|
|
10623
|
+
this.asr = createSenseVoice({
|
|
10624
|
+
modelUrl: this.config.models.senseVoice.modelUrl,
|
|
10625
|
+
tokensUrl: this.config.models.senseVoice.tokensUrl,
|
|
10626
|
+
language: this.config.models.senseVoice.language,
|
|
10627
|
+
unifiedWorker: this.unifiedWorker ?? void 0
|
|
10628
|
+
});
|
|
10629
|
+
await this.asr.load();
|
|
10630
|
+
this.emitProgress("Speech recognition", 45, 3, 1);
|
|
10631
|
+
this.emitProgress("Lip sync", 45, 3, 1);
|
|
10632
|
+
let lam = createA2E({
|
|
10633
|
+
gpuModelUrl: this.config.models.lam.gpuModelUrl,
|
|
10634
|
+
gpuExternalDataUrl: this.config.models.lam.gpuExternalDataUrl,
|
|
10635
|
+
cpuModelUrl: this.config.models.lam.cpuModelUrl,
|
|
10636
|
+
mode: this.config.models.lam.mode,
|
|
10637
|
+
unifiedWorker: this.unifiedWorker ?? void 0
|
|
10638
|
+
});
|
|
10639
|
+
let lamProgress = 45;
|
|
10640
|
+
const lamTickInterval = setInterval(() => {
|
|
10641
|
+
const remaining = 85 - lamProgress;
|
|
10642
|
+
lamProgress += Math.max(0.5, remaining * 0.08);
|
|
10643
|
+
this.emitProgress("Lip sync", Math.round(lamProgress), 3, 1);
|
|
10644
|
+
}, 300);
|
|
10645
|
+
try {
|
|
10646
|
+
const lamLoadResult = await Promise.race([
|
|
10647
|
+
lam.load().then(() => "ok"),
|
|
10648
|
+
new Promise((r) => setTimeout(() => r("timeout"), timeoutMs))
|
|
10649
|
+
]);
|
|
10650
|
+
if (lamLoadResult === "timeout") {
|
|
10651
|
+
logger19.warn(`LAM GPU load timed out after ${timeoutMs}ms, falling back to CPU`);
|
|
10652
|
+
await lam.dispose();
|
|
10653
|
+
lam = createA2E({
|
|
10654
|
+
gpuModelUrl: this.config.models.lam.gpuModelUrl,
|
|
10655
|
+
cpuModelUrl: this.config.models.lam.cpuModelUrl,
|
|
10656
|
+
mode: "cpu",
|
|
10657
|
+
unifiedWorker: this.unifiedWorker ?? void 0
|
|
10658
|
+
});
|
|
10659
|
+
await lam.load();
|
|
10660
|
+
}
|
|
10661
|
+
} finally {
|
|
10662
|
+
clearInterval(lamTickInterval);
|
|
10663
|
+
}
|
|
10664
|
+
this.lam = lam;
|
|
10665
|
+
this.emitProgress("Lip sync", 85, 3, 2);
|
|
10666
|
+
this.emitProgress("Voice detection", 85, 3, 2);
|
|
10667
|
+
this.vad = createSileroVAD({
|
|
10668
|
+
modelUrl: this.config.models.vad.modelUrl,
|
|
10669
|
+
threshold: this.config.models.vad.threshold,
|
|
10670
|
+
unifiedWorker: this.unifiedWorker ?? void 0
|
|
10671
|
+
});
|
|
10672
|
+
await this.vad.load();
|
|
10673
|
+
this.emitProgress("Voice detection", 100, 3, 3);
|
|
10674
|
+
this.playback = new PlaybackPipeline({
|
|
10675
|
+
lam: this.lam,
|
|
10676
|
+
profile: this.config.profile,
|
|
10677
|
+
identityIndex: this.config.identityIndex,
|
|
10678
|
+
neutralTransitionEnabled: this.config.neutralTransitionEnabled ?? true,
|
|
10679
|
+
neutralTransitionMs: this.config.neutralTransitionMs,
|
|
10680
|
+
audioDelayMs: this.config.audioDelayMs,
|
|
10681
|
+
chunkTargetMs: this.config.chunkTargetMs
|
|
10682
|
+
});
|
|
10683
|
+
await this.playback.initialize();
|
|
10684
|
+
this.playback.on("frame", (f) => {
|
|
10685
|
+
this._currentFrame = f.blendshapes;
|
|
10686
|
+
this.emit("frame", f);
|
|
10687
|
+
});
|
|
10688
|
+
this.playback.on("frame:raw", (f) => this.emit("frame:raw", f));
|
|
10689
|
+
this.playback.on("playback:start", (t) => this.emit("playback:start", t));
|
|
10690
|
+
this.playback.on("playback:complete", () => {
|
|
10691
|
+
if (this.stopped) return;
|
|
10692
|
+
this.emit("playback:complete", void 0);
|
|
10693
|
+
this.vad?.reset();
|
|
10694
|
+
this.epoch++;
|
|
10695
|
+
this.setState("listening");
|
|
10696
|
+
});
|
|
10697
|
+
this.playback.on("error", (e) => this.emit("error", e));
|
|
10698
|
+
this.interruption = new InterruptionHandler({
|
|
10699
|
+
enabled: this.config.interruptionEnabled ?? true,
|
|
10700
|
+
minSpeechDurationMs: this.config.interruptionMinSpeechMs ?? 200
|
|
10701
|
+
});
|
|
10702
|
+
this.interruption.on("interruption.triggered", () => {
|
|
10703
|
+
this.handleInterruption();
|
|
10704
|
+
});
|
|
10705
|
+
this.setState("ready");
|
|
10706
|
+
} catch (error) {
|
|
10707
|
+
const err = error instanceof Error ? error : new Error(String(error));
|
|
10708
|
+
logger19.error("Model loading failed", { message: err.message });
|
|
10709
|
+
this.emit("error", err);
|
|
10710
|
+
this.setState("error");
|
|
10711
|
+
throw err;
|
|
10712
|
+
}
|
|
10713
|
+
}
|
|
10714
|
+
// ---------------------------------------------------------------------------
|
|
10715
|
+
// Conversation lifecycle
|
|
10716
|
+
// ---------------------------------------------------------------------------
|
|
10717
|
+
async start() {
|
|
10718
|
+
if (this._state !== "ready") {
|
|
10719
|
+
throw new Error(`Cannot start: state is '${this._state}', expected 'ready'`);
|
|
10720
|
+
}
|
|
10721
|
+
this.stopped = false;
|
|
10722
|
+
this.epoch++;
|
|
10723
|
+
this._sessionId = crypto.randomUUID();
|
|
10724
|
+
this.asrErrorCount = 0;
|
|
10725
|
+
this.mic = new MicrophoneCapture(this.omoteEvents, {
|
|
10726
|
+
sampleRate: 16e3,
|
|
10727
|
+
chunkSize: 512
|
|
10728
|
+
});
|
|
10729
|
+
this.omoteEvents.on("audio.chunk", ({ pcm }) => {
|
|
10730
|
+
const float32 = int16ToFloat32(pcm);
|
|
10731
|
+
this.processAudioChunk(float32);
|
|
10732
|
+
});
|
|
10733
|
+
this.omoteEvents.on("audio.level", (level) => {
|
|
10734
|
+
this.emit("audio:level", level);
|
|
10735
|
+
});
|
|
10736
|
+
await this.mic.start();
|
|
10737
|
+
this.setState("listening");
|
|
10738
|
+
}
|
|
10739
|
+
stop() {
|
|
10740
|
+
this.stopped = true;
|
|
10741
|
+
this.epoch++;
|
|
10742
|
+
this.clearSilenceTimer();
|
|
10743
|
+
this.stopProgressiveTranscription();
|
|
10744
|
+
this.responseAbortController?.abort();
|
|
10745
|
+
this.responseAbortController = null;
|
|
10746
|
+
this.vad?.reset();
|
|
10747
|
+
this.playback?.stop();
|
|
10748
|
+
this.mic?.stop();
|
|
10749
|
+
this.mic = null;
|
|
10750
|
+
this.isSpeaking = false;
|
|
10751
|
+
this.audioBuffer = [];
|
|
10752
|
+
this.audioBufferSamples = 0;
|
|
10753
|
+
this._currentFrame = null;
|
|
10754
|
+
this.interruption?.setAISpeaking(false);
|
|
10755
|
+
if (this._state !== "idle") {
|
|
10756
|
+
this.setState("ready");
|
|
10757
|
+
}
|
|
10758
|
+
}
|
|
10759
|
+
setProfile(profile) {
|
|
10760
|
+
this.config.profile = profile;
|
|
10761
|
+
this.playback?.setProfile(profile);
|
|
10762
|
+
}
|
|
10763
|
+
async dispose() {
|
|
10764
|
+
this.stop();
|
|
10765
|
+
this.epoch++;
|
|
10766
|
+
await this.playback?.dispose();
|
|
10767
|
+
await this.asr?.dispose();
|
|
10768
|
+
await this.lam?.dispose();
|
|
10769
|
+
await this.vad?.dispose();
|
|
10770
|
+
this.playback = null;
|
|
10771
|
+
this.asr = null;
|
|
10772
|
+
this.lam = null;
|
|
10773
|
+
this.vad = null;
|
|
10774
|
+
this._state = "idle";
|
|
10775
|
+
}
|
|
10776
|
+
// ---------------------------------------------------------------------------
|
|
10777
|
+
// Audio processing
|
|
10778
|
+
// ---------------------------------------------------------------------------
|
|
10779
|
+
async processAudioChunk(samples) {
|
|
10780
|
+
if (!this.vad) return;
|
|
10781
|
+
try {
|
|
10782
|
+
const result = await this.vad.process(samples);
|
|
10783
|
+
if (this._state === "speaking" && this.interruption) {
|
|
10784
|
+
this.interruption.processVADResult(result.probability);
|
|
10785
|
+
return;
|
|
10786
|
+
}
|
|
10787
|
+
if (this._state !== "listening" && this._state !== "thinking") return;
|
|
10788
|
+
const wasSpeaking = this.isSpeaking;
|
|
10789
|
+
if (result.isSpeech) {
|
|
10790
|
+
if (!wasSpeaking) {
|
|
10791
|
+
this.isSpeaking = true;
|
|
10792
|
+
this.speechStartTime = performance.now();
|
|
10793
|
+
this.audioBuffer = [];
|
|
10794
|
+
this.audioBufferSamples = 0;
|
|
10795
|
+
this.lastProgressiveResult = null;
|
|
10796
|
+
this.lastProgressiveSamples = 0;
|
|
10797
|
+
this.emit("speech:start", void 0);
|
|
10798
|
+
this.startProgressiveTranscription();
|
|
10799
|
+
}
|
|
10800
|
+
this.audioBuffer.push(new Float32Array(samples));
|
|
10801
|
+
this.audioBufferSamples += samples.length;
|
|
10802
|
+
this.clearSilenceTimer();
|
|
10803
|
+
} else if (wasSpeaking) {
|
|
10804
|
+
this.audioBuffer.push(new Float32Array(samples));
|
|
10805
|
+
this.audioBufferSamples += samples.length;
|
|
10806
|
+
if (!this.silenceTimer) {
|
|
10807
|
+
const timeoutMs = this.getSilenceTimeout();
|
|
10808
|
+
this.silenceTimer = setTimeout(() => {
|
|
10809
|
+
this.onSilenceDetected();
|
|
10810
|
+
}, timeoutMs);
|
|
10811
|
+
}
|
|
10812
|
+
}
|
|
10813
|
+
} catch (err) {
|
|
10814
|
+
logger19.warn("VAD error", { error: String(err) });
|
|
10815
|
+
}
|
|
10816
|
+
}
|
|
10817
|
+
// ---------------------------------------------------------------------------
|
|
10818
|
+
// Silence detection
|
|
10819
|
+
// ---------------------------------------------------------------------------
|
|
10820
|
+
getSilenceTimeout() {
|
|
10821
|
+
const base = this.config.silenceTimeoutMs ?? 500;
|
|
10822
|
+
const extended = this.config.silenceTimeoutExtendedMs ?? 700;
|
|
10823
|
+
const adaptive = this.config.adaptiveTimeout ?? true;
|
|
10824
|
+
if (!adaptive) return base;
|
|
10825
|
+
const speechDurationMs = performance.now() - this.speechStartTime;
|
|
10826
|
+
return speechDurationMs > 3e3 ? extended : base;
|
|
10827
|
+
}
|
|
10828
|
+
onSilenceDetected() {
|
|
10829
|
+
const capturedEpoch = this.epoch;
|
|
10830
|
+
this.isSpeaking = false;
|
|
10831
|
+
const durationMs = performance.now() - this.speechStartTime;
|
|
10832
|
+
this.emit("speech:end", { durationMs });
|
|
10833
|
+
this.clearSilenceTimer();
|
|
10834
|
+
this.processEndOfSpeech(capturedEpoch).catch((err) => {
|
|
10835
|
+
logger19.error("End of speech processing failed", { error: String(err) });
|
|
10836
|
+
if (this.epoch === capturedEpoch && !this.stopped) {
|
|
10837
|
+
this.emit("error", err instanceof Error ? err : new Error(String(err)));
|
|
10838
|
+
this.setState("listening");
|
|
10839
|
+
}
|
|
10840
|
+
});
|
|
10841
|
+
}
|
|
10842
|
+
// ---------------------------------------------------------------------------
|
|
10843
|
+
// End of speech → transcription → response
|
|
10844
|
+
// ---------------------------------------------------------------------------
|
|
10845
|
+
async processEndOfSpeech(capturedEpoch) {
|
|
10846
|
+
if (this.progressivePromise) {
|
|
10847
|
+
try {
|
|
10848
|
+
await this.progressivePromise;
|
|
10849
|
+
} catch {
|
|
10850
|
+
}
|
|
10851
|
+
}
|
|
10852
|
+
this.stopProgressiveTranscription();
|
|
10853
|
+
if (this.epoch !== capturedEpoch || this.stopped) return;
|
|
10854
|
+
const totalSamples = this.audioBufferSamples;
|
|
10855
|
+
const fullAudio = new Float32Array(totalSamples);
|
|
10856
|
+
let offset = 0;
|
|
10857
|
+
for (const chunk of this.audioBuffer) {
|
|
10858
|
+
fullAudio.set(chunk, offset);
|
|
10859
|
+
offset += chunk.length;
|
|
10860
|
+
}
|
|
10861
|
+
this.audioBuffer = [];
|
|
10862
|
+
this.audioBufferSamples = 0;
|
|
10863
|
+
const minDuration = this.config.minAudioDurationSec ?? 0.3;
|
|
10864
|
+
const minEnergy = this.config.minAudioEnergy ?? 0.02;
|
|
10865
|
+
const durationSec = totalSamples / 16e3;
|
|
10866
|
+
if (durationSec < minDuration) {
|
|
10867
|
+
logger19.info("Audio too short, discarding", { durationSec });
|
|
10868
|
+
this.setState("listening");
|
|
10869
|
+
return;
|
|
10870
|
+
}
|
|
10871
|
+
let maxAbs = 0;
|
|
10872
|
+
for (let i = 0; i < fullAudio.length; i++) {
|
|
10873
|
+
const abs = Math.abs(fullAudio[i]);
|
|
10874
|
+
if (abs > maxAbs) maxAbs = abs;
|
|
10875
|
+
}
|
|
10876
|
+
let rms = 0;
|
|
10877
|
+
for (let i = 0; i < fullAudio.length; i++) {
|
|
10878
|
+
rms += fullAudio[i] * fullAudio[i];
|
|
10879
|
+
}
|
|
10880
|
+
rms = Math.sqrt(rms / fullAudio.length);
|
|
10881
|
+
if (rms < minEnergy) {
|
|
10882
|
+
logger19.info("Audio too quiet, discarding", { rms });
|
|
10883
|
+
this.setState("listening");
|
|
10884
|
+
return;
|
|
10885
|
+
}
|
|
10886
|
+
const normalizedAudio = this.normalizeAudio(fullAudio);
|
|
10887
|
+
this.setState("thinking");
|
|
10888
|
+
let transcript = null;
|
|
10889
|
+
const coverageThreshold = this.config.progressiveCoverageThreshold ?? 0.8;
|
|
10890
|
+
if (this.lastProgressiveResult && this.lastProgressiveResult.text.trim().length > 0 && this.lastProgressiveSamples >= totalSamples * coverageThreshold) {
|
|
10891
|
+
transcript = { ...this.lastProgressiveResult, isFinal: true };
|
|
10892
|
+
logger19.info("Using progressive result", {
|
|
10893
|
+
coverage: (this.lastProgressiveSamples / totalSamples).toFixed(2),
|
|
10894
|
+
text: transcript.text
|
|
10895
|
+
});
|
|
10896
|
+
} else {
|
|
10897
|
+
this.lastProgressiveResult = null;
|
|
10898
|
+
transcript = await this.transcribeWithTimeout(normalizedAudio);
|
|
10899
|
+
if (transcript) {
|
|
10900
|
+
transcript.isFinal = true;
|
|
10901
|
+
}
|
|
10902
|
+
}
|
|
10903
|
+
if (this.epoch !== capturedEpoch || this.stopped) return;
|
|
10904
|
+
if (!transcript || !transcript.text.trim()) {
|
|
10905
|
+
logger19.info("No transcript, resuming listening");
|
|
10906
|
+
this.setState("listening");
|
|
10907
|
+
return;
|
|
10908
|
+
}
|
|
10909
|
+
this.emit("transcript", transcript);
|
|
10910
|
+
await this.callResponseHandler(transcript, capturedEpoch);
|
|
10911
|
+
}
|
|
10912
|
+
// ---------------------------------------------------------------------------
|
|
10913
|
+
// Response handler
|
|
10914
|
+
// ---------------------------------------------------------------------------
|
|
10915
|
+
async callResponseHandler(transcript, capturedEpoch) {
|
|
10916
|
+
if (this.epoch !== capturedEpoch || this.stopped) return;
|
|
10917
|
+
this.setState("speaking");
|
|
10918
|
+
this.interruption?.setAISpeaking(true);
|
|
10919
|
+
const abortController = new AbortController();
|
|
10920
|
+
this.responseAbortController = abortController;
|
|
10921
|
+
try {
|
|
10922
|
+
this.playback.start();
|
|
10923
|
+
await this.config.onResponse({
|
|
10924
|
+
text: transcript.text,
|
|
10925
|
+
emotion: transcript.emotion,
|
|
10926
|
+
event: transcript.event,
|
|
10927
|
+
send: async (chunk) => {
|
|
10928
|
+
if (abortController.signal.aborted) return;
|
|
10929
|
+
await this.playback.onAudioChunk(chunk);
|
|
10930
|
+
},
|
|
10931
|
+
done: async () => {
|
|
10932
|
+
if (abortController.signal.aborted) return;
|
|
10933
|
+
await this.playback.end();
|
|
10934
|
+
},
|
|
10935
|
+
signal: abortController.signal,
|
|
10936
|
+
sessionId: this._sessionId
|
|
10937
|
+
});
|
|
10938
|
+
} catch (error) {
|
|
10939
|
+
if (abortController.signal.aborted) return;
|
|
10940
|
+
const err = error instanceof Error ? error : new Error(String(error));
|
|
10941
|
+
logger19.error("Response handler error", { message: err.message });
|
|
10942
|
+
this.emit("error", err);
|
|
10943
|
+
if (this.epoch === capturedEpoch && !this.stopped) {
|
|
10944
|
+
this.interruption?.setAISpeaking(false);
|
|
10945
|
+
this.setState("listening");
|
|
10946
|
+
}
|
|
10947
|
+
} finally {
|
|
10948
|
+
this.responseAbortController = null;
|
|
10949
|
+
}
|
|
10950
|
+
}
|
|
10951
|
+
// ---------------------------------------------------------------------------
|
|
10952
|
+
// Interruption handling
|
|
10953
|
+
// ---------------------------------------------------------------------------
|
|
10954
|
+
handleInterruption() {
|
|
10955
|
+
if (this._state !== "speaking") return;
|
|
10956
|
+
logger19.info("Interruption triggered");
|
|
10957
|
+
this.epoch++;
|
|
10958
|
+
this.responseAbortController?.abort();
|
|
10959
|
+
this.playback?.stop();
|
|
10960
|
+
this.interruption?.setAISpeaking(false);
|
|
10961
|
+
this.emit("interruption", void 0);
|
|
10962
|
+
if (!this.stopped) {
|
|
10963
|
+
this.setState("listening");
|
|
10964
|
+
}
|
|
10965
|
+
}
|
|
10966
|
+
// ---------------------------------------------------------------------------
|
|
10967
|
+
// Progressive transcription
|
|
10968
|
+
// ---------------------------------------------------------------------------
|
|
10969
|
+
startProgressiveTranscription() {
|
|
10970
|
+
this.stopProgressiveTranscription();
|
|
10971
|
+
const intervalMs = isIOS() ? this.config.progressiveIntervalIosMs ?? 800 : this.config.progressiveIntervalMs ?? 500;
|
|
10972
|
+
const minSamples = this.config.progressiveMinSamples ?? 8e3;
|
|
10973
|
+
this.progressiveTimer = setInterval(() => {
|
|
10974
|
+
if (this.audioBufferSamples < minSamples) return;
|
|
10975
|
+
if (!this.asr) return;
|
|
10976
|
+
const capturedEpoch = this.epoch;
|
|
10977
|
+
const snapshot = new Float32Array(this.audioBufferSamples);
|
|
10978
|
+
let offset = 0;
|
|
10979
|
+
for (const chunk of this.audioBuffer) {
|
|
10980
|
+
snapshot.set(chunk, offset);
|
|
10981
|
+
offset += chunk.length;
|
|
10982
|
+
}
|
|
10983
|
+
const snapshotSamples = this.audioBufferSamples;
|
|
10984
|
+
this.progressivePromise = (async () => {
|
|
10985
|
+
try {
|
|
10986
|
+
const result = await this.transcribeWithTimeout(snapshot);
|
|
10987
|
+
if (this.epoch !== capturedEpoch) return;
|
|
10988
|
+
if (result && result.text.trim()) {
|
|
10989
|
+
this.lastProgressiveResult = result;
|
|
10990
|
+
this.lastProgressiveSamples = snapshotSamples;
|
|
10991
|
+
this.emit("transcript", { ...result, isFinal: false });
|
|
10992
|
+
}
|
|
10993
|
+
} catch {
|
|
10994
|
+
}
|
|
10995
|
+
})();
|
|
10996
|
+
}, intervalMs);
|
|
10997
|
+
}
|
|
10998
|
+
stopProgressiveTranscription() {
|
|
10999
|
+
if (this.progressiveTimer) {
|
|
11000
|
+
clearInterval(this.progressiveTimer);
|
|
11001
|
+
this.progressiveTimer = null;
|
|
11002
|
+
}
|
|
11003
|
+
}
|
|
11004
|
+
// ---------------------------------------------------------------------------
|
|
11005
|
+
// Transcription with timeout + ASR error recovery
|
|
11006
|
+
// ---------------------------------------------------------------------------
|
|
11007
|
+
async transcribeWithTimeout(audio) {
|
|
11008
|
+
if (!this.asr) return null;
|
|
11009
|
+
const timeoutMs = this.config.transcriptionTimeoutMs ?? 1e4;
|
|
11010
|
+
const startTime = performance.now();
|
|
11011
|
+
try {
|
|
11012
|
+
const result = await Promise.race([
|
|
11013
|
+
this.asr.transcribe(audio),
|
|
11014
|
+
new Promise(
|
|
11015
|
+
(_, reject) => setTimeout(() => reject(new Error(`Transcription timed out after ${timeoutMs}ms`)), timeoutMs)
|
|
11016
|
+
)
|
|
11017
|
+
]);
|
|
11018
|
+
this.asrErrorCount = 0;
|
|
11019
|
+
return {
|
|
11020
|
+
text: result.text,
|
|
11021
|
+
emotion: result.emotion,
|
|
11022
|
+
language: result.language,
|
|
11023
|
+
isFinal: false,
|
|
11024
|
+
inferenceTimeMs: performance.now() - startTime
|
|
11025
|
+
};
|
|
11026
|
+
} catch (error) {
|
|
11027
|
+
this.asrErrorCount++;
|
|
11028
|
+
logger19.warn("Transcription failed", {
|
|
11029
|
+
attempt: this.asrErrorCount,
|
|
11030
|
+
error: String(error)
|
|
11031
|
+
});
|
|
11032
|
+
if (this.asrErrorCount >= 3) {
|
|
11033
|
+
logger19.warn("3 consecutive ASR errors, recreating session");
|
|
11034
|
+
try {
|
|
11035
|
+
await this.asr.dispose();
|
|
11036
|
+
this.asr = createSenseVoice({
|
|
11037
|
+
modelUrl: this.config.models.senseVoice.modelUrl,
|
|
11038
|
+
tokensUrl: this.config.models.senseVoice.tokensUrl,
|
|
11039
|
+
language: this.config.models.senseVoice.language,
|
|
11040
|
+
unifiedWorker: this.unifiedWorker ?? void 0
|
|
11041
|
+
});
|
|
11042
|
+
await this.asr.load();
|
|
11043
|
+
this.asrErrorCount = 0;
|
|
11044
|
+
} catch (recreateErr) {
|
|
11045
|
+
logger19.error("ASR session recreation failed", { error: String(recreateErr) });
|
|
11046
|
+
}
|
|
11047
|
+
}
|
|
11048
|
+
return null;
|
|
11049
|
+
}
|
|
11050
|
+
}
|
|
11051
|
+
// ---------------------------------------------------------------------------
|
|
11052
|
+
// Audio normalization
|
|
11053
|
+
// ---------------------------------------------------------------------------
|
|
11054
|
+
normalizeAudio(audio) {
|
|
11055
|
+
if (!(this.config.normalizeAudio ?? true)) return audio;
|
|
11056
|
+
let maxAbs = 0;
|
|
11057
|
+
for (let i = 0; i < audio.length; i++) {
|
|
11058
|
+
const abs = Math.abs(audio[i]);
|
|
11059
|
+
if (abs > maxAbs) maxAbs = abs;
|
|
11060
|
+
}
|
|
11061
|
+
if (maxAbs >= 0.1 || maxAbs === 0) return audio;
|
|
11062
|
+
const gain = 0.5 / maxAbs;
|
|
11063
|
+
const normalized = new Float32Array(audio.length);
|
|
11064
|
+
for (let i = 0; i < audio.length; i++) {
|
|
11065
|
+
normalized[i] = audio[i] * gain;
|
|
11066
|
+
}
|
|
11067
|
+
return normalized;
|
|
11068
|
+
}
|
|
11069
|
+
// ---------------------------------------------------------------------------
|
|
11070
|
+
// Helpers
|
|
11071
|
+
// ---------------------------------------------------------------------------
|
|
11072
|
+
setState(state) {
|
|
11073
|
+
if (this._state === state) return;
|
|
11074
|
+
logger19.info("State transition", { from: this._state, to: state });
|
|
11075
|
+
this._state = state;
|
|
11076
|
+
this.emit("state", state);
|
|
11077
|
+
}
|
|
11078
|
+
emitProgress(currentModel, progress, totalModels, modelsLoaded) {
|
|
11079
|
+
this.emit("loading:progress", { currentModel, progress, totalModels, modelsLoaded });
|
|
11080
|
+
}
|
|
11081
|
+
clearSilenceTimer() {
|
|
11082
|
+
if (this.silenceTimer) {
|
|
11083
|
+
clearTimeout(this.silenceTimer);
|
|
11084
|
+
this.silenceTimer = null;
|
|
11085
|
+
}
|
|
11086
|
+
}
|
|
11087
|
+
};
|
|
11088
|
+
|
|
10393
11089
|
// ../types/dist/index.mjs
|
|
10394
11090
|
var PROTOCOL_VERSION = 1;
|
|
10395
11091
|
function isProtocolEvent(obj) {
|