@omote/core 0.5.7 → 0.6.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +4 -13
- package/dist/index.d.mts +813 -86
- package/dist/index.d.ts +813 -86
- package/dist/index.js +1653 -563
- package/dist/index.js.map +1 -1
- package/dist/index.mjs +1648 -558
- package/dist/index.mjs.map +1 -1
- package/package.json +3 -2
package/dist/index.js
CHANGED
|
@@ -32,7 +32,9 @@ var index_exports = {};
|
|
|
32
32
|
__export(index_exports, {
|
|
33
33
|
A2EOrchestrator: () => A2EOrchestrator,
|
|
34
34
|
A2EProcessor: () => A2EProcessor,
|
|
35
|
+
ALL_AUS: () => ALL_AUS,
|
|
35
36
|
ARKIT_BLENDSHAPES: () => ARKIT_BLENDSHAPES,
|
|
37
|
+
AU_TO_ARKIT: () => AU_TO_ARKIT,
|
|
36
38
|
AnimationGraph: () => AnimationGraph,
|
|
37
39
|
AudioChunkCoalescer: () => AudioChunkCoalescer,
|
|
38
40
|
AudioEnergyAnalyzer: () => AudioEnergyAnalyzer,
|
|
@@ -43,24 +45,31 @@ __export(index_exports, {
|
|
|
43
45
|
ConsoleExporter: () => ConsoleExporter,
|
|
44
46
|
DEFAULT_ANIMATION_CONFIG: () => DEFAULT_ANIMATION_CONFIG,
|
|
45
47
|
DEFAULT_LOGGING_CONFIG: () => DEFAULT_LOGGING_CONFIG,
|
|
48
|
+
DEFAULT_MODEL_URLS: () => DEFAULT_MODEL_URLS,
|
|
46
49
|
EMOTION_NAMES: () => EMOTION_NAMES,
|
|
50
|
+
EMOTION_TO_AU: () => EMOTION_TO_AU,
|
|
47
51
|
EMOTION_VECTOR_SIZE: () => EMOTION_VECTOR_SIZE,
|
|
48
52
|
EmotionController: () => EmotionController,
|
|
49
53
|
EmotionPresets: () => EmotionPresets,
|
|
54
|
+
EmotionResolver: () => EmotionResolver,
|
|
50
55
|
EmphasisDetector: () => EmphasisDetector,
|
|
51
56
|
EventEmitter: () => EventEmitter,
|
|
57
|
+
FaceCompositor: () => FaceCompositor,
|
|
52
58
|
FullFacePipeline: () => FullFacePipeline,
|
|
59
|
+
HF_CDN_URLS: () => HF_CDN_URLS,
|
|
53
60
|
INFERENCE_LATENCY_BUCKETS: () => INFERENCE_LATENCY_BUCKETS,
|
|
54
61
|
InterruptionHandler: () => InterruptionHandler,
|
|
55
62
|
LAM_BLENDSHAPES: () => LAM_BLENDSHAPES,
|
|
56
63
|
LOG_LEVEL_PRIORITY: () => LOG_LEVEL_PRIORITY,
|
|
57
64
|
MODEL_LOAD_TIME_BUCKETS: () => MODEL_LOAD_TIME_BUCKETS,
|
|
58
65
|
MetricNames: () => MetricNames,
|
|
66
|
+
MicLipSync: () => MicLipSync,
|
|
59
67
|
MicrophoneCapture: () => MicrophoneCapture,
|
|
60
68
|
ModelCache: () => ModelCache,
|
|
61
69
|
OTLPExporter: () => OTLPExporter,
|
|
62
70
|
OmoteTelemetry: () => OmoteTelemetry,
|
|
63
71
|
PROTOCOL_VERSION: () => PROTOCOL_VERSION,
|
|
72
|
+
PlaybackPipeline: () => PlaybackPipeline,
|
|
64
73
|
ProceduralLifeLayer: () => ProceduralLifeLayer,
|
|
65
74
|
RingBuffer: () => RingBuffer,
|
|
66
75
|
SafariSpeechRecognition: () => SafariSpeechRecognition,
|
|
@@ -71,15 +80,18 @@ __export(index_exports, {
|
|
|
71
80
|
SileroVADUnifiedAdapter: () => SileroVADUnifiedAdapter,
|
|
72
81
|
SileroVADWorker: () => SileroVADWorker,
|
|
73
82
|
UnifiedInferenceWorker: () => UnifiedInferenceWorker,
|
|
83
|
+
VoicePipeline: () => VoicePipeline,
|
|
74
84
|
Wav2ArkitCpuInference: () => Wav2ArkitCpuInference,
|
|
75
85
|
Wav2ArkitCpuUnifiedAdapter: () => Wav2ArkitCpuUnifiedAdapter,
|
|
76
86
|
Wav2ArkitCpuWorker: () => Wav2ArkitCpuWorker,
|
|
77
87
|
Wav2Vec2Inference: () => Wav2Vec2Inference,
|
|
88
|
+
applyProfile: () => applyProfile,
|
|
78
89
|
blendEmotions: () => blendEmotions,
|
|
79
90
|
calculatePeak: () => calculatePeak,
|
|
80
91
|
calculateRMS: () => calculateRMS,
|
|
81
92
|
configureCacheLimit: () => configureCacheLimit,
|
|
82
93
|
configureLogging: () => configureLogging,
|
|
94
|
+
configureModelUrls: () => configureModelUrls,
|
|
83
95
|
configureTelemetry: () => configureTelemetry,
|
|
84
96
|
createA2E: () => createA2E,
|
|
85
97
|
createEmotionVector: () => createEmotionVector,
|
|
@@ -110,6 +122,7 @@ __export(index_exports, {
|
|
|
110
122
|
noopLogger: () => noopLogger,
|
|
111
123
|
preloadModels: () => preloadModels,
|
|
112
124
|
resetLoggingConfig: () => resetLoggingConfig,
|
|
125
|
+
resetModelUrls: () => resetModelUrls,
|
|
113
126
|
resolveBackend: () => resolveBackend,
|
|
114
127
|
setLogLevel: () => setLogLevel,
|
|
115
128
|
setLoggingEnabled: () => setLoggingEnabled,
|
|
@@ -867,12 +880,12 @@ var Logger = class _Logger {
|
|
|
867
880
|
};
|
|
868
881
|
var loggerCache = /* @__PURE__ */ new Map();
|
|
869
882
|
function createLogger(module2) {
|
|
870
|
-
let
|
|
871
|
-
if (!
|
|
872
|
-
|
|
873
|
-
loggerCache.set(module2,
|
|
883
|
+
let logger20 = loggerCache.get(module2);
|
|
884
|
+
if (!logger20) {
|
|
885
|
+
logger20 = new Logger(module2);
|
|
886
|
+
loggerCache.set(module2, logger20);
|
|
874
887
|
}
|
|
875
|
-
return
|
|
888
|
+
return logger20;
|
|
876
889
|
}
|
|
877
890
|
var noopLogger = {
|
|
878
891
|
module: "noop",
|
|
@@ -1168,6 +1181,24 @@ var A2EProcessor = class {
|
|
|
1168
1181
|
}
|
|
1169
1182
|
};
|
|
1170
1183
|
|
|
1184
|
+
// src/audio/audioUtils.ts
|
|
1185
|
+
function pcm16ToFloat32(buffer) {
|
|
1186
|
+
const byteLen = buffer.byteLength & ~1;
|
|
1187
|
+
const int16 = byteLen === buffer.byteLength ? new Int16Array(buffer) : new Int16Array(buffer, 0, byteLen / 2);
|
|
1188
|
+
const float32 = new Float32Array(int16.length);
|
|
1189
|
+
for (let i = 0; i < int16.length; i++) {
|
|
1190
|
+
float32[i] = int16[i] / 32768;
|
|
1191
|
+
}
|
|
1192
|
+
return float32;
|
|
1193
|
+
}
|
|
1194
|
+
function int16ToFloat32(int16) {
|
|
1195
|
+
const float32 = new Float32Array(int16.length);
|
|
1196
|
+
for (let i = 0; i < int16.length; i++) {
|
|
1197
|
+
float32[i] = int16[i] / 32768;
|
|
1198
|
+
}
|
|
1199
|
+
return float32;
|
|
1200
|
+
}
|
|
1201
|
+
|
|
1171
1202
|
// src/telemetry/exporters/console.ts
|
|
1172
1203
|
var ConsoleExporter = class {
|
|
1173
1204
|
constructor(options = {}) {
|
|
@@ -2940,7 +2971,7 @@ var _Wav2Vec2Inference = class _Wav2Vec2Inference {
|
|
|
2940
2971
|
} else {
|
|
2941
2972
|
logger3.info("Fetching external model data", {
|
|
2942
2973
|
dataUrl,
|
|
2943
|
-
note: "This may be a large download
|
|
2974
|
+
note: "This may be a large download"
|
|
2944
2975
|
});
|
|
2945
2976
|
externalDataBuffer = await fetchWithCache(dataUrl);
|
|
2946
2977
|
}
|
|
@@ -2948,6 +2979,9 @@ var _Wav2Vec2Inference = class _Wav2Vec2Inference {
|
|
|
2948
2979
|
size: formatBytes(externalDataBuffer.byteLength)
|
|
2949
2980
|
});
|
|
2950
2981
|
} catch (err) {
|
|
2982
|
+
if (typeof this.config.externalDataUrl === "string") {
|
|
2983
|
+
throw new Error(`Failed to fetch external data: ${dataUrl} \u2014 ${err.message}`);
|
|
2984
|
+
}
|
|
2951
2985
|
logger3.debug("No external data file found (single-file model)", {
|
|
2952
2986
|
dataUrl,
|
|
2953
2987
|
error: err.message
|
|
@@ -3071,28 +3105,6 @@ var _Wav2Vec2Inference = class _Wav2Vec2Inference {
|
|
|
3071
3105
|
};
|
|
3072
3106
|
return this.queueInference(feeds);
|
|
3073
3107
|
}
|
|
3074
|
-
/**
|
|
3075
|
-
* Decode CTC logits to text using greedy decoding
|
|
3076
|
-
*/
|
|
3077
|
-
decodeCTC(logits) {
|
|
3078
|
-
const tokens = [];
|
|
3079
|
-
let prevToken = -1;
|
|
3080
|
-
for (const frame of logits) {
|
|
3081
|
-
let maxIdx = 0;
|
|
3082
|
-
let maxVal = frame[0];
|
|
3083
|
-
for (let i = 1; i < frame.length; i++) {
|
|
3084
|
-
if (frame[i] > maxVal) {
|
|
3085
|
-
maxVal = frame[i];
|
|
3086
|
-
maxIdx = i;
|
|
3087
|
-
}
|
|
3088
|
-
}
|
|
3089
|
-
if (maxIdx !== prevToken && maxIdx !== 0) {
|
|
3090
|
-
tokens.push(maxIdx);
|
|
3091
|
-
}
|
|
3092
|
-
prevToken = maxIdx;
|
|
3093
|
-
}
|
|
3094
|
-
return tokens.map((t) => CTC_VOCAB[t] === "|" ? " " : CTC_VOCAB[t]).join("");
|
|
3095
|
-
}
|
|
3096
3108
|
/**
|
|
3097
3109
|
* Queue inference to serialize ONNX session calls
|
|
3098
3110
|
*/
|
|
@@ -3120,37 +3132,25 @@ var _Wav2Vec2Inference = class _Wav2Vec2Inference {
|
|
|
3120
3132
|
})
|
|
3121
3133
|
]);
|
|
3122
3134
|
const inferenceTimeMs = performance.now() - startTime;
|
|
3123
|
-
const asrOutput = results["asr_logits"];
|
|
3124
3135
|
const blendshapeOutput = results["blendshapes"];
|
|
3125
|
-
if (!
|
|
3126
|
-
throw new Error("Missing
|
|
3136
|
+
if (!blendshapeOutput) {
|
|
3137
|
+
throw new Error("Missing blendshapes output from model");
|
|
3127
3138
|
}
|
|
3128
|
-
const asrData = asrOutput.data;
|
|
3129
3139
|
const blendshapeData = blendshapeOutput.data;
|
|
3130
|
-
const numASRFrames = asrOutput.dims[1];
|
|
3131
3140
|
const numA2EFrames = blendshapeOutput.dims[1];
|
|
3132
|
-
const asrVocabSize = asrOutput.dims[2];
|
|
3133
3141
|
const numBlendshapes = blendshapeOutput.dims[2];
|
|
3134
|
-
const asrLogits = [];
|
|
3135
3142
|
const blendshapes = [];
|
|
3136
|
-
for (let f = 0; f < numASRFrames; f++) {
|
|
3137
|
-
asrLogits.push(asrData.slice(f * asrVocabSize, (f + 1) * asrVocabSize));
|
|
3138
|
-
}
|
|
3139
3143
|
for (let f = 0; f < numA2EFrames; f++) {
|
|
3140
3144
|
const rawFrame = blendshapeData.slice(f * numBlendshapes, (f + 1) * numBlendshapes);
|
|
3141
3145
|
blendshapes.push(symmetrizeBlendshapes(rawFrame));
|
|
3142
3146
|
}
|
|
3143
|
-
const text = this.decodeCTC(asrLogits);
|
|
3144
3147
|
logger3.trace("Inference completed", {
|
|
3145
3148
|
inferenceTimeMs: Math.round(inferenceTimeMs * 100) / 100,
|
|
3146
|
-
numA2EFrames
|
|
3147
|
-
numASRFrames,
|
|
3148
|
-
textLength: text.length
|
|
3149
|
+
numA2EFrames
|
|
3149
3150
|
});
|
|
3150
3151
|
span?.setAttributes({
|
|
3151
3152
|
"inference.duration_ms": inferenceTimeMs,
|
|
3152
|
-
"inference.a2e_frames": numA2EFrames
|
|
3153
|
-
"inference.asr_frames": numASRFrames
|
|
3153
|
+
"inference.a2e_frames": numA2EFrames
|
|
3154
3154
|
});
|
|
3155
3155
|
span?.end();
|
|
3156
3156
|
telemetry?.recordHistogram("omote.inference.latency", inferenceTimeMs, {
|
|
@@ -3164,11 +3164,7 @@ var _Wav2Vec2Inference = class _Wav2Vec2Inference {
|
|
|
3164
3164
|
});
|
|
3165
3165
|
resolve({
|
|
3166
3166
|
blendshapes,
|
|
3167
|
-
asrLogits,
|
|
3168
|
-
text,
|
|
3169
3167
|
numFrames: numA2EFrames,
|
|
3170
|
-
numA2EFrames,
|
|
3171
|
-
numASRFrames,
|
|
3172
3168
|
inferenceTimeMs
|
|
3173
3169
|
});
|
|
3174
3170
|
} catch (err) {
|
|
@@ -3221,19 +3217,7 @@ _Wav2Vec2Inference.INFERENCE_TIMEOUT_MS = 5e3;
|
|
|
3221
3217
|
_Wav2Vec2Inference.isWebGPUAvailable = isWebGPUAvailable;
|
|
3222
3218
|
var Wav2Vec2Inference = _Wav2Vec2Inference;
|
|
3223
3219
|
|
|
3224
|
-
// src/audio/
|
|
3225
|
-
function pcm16ToFloat32(buffer) {
|
|
3226
|
-
const byteLen = buffer.byteLength & ~1;
|
|
3227
|
-
const int16 = byteLen === buffer.byteLength ? new Int16Array(buffer) : new Int16Array(buffer, 0, byteLen / 2);
|
|
3228
|
-
const float32 = new Float32Array(int16.length);
|
|
3229
|
-
for (let i = 0; i < int16.length; i++) {
|
|
3230
|
-
float32[i] = int16[i] / 32768;
|
|
3231
|
-
}
|
|
3232
|
-
return float32;
|
|
3233
|
-
}
|
|
3234
|
-
|
|
3235
|
-
// src/audio/FullFacePipeline.ts
|
|
3236
|
-
var logger4 = createLogger("FullFacePipeline");
|
|
3220
|
+
// src/audio/expressionProfile.ts
|
|
3237
3221
|
var BLENDSHAPE_TO_GROUP = /* @__PURE__ */ new Map();
|
|
3238
3222
|
for (const name of LAM_BLENDSHAPES) {
|
|
3239
3223
|
if (name.startsWith("eye")) {
|
|
@@ -3252,6 +3236,24 @@ for (const name of LAM_BLENDSHAPES) {
|
|
|
3252
3236
|
BLENDSHAPE_TO_GROUP.set(name, "tongue");
|
|
3253
3237
|
}
|
|
3254
3238
|
}
|
|
3239
|
+
function applyProfile(raw, profile) {
|
|
3240
|
+
const scaled = new Float32Array(52);
|
|
3241
|
+
for (let i = 0; i < 52; i++) {
|
|
3242
|
+
const name = LAM_BLENDSHAPES[i];
|
|
3243
|
+
let scaler;
|
|
3244
|
+
if (profile.overrides && profile.overrides[name] !== void 0) {
|
|
3245
|
+
scaler = profile.overrides[name];
|
|
3246
|
+
} else {
|
|
3247
|
+
const group = BLENDSHAPE_TO_GROUP.get(name);
|
|
3248
|
+
scaler = group ? profile[group] ?? 1 : 1;
|
|
3249
|
+
}
|
|
3250
|
+
scaled[i] = Math.min(1, Math.max(0, raw[i] * scaler));
|
|
3251
|
+
}
|
|
3252
|
+
return scaled;
|
|
3253
|
+
}
|
|
3254
|
+
|
|
3255
|
+
// src/audio/FullFacePipeline.ts
|
|
3256
|
+
var logger4 = createLogger("FullFacePipeline");
|
|
3255
3257
|
var FullFacePipeline = class extends EventEmitter {
|
|
3256
3258
|
constructor(options) {
|
|
3257
3259
|
super();
|
|
@@ -3316,25 +3318,10 @@ var FullFacePipeline = class extends EventEmitter {
|
|
|
3316
3318
|
/**
|
|
3317
3319
|
* Apply ExpressionProfile scaling to raw A2E blendshapes.
|
|
3318
3320
|
*
|
|
3319
|
-
*
|
|
3320
|
-
* 1. If an override exists for the blendshape name, use override as scaler
|
|
3321
|
-
* 2. Otherwise, use the group scaler (default 1.0)
|
|
3322
|
-
* 3. Clamp result to [0, 1]
|
|
3321
|
+
* Delegates to the standalone applyProfile() utility from expressionProfile.ts.
|
|
3323
3322
|
*/
|
|
3324
3323
|
applyProfile(raw) {
|
|
3325
|
-
|
|
3326
|
-
for (let i = 0; i < 52; i++) {
|
|
3327
|
-
const name = LAM_BLENDSHAPES[i];
|
|
3328
|
-
let scaler;
|
|
3329
|
-
if (this.profile.overrides && this.profile.overrides[name] !== void 0) {
|
|
3330
|
-
scaler = this.profile.overrides[name];
|
|
3331
|
-
} else {
|
|
3332
|
-
const group = BLENDSHAPE_TO_GROUP.get(name);
|
|
3333
|
-
scaler = group ? this.profile[group] ?? 1 : 1;
|
|
3334
|
-
}
|
|
3335
|
-
scaled[i] = Math.min(1, Math.max(0, raw[i] * scaler));
|
|
3336
|
-
}
|
|
3337
|
-
return scaled;
|
|
3324
|
+
return applyProfile(raw, this.profile);
|
|
3338
3325
|
}
|
|
3339
3326
|
/**
|
|
3340
3327
|
* Start a new playback session
|
|
@@ -3519,6 +3506,329 @@ var FullFacePipeline = class extends EventEmitter {
|
|
|
3519
3506
|
}
|
|
3520
3507
|
};
|
|
3521
3508
|
|
|
3509
|
+
// src/audio/PlaybackPipeline.ts
|
|
3510
|
+
var logger5 = createLogger("PlaybackPipeline");
|
|
3511
|
+
var PlaybackPipeline = class extends EventEmitter {
|
|
3512
|
+
constructor(config) {
|
|
3513
|
+
super();
|
|
3514
|
+
this.config = config;
|
|
3515
|
+
this._state = "idle";
|
|
3516
|
+
this.playbackStarted = false;
|
|
3517
|
+
this.monitorInterval = null;
|
|
3518
|
+
this.frameAnimationId = null;
|
|
3519
|
+
// Stale frame detection
|
|
3520
|
+
this.lastNewFrameTime = 0;
|
|
3521
|
+
this.lastKnownLamFrame = null;
|
|
3522
|
+
this.staleWarningEmitted = false;
|
|
3523
|
+
// Diagnostic counter
|
|
3524
|
+
this.frameLoopCount = 0;
|
|
3525
|
+
this.neutralTransitionFrame = null;
|
|
3526
|
+
this.neutralTransitionStart = 0;
|
|
3527
|
+
this.neutralAnimationId = null;
|
|
3528
|
+
// Current frame refs
|
|
3529
|
+
this._currentFrame = null;
|
|
3530
|
+
this._currentRawFrame = null;
|
|
3531
|
+
this.sampleRate = config.sampleRate ?? 16e3;
|
|
3532
|
+
this.profile = config.profile ?? {};
|
|
3533
|
+
this.staleThresholdMs = config.staleThresholdMs ?? 2e3;
|
|
3534
|
+
this.neutralTransitionEnabled = config.neutralTransitionEnabled ?? false;
|
|
3535
|
+
this.neutralTransitionMs = config.neutralTransitionMs ?? 250;
|
|
3536
|
+
const isCpuModel = config.lam.modelId === "wav2arkit_cpu";
|
|
3537
|
+
const chunkSize = config.chunkSize ?? config.lam.chunkSize ?? 16e3;
|
|
3538
|
+
const chunkAccumulationMs = chunkSize / this.sampleRate * 1e3;
|
|
3539
|
+
const inferenceEstimateMs = isCpuModel ? 300 : config.lam.backend === "wasm" ? 250 : 80;
|
|
3540
|
+
const marginMs = 100;
|
|
3541
|
+
const autoDelay = Math.ceil(chunkAccumulationMs + inferenceEstimateMs + marginMs);
|
|
3542
|
+
const audioDelayMs = config.audioDelayMs ?? autoDelay;
|
|
3543
|
+
logger5.info("PlaybackPipeline config", {
|
|
3544
|
+
chunkSize,
|
|
3545
|
+
audioDelayMs,
|
|
3546
|
+
autoDelay,
|
|
3547
|
+
backend: config.lam.backend,
|
|
3548
|
+
modelId: config.lam.modelId,
|
|
3549
|
+
neutralTransitionEnabled: this.neutralTransitionEnabled
|
|
3550
|
+
});
|
|
3551
|
+
this.scheduler = new AudioScheduler({
|
|
3552
|
+
sampleRate: this.sampleRate,
|
|
3553
|
+
initialLookaheadSec: audioDelayMs / 1e3
|
|
3554
|
+
});
|
|
3555
|
+
this.coalescer = new AudioChunkCoalescer({
|
|
3556
|
+
sampleRate: this.sampleRate,
|
|
3557
|
+
targetDurationMs: config.chunkTargetMs ?? 200
|
|
3558
|
+
});
|
|
3559
|
+
this.processor = new A2EProcessor({
|
|
3560
|
+
backend: config.lam,
|
|
3561
|
+
sampleRate: this.sampleRate,
|
|
3562
|
+
chunkSize,
|
|
3563
|
+
identityIndex: config.identityIndex,
|
|
3564
|
+
onError: (error) => {
|
|
3565
|
+
logger5.error("A2E inference error", { message: error.message, stack: error.stack });
|
|
3566
|
+
this.emit("error", error);
|
|
3567
|
+
}
|
|
3568
|
+
});
|
|
3569
|
+
}
|
|
3570
|
+
/** Current pipeline state */
|
|
3571
|
+
get state() {
|
|
3572
|
+
return this._state;
|
|
3573
|
+
}
|
|
3574
|
+
/** Current scaled blendshapes (updated in-place for perf) */
|
|
3575
|
+
get currentFrame() {
|
|
3576
|
+
return this._currentFrame;
|
|
3577
|
+
}
|
|
3578
|
+
/** Raw A2E blendshapes (before profile scaling) */
|
|
3579
|
+
get currentRawFrame() {
|
|
3580
|
+
return this._currentRawFrame;
|
|
3581
|
+
}
|
|
3582
|
+
// ---------------------------------------------------------------------------
|
|
3583
|
+
// Lifecycle
|
|
3584
|
+
// ---------------------------------------------------------------------------
|
|
3585
|
+
/** Initialize AudioContext (lazy, call after user gesture) */
|
|
3586
|
+
async initialize() {
|
|
3587
|
+
await this.scheduler.initialize();
|
|
3588
|
+
}
|
|
3589
|
+
/** Update ExpressionProfile at runtime */
|
|
3590
|
+
setProfile(profile) {
|
|
3591
|
+
this.profile = profile;
|
|
3592
|
+
}
|
|
3593
|
+
// ---------------------------------------------------------------------------
|
|
3594
|
+
// Async mode (streaming TTS)
|
|
3595
|
+
// ---------------------------------------------------------------------------
|
|
3596
|
+
/**
|
|
3597
|
+
* Start a new playback session.
|
|
3598
|
+
* Idempotent — calling during playback resets cleanly without emitting
|
|
3599
|
+
* spurious playback:complete.
|
|
3600
|
+
*/
|
|
3601
|
+
start() {
|
|
3602
|
+
this.stopInternal(false);
|
|
3603
|
+
this.scheduler.reset();
|
|
3604
|
+
this.coalescer.reset();
|
|
3605
|
+
this.processor.reset();
|
|
3606
|
+
this.playbackStarted = false;
|
|
3607
|
+
this.lastNewFrameTime = 0;
|
|
3608
|
+
this.lastKnownLamFrame = null;
|
|
3609
|
+
this.staleWarningEmitted = false;
|
|
3610
|
+
this.frameLoopCount = 0;
|
|
3611
|
+
this._currentFrame = null;
|
|
3612
|
+
this._currentRawFrame = null;
|
|
3613
|
+
this.cancelNeutralTransition();
|
|
3614
|
+
this.scheduler.warmup();
|
|
3615
|
+
this.startFrameLoop();
|
|
3616
|
+
this.startMonitoring();
|
|
3617
|
+
this.setState("playing");
|
|
3618
|
+
}
|
|
3619
|
+
/** Feed a streaming audio chunk (PCM16 Uint8Array) */
|
|
3620
|
+
async onAudioChunk(chunk) {
|
|
3621
|
+
const combined = this.coalescer.add(chunk);
|
|
3622
|
+
if (!combined) return;
|
|
3623
|
+
const float32 = pcm16ToFloat32(combined);
|
|
3624
|
+
const scheduleTime = await this.scheduler.schedule(float32);
|
|
3625
|
+
if (!this.playbackStarted) {
|
|
3626
|
+
this.playbackStarted = true;
|
|
3627
|
+
this.emit("playback:start", { time: scheduleTime });
|
|
3628
|
+
this.emit("playback_start", scheduleTime);
|
|
3629
|
+
}
|
|
3630
|
+
this.processor.pushAudio(float32, scheduleTime);
|
|
3631
|
+
}
|
|
3632
|
+
/** Signal end of audio stream (flushes remaining audio) */
|
|
3633
|
+
async end() {
|
|
3634
|
+
const remaining = this.coalescer.flush();
|
|
3635
|
+
if (remaining) {
|
|
3636
|
+
const chunk = new Uint8Array(remaining);
|
|
3637
|
+
await this.onAudioChunk(chunk);
|
|
3638
|
+
}
|
|
3639
|
+
await this.processor.flush();
|
|
3640
|
+
}
|
|
3641
|
+
// ---------------------------------------------------------------------------
|
|
3642
|
+
// Sync mode (full buffer)
|
|
3643
|
+
// ---------------------------------------------------------------------------
|
|
3644
|
+
/**
|
|
3645
|
+
* Feed a complete audio buffer. Chunks into 200ms pieces, schedules each
|
|
3646
|
+
* for playback, runs A2E inference, then waits for completion.
|
|
3647
|
+
*/
|
|
3648
|
+
async feedBuffer(audio) {
|
|
3649
|
+
const float32 = audio instanceof Float32Array ? audio : pcm16ToFloat32(audio);
|
|
3650
|
+
this.start();
|
|
3651
|
+
const chunkSamples = Math.floor(this.sampleRate * 0.2);
|
|
3652
|
+
for (let i = 0; i < float32.length; i += chunkSamples) {
|
|
3653
|
+
const chunk = float32.subarray(i, Math.min(i + chunkSamples, float32.length));
|
|
3654
|
+
const scheduleTime = await this.scheduler.schedule(chunk);
|
|
3655
|
+
this.processor.pushAudio(chunk, scheduleTime);
|
|
3656
|
+
if (!this.playbackStarted) {
|
|
3657
|
+
this.playbackStarted = true;
|
|
3658
|
+
this.emit("playback:start", { time: scheduleTime });
|
|
3659
|
+
this.emit("playback_start", scheduleTime);
|
|
3660
|
+
}
|
|
3661
|
+
}
|
|
3662
|
+
await this.processor.flush();
|
|
3663
|
+
return new Promise((resolve) => {
|
|
3664
|
+
const unsub = this.on("playback:complete", () => {
|
|
3665
|
+
unsub();
|
|
3666
|
+
resolve();
|
|
3667
|
+
});
|
|
3668
|
+
});
|
|
3669
|
+
}
|
|
3670
|
+
// ---------------------------------------------------------------------------
|
|
3671
|
+
// Control
|
|
3672
|
+
// ---------------------------------------------------------------------------
|
|
3673
|
+
/** Stop playback immediately with fade-out */
|
|
3674
|
+
async stop(fadeOutMs = 50) {
|
|
3675
|
+
this.setState("stopping");
|
|
3676
|
+
this.stopInternal(true);
|
|
3677
|
+
await this.scheduler.cancelAll(fadeOutMs);
|
|
3678
|
+
this.coalescer.reset();
|
|
3679
|
+
this.processor.reset();
|
|
3680
|
+
this.playbackStarted = false;
|
|
3681
|
+
this._currentFrame = null;
|
|
3682
|
+
this._currentRawFrame = null;
|
|
3683
|
+
this.emit("playback:stop", void 0);
|
|
3684
|
+
this.setState("idle");
|
|
3685
|
+
}
|
|
3686
|
+
/** Cleanup all resources */
|
|
3687
|
+
dispose() {
|
|
3688
|
+
this.stopInternal(true);
|
|
3689
|
+
this.cancelNeutralTransition();
|
|
3690
|
+
this.scheduler.dispose();
|
|
3691
|
+
this.coalescer.reset();
|
|
3692
|
+
this.processor.dispose();
|
|
3693
|
+
this._state = "idle";
|
|
3694
|
+
}
|
|
3695
|
+
/** Get pipeline debug state */
|
|
3696
|
+
getDebugState() {
|
|
3697
|
+
return {
|
|
3698
|
+
state: this._state,
|
|
3699
|
+
playbackStarted: this.playbackStarted,
|
|
3700
|
+
coalescerFill: this.coalescer.fillLevel,
|
|
3701
|
+
processorFill: this.processor.fillLevel,
|
|
3702
|
+
queuedFrames: this.processor.queuedFrameCount,
|
|
3703
|
+
currentTime: this.scheduler.getCurrentTime(),
|
|
3704
|
+
playbackEndTime: this.scheduler.getPlaybackEndTime()
|
|
3705
|
+
};
|
|
3706
|
+
}
|
|
3707
|
+
// ---------------------------------------------------------------------------
|
|
3708
|
+
// Internal: Frame loop
|
|
3709
|
+
// ---------------------------------------------------------------------------
|
|
3710
|
+
startFrameLoop() {
|
|
3711
|
+
const updateFrame = () => {
|
|
3712
|
+
this.frameLoopCount++;
|
|
3713
|
+
const currentTime = this.scheduler.getCurrentTime();
|
|
3714
|
+
const lamFrame = this.processor.getFrameForTime(currentTime);
|
|
3715
|
+
if (lamFrame && lamFrame !== this.lastKnownLamFrame) {
|
|
3716
|
+
this.lastNewFrameTime = performance.now();
|
|
3717
|
+
this.lastKnownLamFrame = lamFrame;
|
|
3718
|
+
this.staleWarningEmitted = false;
|
|
3719
|
+
}
|
|
3720
|
+
if (this.playbackStarted && this.lastNewFrameTime > 0 && performance.now() - this.lastNewFrameTime > this.staleThresholdMs) {
|
|
3721
|
+
if (!this.staleWarningEmitted) {
|
|
3722
|
+
this.staleWarningEmitted = true;
|
|
3723
|
+
logger5.warn("A2E stalled \u2014 no new inference frames", {
|
|
3724
|
+
staleDurationMs: Math.round(performance.now() - this.lastNewFrameTime),
|
|
3725
|
+
queuedFrames: this.processor.queuedFrameCount
|
|
3726
|
+
});
|
|
3727
|
+
}
|
|
3728
|
+
}
|
|
3729
|
+
if (lamFrame) {
|
|
3730
|
+
const scaled = applyProfile(lamFrame, this.profile);
|
|
3731
|
+
this._currentFrame = scaled;
|
|
3732
|
+
this._currentRawFrame = lamFrame;
|
|
3733
|
+
const fullFrame = {
|
|
3734
|
+
blendshapes: scaled,
|
|
3735
|
+
rawBlendshapes: lamFrame,
|
|
3736
|
+
timestamp: currentTime
|
|
3737
|
+
};
|
|
3738
|
+
this.emit("frame", fullFrame);
|
|
3739
|
+
this.emit("frame:raw", lamFrame);
|
|
3740
|
+
this.emit("full_frame_ready", fullFrame);
|
|
3741
|
+
this.emit("lam_frame_ready", lamFrame);
|
|
3742
|
+
}
|
|
3743
|
+
this.frameAnimationId = requestAnimationFrame(updateFrame);
|
|
3744
|
+
};
|
|
3745
|
+
this.frameAnimationId = requestAnimationFrame(updateFrame);
|
|
3746
|
+
}
|
|
3747
|
+
// ---------------------------------------------------------------------------
|
|
3748
|
+
// Internal: Playback monitoring
|
|
3749
|
+
// ---------------------------------------------------------------------------
|
|
3750
|
+
startMonitoring() {
|
|
3751
|
+
if (this.monitorInterval) {
|
|
3752
|
+
clearInterval(this.monitorInterval);
|
|
3753
|
+
}
|
|
3754
|
+
this.monitorInterval = setInterval(() => {
|
|
3755
|
+
if (this.scheduler.isComplete() && this.processor.queuedFrameCount === 0) {
|
|
3756
|
+
this.onPlaybackComplete();
|
|
3757
|
+
}
|
|
3758
|
+
}, 100);
|
|
3759
|
+
}
|
|
3760
|
+
onPlaybackComplete() {
|
|
3761
|
+
this.stopInternal(false);
|
|
3762
|
+
this.playbackStarted = false;
|
|
3763
|
+
this.emit("playback:complete", void 0);
|
|
3764
|
+
this.emit("playback_complete", void 0);
|
|
3765
|
+
if (this.neutralTransitionEnabled && this._currentFrame) {
|
|
3766
|
+
this.startNeutralTransition(this._currentFrame);
|
|
3767
|
+
} else {
|
|
3768
|
+
this.setState("idle");
|
|
3769
|
+
}
|
|
3770
|
+
}
|
|
3771
|
+
// ---------------------------------------------------------------------------
|
|
3772
|
+
// Internal: Neutral transition (opt-in)
|
|
3773
|
+
// ---------------------------------------------------------------------------
|
|
3774
|
+
startNeutralTransition(fromFrame) {
|
|
3775
|
+
this.neutralTransitionFrame = new Float32Array(fromFrame);
|
|
3776
|
+
this.neutralTransitionStart = performance.now();
|
|
3777
|
+
const animate = () => {
|
|
3778
|
+
const elapsed = performance.now() - this.neutralTransitionStart;
|
|
3779
|
+
const t = Math.min(1, elapsed / this.neutralTransitionMs);
|
|
3780
|
+
const eased = 1 - Math.pow(1 - t, 3);
|
|
3781
|
+
const blendshapes = new Float32Array(52);
|
|
3782
|
+
for (let i = 0; i < 52; i++) {
|
|
3783
|
+
blendshapes[i] = this.neutralTransitionFrame[i] * (1 - eased);
|
|
3784
|
+
}
|
|
3785
|
+
this._currentFrame = blendshapes;
|
|
3786
|
+
const frame = {
|
|
3787
|
+
blendshapes,
|
|
3788
|
+
rawBlendshapes: blendshapes,
|
|
3789
|
+
// raw = scaled during transition
|
|
3790
|
+
timestamp: performance.now() / 1e3
|
|
3791
|
+
};
|
|
3792
|
+
this.emit("frame", frame);
|
|
3793
|
+
this.emit("full_frame_ready", frame);
|
|
3794
|
+
if (t >= 1) {
|
|
3795
|
+
this.neutralTransitionFrame = null;
|
|
3796
|
+
this._currentFrame = null;
|
|
3797
|
+
this._currentRawFrame = null;
|
|
3798
|
+
this.setState("idle");
|
|
3799
|
+
return;
|
|
3800
|
+
}
|
|
3801
|
+
this.neutralAnimationId = requestAnimationFrame(animate);
|
|
3802
|
+
};
|
|
3803
|
+
this.neutralAnimationId = requestAnimationFrame(animate);
|
|
3804
|
+
}
|
|
3805
|
+
cancelNeutralTransition() {
|
|
3806
|
+
if (this.neutralAnimationId) {
|
|
3807
|
+
cancelAnimationFrame(this.neutralAnimationId);
|
|
3808
|
+
this.neutralAnimationId = null;
|
|
3809
|
+
}
|
|
3810
|
+
this.neutralTransitionFrame = null;
|
|
3811
|
+
}
|
|
3812
|
+
// ---------------------------------------------------------------------------
|
|
3813
|
+
// Internal: Helpers
|
|
3814
|
+
// ---------------------------------------------------------------------------
|
|
3815
|
+
stopInternal(emitEvents) {
|
|
3816
|
+
if (this.monitorInterval) {
|
|
3817
|
+
clearInterval(this.monitorInterval);
|
|
3818
|
+
this.monitorInterval = null;
|
|
3819
|
+
}
|
|
3820
|
+
if (this.frameAnimationId) {
|
|
3821
|
+
cancelAnimationFrame(this.frameAnimationId);
|
|
3822
|
+
this.frameAnimationId = null;
|
|
3823
|
+
}
|
|
3824
|
+
}
|
|
3825
|
+
setState(state) {
|
|
3826
|
+
if (this._state === state) return;
|
|
3827
|
+
this._state = state;
|
|
3828
|
+
this.emit("state", state);
|
|
3829
|
+
}
|
|
3830
|
+
};
|
|
3831
|
+
|
|
3522
3832
|
// src/audio/InterruptionHandler.ts
|
|
3523
3833
|
var InterruptionHandler = class extends EventEmitter {
|
|
3524
3834
|
constructor(config = {}) {
|
|
@@ -3906,7 +4216,7 @@ function ctcGreedyDecode(logits, seqLen, vocabSize, tokenMap) {
|
|
|
3906
4216
|
}
|
|
3907
4217
|
|
|
3908
4218
|
// src/inference/SenseVoiceInference.ts
|
|
3909
|
-
var
|
|
4219
|
+
var logger6 = createLogger("SenseVoice");
|
|
3910
4220
|
var _SenseVoiceInference = class _SenseVoiceInference {
|
|
3911
4221
|
constructor(config) {
|
|
3912
4222
|
this.session = null;
|
|
@@ -3959,26 +4269,26 @@ var _SenseVoiceInference = class _SenseVoiceInference {
|
|
|
3959
4269
|
"model.backend_requested": this.config.backend
|
|
3960
4270
|
});
|
|
3961
4271
|
try {
|
|
3962
|
-
|
|
4272
|
+
logger6.info("Loading ONNX Runtime...", { preference: this.config.backend });
|
|
3963
4273
|
const { ort, backend } = await getOnnxRuntimeForPreference(this.config.backend);
|
|
3964
4274
|
this.ort = ort;
|
|
3965
4275
|
this._backend = backend;
|
|
3966
|
-
|
|
3967
|
-
|
|
4276
|
+
logger6.info("ONNX Runtime loaded", { backend: this._backend });
|
|
4277
|
+
logger6.debug("Fetching tokens vocabulary", { tokensUrl: this.config.tokensUrl });
|
|
3968
4278
|
const tokensResponse = await fetch(this.config.tokensUrl);
|
|
3969
4279
|
if (!tokensResponse.ok) {
|
|
3970
4280
|
throw new Error(`Failed to fetch tokens.txt: ${tokensResponse.status} ${tokensResponse.statusText}`);
|
|
3971
4281
|
}
|
|
3972
4282
|
const tokensText = await tokensResponse.text();
|
|
3973
4283
|
this.tokenMap = parseTokensFile(tokensText);
|
|
3974
|
-
|
|
4284
|
+
logger6.debug("Tokens loaded", { vocabSize: this.tokenMap.size });
|
|
3975
4285
|
const sessionOptions = getSessionOptions(this._backend);
|
|
3976
4286
|
if (this._backend === "webgpu") {
|
|
3977
4287
|
sessionOptions.graphOptimizationLevel = "basic";
|
|
3978
4288
|
}
|
|
3979
4289
|
let isCached = false;
|
|
3980
4290
|
if (isIOS()) {
|
|
3981
|
-
|
|
4291
|
+
logger6.info("iOS: passing model URL directly to ORT (low-memory path)", {
|
|
3982
4292
|
modelUrl: this.config.modelUrl
|
|
3983
4293
|
});
|
|
3984
4294
|
this.session = await withTimeout(
|
|
@@ -3991,14 +4301,14 @@ var _SenseVoiceInference = class _SenseVoiceInference {
|
|
|
3991
4301
|
isCached = await cache.has(this.config.modelUrl);
|
|
3992
4302
|
let modelBuffer;
|
|
3993
4303
|
if (isCached) {
|
|
3994
|
-
|
|
4304
|
+
logger6.debug("Loading model from cache", { modelUrl: this.config.modelUrl });
|
|
3995
4305
|
modelBuffer = await cache.get(this.config.modelUrl);
|
|
3996
4306
|
onProgress?.(modelBuffer.byteLength, modelBuffer.byteLength);
|
|
3997
4307
|
} else {
|
|
3998
|
-
|
|
4308
|
+
logger6.debug("Fetching and caching model", { modelUrl: this.config.modelUrl });
|
|
3999
4309
|
modelBuffer = await fetchWithCache(this.config.modelUrl, onProgress);
|
|
4000
4310
|
}
|
|
4001
|
-
|
|
4311
|
+
logger6.debug("Creating ONNX session", {
|
|
4002
4312
|
size: formatBytes(modelBuffer.byteLength),
|
|
4003
4313
|
backend: this._backend
|
|
4004
4314
|
});
|
|
@@ -4011,15 +4321,15 @@ var _SenseVoiceInference = class _SenseVoiceInference {
|
|
|
4011
4321
|
const cmvn = parseCMVNFromMetadata(metadata.neg_mean, metadata.inv_stddev);
|
|
4012
4322
|
this.negMean = cmvn.negMean;
|
|
4013
4323
|
this.invStddev = cmvn.invStddev;
|
|
4014
|
-
|
|
4324
|
+
logger6.debug("CMVN loaded from model metadata", { dim: this.negMean.length });
|
|
4015
4325
|
} else {
|
|
4016
|
-
|
|
4326
|
+
logger6.warn("CMVN not found in model metadata \u2014 features will not be normalized");
|
|
4017
4327
|
}
|
|
4018
4328
|
} catch (cmvnErr) {
|
|
4019
|
-
|
|
4329
|
+
logger6.warn("Failed to read CMVN from model metadata", { error: cmvnErr });
|
|
4020
4330
|
}
|
|
4021
4331
|
const loadTimeMs = performance.now() - startTime;
|
|
4022
|
-
|
|
4332
|
+
logger6.info("SenseVoice model loaded", {
|
|
4023
4333
|
backend: this._backend,
|
|
4024
4334
|
loadTimeMs: Math.round(loadTimeMs),
|
|
4025
4335
|
vocabSize: this.tokenMap.size,
|
|
@@ -4130,7 +4440,7 @@ var _SenseVoiceInference = class _SenseVoiceInference {
|
|
|
4130
4440
|
const vocabSize = logitsDims[2];
|
|
4131
4441
|
const decoded = ctcGreedyDecode(logitsData, seqLen, vocabSize, this.tokenMap);
|
|
4132
4442
|
const inferenceTimeMs = performance.now() - startTime;
|
|
4133
|
-
|
|
4443
|
+
logger6.trace("Transcription complete", {
|
|
4134
4444
|
text: decoded.text.substring(0, 50),
|
|
4135
4445
|
language: decoded.language,
|
|
4136
4446
|
emotion: decoded.emotion,
|
|
@@ -4168,7 +4478,7 @@ var _SenseVoiceInference = class _SenseVoiceInference {
|
|
|
4168
4478
|
const errMsg = err instanceof Error ? err.message : String(err);
|
|
4169
4479
|
if (errMsg.includes("timed out")) {
|
|
4170
4480
|
this.poisoned = true;
|
|
4171
|
-
|
|
4481
|
+
logger6.error("CRITICAL: Inference session timed out \u2014 SenseVoice is dead. Page reload required.", {
|
|
4172
4482
|
backend: this._backend,
|
|
4173
4483
|
timeoutMs: _SenseVoiceInference.INFERENCE_TIMEOUT_MS
|
|
4174
4484
|
});
|
|
@@ -4176,7 +4486,7 @@ var _SenseVoiceInference = class _SenseVoiceInference {
|
|
|
4176
4486
|
const oomError = new Error(
|
|
4177
4487
|
`SenseVoice inference failed with raw C++ exception pointer (0x${err.toString(16)}). This is likely an OOM crash in WASM. Try reloading the page.`
|
|
4178
4488
|
);
|
|
4179
|
-
|
|
4489
|
+
logger6.error("ORT WASM OOM \u2014 raw C++ exception pointer", {
|
|
4180
4490
|
pointer: `0x${err.toString(16)}`,
|
|
4181
4491
|
backend: this._backend
|
|
4182
4492
|
});
|
|
@@ -4189,7 +4499,7 @@ var _SenseVoiceInference = class _SenseVoiceInference {
|
|
|
4189
4499
|
reject(oomError);
|
|
4190
4500
|
return;
|
|
4191
4501
|
} else {
|
|
4192
|
-
|
|
4502
|
+
logger6.error("Inference failed", { error: errMsg, backend: this._backend });
|
|
4193
4503
|
}
|
|
4194
4504
|
span?.endWithError(err instanceof Error ? err : new Error(String(err)));
|
|
4195
4505
|
telemetry?.incrementCounter("omote.inference.total", 1, {
|
|
@@ -4218,7 +4528,7 @@ _SenseVoiceInference.INFERENCE_TIMEOUT_MS = 1e4;
|
|
|
4218
4528
|
var SenseVoiceInference = _SenseVoiceInference;
|
|
4219
4529
|
|
|
4220
4530
|
// src/inference/SenseVoiceWorker.ts
|
|
4221
|
-
var
|
|
4531
|
+
var logger7 = createLogger("SenseVoiceWorker");
|
|
4222
4532
|
var WASM_CDN_PATH2 = "https://cdn.jsdelivr.net/npm/onnxruntime-web@1.23.2/dist/";
|
|
4223
4533
|
var LOAD_TIMEOUT_MS = 3e5;
|
|
4224
4534
|
var INFERENCE_TIMEOUT_MS = 1e4;
|
|
@@ -4957,7 +5267,7 @@ var SenseVoiceWorker = class {
|
|
|
4957
5267
|
this.handleWorkerMessage(event.data);
|
|
4958
5268
|
};
|
|
4959
5269
|
worker.onerror = (error) => {
|
|
4960
|
-
|
|
5270
|
+
logger7.error("Worker error", { error: error.message });
|
|
4961
5271
|
for (const [, resolver] of this.pendingResolvers) {
|
|
4962
5272
|
resolver.reject(new Error(`Worker error: ${error.message}`));
|
|
4963
5273
|
}
|
|
@@ -5037,9 +5347,9 @@ var SenseVoiceWorker = class {
|
|
|
5037
5347
|
"model.language": this.config.language
|
|
5038
5348
|
});
|
|
5039
5349
|
try {
|
|
5040
|
-
|
|
5350
|
+
logger7.info("Creating SenseVoice worker...");
|
|
5041
5351
|
this.worker = this.createWorker();
|
|
5042
|
-
|
|
5352
|
+
logger7.info("Loading model in worker...", {
|
|
5043
5353
|
modelUrl: this.config.modelUrl,
|
|
5044
5354
|
tokensUrl: this.config.tokensUrl,
|
|
5045
5355
|
language: this.config.language,
|
|
@@ -5061,7 +5371,7 @@ var SenseVoiceWorker = class {
|
|
|
5061
5371
|
this._isLoaded = true;
|
|
5062
5372
|
const loadTimeMs = performance.now() - startTime;
|
|
5063
5373
|
onProgress?.(1, 1);
|
|
5064
|
-
|
|
5374
|
+
logger7.info("SenseVoice worker loaded successfully", {
|
|
5065
5375
|
backend: "wasm",
|
|
5066
5376
|
loadTimeMs: Math.round(loadTimeMs),
|
|
5067
5377
|
workerLoadTimeMs: Math.round(result.loadTimeMs),
|
|
@@ -5140,7 +5450,7 @@ var SenseVoiceWorker = class {
|
|
|
5140
5450
|
INFERENCE_TIMEOUT_MS
|
|
5141
5451
|
);
|
|
5142
5452
|
const totalTimeMs = performance.now() - startTime;
|
|
5143
|
-
|
|
5453
|
+
logger7.trace("Worker transcription complete", {
|
|
5144
5454
|
text: result.text.substring(0, 50),
|
|
5145
5455
|
language: result.language,
|
|
5146
5456
|
emotion: result.emotion,
|
|
@@ -5176,11 +5486,11 @@ var SenseVoiceWorker = class {
|
|
|
5176
5486
|
} catch (err) {
|
|
5177
5487
|
const errMsg = err instanceof Error ? err.message : String(err);
|
|
5178
5488
|
if (errMsg.includes("timed out")) {
|
|
5179
|
-
|
|
5489
|
+
logger7.error("CRITICAL: Worker inference timed out \u2014 SenseVoice worker is dead. Page reload required.", {
|
|
5180
5490
|
timeoutMs: INFERENCE_TIMEOUT_MS
|
|
5181
5491
|
});
|
|
5182
5492
|
} else {
|
|
5183
|
-
|
|
5493
|
+
logger7.error("Worker inference failed", { error: errMsg });
|
|
5184
5494
|
}
|
|
5185
5495
|
span?.endWithError(err instanceof Error ? err : new Error(String(err)));
|
|
5186
5496
|
telemetry?.incrementCounter("omote.inference.total", 1, {
|
|
@@ -5217,8 +5527,53 @@ var SenseVoiceWorker = class {
|
|
|
5217
5527
|
}
|
|
5218
5528
|
};
|
|
5219
5529
|
|
|
5530
|
+
// src/inference/defaultModelUrls.ts
|
|
5531
|
+
var HF = "https://huggingface.co";
|
|
5532
|
+
var HF_MODEL_URLS = {
|
|
5533
|
+
/** LAM A2E model — fp16 external data (385KB graph + 192MB weights, WebGPU) — 52 ARKit blendshapes */
|
|
5534
|
+
lam: `${HF}/omote-ai/lam-a2e/resolve/main/model_fp16.onnx`,
|
|
5535
|
+
/** wav2arkit_cpu A2E model graph (1.86MB, WASM) — Safari/iOS fallback */
|
|
5536
|
+
wav2arkitCpu: `${HF}/myned-ai/wav2arkit_cpu/resolve/main/wav2arkit_cpu.onnx`,
|
|
5537
|
+
/** SenseVoice ASR model (228MB int8, WASM) — speech recognition + emotion + language */
|
|
5538
|
+
senseVoice: `${HF}/omote-ai/sensevoice-asr/resolve/main/model.int8.onnx`,
|
|
5539
|
+
/** Silero VAD model (~2MB, WASM) — voice activity detection */
|
|
5540
|
+
sileroVad: `${HF}/deepghs/silero-vad-onnx/resolve/main/silero_vad.onnx`
|
|
5541
|
+
};
|
|
5542
|
+
var _overrides = {};
|
|
5543
|
+
var DEFAULT_MODEL_URLS = new Proxy(
|
|
5544
|
+
{},
|
|
5545
|
+
{
|
|
5546
|
+
get(_target, prop) {
|
|
5547
|
+
const key = prop;
|
|
5548
|
+
return _overrides[key] ?? HF_MODEL_URLS[key];
|
|
5549
|
+
},
|
|
5550
|
+
ownKeys() {
|
|
5551
|
+
return Object.keys(HF_MODEL_URLS);
|
|
5552
|
+
},
|
|
5553
|
+
getOwnPropertyDescriptor(_target, prop) {
|
|
5554
|
+
if (prop in HF_MODEL_URLS) {
|
|
5555
|
+
return { configurable: true, enumerable: true, value: this.get(_target, prop, _target) };
|
|
5556
|
+
}
|
|
5557
|
+
return void 0;
|
|
5558
|
+
}
|
|
5559
|
+
}
|
|
5560
|
+
);
|
|
5561
|
+
function configureModelUrls(urls) {
|
|
5562
|
+
for (const [key, url] of Object.entries(urls)) {
|
|
5563
|
+
if (key in HF_MODEL_URLS && typeof url === "string") {
|
|
5564
|
+
_overrides[key] = url;
|
|
5565
|
+
}
|
|
5566
|
+
}
|
|
5567
|
+
}
|
|
5568
|
+
function resetModelUrls() {
|
|
5569
|
+
for (const key of Object.keys(_overrides)) {
|
|
5570
|
+
delete _overrides[key];
|
|
5571
|
+
}
|
|
5572
|
+
}
|
|
5573
|
+
var HF_CDN_URLS = HF_MODEL_URLS;
|
|
5574
|
+
|
|
5220
5575
|
// src/inference/UnifiedInferenceWorker.ts
|
|
5221
|
-
var
|
|
5576
|
+
var logger8 = createLogger("UnifiedInferenceWorker");
|
|
5222
5577
|
var WASM_CDN_PATH3 = "https://cdn.jsdelivr.net/npm/onnxruntime-web@1.23.2/dist/";
|
|
5223
5578
|
var INIT_TIMEOUT_MS = 6e4;
|
|
5224
5579
|
var SV_LOAD_TIMEOUT_MS = 3e5;
|
|
@@ -5920,7 +6275,7 @@ var UnifiedInferenceWorker = class {
|
|
|
5920
6275
|
const telemetry = getTelemetry();
|
|
5921
6276
|
const span = telemetry?.startSpan("UnifiedInferenceWorker.init");
|
|
5922
6277
|
try {
|
|
5923
|
-
|
|
6278
|
+
logger8.info("Creating unified inference worker...");
|
|
5924
6279
|
this.worker = this.createWorker();
|
|
5925
6280
|
await this.sendMessage(
|
|
5926
6281
|
{ type: "init", wasmPaths: WASM_CDN_PATH3, isIOS: isIOS() },
|
|
@@ -5929,7 +6284,7 @@ var UnifiedInferenceWorker = class {
|
|
|
5929
6284
|
);
|
|
5930
6285
|
this.initialized = true;
|
|
5931
6286
|
const loadTimeMs = performance.now() - startTime;
|
|
5932
|
-
|
|
6287
|
+
logger8.info("Unified worker initialized", { loadTimeMs: Math.round(loadTimeMs) });
|
|
5933
6288
|
span?.setAttributes({ "worker.init_time_ms": loadTimeMs });
|
|
5934
6289
|
span?.end();
|
|
5935
6290
|
} catch (error) {
|
|
@@ -6103,7 +6458,7 @@ var UnifiedInferenceWorker = class {
|
|
|
6103
6458
|
this.handleWorkerMessage(event.data);
|
|
6104
6459
|
};
|
|
6105
6460
|
worker.onerror = (error) => {
|
|
6106
|
-
|
|
6461
|
+
logger8.error("Unified worker error", { error: error.message });
|
|
6107
6462
|
this.rejectAllPending(`Worker error: ${error.message}`);
|
|
6108
6463
|
};
|
|
6109
6464
|
return worker;
|
|
@@ -6117,7 +6472,7 @@ var UnifiedInferenceWorker = class {
|
|
|
6117
6472
|
this.pendingRequests.delete(requestId);
|
|
6118
6473
|
pending.reject(new Error(data.error));
|
|
6119
6474
|
} else {
|
|
6120
|
-
|
|
6475
|
+
logger8.error("Worker broadcast error", { error: data.error });
|
|
6121
6476
|
this.rejectAllPending(data.error);
|
|
6122
6477
|
}
|
|
6123
6478
|
return;
|
|
@@ -6139,7 +6494,7 @@ var UnifiedInferenceWorker = class {
|
|
|
6139
6494
|
const timeout = setTimeout(() => {
|
|
6140
6495
|
this.pendingRequests.delete(requestId);
|
|
6141
6496
|
this.poisoned = true;
|
|
6142
|
-
|
|
6497
|
+
logger8.error("CRITICAL: Worker operation timed out \u2014 worker is dead", {
|
|
6143
6498
|
type: message.type,
|
|
6144
6499
|
timeoutMs
|
|
6145
6500
|
});
|
|
@@ -6205,7 +6560,7 @@ var SenseVoiceUnifiedAdapter = class {
|
|
|
6205
6560
|
});
|
|
6206
6561
|
this._isLoaded = true;
|
|
6207
6562
|
onProgress?.(1, 1);
|
|
6208
|
-
|
|
6563
|
+
logger8.info("SenseVoice loaded via unified worker", {
|
|
6209
6564
|
backend: "wasm",
|
|
6210
6565
|
loadTimeMs: Math.round(result.loadTimeMs),
|
|
6211
6566
|
vocabSize: result.vocabSize
|
|
@@ -6270,7 +6625,7 @@ var Wav2ArkitCpuUnifiedAdapter = class {
|
|
|
6270
6625
|
externalDataUrl: externalDataUrl || null
|
|
6271
6626
|
});
|
|
6272
6627
|
this._isLoaded = true;
|
|
6273
|
-
|
|
6628
|
+
logger8.info("Wav2ArkitCpu loaded via unified worker", {
|
|
6274
6629
|
backend: "wasm",
|
|
6275
6630
|
loadTimeMs: Math.round(result.loadTimeMs)
|
|
6276
6631
|
});
|
|
@@ -6376,7 +6731,7 @@ var SileroVADUnifiedAdapter = class {
|
|
|
6376
6731
|
sampleRate: this.config.sampleRate
|
|
6377
6732
|
});
|
|
6378
6733
|
this._isLoaded = true;
|
|
6379
|
-
|
|
6734
|
+
logger8.info("SileroVAD loaded via unified worker", {
|
|
6380
6735
|
backend: "wasm",
|
|
6381
6736
|
loadTimeMs: Math.round(result.loadTimeMs),
|
|
6382
6737
|
sampleRate: this.config.sampleRate,
|
|
@@ -6457,12 +6812,13 @@ var SileroVADUnifiedAdapter = class {
|
|
|
6457
6812
|
};
|
|
6458
6813
|
|
|
6459
6814
|
// src/inference/createSenseVoice.ts
|
|
6460
|
-
var
|
|
6461
|
-
function createSenseVoice(config) {
|
|
6815
|
+
var logger9 = createLogger("createSenseVoice");
|
|
6816
|
+
function createSenseVoice(config = {}) {
|
|
6817
|
+
const modelUrl = config.modelUrl ?? DEFAULT_MODEL_URLS.senseVoice;
|
|
6462
6818
|
if (config.unifiedWorker) {
|
|
6463
|
-
|
|
6819
|
+
logger9.info("Creating SenseVoiceUnifiedAdapter (shared unified worker)");
|
|
6464
6820
|
return new SenseVoiceUnifiedAdapter(config.unifiedWorker, {
|
|
6465
|
-
modelUrl
|
|
6821
|
+
modelUrl,
|
|
6466
6822
|
tokensUrl: config.tokensUrl,
|
|
6467
6823
|
language: config.language,
|
|
6468
6824
|
textNorm: config.textNorm
|
|
@@ -6473,37 +6829,37 @@ function createSenseVoice(config) {
|
|
|
6473
6829
|
if (!SenseVoiceWorker.isSupported()) {
|
|
6474
6830
|
throw new Error("Web Workers are not supported in this environment");
|
|
6475
6831
|
}
|
|
6476
|
-
|
|
6832
|
+
logger9.info("Creating SenseVoiceWorker (off-main-thread)");
|
|
6477
6833
|
return new SenseVoiceWorker({
|
|
6478
|
-
modelUrl
|
|
6834
|
+
modelUrl,
|
|
6479
6835
|
tokensUrl: config.tokensUrl,
|
|
6480
6836
|
language: config.language,
|
|
6481
6837
|
textNorm: config.textNorm
|
|
6482
6838
|
});
|
|
6483
6839
|
}
|
|
6484
6840
|
if (useWorker === false) {
|
|
6485
|
-
|
|
6841
|
+
logger9.info("Creating SenseVoiceInference (main thread)");
|
|
6486
6842
|
return new SenseVoiceInference({
|
|
6487
|
-
modelUrl
|
|
6843
|
+
modelUrl,
|
|
6488
6844
|
tokensUrl: config.tokensUrl,
|
|
6489
6845
|
language: config.language,
|
|
6490
6846
|
textNorm: config.textNorm
|
|
6491
6847
|
});
|
|
6492
6848
|
}
|
|
6493
6849
|
if (SenseVoiceWorker.isSupported() && !isIOS()) {
|
|
6494
|
-
|
|
6850
|
+
logger9.info("Auto-detected: creating SenseVoiceWorker (off-main-thread)");
|
|
6495
6851
|
return new SenseVoiceWorker({
|
|
6496
|
-
modelUrl
|
|
6852
|
+
modelUrl,
|
|
6497
6853
|
tokensUrl: config.tokensUrl,
|
|
6498
6854
|
language: config.language,
|
|
6499
6855
|
textNorm: config.textNorm
|
|
6500
6856
|
});
|
|
6501
6857
|
}
|
|
6502
|
-
|
|
6858
|
+
logger9.info("Auto-detected: creating SenseVoiceInference (main thread)", {
|
|
6503
6859
|
reason: isIOS() ? "iOS (shared ORT instance)" : "Worker unsupported"
|
|
6504
6860
|
});
|
|
6505
6861
|
return new SenseVoiceInference({
|
|
6506
|
-
modelUrl
|
|
6862
|
+
modelUrl,
|
|
6507
6863
|
tokensUrl: config.tokensUrl,
|
|
6508
6864
|
language: config.language,
|
|
6509
6865
|
textNorm: config.textNorm
|
|
@@ -6511,7 +6867,7 @@ function createSenseVoice(config) {
|
|
|
6511
6867
|
}
|
|
6512
6868
|
|
|
6513
6869
|
// src/inference/Wav2ArkitCpuInference.ts
|
|
6514
|
-
var
|
|
6870
|
+
var logger10 = createLogger("Wav2ArkitCpu");
|
|
6515
6871
|
var _Wav2ArkitCpuInference = class _Wav2ArkitCpuInference {
|
|
6516
6872
|
constructor(config) {
|
|
6517
6873
|
this.modelId = "wav2arkit_cpu";
|
|
@@ -6553,16 +6909,16 @@ var _Wav2ArkitCpuInference = class _Wav2ArkitCpuInference {
|
|
|
6553
6909
|
});
|
|
6554
6910
|
try {
|
|
6555
6911
|
const preference = this.config.backend || "wasm";
|
|
6556
|
-
|
|
6912
|
+
logger10.info("Loading ONNX Runtime...", { preference });
|
|
6557
6913
|
const { ort, backend } = await getOnnxRuntimeForPreference(preference);
|
|
6558
6914
|
this.ort = ort;
|
|
6559
6915
|
this._backend = backend;
|
|
6560
|
-
|
|
6916
|
+
logger10.info("ONNX Runtime loaded", { backend: this._backend });
|
|
6561
6917
|
const modelUrl = this.config.modelUrl;
|
|
6562
6918
|
const dataUrl = this.config.externalDataUrl !== false ? this.config.externalDataUrl || `${modelUrl}.data` : null;
|
|
6563
6919
|
const sessionOptions = getSessionOptions(this._backend);
|
|
6564
6920
|
if (isIOS()) {
|
|
6565
|
-
|
|
6921
|
+
logger10.info("iOS: passing model URLs directly to ORT (low-memory path)", {
|
|
6566
6922
|
modelUrl,
|
|
6567
6923
|
dataUrl
|
|
6568
6924
|
});
|
|
@@ -6584,15 +6940,15 @@ var _Wav2ArkitCpuInference = class _Wav2ArkitCpuInference {
|
|
|
6584
6940
|
const isCached = await cache.has(modelUrl);
|
|
6585
6941
|
let modelBuffer;
|
|
6586
6942
|
if (isCached) {
|
|
6587
|
-
|
|
6943
|
+
logger10.debug("Loading model from cache", { modelUrl });
|
|
6588
6944
|
modelBuffer = await cache.get(modelUrl);
|
|
6589
6945
|
if (!modelBuffer) {
|
|
6590
|
-
|
|
6946
|
+
logger10.warn("Cache corruption detected, clearing and retrying", { modelUrl });
|
|
6591
6947
|
await cache.delete(modelUrl);
|
|
6592
6948
|
modelBuffer = await fetchWithCache(modelUrl);
|
|
6593
6949
|
}
|
|
6594
6950
|
} else {
|
|
6595
|
-
|
|
6951
|
+
logger10.debug("Fetching and caching model graph", { modelUrl });
|
|
6596
6952
|
modelBuffer = await fetchWithCache(modelUrl);
|
|
6597
6953
|
}
|
|
6598
6954
|
if (!modelBuffer) {
|
|
@@ -6603,31 +6959,31 @@ var _Wav2ArkitCpuInference = class _Wav2ArkitCpuInference {
|
|
|
6603
6959
|
try {
|
|
6604
6960
|
const isDataCached = await cache.has(dataUrl);
|
|
6605
6961
|
if (isDataCached) {
|
|
6606
|
-
|
|
6962
|
+
logger10.debug("Loading external data from cache", { dataUrl });
|
|
6607
6963
|
externalDataBuffer = await cache.get(dataUrl);
|
|
6608
6964
|
if (!externalDataBuffer) {
|
|
6609
|
-
|
|
6965
|
+
logger10.warn("Cache corruption for external data, retrying", { dataUrl });
|
|
6610
6966
|
await cache.delete(dataUrl);
|
|
6611
6967
|
externalDataBuffer = await fetchWithCache(dataUrl);
|
|
6612
6968
|
}
|
|
6613
6969
|
} else {
|
|
6614
|
-
|
|
6970
|
+
logger10.info("Fetching external model data", {
|
|
6615
6971
|
dataUrl,
|
|
6616
6972
|
note: "This may be a large download (400MB+)"
|
|
6617
6973
|
});
|
|
6618
6974
|
externalDataBuffer = await fetchWithCache(dataUrl);
|
|
6619
6975
|
}
|
|
6620
|
-
|
|
6976
|
+
logger10.info("External data loaded", {
|
|
6621
6977
|
size: formatBytes(externalDataBuffer.byteLength)
|
|
6622
6978
|
});
|
|
6623
6979
|
} catch (err) {
|
|
6624
|
-
|
|
6980
|
+
logger10.debug("No external data file found (single-file model)", {
|
|
6625
6981
|
dataUrl,
|
|
6626
6982
|
error: err.message
|
|
6627
6983
|
});
|
|
6628
6984
|
}
|
|
6629
6985
|
}
|
|
6630
|
-
|
|
6986
|
+
logger10.debug("Creating ONNX session", {
|
|
6631
6987
|
graphSize: formatBytes(modelBuffer.byteLength),
|
|
6632
6988
|
externalDataSize: externalDataBuffer ? formatBytes(externalDataBuffer.byteLength) : "none",
|
|
6633
6989
|
backend: this._backend
|
|
@@ -6643,7 +6999,7 @@ var _Wav2ArkitCpuInference = class _Wav2ArkitCpuInference {
|
|
|
6643
6999
|
this.session = await this.ort.InferenceSession.create(modelData, sessionOptions);
|
|
6644
7000
|
}
|
|
6645
7001
|
const loadTimeMs = performance.now() - startTime;
|
|
6646
|
-
|
|
7002
|
+
logger10.info("Model loaded successfully", {
|
|
6647
7003
|
backend: this._backend,
|
|
6648
7004
|
loadTimeMs: Math.round(loadTimeMs),
|
|
6649
7005
|
inputs: this.session.inputNames,
|
|
@@ -6659,12 +7015,12 @@ var _Wav2ArkitCpuInference = class _Wav2ArkitCpuInference {
|
|
|
6659
7015
|
model: "wav2arkit_cpu",
|
|
6660
7016
|
backend: this._backend
|
|
6661
7017
|
});
|
|
6662
|
-
|
|
7018
|
+
logger10.debug("Running warmup inference");
|
|
6663
7019
|
const warmupStart = performance.now();
|
|
6664
7020
|
const silentAudio = new Float32Array(16e3);
|
|
6665
7021
|
await this.infer(silentAudio);
|
|
6666
7022
|
const warmupTimeMs = performance.now() - warmupStart;
|
|
6667
|
-
|
|
7023
|
+
logger10.info("Warmup inference complete", {
|
|
6668
7024
|
warmupTimeMs: Math.round(warmupTimeMs),
|
|
6669
7025
|
backend: this._backend
|
|
6670
7026
|
});
|
|
@@ -6751,7 +7107,7 @@ var _Wav2ArkitCpuInference = class _Wav2ArkitCpuInference {
|
|
|
6751
7107
|
const symmetrized = symmetrizeBlendshapes(rawFrame);
|
|
6752
7108
|
blendshapes.push(symmetrized);
|
|
6753
7109
|
}
|
|
6754
|
-
|
|
7110
|
+
logger10.trace("Inference completed", {
|
|
6755
7111
|
inferenceTimeMs: Math.round(inferenceTimeMs * 100) / 100,
|
|
6756
7112
|
numFrames,
|
|
6757
7113
|
inputSamples
|
|
@@ -6779,7 +7135,7 @@ var _Wav2ArkitCpuInference = class _Wav2ArkitCpuInference {
|
|
|
6779
7135
|
const errMsg = err instanceof Error ? err.message : String(err);
|
|
6780
7136
|
if (errMsg.includes("timed out")) {
|
|
6781
7137
|
this.poisoned = true;
|
|
6782
|
-
|
|
7138
|
+
logger10.error("CRITICAL: Inference session timed out \u2014 Wav2ArkitCpu is dead. Page reload required.", {
|
|
6783
7139
|
backend: this._backend,
|
|
6784
7140
|
timeoutMs: _Wav2ArkitCpuInference.INFERENCE_TIMEOUT_MS
|
|
6785
7141
|
});
|
|
@@ -6787,7 +7143,7 @@ var _Wav2ArkitCpuInference = class _Wav2ArkitCpuInference {
|
|
|
6787
7143
|
const oomError = new Error(
|
|
6788
7144
|
`Wav2ArkitCpu inference failed with raw C++ exception pointer (0x${err.toString(16)}). This is likely an OOM crash in WASM. Try reloading the page.`
|
|
6789
7145
|
);
|
|
6790
|
-
|
|
7146
|
+
logger10.error("ORT WASM OOM \u2014 raw C++ exception pointer", {
|
|
6791
7147
|
pointer: `0x${err.toString(16)}`,
|
|
6792
7148
|
backend: this._backend
|
|
6793
7149
|
});
|
|
@@ -6800,7 +7156,7 @@ var _Wav2ArkitCpuInference = class _Wav2ArkitCpuInference {
|
|
|
6800
7156
|
reject(oomError);
|
|
6801
7157
|
return;
|
|
6802
7158
|
} else {
|
|
6803
|
-
|
|
7159
|
+
logger10.error("Inference failed", { error: errMsg, backend: this._backend });
|
|
6804
7160
|
}
|
|
6805
7161
|
span?.endWithError(err instanceof Error ? err : new Error(String(err)));
|
|
6806
7162
|
telemetry?.incrementCounter("omote.inference.total", 1, {
|
|
@@ -6827,7 +7183,7 @@ _Wav2ArkitCpuInference.INFERENCE_TIMEOUT_MS = 5e3;
|
|
|
6827
7183
|
var Wav2ArkitCpuInference = _Wav2ArkitCpuInference;
|
|
6828
7184
|
|
|
6829
7185
|
// src/inference/Wav2ArkitCpuWorker.ts
|
|
6830
|
-
var
|
|
7186
|
+
var logger11 = createLogger("Wav2ArkitCpuWorker");
|
|
6831
7187
|
var WASM_CDN_PATH4 = "https://cdn.jsdelivr.net/npm/onnxruntime-web@1.23.2/dist/";
|
|
6832
7188
|
var LOAD_TIMEOUT_MS2 = 42e4;
|
|
6833
7189
|
var INFERENCE_TIMEOUT_MS2 = 5e3;
|
|
@@ -7114,7 +7470,7 @@ var Wav2ArkitCpuWorker = class {
|
|
|
7114
7470
|
this.handleWorkerMessage(event.data);
|
|
7115
7471
|
};
|
|
7116
7472
|
worker.onerror = (error) => {
|
|
7117
|
-
|
|
7473
|
+
logger11.error("Worker error", { error: error.message });
|
|
7118
7474
|
for (const [, resolver] of this.pendingResolvers) {
|
|
7119
7475
|
resolver.reject(new Error(`Worker error: ${error.message}`));
|
|
7120
7476
|
}
|
|
@@ -7190,10 +7546,10 @@ var Wav2ArkitCpuWorker = class {
|
|
|
7190
7546
|
"model.backend_requested": "wasm"
|
|
7191
7547
|
});
|
|
7192
7548
|
try {
|
|
7193
|
-
|
|
7549
|
+
logger11.info("Creating wav2arkit_cpu worker...");
|
|
7194
7550
|
this.worker = this.createWorker();
|
|
7195
7551
|
const externalDataUrl = this.config.externalDataUrl !== false ? this.config.externalDataUrl || `${this.config.modelUrl}.data` : null;
|
|
7196
|
-
|
|
7552
|
+
logger11.info("Loading model in worker...", {
|
|
7197
7553
|
modelUrl: this.config.modelUrl,
|
|
7198
7554
|
externalDataUrl,
|
|
7199
7555
|
isIOS: isIOS()
|
|
@@ -7211,7 +7567,7 @@ var Wav2ArkitCpuWorker = class {
|
|
|
7211
7567
|
);
|
|
7212
7568
|
this._isLoaded = true;
|
|
7213
7569
|
const loadTimeMs = performance.now() - startTime;
|
|
7214
|
-
|
|
7570
|
+
logger11.info("Wav2ArkitCpu worker loaded successfully", {
|
|
7215
7571
|
backend: "wasm",
|
|
7216
7572
|
loadTimeMs: Math.round(loadTimeMs),
|
|
7217
7573
|
workerLoadTimeMs: Math.round(result.loadTimeMs),
|
|
@@ -7296,7 +7652,7 @@ var Wav2ArkitCpuWorker = class {
|
|
|
7296
7652
|
for (let f = 0; f < numFrames; f++) {
|
|
7297
7653
|
blendshapes.push(flatBuffer.slice(f * numBlendshapes, (f + 1) * numBlendshapes));
|
|
7298
7654
|
}
|
|
7299
|
-
|
|
7655
|
+
logger11.trace("Worker inference completed", {
|
|
7300
7656
|
inferenceTimeMs: Math.round(inferenceTimeMs * 100) / 100,
|
|
7301
7657
|
workerTimeMs: Math.round(result.inferenceTimeMs * 100) / 100,
|
|
7302
7658
|
numFrames,
|
|
@@ -7326,12 +7682,12 @@ var Wav2ArkitCpuWorker = class {
|
|
|
7326
7682
|
const errMsg = err instanceof Error ? err.message : String(err);
|
|
7327
7683
|
if (errMsg.includes("timed out")) {
|
|
7328
7684
|
this.poisoned = true;
|
|
7329
|
-
|
|
7685
|
+
logger11.error("CRITICAL: Worker inference timed out \u2014 Wav2ArkitCpu worker is dead. Page reload required.", {
|
|
7330
7686
|
backend: "wasm",
|
|
7331
7687
|
timeoutMs: INFERENCE_TIMEOUT_MS2
|
|
7332
7688
|
});
|
|
7333
7689
|
} else {
|
|
7334
|
-
|
|
7690
|
+
logger11.error("Worker inference failed", { error: errMsg, backend: "wasm" });
|
|
7335
7691
|
}
|
|
7336
7692
|
span?.endWithError(err instanceof Error ? err : new Error(String(err)));
|
|
7337
7693
|
telemetry?.incrementCounter("omote.inference.total", 1, {
|
|
@@ -7369,53 +7725,56 @@ var Wav2ArkitCpuWorker = class {
|
|
|
7369
7725
|
};
|
|
7370
7726
|
|
|
7371
7727
|
// src/inference/createA2E.ts
|
|
7372
|
-
var
|
|
7373
|
-
function createA2E(config) {
|
|
7728
|
+
var logger12 = createLogger("createA2E");
|
|
7729
|
+
function createA2E(config = {}) {
|
|
7374
7730
|
const mode = config.mode ?? "auto";
|
|
7375
7731
|
const fallbackOnError = config.fallbackOnError ?? true;
|
|
7732
|
+
const gpuModelUrl = config.gpuModelUrl ?? DEFAULT_MODEL_URLS.lam;
|
|
7733
|
+
const cpuModelUrl = config.cpuModelUrl ?? DEFAULT_MODEL_URLS.wav2arkitCpu;
|
|
7376
7734
|
let useCpu;
|
|
7377
7735
|
if (mode === "cpu") {
|
|
7378
7736
|
useCpu = true;
|
|
7379
|
-
|
|
7737
|
+
logger12.info("Forcing CPU A2E model (wav2arkit_cpu)");
|
|
7380
7738
|
} else if (mode === "gpu") {
|
|
7381
7739
|
useCpu = false;
|
|
7382
|
-
|
|
7740
|
+
logger12.info("Forcing GPU A2E model (Wav2Vec2)");
|
|
7383
7741
|
} else {
|
|
7384
7742
|
useCpu = shouldUseCpuA2E();
|
|
7385
|
-
|
|
7743
|
+
logger12.info("Auto-detected A2E model", {
|
|
7386
7744
|
useCpu,
|
|
7387
7745
|
isSafari: isSafari()
|
|
7388
7746
|
});
|
|
7389
7747
|
}
|
|
7390
7748
|
if (useCpu) {
|
|
7391
7749
|
if (config.unifiedWorker) {
|
|
7392
|
-
|
|
7750
|
+
logger12.info("Creating Wav2ArkitCpuUnifiedAdapter (404MB, WASM, shared unified worker)");
|
|
7393
7751
|
return new Wav2ArkitCpuUnifiedAdapter(config.unifiedWorker, {
|
|
7394
|
-
modelUrl:
|
|
7752
|
+
modelUrl: cpuModelUrl
|
|
7395
7753
|
});
|
|
7396
7754
|
}
|
|
7397
7755
|
if (config.useWorker && Wav2ArkitCpuWorker.isSupported() && !isIOS()) {
|
|
7398
|
-
|
|
7756
|
+
logger12.info("Creating Wav2ArkitCpuWorker (404MB, WASM, off-main-thread)");
|
|
7399
7757
|
return new Wav2ArkitCpuWorker({
|
|
7400
|
-
modelUrl:
|
|
7758
|
+
modelUrl: cpuModelUrl
|
|
7401
7759
|
});
|
|
7402
7760
|
}
|
|
7403
|
-
|
|
7761
|
+
logger12.info("Creating Wav2ArkitCpuInference (404MB, WASM)");
|
|
7404
7762
|
return new Wav2ArkitCpuInference({
|
|
7405
|
-
modelUrl:
|
|
7763
|
+
modelUrl: cpuModelUrl
|
|
7406
7764
|
});
|
|
7407
7765
|
}
|
|
7766
|
+
const gpuExternalDataUrl = config.gpuExternalDataUrl !== void 0 ? config.gpuExternalDataUrl : void 0;
|
|
7408
7767
|
const gpuInstance = new Wav2Vec2Inference({
|
|
7409
|
-
modelUrl:
|
|
7410
|
-
externalDataUrl:
|
|
7768
|
+
modelUrl: gpuModelUrl,
|
|
7769
|
+
externalDataUrl: gpuExternalDataUrl,
|
|
7411
7770
|
backend: config.gpuBackend ?? "auto",
|
|
7412
7771
|
numIdentityClasses: config.numIdentityClasses
|
|
7413
7772
|
});
|
|
7414
7773
|
if (fallbackOnError) {
|
|
7415
|
-
|
|
7774
|
+
logger12.info("Creating Wav2Vec2Inference with CPU fallback");
|
|
7416
7775
|
return new A2EWithFallback(gpuInstance, config);
|
|
7417
7776
|
}
|
|
7418
|
-
|
|
7777
|
+
logger12.info("Creating Wav2Vec2Inference (no fallback)");
|
|
7419
7778
|
return gpuInstance;
|
|
7420
7779
|
}
|
|
7421
7780
|
var A2EWithFallback = class {
|
|
@@ -7423,6 +7782,7 @@ var A2EWithFallback = class {
|
|
|
7423
7782
|
this.hasFallenBack = false;
|
|
7424
7783
|
this.implementation = gpuInstance;
|
|
7425
7784
|
this.config = config;
|
|
7785
|
+
this.resolvedCpuModelUrl = config.cpuModelUrl ?? DEFAULT_MODEL_URLS.wav2arkitCpu;
|
|
7426
7786
|
}
|
|
7427
7787
|
get modelId() {
|
|
7428
7788
|
return this.implementation.modelId;
|
|
@@ -7444,26 +7804,26 @@ var A2EWithFallback = class {
|
|
|
7444
7804
|
}
|
|
7445
7805
|
}
|
|
7446
7806
|
async fallbackToCpu(reason) {
|
|
7447
|
-
|
|
7807
|
+
logger12.warn("GPU model load failed, falling back to CPU model", { reason });
|
|
7448
7808
|
try {
|
|
7449
7809
|
await this.implementation.dispose();
|
|
7450
7810
|
} catch {
|
|
7451
7811
|
}
|
|
7452
7812
|
if (this.config.unifiedWorker) {
|
|
7453
7813
|
this.implementation = new Wav2ArkitCpuUnifiedAdapter(this.config.unifiedWorker, {
|
|
7454
|
-
modelUrl: this.
|
|
7814
|
+
modelUrl: this.resolvedCpuModelUrl
|
|
7455
7815
|
});
|
|
7456
|
-
|
|
7816
|
+
logger12.info("Fallback to Wav2ArkitCpuUnifiedAdapter successful");
|
|
7457
7817
|
} else if (this.config.useWorker && Wav2ArkitCpuWorker.isSupported() && !isIOS()) {
|
|
7458
7818
|
this.implementation = new Wav2ArkitCpuWorker({
|
|
7459
|
-
modelUrl: this.
|
|
7819
|
+
modelUrl: this.resolvedCpuModelUrl
|
|
7460
7820
|
});
|
|
7461
|
-
|
|
7821
|
+
logger12.info("Fallback to Wav2ArkitCpuWorker successful");
|
|
7462
7822
|
} else {
|
|
7463
7823
|
this.implementation = new Wav2ArkitCpuInference({
|
|
7464
|
-
modelUrl: this.
|
|
7824
|
+
modelUrl: this.resolvedCpuModelUrl
|
|
7465
7825
|
});
|
|
7466
|
-
|
|
7826
|
+
logger12.info("Fallback to Wav2ArkitCpuInference successful");
|
|
7467
7827
|
}
|
|
7468
7828
|
this.hasFallenBack = true;
|
|
7469
7829
|
return await this.implementation.load();
|
|
@@ -7667,7 +8027,7 @@ var EmphasisDetector = class {
|
|
|
7667
8027
|
};
|
|
7668
8028
|
|
|
7669
8029
|
// src/inference/SileroVADInference.ts
|
|
7670
|
-
var
|
|
8030
|
+
var logger13 = createLogger("SileroVAD");
|
|
7671
8031
|
var SileroVADInference = class {
|
|
7672
8032
|
constructor(config) {
|
|
7673
8033
|
this.session = null;
|
|
@@ -7741,23 +8101,23 @@ var SileroVADInference = class {
|
|
|
7741
8101
|
"model.sample_rate": this.config.sampleRate
|
|
7742
8102
|
});
|
|
7743
8103
|
try {
|
|
7744
|
-
|
|
8104
|
+
logger13.info("Loading ONNX Runtime...", { preference: this.config.backend });
|
|
7745
8105
|
const { ort, backend } = await getOnnxRuntimeForPreference(this.config.backend);
|
|
7746
8106
|
this.ort = ort;
|
|
7747
8107
|
this._backend = backend;
|
|
7748
|
-
|
|
8108
|
+
logger13.info("ONNX Runtime loaded", { backend: this._backend });
|
|
7749
8109
|
const cache = getModelCache();
|
|
7750
8110
|
const modelUrl = this.config.modelUrl;
|
|
7751
8111
|
const isCached = await cache.has(modelUrl);
|
|
7752
8112
|
let modelBuffer;
|
|
7753
8113
|
if (isCached) {
|
|
7754
|
-
|
|
8114
|
+
logger13.debug("Loading model from cache", { modelUrl });
|
|
7755
8115
|
modelBuffer = await cache.get(modelUrl);
|
|
7756
8116
|
} else {
|
|
7757
|
-
|
|
8117
|
+
logger13.debug("Fetching and caching model", { modelUrl });
|
|
7758
8118
|
modelBuffer = await fetchWithCache(modelUrl);
|
|
7759
8119
|
}
|
|
7760
|
-
|
|
8120
|
+
logger13.debug("Creating ONNX session", {
|
|
7761
8121
|
size: formatBytes(modelBuffer.byteLength),
|
|
7762
8122
|
backend: this._backend
|
|
7763
8123
|
});
|
|
@@ -7766,7 +8126,7 @@ var SileroVADInference = class {
|
|
|
7766
8126
|
this.session = await ort.InferenceSession.create(modelData, sessionOptions);
|
|
7767
8127
|
this.reset();
|
|
7768
8128
|
const loadTimeMs = performance.now() - startTime;
|
|
7769
|
-
|
|
8129
|
+
logger13.info("Model loaded successfully", {
|
|
7770
8130
|
backend: this._backend,
|
|
7771
8131
|
loadTimeMs: Math.round(loadTimeMs),
|
|
7772
8132
|
sampleRate: this.config.sampleRate,
|
|
@@ -7821,7 +8181,7 @@ var SileroVADInference = class {
|
|
|
7821
8181
|
[]
|
|
7822
8182
|
);
|
|
7823
8183
|
} catch (e) {
|
|
7824
|
-
|
|
8184
|
+
logger13.warn("BigInt64Array not available, using bigint array fallback", {
|
|
7825
8185
|
error: e instanceof Error ? e.message : String(e)
|
|
7826
8186
|
});
|
|
7827
8187
|
this.srTensor = new this.ort.Tensor(
|
|
@@ -7927,7 +8287,7 @@ var SileroVADInference = class {
|
|
|
7927
8287
|
this.preSpeechBuffer.shift();
|
|
7928
8288
|
}
|
|
7929
8289
|
}
|
|
7930
|
-
|
|
8290
|
+
logger13.trace("Skipping VAD inference - audio too quiet", {
|
|
7931
8291
|
rms: Math.round(rms * 1e4) / 1e4,
|
|
7932
8292
|
threshold: MIN_ENERGY_THRESHOLD
|
|
7933
8293
|
});
|
|
@@ -7981,7 +8341,7 @@ var SileroVADInference = class {
|
|
|
7981
8341
|
if (isSpeech && !this.wasSpeaking) {
|
|
7982
8342
|
preSpeechChunks = [...this.preSpeechBuffer];
|
|
7983
8343
|
this.preSpeechBuffer = [];
|
|
7984
|
-
|
|
8344
|
+
logger13.debug("Speech started with pre-speech buffer", {
|
|
7985
8345
|
preSpeechChunks: preSpeechChunks.length,
|
|
7986
8346
|
durationMs: Math.round(preSpeechChunks.length * this.getChunkDurationMs())
|
|
7987
8347
|
});
|
|
@@ -7994,7 +8354,7 @@ var SileroVADInference = class {
|
|
|
7994
8354
|
this.preSpeechBuffer = [];
|
|
7995
8355
|
}
|
|
7996
8356
|
this.wasSpeaking = isSpeech;
|
|
7997
|
-
|
|
8357
|
+
logger13.trace("VAD inference completed", {
|
|
7998
8358
|
probability: Math.round(probability * 1e3) / 1e3,
|
|
7999
8359
|
isSpeech,
|
|
8000
8360
|
inferenceTimeMs: Math.round(inferenceTimeMs * 100) / 100
|
|
@@ -8025,7 +8385,7 @@ var SileroVADInference = class {
|
|
|
8025
8385
|
const oomError = new Error(
|
|
8026
8386
|
`SileroVAD inference failed with raw C++ exception pointer (0x${err.toString(16)}). This is likely an OOM crash in WASM. Try reducing concurrent model sessions or reloading the page.`
|
|
8027
8387
|
);
|
|
8028
|
-
|
|
8388
|
+
logger13.error("ORT WASM OOM \u2014 raw C++ exception pointer", {
|
|
8029
8389
|
pointer: `0x${err.toString(16)}`,
|
|
8030
8390
|
backend: this._backend
|
|
8031
8391
|
});
|
|
@@ -8068,7 +8428,7 @@ var SileroVADInference = class {
|
|
|
8068
8428
|
SileroVADInference.isWebGPUAvailable = isWebGPUAvailable;
|
|
8069
8429
|
|
|
8070
8430
|
// src/inference/SileroVADWorker.ts
|
|
8071
|
-
var
|
|
8431
|
+
var logger14 = createLogger("SileroVADWorker");
|
|
8072
8432
|
var WASM_CDN_PATH5 = "https://cdn.jsdelivr.net/npm/onnxruntime-web@1.23.2/dist/";
|
|
8073
8433
|
var LOAD_TIMEOUT_MS3 = 12e4;
|
|
8074
8434
|
var INFERENCE_TIMEOUT_MS3 = 1e3;
|
|
@@ -8353,7 +8713,7 @@ var SileroVADWorker = class {
|
|
|
8353
8713
|
this.handleWorkerMessage(event.data);
|
|
8354
8714
|
};
|
|
8355
8715
|
worker.onerror = (error) => {
|
|
8356
|
-
|
|
8716
|
+
logger14.error("Worker error", { error: error.message });
|
|
8357
8717
|
for (const [, resolver] of this.pendingResolvers) {
|
|
8358
8718
|
resolver.reject(new Error(`Worker error: ${error.message}`));
|
|
8359
8719
|
}
|
|
@@ -8429,9 +8789,9 @@ var SileroVADWorker = class {
|
|
|
8429
8789
|
"model.sample_rate": this.config.sampleRate
|
|
8430
8790
|
});
|
|
8431
8791
|
try {
|
|
8432
|
-
|
|
8792
|
+
logger14.info("Creating VAD worker...");
|
|
8433
8793
|
this.worker = this.createWorker();
|
|
8434
|
-
|
|
8794
|
+
logger14.info("Loading model in worker...", {
|
|
8435
8795
|
modelUrl: this.config.modelUrl,
|
|
8436
8796
|
sampleRate: this.config.sampleRate
|
|
8437
8797
|
});
|
|
@@ -8447,7 +8807,7 @@ var SileroVADWorker = class {
|
|
|
8447
8807
|
);
|
|
8448
8808
|
this._isLoaded = true;
|
|
8449
8809
|
const loadTimeMs = performance.now() - startTime;
|
|
8450
|
-
|
|
8810
|
+
logger14.info("VAD worker loaded successfully", {
|
|
8451
8811
|
backend: "wasm",
|
|
8452
8812
|
loadTimeMs: Math.round(loadTimeMs),
|
|
8453
8813
|
workerLoadTimeMs: Math.round(result.loadTimeMs),
|
|
@@ -8554,7 +8914,7 @@ var SileroVADWorker = class {
|
|
|
8554
8914
|
if (isSpeech && !this.wasSpeaking) {
|
|
8555
8915
|
preSpeechChunks = [...this.preSpeechBuffer];
|
|
8556
8916
|
this.preSpeechBuffer = [];
|
|
8557
|
-
|
|
8917
|
+
logger14.debug("Speech started with pre-speech buffer", {
|
|
8558
8918
|
preSpeechChunks: preSpeechChunks.length,
|
|
8559
8919
|
durationMs: Math.round(preSpeechChunks.length * this.getChunkDurationMs())
|
|
8560
8920
|
});
|
|
@@ -8567,7 +8927,7 @@ var SileroVADWorker = class {
|
|
|
8567
8927
|
this.preSpeechBuffer = [];
|
|
8568
8928
|
}
|
|
8569
8929
|
this.wasSpeaking = isSpeech;
|
|
8570
|
-
|
|
8930
|
+
logger14.trace("VAD worker inference completed", {
|
|
8571
8931
|
probability: Math.round(result.probability * 1e3) / 1e3,
|
|
8572
8932
|
isSpeech,
|
|
8573
8933
|
inferenceTimeMs: Math.round(inferenceTimeMs * 100) / 100,
|
|
@@ -8635,63 +8995,65 @@ var SileroVADWorker = class {
|
|
|
8635
8995
|
};
|
|
8636
8996
|
|
|
8637
8997
|
// src/inference/createSileroVAD.ts
|
|
8638
|
-
var
|
|
8998
|
+
var logger15 = createLogger("createSileroVAD");
|
|
8639
8999
|
function supportsVADWorker() {
|
|
8640
9000
|
if (typeof Worker === "undefined") {
|
|
8641
|
-
|
|
9001
|
+
logger15.debug("Worker not supported: Worker constructor undefined");
|
|
8642
9002
|
return false;
|
|
8643
9003
|
}
|
|
8644
9004
|
if (typeof URL === "undefined" || typeof URL.createObjectURL === "undefined") {
|
|
8645
|
-
|
|
9005
|
+
logger15.debug("Worker not supported: URL.createObjectURL unavailable");
|
|
8646
9006
|
return false;
|
|
8647
9007
|
}
|
|
8648
9008
|
if (typeof Blob === "undefined") {
|
|
8649
|
-
|
|
9009
|
+
logger15.debug("Worker not supported: Blob constructor unavailable");
|
|
8650
9010
|
return false;
|
|
8651
9011
|
}
|
|
8652
9012
|
return true;
|
|
8653
9013
|
}
|
|
8654
|
-
function createSileroVAD(config) {
|
|
9014
|
+
function createSileroVAD(config = {}) {
|
|
9015
|
+
const modelUrl = config.modelUrl ?? DEFAULT_MODEL_URLS.sileroVad;
|
|
9016
|
+
const resolvedConfig = { ...config, modelUrl };
|
|
8655
9017
|
if (config.unifiedWorker) {
|
|
8656
|
-
|
|
8657
|
-
return new SileroVADUnifiedAdapter(config.unifiedWorker,
|
|
9018
|
+
logger15.info("Creating SileroVADUnifiedAdapter (shared unified worker)");
|
|
9019
|
+
return new SileroVADUnifiedAdapter(config.unifiedWorker, resolvedConfig);
|
|
8658
9020
|
}
|
|
8659
9021
|
const fallbackOnError = config.fallbackOnError ?? true;
|
|
8660
9022
|
let useWorker;
|
|
8661
9023
|
if (config.useWorker !== void 0) {
|
|
8662
9024
|
useWorker = config.useWorker;
|
|
8663
|
-
|
|
9025
|
+
logger15.debug("Worker preference explicitly set", { useWorker });
|
|
8664
9026
|
} else {
|
|
8665
9027
|
const workerSupported = supportsVADWorker();
|
|
8666
9028
|
const onMobile = isMobile();
|
|
8667
9029
|
useWorker = workerSupported && !onMobile;
|
|
8668
|
-
|
|
9030
|
+
logger15.debug("Auto-detected Worker preference", {
|
|
8669
9031
|
useWorker,
|
|
8670
9032
|
workerSupported,
|
|
8671
9033
|
onMobile
|
|
8672
9034
|
});
|
|
8673
9035
|
}
|
|
8674
9036
|
if (useWorker) {
|
|
8675
|
-
|
|
9037
|
+
logger15.info("Creating SileroVADWorker (off-main-thread)");
|
|
8676
9038
|
const worker = new SileroVADWorker({
|
|
8677
|
-
modelUrl
|
|
9039
|
+
modelUrl,
|
|
8678
9040
|
sampleRate: config.sampleRate,
|
|
8679
9041
|
threshold: config.threshold,
|
|
8680
9042
|
preSpeechBufferChunks: config.preSpeechBufferChunks
|
|
8681
9043
|
});
|
|
8682
9044
|
if (fallbackOnError) {
|
|
8683
|
-
return new VADWorkerWithFallback(worker,
|
|
9045
|
+
return new VADWorkerWithFallback(worker, resolvedConfig);
|
|
8684
9046
|
}
|
|
8685
9047
|
return worker;
|
|
8686
9048
|
}
|
|
8687
|
-
|
|
8688
|
-
return new SileroVADInference(
|
|
9049
|
+
logger15.info("Creating SileroVADInference (main thread)");
|
|
9050
|
+
return new SileroVADInference(resolvedConfig);
|
|
8689
9051
|
}
|
|
8690
9052
|
var VADWorkerWithFallback = class {
|
|
8691
|
-
constructor(worker,
|
|
9053
|
+
constructor(worker, resolvedConfig) {
|
|
8692
9054
|
this.hasFallenBack = false;
|
|
8693
9055
|
this.implementation = worker;
|
|
8694
|
-
this.
|
|
9056
|
+
this.resolvedConfig = resolvedConfig;
|
|
8695
9057
|
}
|
|
8696
9058
|
get backend() {
|
|
8697
9059
|
if (!this.isLoaded) return null;
|
|
@@ -8710,16 +9072,16 @@ var VADWorkerWithFallback = class {
|
|
|
8710
9072
|
try {
|
|
8711
9073
|
return await this.implementation.load();
|
|
8712
9074
|
} catch (error) {
|
|
8713
|
-
|
|
9075
|
+
logger15.warn("Worker load failed, falling back to main thread", {
|
|
8714
9076
|
error: error instanceof Error ? error.message : String(error)
|
|
8715
9077
|
});
|
|
8716
9078
|
try {
|
|
8717
9079
|
await this.implementation.dispose();
|
|
8718
9080
|
} catch {
|
|
8719
9081
|
}
|
|
8720
|
-
this.implementation = new SileroVADInference(this.
|
|
9082
|
+
this.implementation = new SileroVADInference(this.resolvedConfig);
|
|
8721
9083
|
this.hasFallenBack = true;
|
|
8722
|
-
|
|
9084
|
+
logger15.info("Fallback to SileroVADInference successful");
|
|
8723
9085
|
return await this.implementation.load();
|
|
8724
9086
|
}
|
|
8725
9087
|
}
|
|
@@ -8741,7 +9103,7 @@ var VADWorkerWithFallback = class {
|
|
|
8741
9103
|
};
|
|
8742
9104
|
|
|
8743
9105
|
// src/inference/A2EOrchestrator.ts
|
|
8744
|
-
var
|
|
9106
|
+
var logger16 = createLogger("A2EOrchestrator");
|
|
8745
9107
|
var A2EOrchestrator = class {
|
|
8746
9108
|
constructor(config) {
|
|
8747
9109
|
this.a2e = null;
|
|
@@ -8782,7 +9144,7 @@ var A2EOrchestrator = class {
|
|
|
8782
9144
|
*/
|
|
8783
9145
|
async load() {
|
|
8784
9146
|
if (this.disposed) throw new Error("A2EOrchestrator has been disposed");
|
|
8785
|
-
|
|
9147
|
+
logger16.info("Loading A2E model...");
|
|
8786
9148
|
this.a2e = createA2E({
|
|
8787
9149
|
gpuModelUrl: this.config.gpuModelUrl,
|
|
8788
9150
|
gpuExternalDataUrl: this.config.gpuExternalDataUrl,
|
|
@@ -8799,7 +9161,7 @@ var A2EOrchestrator = class {
|
|
|
8799
9161
|
onError: this.config.onError
|
|
8800
9162
|
});
|
|
8801
9163
|
this._isReady = true;
|
|
8802
|
-
|
|
9164
|
+
logger16.info("A2E model loaded", {
|
|
8803
9165
|
backend: info.backend,
|
|
8804
9166
|
loadTimeMs: info.loadTimeMs,
|
|
8805
9167
|
modelId: this.a2e.modelId
|
|
@@ -8854,10 +9216,10 @@ var A2EOrchestrator = class {
|
|
|
8854
9216
|
this.scriptProcessor.connect(this.audioContext.destination);
|
|
8855
9217
|
this._isStreaming = true;
|
|
8856
9218
|
this.processor.startDrip();
|
|
8857
|
-
|
|
9219
|
+
logger16.info("Mic capture started", { sampleRate: this.nativeSampleRate });
|
|
8858
9220
|
} catch (err) {
|
|
8859
9221
|
const error = err instanceof Error ? err : new Error(String(err));
|
|
8860
|
-
|
|
9222
|
+
logger16.error("Failed to start mic capture", { error: error.message });
|
|
8861
9223
|
this.config.onError?.(error);
|
|
8862
9224
|
throw error;
|
|
8863
9225
|
}
|
|
@@ -8885,7 +9247,7 @@ var A2EOrchestrator = class {
|
|
|
8885
9247
|
});
|
|
8886
9248
|
this.audioContext = null;
|
|
8887
9249
|
}
|
|
8888
|
-
|
|
9250
|
+
logger16.info("Mic capture stopped");
|
|
8889
9251
|
}
|
|
8890
9252
|
/**
|
|
8891
9253
|
* Dispose of all resources
|
|
@@ -8908,7 +9270,7 @@ var A2EOrchestrator = class {
|
|
|
8908
9270
|
};
|
|
8909
9271
|
|
|
8910
9272
|
// src/inference/SafariSpeechRecognition.ts
|
|
8911
|
-
var
|
|
9273
|
+
var logger17 = createLogger("SafariSpeech");
|
|
8912
9274
|
var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
8913
9275
|
constructor(config = {}) {
|
|
8914
9276
|
this.recognition = null;
|
|
@@ -8927,7 +9289,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
8927
9289
|
interimResults: config.interimResults ?? true,
|
|
8928
9290
|
maxAlternatives: config.maxAlternatives ?? 1
|
|
8929
9291
|
};
|
|
8930
|
-
|
|
9292
|
+
logger17.debug("SafariSpeechRecognition created", {
|
|
8931
9293
|
language: this.config.language,
|
|
8932
9294
|
continuous: this.config.continuous
|
|
8933
9295
|
});
|
|
@@ -8988,7 +9350,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
8988
9350
|
*/
|
|
8989
9351
|
async start() {
|
|
8990
9352
|
if (this.isListening) {
|
|
8991
|
-
|
|
9353
|
+
logger17.warn("Already listening");
|
|
8992
9354
|
return;
|
|
8993
9355
|
}
|
|
8994
9356
|
if (!_SafariSpeechRecognition.isAvailable()) {
|
|
@@ -9018,7 +9380,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
9018
9380
|
this.isListening = true;
|
|
9019
9381
|
this.startTime = performance.now();
|
|
9020
9382
|
this.accumulatedText = "";
|
|
9021
|
-
|
|
9383
|
+
logger17.info("Speech recognition started", {
|
|
9022
9384
|
language: this.config.language
|
|
9023
9385
|
});
|
|
9024
9386
|
span?.end();
|
|
@@ -9033,7 +9395,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
9033
9395
|
*/
|
|
9034
9396
|
async stop() {
|
|
9035
9397
|
if (!this.isListening || !this.recognition) {
|
|
9036
|
-
|
|
9398
|
+
logger17.warn("Not currently listening");
|
|
9037
9399
|
return {
|
|
9038
9400
|
text: this.accumulatedText,
|
|
9039
9401
|
language: this.config.language,
|
|
@@ -9062,7 +9424,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
9062
9424
|
if (this.recognition && this.isListening) {
|
|
9063
9425
|
this.recognition.abort();
|
|
9064
9426
|
this.isListening = false;
|
|
9065
|
-
|
|
9427
|
+
logger17.info("Speech recognition aborted");
|
|
9066
9428
|
}
|
|
9067
9429
|
}
|
|
9068
9430
|
/**
|
|
@@ -9093,7 +9455,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
9093
9455
|
this.isListening = false;
|
|
9094
9456
|
this.resultCallbacks = [];
|
|
9095
9457
|
this.errorCallbacks = [];
|
|
9096
|
-
|
|
9458
|
+
logger17.debug("SafariSpeechRecognition disposed");
|
|
9097
9459
|
}
|
|
9098
9460
|
/**
|
|
9099
9461
|
* Set up event handlers for the recognition instance
|
|
@@ -9121,7 +9483,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
9121
9483
|
confidence: alternative.confidence
|
|
9122
9484
|
};
|
|
9123
9485
|
this.emitResult(speechResult);
|
|
9124
|
-
|
|
9486
|
+
logger17.trace("Speech result", {
|
|
9125
9487
|
text: text.substring(0, 50),
|
|
9126
9488
|
isFinal,
|
|
9127
9489
|
confidence: alternative.confidence
|
|
@@ -9131,12 +9493,12 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
9131
9493
|
span?.end();
|
|
9132
9494
|
} catch (error) {
|
|
9133
9495
|
span?.endWithError(error instanceof Error ? error : new Error(String(error)));
|
|
9134
|
-
|
|
9496
|
+
logger17.error("Error processing speech result", { error });
|
|
9135
9497
|
}
|
|
9136
9498
|
};
|
|
9137
9499
|
this.recognition.onerror = (event) => {
|
|
9138
9500
|
const error = new Error(`Speech recognition error: ${event.error} - ${event.message}`);
|
|
9139
|
-
|
|
9501
|
+
logger17.error("Speech recognition error", { error: event.error, message: event.message });
|
|
9140
9502
|
this.emitError(error);
|
|
9141
9503
|
if (this.stopRejecter) {
|
|
9142
9504
|
this.stopRejecter(error);
|
|
@@ -9146,7 +9508,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
9146
9508
|
};
|
|
9147
9509
|
this.recognition.onend = () => {
|
|
9148
9510
|
this.isListening = false;
|
|
9149
|
-
|
|
9511
|
+
logger17.info("Speech recognition ended", {
|
|
9150
9512
|
totalText: this.accumulatedText.length,
|
|
9151
9513
|
durationMs: performance.now() - this.startTime
|
|
9152
9514
|
});
|
|
@@ -9163,13 +9525,13 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
9163
9525
|
}
|
|
9164
9526
|
};
|
|
9165
9527
|
this.recognition.onstart = () => {
|
|
9166
|
-
|
|
9528
|
+
logger17.debug("Speech recognition started by browser");
|
|
9167
9529
|
};
|
|
9168
9530
|
this.recognition.onspeechstart = () => {
|
|
9169
|
-
|
|
9531
|
+
logger17.debug("Speech detected");
|
|
9170
9532
|
};
|
|
9171
9533
|
this.recognition.onspeechend = () => {
|
|
9172
|
-
|
|
9534
|
+
logger17.debug("Speech ended");
|
|
9173
9535
|
};
|
|
9174
9536
|
}
|
|
9175
9537
|
/**
|
|
@@ -9180,7 +9542,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
9180
9542
|
try {
|
|
9181
9543
|
callback(result);
|
|
9182
9544
|
} catch (error) {
|
|
9183
|
-
|
|
9545
|
+
logger17.error("Error in result callback", { error });
|
|
9184
9546
|
}
|
|
9185
9547
|
}
|
|
9186
9548
|
}
|
|
@@ -9192,7 +9554,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
9192
9554
|
try {
|
|
9193
9555
|
callback(error);
|
|
9194
9556
|
} catch (callbackError) {
|
|
9195
|
-
|
|
9557
|
+
logger17.error("Error in error callback", { error: callbackError });
|
|
9196
9558
|
}
|
|
9197
9559
|
}
|
|
9198
9560
|
}
|
|
@@ -9762,338 +10124,32 @@ var AnimationGraph = class extends EventEmitter {
|
|
|
9762
10124
|
}
|
|
9763
10125
|
};
|
|
9764
10126
|
|
|
9765
|
-
// src/animation/simplex2d.ts
|
|
9766
|
-
var perm = new Uint8Array(512);
|
|
9767
|
-
var grad2 = [
|
|
9768
|
-
[1, 1],
|
|
9769
|
-
[-1, 1],
|
|
9770
|
-
[1, -1],
|
|
9771
|
-
[-1, -1],
|
|
9772
|
-
[1, 0],
|
|
9773
|
-
[-1, 0],
|
|
9774
|
-
[0, 1],
|
|
9775
|
-
[0, -1]
|
|
9776
|
-
];
|
|
9777
|
-
var p = [
|
|
9778
|
-
151,
|
|
9779
|
-
160,
|
|
9780
|
-
137,
|
|
9781
|
-
91,
|
|
9782
|
-
90,
|
|
9783
|
-
15,
|
|
9784
|
-
131,
|
|
9785
|
-
13,
|
|
9786
|
-
201,
|
|
9787
|
-
95,
|
|
9788
|
-
96,
|
|
9789
|
-
53,
|
|
9790
|
-
194,
|
|
9791
|
-
233,
|
|
9792
|
-
7,
|
|
9793
|
-
225,
|
|
9794
|
-
140,
|
|
9795
|
-
36,
|
|
9796
|
-
103,
|
|
9797
|
-
30,
|
|
9798
|
-
69,
|
|
9799
|
-
142,
|
|
9800
|
-
8,
|
|
9801
|
-
99,
|
|
9802
|
-
37,
|
|
9803
|
-
240,
|
|
9804
|
-
21,
|
|
9805
|
-
10,
|
|
9806
|
-
23,
|
|
9807
|
-
190,
|
|
9808
|
-
6,
|
|
9809
|
-
148,
|
|
9810
|
-
247,
|
|
9811
|
-
120,
|
|
9812
|
-
234,
|
|
9813
|
-
75,
|
|
9814
|
-
0,
|
|
9815
|
-
26,
|
|
9816
|
-
197,
|
|
9817
|
-
62,
|
|
9818
|
-
94,
|
|
9819
|
-
252,
|
|
9820
|
-
219,
|
|
9821
|
-
203,
|
|
9822
|
-
117,
|
|
9823
|
-
35,
|
|
9824
|
-
11,
|
|
9825
|
-
32,
|
|
9826
|
-
57,
|
|
9827
|
-
177,
|
|
9828
|
-
33,
|
|
9829
|
-
88,
|
|
9830
|
-
237,
|
|
9831
|
-
149,
|
|
9832
|
-
56,
|
|
9833
|
-
87,
|
|
9834
|
-
174,
|
|
9835
|
-
20,
|
|
9836
|
-
125,
|
|
9837
|
-
136,
|
|
9838
|
-
171,
|
|
9839
|
-
168,
|
|
9840
|
-
68,
|
|
9841
|
-
175,
|
|
9842
|
-
74,
|
|
9843
|
-
165,
|
|
9844
|
-
71,
|
|
9845
|
-
134,
|
|
9846
|
-
139,
|
|
9847
|
-
48,
|
|
9848
|
-
27,
|
|
9849
|
-
166,
|
|
9850
|
-
77,
|
|
9851
|
-
146,
|
|
9852
|
-
158,
|
|
9853
|
-
231,
|
|
9854
|
-
83,
|
|
9855
|
-
111,
|
|
9856
|
-
229,
|
|
9857
|
-
122,
|
|
9858
|
-
60,
|
|
9859
|
-
211,
|
|
9860
|
-
133,
|
|
9861
|
-
230,
|
|
9862
|
-
220,
|
|
9863
|
-
105,
|
|
9864
|
-
92,
|
|
9865
|
-
41,
|
|
9866
|
-
55,
|
|
9867
|
-
46,
|
|
9868
|
-
245,
|
|
9869
|
-
40,
|
|
9870
|
-
244,
|
|
9871
|
-
102,
|
|
9872
|
-
143,
|
|
9873
|
-
54,
|
|
9874
|
-
65,
|
|
9875
|
-
25,
|
|
9876
|
-
63,
|
|
9877
|
-
161,
|
|
9878
|
-
1,
|
|
9879
|
-
216,
|
|
9880
|
-
80,
|
|
9881
|
-
73,
|
|
9882
|
-
209,
|
|
9883
|
-
76,
|
|
9884
|
-
132,
|
|
9885
|
-
187,
|
|
9886
|
-
208,
|
|
9887
|
-
89,
|
|
9888
|
-
18,
|
|
9889
|
-
169,
|
|
9890
|
-
200,
|
|
9891
|
-
196,
|
|
9892
|
-
135,
|
|
9893
|
-
130,
|
|
9894
|
-
116,
|
|
9895
|
-
188,
|
|
9896
|
-
159,
|
|
9897
|
-
86,
|
|
9898
|
-
164,
|
|
9899
|
-
100,
|
|
9900
|
-
109,
|
|
9901
|
-
198,
|
|
9902
|
-
173,
|
|
9903
|
-
186,
|
|
9904
|
-
3,
|
|
9905
|
-
64,
|
|
9906
|
-
52,
|
|
9907
|
-
217,
|
|
9908
|
-
226,
|
|
9909
|
-
250,
|
|
9910
|
-
124,
|
|
9911
|
-
123,
|
|
9912
|
-
5,
|
|
9913
|
-
202,
|
|
9914
|
-
38,
|
|
9915
|
-
147,
|
|
9916
|
-
118,
|
|
9917
|
-
126,
|
|
9918
|
-
255,
|
|
9919
|
-
82,
|
|
9920
|
-
85,
|
|
9921
|
-
212,
|
|
9922
|
-
207,
|
|
9923
|
-
206,
|
|
9924
|
-
59,
|
|
9925
|
-
227,
|
|
9926
|
-
47,
|
|
9927
|
-
16,
|
|
9928
|
-
58,
|
|
9929
|
-
17,
|
|
9930
|
-
182,
|
|
9931
|
-
189,
|
|
9932
|
-
28,
|
|
9933
|
-
42,
|
|
9934
|
-
223,
|
|
9935
|
-
183,
|
|
9936
|
-
170,
|
|
9937
|
-
213,
|
|
9938
|
-
119,
|
|
9939
|
-
248,
|
|
9940
|
-
152,
|
|
9941
|
-
2,
|
|
9942
|
-
44,
|
|
9943
|
-
154,
|
|
9944
|
-
163,
|
|
9945
|
-
70,
|
|
9946
|
-
221,
|
|
9947
|
-
153,
|
|
9948
|
-
101,
|
|
9949
|
-
155,
|
|
9950
|
-
167,
|
|
9951
|
-
43,
|
|
9952
|
-
172,
|
|
9953
|
-
9,
|
|
9954
|
-
129,
|
|
9955
|
-
22,
|
|
9956
|
-
39,
|
|
9957
|
-
253,
|
|
9958
|
-
19,
|
|
9959
|
-
98,
|
|
9960
|
-
108,
|
|
9961
|
-
110,
|
|
9962
|
-
79,
|
|
9963
|
-
113,
|
|
9964
|
-
224,
|
|
9965
|
-
232,
|
|
9966
|
-
178,
|
|
9967
|
-
185,
|
|
9968
|
-
112,
|
|
9969
|
-
104,
|
|
9970
|
-
218,
|
|
9971
|
-
246,
|
|
9972
|
-
97,
|
|
9973
|
-
228,
|
|
9974
|
-
251,
|
|
9975
|
-
34,
|
|
9976
|
-
242,
|
|
9977
|
-
193,
|
|
9978
|
-
238,
|
|
9979
|
-
210,
|
|
9980
|
-
144,
|
|
9981
|
-
12,
|
|
9982
|
-
191,
|
|
9983
|
-
179,
|
|
9984
|
-
162,
|
|
9985
|
-
241,
|
|
9986
|
-
81,
|
|
9987
|
-
51,
|
|
9988
|
-
145,
|
|
9989
|
-
235,
|
|
9990
|
-
249,
|
|
9991
|
-
14,
|
|
9992
|
-
239,
|
|
9993
|
-
107,
|
|
9994
|
-
49,
|
|
9995
|
-
192,
|
|
9996
|
-
214,
|
|
9997
|
-
31,
|
|
9998
|
-
181,
|
|
9999
|
-
199,
|
|
10000
|
-
106,
|
|
10001
|
-
157,
|
|
10002
|
-
184,
|
|
10003
|
-
84,
|
|
10004
|
-
204,
|
|
10005
|
-
176,
|
|
10006
|
-
115,
|
|
10007
|
-
121,
|
|
10008
|
-
50,
|
|
10009
|
-
45,
|
|
10010
|
-
127,
|
|
10011
|
-
4,
|
|
10012
|
-
150,
|
|
10013
|
-
254,
|
|
10014
|
-
138,
|
|
10015
|
-
236,
|
|
10016
|
-
205,
|
|
10017
|
-
93,
|
|
10018
|
-
222,
|
|
10019
|
-
114,
|
|
10020
|
-
67,
|
|
10021
|
-
29,
|
|
10022
|
-
24,
|
|
10023
|
-
72,
|
|
10024
|
-
243,
|
|
10025
|
-
141,
|
|
10026
|
-
128,
|
|
10027
|
-
195,
|
|
10028
|
-
78,
|
|
10029
|
-
66,
|
|
10030
|
-
215,
|
|
10031
|
-
61,
|
|
10032
|
-
156,
|
|
10033
|
-
180
|
|
10034
|
-
];
|
|
10035
|
-
for (let i = 0; i < 256; i++) {
|
|
10036
|
-
perm[i] = p[i];
|
|
10037
|
-
perm[i + 256] = p[i];
|
|
10038
|
-
}
|
|
10039
|
-
var F2 = 0.5 * (Math.sqrt(3) - 1);
|
|
10040
|
-
var G2 = (3 - Math.sqrt(3)) / 6;
|
|
10041
|
-
function dot2(g, x, y) {
|
|
10042
|
-
return g[0] * x + g[1] * y;
|
|
10043
|
-
}
|
|
10044
|
-
function simplex2d(x, y) {
|
|
10045
|
-
const s = (x + y) * F2;
|
|
10046
|
-
const i = Math.floor(x + s);
|
|
10047
|
-
const j = Math.floor(y + s);
|
|
10048
|
-
const t = (i + j) * G2;
|
|
10049
|
-
const X0 = i - t;
|
|
10050
|
-
const Y0 = j - t;
|
|
10051
|
-
const x0 = x - X0;
|
|
10052
|
-
const y0 = y - Y0;
|
|
10053
|
-
const i1 = x0 > y0 ? 1 : 0;
|
|
10054
|
-
const j1 = x0 > y0 ? 0 : 1;
|
|
10055
|
-
const x1 = x0 - i1 + G2;
|
|
10056
|
-
const y1 = y0 - j1 + G2;
|
|
10057
|
-
const x2 = x0 - 1 + 2 * G2;
|
|
10058
|
-
const y2 = y0 - 1 + 2 * G2;
|
|
10059
|
-
const ii = i & 255;
|
|
10060
|
-
const jj = j & 255;
|
|
10061
|
-
const gi0 = perm[ii + perm[jj]] % 8;
|
|
10062
|
-
const gi1 = perm[ii + i1 + perm[jj + j1]] % 8;
|
|
10063
|
-
const gi2 = perm[ii + 1 + perm[jj + 1]] % 8;
|
|
10064
|
-
let n0 = 0;
|
|
10065
|
-
let t0 = 0.5 - x0 * x0 - y0 * y0;
|
|
10066
|
-
if (t0 >= 0) {
|
|
10067
|
-
t0 *= t0;
|
|
10068
|
-
n0 = t0 * t0 * dot2(grad2[gi0], x0, y0);
|
|
10069
|
-
}
|
|
10070
|
-
let n1 = 0;
|
|
10071
|
-
let t1 = 0.5 - x1 * x1 - y1 * y1;
|
|
10072
|
-
if (t1 >= 0) {
|
|
10073
|
-
t1 *= t1;
|
|
10074
|
-
n1 = t1 * t1 * dot2(grad2[gi1], x1, y1);
|
|
10075
|
-
}
|
|
10076
|
-
let n2 = 0;
|
|
10077
|
-
let t2 = 0.5 - x2 * x2 - y2 * y2;
|
|
10078
|
-
if (t2 >= 0) {
|
|
10079
|
-
t2 *= t2;
|
|
10080
|
-
n2 = t2 * t2 * dot2(grad2[gi2], x2, y2);
|
|
10081
|
-
}
|
|
10082
|
-
return 70 * (n0 + n1 + n2);
|
|
10083
|
-
}
|
|
10084
|
-
|
|
10085
10127
|
// src/animation/ProceduralLifeLayer.ts
|
|
10128
|
+
var import_simplex_noise = require("simplex-noise");
|
|
10129
|
+
var simplex2d = (0, import_simplex_noise.createNoise2D)();
|
|
10130
|
+
var LIFE_BS_INDEX = /* @__PURE__ */ new Map();
|
|
10131
|
+
for (let i = 0; i < LAM_BLENDSHAPES.length; i++) {
|
|
10132
|
+
LIFE_BS_INDEX.set(LAM_BLENDSHAPES[i], i);
|
|
10133
|
+
}
|
|
10086
10134
|
var PHASE_OPEN = 0;
|
|
10087
10135
|
var PHASE_CLOSING = 1;
|
|
10088
10136
|
var PHASE_CLOSED = 2;
|
|
10089
10137
|
var PHASE_OPENING = 3;
|
|
10090
|
-
var BLINK_CLOSE_DURATION = 0.
|
|
10138
|
+
var BLINK_CLOSE_DURATION = 0.092;
|
|
10091
10139
|
var BLINK_HOLD_DURATION = 0.04;
|
|
10092
|
-
var BLINK_OPEN_DURATION = 0.
|
|
10140
|
+
var BLINK_OPEN_DURATION = 0.242;
|
|
10093
10141
|
var BLINK_ASYMMETRY_DELAY = 8e-3;
|
|
10142
|
+
var BLINK_IBI_MU = Math.log(5.97);
|
|
10143
|
+
var BLINK_IBI_SIGMA = 0.89;
|
|
10094
10144
|
var GAZE_BREAK_DURATION = 0.12;
|
|
10095
10145
|
var GAZE_BREAK_HOLD_DURATION = 0.3;
|
|
10096
10146
|
var GAZE_BREAK_RETURN_DURATION = 0.15;
|
|
10147
|
+
var GAZE_STATE_PARAMS = {
|
|
10148
|
+
idle: { interval: [2, 5], amplitude: [0.15, 0.4] },
|
|
10149
|
+
listening: { interval: [4, 10], amplitude: [0.1, 0.25] },
|
|
10150
|
+
thinking: { interval: [1, 3], amplitude: [0.2, 0.5] },
|
|
10151
|
+
speaking: { interval: [2, 6], amplitude: [0.15, 0.35] }
|
|
10152
|
+
};
|
|
10097
10153
|
var EYE_NOISE_X_FREQ = 0.8;
|
|
10098
10154
|
var EYE_NOISE_Y_FREQ = 0.6;
|
|
10099
10155
|
var EYE_NOISE_X_PHASE = 73.1;
|
|
@@ -10121,6 +10177,12 @@ function smoothStep(t) {
|
|
|
10121
10177
|
function softClamp(v, max) {
|
|
10122
10178
|
return Math.tanh(v / max) * max;
|
|
10123
10179
|
}
|
|
10180
|
+
function sampleLogNormal(mu, sigma) {
|
|
10181
|
+
const u1 = Math.random();
|
|
10182
|
+
const u2 = Math.random();
|
|
10183
|
+
const z = Math.sqrt(-2 * Math.log(u1 || 1e-10)) * Math.cos(2 * Math.PI * u2);
|
|
10184
|
+
return Math.exp(mu + sigma * z);
|
|
10185
|
+
}
|
|
10124
10186
|
var ProceduralLifeLayer = class {
|
|
10125
10187
|
constructor(config) {
|
|
10126
10188
|
// Blink state
|
|
@@ -10133,7 +10195,7 @@ var ProceduralLifeLayer = class {
|
|
|
10133
10195
|
// Eye contact (smoothed)
|
|
10134
10196
|
this.smoothedEyeX = 0;
|
|
10135
10197
|
this.smoothedEyeY = 0;
|
|
10136
|
-
// Eye micro-motion
|
|
10198
|
+
// Eye micro-motion
|
|
10137
10199
|
this.eyeNoiseTime = 0;
|
|
10138
10200
|
// Gaze break state
|
|
10139
10201
|
this.gazeBreakTimer = 0;
|
|
@@ -10143,6 +10205,8 @@ var ProceduralLifeLayer = class {
|
|
|
10143
10205
|
this.gazeBreakTargetY = 0;
|
|
10144
10206
|
this.gazeBreakCurrentX = 0;
|
|
10145
10207
|
this.gazeBreakCurrentY = 0;
|
|
10208
|
+
// Conversational state for gaze
|
|
10209
|
+
this.currentState = null;
|
|
10146
10210
|
// Breathing / postural sway
|
|
10147
10211
|
this.microMotionTime = 0;
|
|
10148
10212
|
this.breathingPhase = 0;
|
|
@@ -10151,6 +10215,7 @@ var ProceduralLifeLayer = class {
|
|
|
10151
10215
|
this.previousEnergy = 0;
|
|
10152
10216
|
this.emphasisLevel = 0;
|
|
10153
10217
|
this.blinkIntervalRange = config?.blinkIntervalRange ?? [2.5, 6];
|
|
10218
|
+
this.useLogNormalBlinks = !config?.blinkIntervalRange;
|
|
10154
10219
|
this.gazeBreakIntervalRange = config?.gazeBreakIntervalRange ?? [3, 8];
|
|
10155
10220
|
this.gazeBreakAmplitudeRange = config?.gazeBreakAmplitudeRange ?? [0.15, 0.4];
|
|
10156
10221
|
this.eyeNoiseAmplitude = config?.eyeNoiseAmplitude ?? 0.06;
|
|
@@ -10160,7 +10225,7 @@ var ProceduralLifeLayer = class {
|
|
|
10160
10225
|
this.posturalSwayAmplitude = config?.posturalSwayAmplitude ?? 2e-3;
|
|
10161
10226
|
this.eyeMaxDeviation = config?.eyeMaxDeviation ?? 0.8;
|
|
10162
10227
|
this.eyeSmoothing = config?.eyeSmoothing ?? 15;
|
|
10163
|
-
this.blinkInterval =
|
|
10228
|
+
this.blinkInterval = this.nextBlinkInterval();
|
|
10164
10229
|
this.gazeBreakInterval = randomRange(...this.gazeBreakIntervalRange);
|
|
10165
10230
|
}
|
|
10166
10231
|
/**
|
|
@@ -10175,6 +10240,7 @@ var ProceduralLifeLayer = class {
|
|
|
10175
10240
|
const eyeTargetY = input?.eyeTargetY ?? 0;
|
|
10176
10241
|
const audioEnergy = input?.audioEnergy ?? 0;
|
|
10177
10242
|
const isSpeaking = input?.isSpeaking ?? false;
|
|
10243
|
+
this.currentState = input?.state ?? null;
|
|
10178
10244
|
const safeDelta = Math.min(delta, 0.1);
|
|
10179
10245
|
const blendshapes = {};
|
|
10180
10246
|
this.updateBlinks(delta);
|
|
@@ -10213,6 +10279,12 @@ var ProceduralLifeLayer = class {
|
|
|
10213
10279
|
const swayAmp = this.posturalSwayAmplitude;
|
|
10214
10280
|
const swayX = Math.sin(this.microMotionTime * 0.7) * swayAmp + Math.sin(this.microMotionTime * 1.3) * swayAmp * 0.5;
|
|
10215
10281
|
const swayY = Math.sin(this.microMotionTime * 0.5) * swayAmp * 0.75 + Math.sin(this.microMotionTime * 0.9) * swayAmp * 0.5;
|
|
10282
|
+
const breathVal = Math.sin(this.breathingPhase);
|
|
10283
|
+
if (breathVal > 0) {
|
|
10284
|
+
blendshapes["jawOpen"] = breathVal * 0.015;
|
|
10285
|
+
blendshapes["noseSneerLeft"] = breathVal * 8e-3;
|
|
10286
|
+
blendshapes["noseSneerRight"] = breathVal * 8e-3;
|
|
10287
|
+
}
|
|
10216
10288
|
return {
|
|
10217
10289
|
blendshapes,
|
|
10218
10290
|
headDelta: {
|
|
@@ -10221,12 +10293,35 @@ var ProceduralLifeLayer = class {
|
|
|
10221
10293
|
}
|
|
10222
10294
|
};
|
|
10223
10295
|
}
|
|
10296
|
+
/**
|
|
10297
|
+
* Write life layer output directly to a Float32Array[52] in LAM_BLENDSHAPES order.
|
|
10298
|
+
*
|
|
10299
|
+
* Includes micro-jitter (0.4% amplitude simplex noise on all channels) to
|
|
10300
|
+
* break uncanny stillness on undriven channels.
|
|
10301
|
+
*
|
|
10302
|
+
* @param delta - Time since last frame in seconds
|
|
10303
|
+
* @param input - Per-frame input
|
|
10304
|
+
* @param out - Pre-allocated Float32Array(52) to write into
|
|
10305
|
+
*/
|
|
10306
|
+
updateToArray(delta, input, out) {
|
|
10307
|
+
out.fill(0);
|
|
10308
|
+
const result = this.update(delta, input);
|
|
10309
|
+
for (const [name, value] of Object.entries(result.blendshapes)) {
|
|
10310
|
+
const idx = LIFE_BS_INDEX.get(name);
|
|
10311
|
+
if (idx !== void 0) {
|
|
10312
|
+
out[idx] = value;
|
|
10313
|
+
}
|
|
10314
|
+
}
|
|
10315
|
+
for (let i = 0; i < 52; i++) {
|
|
10316
|
+
out[i] += simplex2d(this.noiseTime * 0.3, i * 7.13) * 4e-3;
|
|
10317
|
+
}
|
|
10318
|
+
}
|
|
10224
10319
|
/**
|
|
10225
10320
|
* Reset all internal state to initial values.
|
|
10226
10321
|
*/
|
|
10227
10322
|
reset() {
|
|
10228
10323
|
this.blinkTimer = 0;
|
|
10229
|
-
this.blinkInterval =
|
|
10324
|
+
this.blinkInterval = this.nextBlinkInterval();
|
|
10230
10325
|
this.blinkPhase = PHASE_OPEN;
|
|
10231
10326
|
this.blinkProgress = 0;
|
|
10232
10327
|
this.asymmetryRight = 0.97;
|
|
@@ -10243,6 +10338,7 @@ var ProceduralLifeLayer = class {
|
|
|
10243
10338
|
this.gazeBreakTargetY = 0;
|
|
10244
10339
|
this.gazeBreakCurrentX = 0;
|
|
10245
10340
|
this.gazeBreakCurrentY = 0;
|
|
10341
|
+
this.currentState = null;
|
|
10246
10342
|
this.microMotionTime = 0;
|
|
10247
10343
|
this.breathingPhase = 0;
|
|
10248
10344
|
this.noiseTime = 0;
|
|
@@ -10250,6 +10346,21 @@ var ProceduralLifeLayer = class {
|
|
|
10250
10346
|
this.emphasisLevel = 0;
|
|
10251
10347
|
}
|
|
10252
10348
|
// =====================================================================
|
|
10349
|
+
// PRIVATE: Blink interval sampling
|
|
10350
|
+
// =====================================================================
|
|
10351
|
+
/**
|
|
10352
|
+
* Sample next blink interval.
|
|
10353
|
+
* Uses log-normal distribution (PMC3565584) when using default config,
|
|
10354
|
+
* or uniform random when custom blinkIntervalRange is provided.
|
|
10355
|
+
*/
|
|
10356
|
+
nextBlinkInterval() {
|
|
10357
|
+
if (this.useLogNormalBlinks) {
|
|
10358
|
+
const sample = sampleLogNormal(BLINK_IBI_MU, BLINK_IBI_SIGMA);
|
|
10359
|
+
return clamp(sample, 1.5, 12);
|
|
10360
|
+
}
|
|
10361
|
+
return randomRange(...this.blinkIntervalRange);
|
|
10362
|
+
}
|
|
10363
|
+
// =====================================================================
|
|
10253
10364
|
// PRIVATE: Blink system
|
|
10254
10365
|
// =====================================================================
|
|
10255
10366
|
updateBlinks(delta) {
|
|
@@ -10258,7 +10369,7 @@ var ProceduralLifeLayer = class {
|
|
|
10258
10369
|
this.blinkPhase = PHASE_CLOSING;
|
|
10259
10370
|
this.blinkProgress = 0;
|
|
10260
10371
|
this.blinkTimer = 0;
|
|
10261
|
-
this.blinkInterval =
|
|
10372
|
+
this.blinkInterval = this.nextBlinkInterval();
|
|
10262
10373
|
this.asymmetryRight = 0.95 + Math.random() * 0.08;
|
|
10263
10374
|
}
|
|
10264
10375
|
if (this.blinkPhase > PHASE_OPEN) {
|
|
@@ -10314,18 +10425,32 @@ var ProceduralLifeLayer = class {
|
|
|
10314
10425
|
return { x, y };
|
|
10315
10426
|
}
|
|
10316
10427
|
// =====================================================================
|
|
10317
|
-
// PRIVATE: Gaze breaks
|
|
10428
|
+
// PRIVATE: Gaze breaks (state-dependent)
|
|
10318
10429
|
// =====================================================================
|
|
10430
|
+
/**
|
|
10431
|
+
* Get active gaze parameters — uses state-dependent params when
|
|
10432
|
+
* conversational state is provided, otherwise falls back to config ranges.
|
|
10433
|
+
*/
|
|
10434
|
+
getActiveGazeParams() {
|
|
10435
|
+
if (this.currentState && GAZE_STATE_PARAMS[this.currentState]) {
|
|
10436
|
+
return GAZE_STATE_PARAMS[this.currentState];
|
|
10437
|
+
}
|
|
10438
|
+
return {
|
|
10439
|
+
interval: this.gazeBreakIntervalRange,
|
|
10440
|
+
amplitude: this.gazeBreakAmplitudeRange
|
|
10441
|
+
};
|
|
10442
|
+
}
|
|
10319
10443
|
updateGazeBreaks(delta) {
|
|
10320
10444
|
this.gazeBreakTimer += delta;
|
|
10321
10445
|
if (this.gazeBreakTimer >= this.gazeBreakInterval && this.gazeBreakPhase === PHASE_OPEN) {
|
|
10322
10446
|
this.gazeBreakPhase = PHASE_CLOSING;
|
|
10323
10447
|
this.gazeBreakProgress = 0;
|
|
10324
10448
|
this.gazeBreakTimer = 0;
|
|
10325
|
-
const
|
|
10449
|
+
const params = this.getActiveGazeParams();
|
|
10450
|
+
const amp = randomRange(...params.amplitude);
|
|
10326
10451
|
this.gazeBreakTargetX = (Math.random() - 0.5) * 2 * amp;
|
|
10327
10452
|
this.gazeBreakTargetY = (Math.random() - 0.5) * amp * 0.4;
|
|
10328
|
-
this.gazeBreakInterval = randomRange(...
|
|
10453
|
+
this.gazeBreakInterval = randomRange(...params.interval);
|
|
10329
10454
|
}
|
|
10330
10455
|
if (this.gazeBreakPhase > PHASE_OPEN) {
|
|
10331
10456
|
this.gazeBreakProgress += delta;
|
|
@@ -10390,6 +10515,971 @@ var ProceduralLifeLayer = class {
|
|
|
10390
10515
|
}
|
|
10391
10516
|
};
|
|
10392
10517
|
|
|
10518
|
+
// src/face/FACSMapping.ts
|
|
10519
|
+
var EMOTION_TO_AU = {
|
|
10520
|
+
joy: [
|
|
10521
|
+
{ au: "AU6", intensity: 0.7, region: "upper" },
|
|
10522
|
+
// cheek raise (Duchenne)
|
|
10523
|
+
{ au: "AU12", intensity: 0.8, region: "lower" }
|
|
10524
|
+
// lip corner pull (smile)
|
|
10525
|
+
],
|
|
10526
|
+
anger: [
|
|
10527
|
+
{ au: "AU4", intensity: 0.8, region: "upper" },
|
|
10528
|
+
// brow lower
|
|
10529
|
+
{ au: "AU5", intensity: 0.4, region: "upper" },
|
|
10530
|
+
// upper lid raise
|
|
10531
|
+
{ au: "AU7", intensity: 0.3, region: "upper" },
|
|
10532
|
+
// lid tighten
|
|
10533
|
+
{ au: "AU23", intensity: 0.6, region: "lower" }
|
|
10534
|
+
// lip tighten
|
|
10535
|
+
],
|
|
10536
|
+
sadness: [
|
|
10537
|
+
{ au: "AU1", intensity: 0.7, region: "upper" },
|
|
10538
|
+
// inner brow raise
|
|
10539
|
+
{ au: "AU4", intensity: 0.3, region: "upper" },
|
|
10540
|
+
// brow lower (furrow)
|
|
10541
|
+
{ au: "AU15", intensity: 0.5, region: "lower" }
|
|
10542
|
+
// lip corner depress
|
|
10543
|
+
],
|
|
10544
|
+
fear: [
|
|
10545
|
+
{ au: "AU1", intensity: 0.6, region: "upper" },
|
|
10546
|
+
// inner brow raise
|
|
10547
|
+
{ au: "AU2", intensity: 0.5, region: "upper" },
|
|
10548
|
+
// outer brow raise
|
|
10549
|
+
{ au: "AU4", intensity: 0.3, region: "upper" },
|
|
10550
|
+
// brow lower
|
|
10551
|
+
{ au: "AU5", intensity: 0.5, region: "upper" },
|
|
10552
|
+
// upper lid raise
|
|
10553
|
+
{ au: "AU20", intensity: 0.4, region: "lower" }
|
|
10554
|
+
// lip stretch
|
|
10555
|
+
],
|
|
10556
|
+
disgust: [
|
|
10557
|
+
{ au: "AU9", intensity: 0.7, region: "upper" },
|
|
10558
|
+
// nose wrinkle
|
|
10559
|
+
{ au: "AU10", intensity: 0.5, region: "lower" },
|
|
10560
|
+
// upper lip raise
|
|
10561
|
+
{ au: "AU15", intensity: 0.4, region: "lower" }
|
|
10562
|
+
// lip corner depress
|
|
10563
|
+
],
|
|
10564
|
+
amazement: [
|
|
10565
|
+
{ au: "AU1", intensity: 0.6, region: "upper" },
|
|
10566
|
+
// inner brow raise
|
|
10567
|
+
{ au: "AU2", intensity: 0.7, region: "upper" },
|
|
10568
|
+
// outer brow raise
|
|
10569
|
+
{ au: "AU5", intensity: 0.6, region: "upper" },
|
|
10570
|
+
// upper lid raise
|
|
10571
|
+
{ au: "AU26", intensity: 0.4, region: "lower" }
|
|
10572
|
+
// jaw drop
|
|
10573
|
+
],
|
|
10574
|
+
grief: [
|
|
10575
|
+
{ au: "AU1", intensity: 0.8, region: "upper" },
|
|
10576
|
+
// inner brow raise
|
|
10577
|
+
{ au: "AU4", intensity: 0.5, region: "upper" },
|
|
10578
|
+
// brow lower
|
|
10579
|
+
{ au: "AU6", intensity: 0.3, region: "upper" },
|
|
10580
|
+
// cheek raise (grief cry)
|
|
10581
|
+
{ au: "AU15", intensity: 0.6, region: "lower" }
|
|
10582
|
+
// lip corner depress
|
|
10583
|
+
],
|
|
10584
|
+
cheekiness: [
|
|
10585
|
+
{ au: "AU2", intensity: 0.4, region: "upper" },
|
|
10586
|
+
// outer brow raise
|
|
10587
|
+
{ au: "AU6", intensity: 0.4, region: "upper" },
|
|
10588
|
+
// cheek raise
|
|
10589
|
+
{ au: "AU12", intensity: 0.6, region: "lower" }
|
|
10590
|
+
// lip corner pull (smirk)
|
|
10591
|
+
],
|
|
10592
|
+
pain: [
|
|
10593
|
+
{ au: "AU4", intensity: 0.7, region: "upper" },
|
|
10594
|
+
// brow lower
|
|
10595
|
+
{ au: "AU6", intensity: 0.4, region: "upper" },
|
|
10596
|
+
// cheek raise (orbicularis)
|
|
10597
|
+
{ au: "AU7", intensity: 0.7, region: "upper" },
|
|
10598
|
+
// lid tighten (squint)
|
|
10599
|
+
{ au: "AU9", intensity: 0.5, region: "upper" }
|
|
10600
|
+
// nose wrinkle
|
|
10601
|
+
],
|
|
10602
|
+
outofbreath: [
|
|
10603
|
+
{ au: "AU1", intensity: 0.3, region: "upper" },
|
|
10604
|
+
// inner brow raise
|
|
10605
|
+
{ au: "AU25", intensity: 0.3, region: "lower" },
|
|
10606
|
+
// lips part
|
|
10607
|
+
{ au: "AU26", intensity: 0.5, region: "lower" }
|
|
10608
|
+
// jaw drop
|
|
10609
|
+
]
|
|
10610
|
+
};
|
|
10611
|
+
var AU_TO_ARKIT = {
|
|
10612
|
+
"AU1": [{ blendshape: "browInnerUp", weight: 1 }],
|
|
10613
|
+
"AU2": [{ blendshape: "browOuterUpLeft", weight: 1 }, { blendshape: "browOuterUpRight", weight: 1 }],
|
|
10614
|
+
"AU4": [{ blendshape: "browDownLeft", weight: 1 }, { blendshape: "browDownRight", weight: 1 }],
|
|
10615
|
+
"AU5": [{ blendshape: "eyeWideLeft", weight: 1 }, { blendshape: "eyeWideRight", weight: 1 }],
|
|
10616
|
+
"AU6": [{ blendshape: "cheekSquintLeft", weight: 1 }, { blendshape: "cheekSquintRight", weight: 1 }],
|
|
10617
|
+
"AU7": [{ blendshape: "eyeSquintLeft", weight: 1 }, { blendshape: "eyeSquintRight", weight: 1 }],
|
|
10618
|
+
"AU9": [{ blendshape: "noseSneerLeft", weight: 1 }, { blendshape: "noseSneerRight", weight: 1 }],
|
|
10619
|
+
"AU10": [{ blendshape: "mouthUpperUpLeft", weight: 1 }, { blendshape: "mouthUpperUpRight", weight: 1 }],
|
|
10620
|
+
"AU12": [{ blendshape: "mouthSmileLeft", weight: 1 }, { blendshape: "mouthSmileRight", weight: 1 }],
|
|
10621
|
+
"AU15": [{ blendshape: "mouthFrownLeft", weight: 1 }, { blendshape: "mouthFrownRight", weight: 1 }],
|
|
10622
|
+
"AU20": [{ blendshape: "mouthStretchLeft", weight: 1 }, { blendshape: "mouthStretchRight", weight: 1 }],
|
|
10623
|
+
"AU23": [{ blendshape: "mouthPressLeft", weight: 1 }, { blendshape: "mouthPressRight", weight: 1 }],
|
|
10624
|
+
"AU25": [{ blendshape: "jawOpen", weight: 0.3 }],
|
|
10625
|
+
"AU26": [{ blendshape: "jawOpen", weight: 1 }]
|
|
10626
|
+
};
|
|
10627
|
+
var ALL_AUS = [...new Set(
|
|
10628
|
+
Object.values(EMOTION_TO_AU).flatMap((activations) => activations.map((a) => a.au))
|
|
10629
|
+
)];
|
|
10630
|
+
|
|
10631
|
+
// src/face/EmotionResolver.ts
|
|
10632
|
+
var BS_INDEX = /* @__PURE__ */ new Map();
|
|
10633
|
+
for (let i = 0; i < LAM_BLENDSHAPES.length; i++) {
|
|
10634
|
+
BS_INDEX.set(LAM_BLENDSHAPES[i], i);
|
|
10635
|
+
}
|
|
10636
|
+
var EmotionResolver = class {
|
|
10637
|
+
constructor() {
|
|
10638
|
+
this.upperBuffer = new Float32Array(52);
|
|
10639
|
+
this.lowerBuffer = new Float32Array(52);
|
|
10640
|
+
}
|
|
10641
|
+
/**
|
|
10642
|
+
* Resolve emotion weights to upper/lower face blendshape contributions.
|
|
10643
|
+
*
|
|
10644
|
+
* @param weights - Emotion channel weights from EmotionController
|
|
10645
|
+
* @param intensity - Global intensity multiplier (0-2). Default: 1.0
|
|
10646
|
+
* @returns Upper and lower face blendshape arrays (52 channels each)
|
|
10647
|
+
*/
|
|
10648
|
+
resolve(weights, intensity = 1) {
|
|
10649
|
+
const upper = this.upperBuffer;
|
|
10650
|
+
const lower = this.lowerBuffer;
|
|
10651
|
+
upper.fill(0);
|
|
10652
|
+
lower.fill(0);
|
|
10653
|
+
for (const emotionName of EMOTION_NAMES) {
|
|
10654
|
+
const emotionWeight = weights[emotionName];
|
|
10655
|
+
if (!emotionWeight || emotionWeight < 0.01) continue;
|
|
10656
|
+
const auActivations = EMOTION_TO_AU[emotionName];
|
|
10657
|
+
if (!auActivations) continue;
|
|
10658
|
+
for (const activation of auActivations) {
|
|
10659
|
+
const arkitMappings = AU_TO_ARKIT[activation.au];
|
|
10660
|
+
if (!arkitMappings) continue;
|
|
10661
|
+
const target = activation.region === "upper" ? upper : lower;
|
|
10662
|
+
const scale = emotionWeight * activation.intensity * intensity;
|
|
10663
|
+
for (const mapping of arkitMappings) {
|
|
10664
|
+
const idx = BS_INDEX.get(mapping.blendshape);
|
|
10665
|
+
if (idx !== void 0) {
|
|
10666
|
+
target[idx] += mapping.weight * scale;
|
|
10667
|
+
}
|
|
10668
|
+
}
|
|
10669
|
+
}
|
|
10670
|
+
}
|
|
10671
|
+
for (let i = 0; i < 52; i++) {
|
|
10672
|
+
if (upper[i] > 1) upper[i] = 1;
|
|
10673
|
+
if (lower[i] > 1) lower[i] = 1;
|
|
10674
|
+
}
|
|
10675
|
+
return {
|
|
10676
|
+
upper: new Float32Array(upper),
|
|
10677
|
+
lower: new Float32Array(lower)
|
|
10678
|
+
};
|
|
10679
|
+
}
|
|
10680
|
+
};
|
|
10681
|
+
|
|
10682
|
+
// src/face/FaceCompositor.ts
|
|
10683
|
+
function smoothstep(t) {
|
|
10684
|
+
return t * t * (3 - 2 * t);
|
|
10685
|
+
}
|
|
10686
|
+
var BS_INDEX2 = /* @__PURE__ */ new Map();
|
|
10687
|
+
for (let i = 0; i < LAM_BLENDSHAPES.length; i++) {
|
|
10688
|
+
BS_INDEX2.set(LAM_BLENDSHAPES[i], i);
|
|
10689
|
+
}
|
|
10690
|
+
var IDX_MOUTH_CLOSE = BS_INDEX2.get("mouthClose");
|
|
10691
|
+
var IS_EYE_CHANNEL = new Array(52).fill(false);
|
|
10692
|
+
for (const name of LAM_BLENDSHAPES) {
|
|
10693
|
+
if (name.startsWith("eyeBlink") || name.startsWith("eyeLook")) {
|
|
10694
|
+
IS_EYE_CHANNEL[BS_INDEX2.get(name)] = true;
|
|
10695
|
+
}
|
|
10696
|
+
}
|
|
10697
|
+
var FaceCompositor = class {
|
|
10698
|
+
constructor(config) {
|
|
10699
|
+
this.emotionResolver = new EmotionResolver();
|
|
10700
|
+
// Pre-allocated buffers
|
|
10701
|
+
this.smoothedUpper = new Float32Array(52);
|
|
10702
|
+
this.smoothedLower = new Float32Array(52);
|
|
10703
|
+
this.lifeBuffer = new Float32Array(52);
|
|
10704
|
+
// Profile arrays (pre-expanded to 52 channels)
|
|
10705
|
+
this.multiplier = new Float32Array(52).fill(1);
|
|
10706
|
+
this.offset = new Float32Array(52);
|
|
10707
|
+
this.lifeLayer = config?.lifeLayer ?? new ProceduralLifeLayer();
|
|
10708
|
+
this.emotionSmoothing = config?.emotionSmoothing ?? 0.12;
|
|
10709
|
+
if (config?.profile) {
|
|
10710
|
+
this.applyProfileArrays(config.profile);
|
|
10711
|
+
}
|
|
10712
|
+
}
|
|
10713
|
+
/**
|
|
10714
|
+
* Compose a single output frame from the 5-stage signal chain.
|
|
10715
|
+
*
|
|
10716
|
+
* @param base - A2E raw output (Float32Array[52], LAM_BLENDSHAPES order)
|
|
10717
|
+
* @param input - Per-frame input (deltaTime, emotion, life layer params)
|
|
10718
|
+
* @returns Float32Array[52] with all values clamped to [0, 1]
|
|
10719
|
+
*/
|
|
10720
|
+
compose(base, input) {
|
|
10721
|
+
const out = new Float32Array(52);
|
|
10722
|
+
out.set(base);
|
|
10723
|
+
const emotion = input.emotion ?? this.stickyEmotion;
|
|
10724
|
+
if (emotion) {
|
|
10725
|
+
const resolved = this.emotionResolver.resolve(
|
|
10726
|
+
emotion,
|
|
10727
|
+
input.emotionIntensity ?? 1
|
|
10728
|
+
);
|
|
10729
|
+
const k = this.emotionSmoothing;
|
|
10730
|
+
for (let i = 0; i < 52; i++) {
|
|
10731
|
+
this.smoothedUpper[i] += (resolved.upper[i] - this.smoothedUpper[i]) * k;
|
|
10732
|
+
this.smoothedLower[i] += (resolved.lower[i] - this.smoothedLower[i]) * k;
|
|
10733
|
+
}
|
|
10734
|
+
const mc = base[IDX_MOUTH_CLOSE];
|
|
10735
|
+
const bilabialSuppress = mc <= 0.3 ? 1 : mc >= 0.7 ? 0.1 : 1 - 0.9 * smoothstep((mc - 0.3) * 2.5);
|
|
10736
|
+
for (let i = 0; i < 52; i++) {
|
|
10737
|
+
out[i] += this.smoothedUpper[i];
|
|
10738
|
+
}
|
|
10739
|
+
for (let i = 0; i < 52; i++) {
|
|
10740
|
+
out[i] *= 1 + this.smoothedLower[i] * bilabialSuppress;
|
|
10741
|
+
}
|
|
10742
|
+
}
|
|
10743
|
+
this.lifeLayer.updateToArray(input.deltaTime, input, this.lifeBuffer);
|
|
10744
|
+
for (let i = 0; i < 52; i++) {
|
|
10745
|
+
if (IS_EYE_CHANNEL[i]) {
|
|
10746
|
+
out[i] = this.lifeBuffer[i];
|
|
10747
|
+
} else {
|
|
10748
|
+
out[i] += this.lifeBuffer[i];
|
|
10749
|
+
}
|
|
10750
|
+
}
|
|
10751
|
+
for (let i = 0; i < 52; i++) {
|
|
10752
|
+
out[i] = out[i] * this.multiplier[i] + this.offset[i];
|
|
10753
|
+
}
|
|
10754
|
+
for (let i = 0; i < 52; i++) {
|
|
10755
|
+
if (out[i] < 0) out[i] = 0;
|
|
10756
|
+
else if (out[i] > 1) out[i] = 1;
|
|
10757
|
+
}
|
|
10758
|
+
return out;
|
|
10759
|
+
}
|
|
10760
|
+
/**
|
|
10761
|
+
* Set sticky emotion (used when input.emotion is not provided).
|
|
10762
|
+
*/
|
|
10763
|
+
setEmotion(weights) {
|
|
10764
|
+
this.stickyEmotion = weights;
|
|
10765
|
+
}
|
|
10766
|
+
/**
|
|
10767
|
+
* Update character profile at runtime.
|
|
10768
|
+
*/
|
|
10769
|
+
setProfile(profile) {
|
|
10770
|
+
this.multiplier.fill(1);
|
|
10771
|
+
this.offset.fill(0);
|
|
10772
|
+
this.applyProfileArrays(profile);
|
|
10773
|
+
}
|
|
10774
|
+
/**
|
|
10775
|
+
* Reset all smoothing state and life layer.
|
|
10776
|
+
*/
|
|
10777
|
+
reset() {
|
|
10778
|
+
this.smoothedUpper.fill(0);
|
|
10779
|
+
this.smoothedLower.fill(0);
|
|
10780
|
+
this.lifeBuffer.fill(0);
|
|
10781
|
+
this.stickyEmotion = void 0;
|
|
10782
|
+
this.lifeLayer.reset();
|
|
10783
|
+
}
|
|
10784
|
+
/** Expand partial profile maps into dense Float32Arrays */
|
|
10785
|
+
applyProfileArrays(profile) {
|
|
10786
|
+
if (profile.multiplier) {
|
|
10787
|
+
for (const [name, value] of Object.entries(profile.multiplier)) {
|
|
10788
|
+
const idx = BS_INDEX2.get(name);
|
|
10789
|
+
if (idx !== void 0 && value !== void 0) {
|
|
10790
|
+
this.multiplier[idx] = value;
|
|
10791
|
+
}
|
|
10792
|
+
}
|
|
10793
|
+
}
|
|
10794
|
+
if (profile.offset) {
|
|
10795
|
+
for (const [name, value] of Object.entries(profile.offset)) {
|
|
10796
|
+
const idx = BS_INDEX2.get(name);
|
|
10797
|
+
if (idx !== void 0 && value !== void 0) {
|
|
10798
|
+
this.offset[idx] = value;
|
|
10799
|
+
}
|
|
10800
|
+
}
|
|
10801
|
+
}
|
|
10802
|
+
}
|
|
10803
|
+
};
|
|
10804
|
+
|
|
10805
|
+
// src/orchestration/MicLipSync.ts
|
|
10806
|
+
var logger18 = createLogger("MicLipSync");
|
|
10807
|
+
var MicLipSync = class extends EventEmitter {
|
|
10808
|
+
constructor(config) {
|
|
10809
|
+
super();
|
|
10810
|
+
this.omoteEvents = new EventEmitter();
|
|
10811
|
+
this._state = "idle";
|
|
10812
|
+
this._isSpeaking = false;
|
|
10813
|
+
this._currentFrame = null;
|
|
10814
|
+
this._currentRawFrame = null;
|
|
10815
|
+
// VAD state
|
|
10816
|
+
this.speechStartTime = 0;
|
|
10817
|
+
this.vadChunkSize = 0;
|
|
10818
|
+
this.vadBuffer = null;
|
|
10819
|
+
this.vadBufferOffset = 0;
|
|
10820
|
+
this.profile = config.profile ?? {};
|
|
10821
|
+
this.vad = config.vad;
|
|
10822
|
+
this.mic = new MicrophoneCapture(this.omoteEvents, {
|
|
10823
|
+
sampleRate: config.sampleRate ?? 16e3,
|
|
10824
|
+
chunkSize: config.micChunkSize ?? 512
|
|
10825
|
+
});
|
|
10826
|
+
this.processor = new A2EProcessor({
|
|
10827
|
+
backend: config.lam,
|
|
10828
|
+
sampleRate: config.sampleRate ?? 16e3,
|
|
10829
|
+
identityIndex: config.identityIndex,
|
|
10830
|
+
onFrame: (raw) => {
|
|
10831
|
+
const scaled = applyProfile(raw, this.profile);
|
|
10832
|
+
this._currentFrame = scaled;
|
|
10833
|
+
this._currentRawFrame = raw;
|
|
10834
|
+
this.emit("frame", { blendshapes: scaled, rawBlendshapes: raw });
|
|
10835
|
+
},
|
|
10836
|
+
onError: (error) => {
|
|
10837
|
+
logger18.error("A2E inference error", { message: error.message });
|
|
10838
|
+
this.emit("error", error);
|
|
10839
|
+
}
|
|
10840
|
+
});
|
|
10841
|
+
this.omoteEvents.on("audio.chunk", ({ pcm }) => {
|
|
10842
|
+
const float32 = int16ToFloat32(pcm);
|
|
10843
|
+
this.processor.pushAudio(float32);
|
|
10844
|
+
if (this.vad) {
|
|
10845
|
+
this.processVAD(float32);
|
|
10846
|
+
}
|
|
10847
|
+
});
|
|
10848
|
+
this.omoteEvents.on("audio.level", (level) => {
|
|
10849
|
+
this.emit("audio:level", level);
|
|
10850
|
+
});
|
|
10851
|
+
if (this.vad) {
|
|
10852
|
+
this.vadChunkSize = this.vad.getChunkSize();
|
|
10853
|
+
this.vadBuffer = new Float32Array(this.vadChunkSize);
|
|
10854
|
+
this.vadBufferOffset = 0;
|
|
10855
|
+
}
|
|
10856
|
+
}
|
|
10857
|
+
/** Current state */
|
|
10858
|
+
get state() {
|
|
10859
|
+
return this._state;
|
|
10860
|
+
}
|
|
10861
|
+
/** Latest blendshape frame (null before first inference) */
|
|
10862
|
+
get currentFrame() {
|
|
10863
|
+
return this._currentFrame;
|
|
10864
|
+
}
|
|
10865
|
+
/** Whether speech is currently detected (requires VAD) */
|
|
10866
|
+
get isSpeaking() {
|
|
10867
|
+
return this._isSpeaking;
|
|
10868
|
+
}
|
|
10869
|
+
/** Current backend type */
|
|
10870
|
+
get backend() {
|
|
10871
|
+
return this.processor ? "active" : null;
|
|
10872
|
+
}
|
|
10873
|
+
// ---------------------------------------------------------------------------
|
|
10874
|
+
// Public API
|
|
10875
|
+
// ---------------------------------------------------------------------------
|
|
10876
|
+
/** Start microphone capture and inference loop */
|
|
10877
|
+
async start() {
|
|
10878
|
+
if (this._state === "active") return;
|
|
10879
|
+
await this.mic.start();
|
|
10880
|
+
this.processor.startDrip();
|
|
10881
|
+
this.emit("mic:start", void 0);
|
|
10882
|
+
this.setState("active");
|
|
10883
|
+
}
|
|
10884
|
+
/** Stop microphone and inference */
|
|
10885
|
+
stop() {
|
|
10886
|
+
if (this._state === "idle") return;
|
|
10887
|
+
this.processor.stopDrip();
|
|
10888
|
+
this.mic.stop();
|
|
10889
|
+
this._isSpeaking = false;
|
|
10890
|
+
this.emit("mic:stop", void 0);
|
|
10891
|
+
this.setState("idle");
|
|
10892
|
+
}
|
|
10893
|
+
/** Pause inference (mic stays open for faster resume) */
|
|
10894
|
+
pause() {
|
|
10895
|
+
if (this._state !== "active") return;
|
|
10896
|
+
this.processor.stopDrip();
|
|
10897
|
+
this.setState("paused");
|
|
10898
|
+
}
|
|
10899
|
+
/** Resume inference after pause */
|
|
10900
|
+
resume() {
|
|
10901
|
+
if (this._state !== "paused") return;
|
|
10902
|
+
this.processor.startDrip();
|
|
10903
|
+
this.setState("active");
|
|
10904
|
+
}
|
|
10905
|
+
/** Update ExpressionProfile at runtime */
|
|
10906
|
+
setProfile(profile) {
|
|
10907
|
+
this.profile = profile;
|
|
10908
|
+
}
|
|
10909
|
+
/** Dispose of all resources */
|
|
10910
|
+
async dispose() {
|
|
10911
|
+
this.stop();
|
|
10912
|
+
this.processor.dispose();
|
|
10913
|
+
}
|
|
10914
|
+
// ---------------------------------------------------------------------------
|
|
10915
|
+
// Internal: VAD processing
|
|
10916
|
+
// ---------------------------------------------------------------------------
|
|
10917
|
+
async processVAD(samples) {
|
|
10918
|
+
if (!this.vad || !this.vadBuffer) return;
|
|
10919
|
+
for (let i = 0; i < samples.length; i++) {
|
|
10920
|
+
this.vadBuffer[this.vadBufferOffset++] = samples[i];
|
|
10921
|
+
if (this.vadBufferOffset >= this.vadChunkSize) {
|
|
10922
|
+
try {
|
|
10923
|
+
const result = await this.vad.process(this.vadBuffer);
|
|
10924
|
+
const wasSpeaking = this._isSpeaking;
|
|
10925
|
+
this._isSpeaking = result.isSpeech;
|
|
10926
|
+
if (!wasSpeaking && result.isSpeech) {
|
|
10927
|
+
this.speechStartTime = performance.now();
|
|
10928
|
+
this.emit("speech:start", void 0);
|
|
10929
|
+
} else if (wasSpeaking && !result.isSpeech) {
|
|
10930
|
+
const durationMs = performance.now() - this.speechStartTime;
|
|
10931
|
+
this.emit("speech:end", { durationMs });
|
|
10932
|
+
}
|
|
10933
|
+
} catch (err) {
|
|
10934
|
+
logger18.warn("VAD process error", { error: String(err) });
|
|
10935
|
+
}
|
|
10936
|
+
this.vadBufferOffset = 0;
|
|
10937
|
+
}
|
|
10938
|
+
}
|
|
10939
|
+
}
|
|
10940
|
+
// ---------------------------------------------------------------------------
|
|
10941
|
+
// Internal: State management
|
|
10942
|
+
// ---------------------------------------------------------------------------
|
|
10943
|
+
setState(state) {
|
|
10944
|
+
if (this._state === state) return;
|
|
10945
|
+
this._state = state;
|
|
10946
|
+
this.emit("state", state);
|
|
10947
|
+
}
|
|
10948
|
+
};
|
|
10949
|
+
|
|
10950
|
+
// src/orchestration/VoicePipeline.ts
|
|
10951
|
+
var logger19 = createLogger("VoicePipeline");
|
|
10952
|
+
var VoicePipeline = class extends EventEmitter {
|
|
10953
|
+
constructor(config) {
|
|
10954
|
+
super();
|
|
10955
|
+
// State
|
|
10956
|
+
this._state = "idle";
|
|
10957
|
+
this.stopped = false;
|
|
10958
|
+
this.epoch = 0;
|
|
10959
|
+
this._sessionId = null;
|
|
10960
|
+
// Models
|
|
10961
|
+
this.asr = null;
|
|
10962
|
+
this.lam = null;
|
|
10963
|
+
this.vad = null;
|
|
10964
|
+
this.unifiedWorker = null;
|
|
10965
|
+
// Pipelines
|
|
10966
|
+
this.playback = null;
|
|
10967
|
+
this.interruption = null;
|
|
10968
|
+
this.omoteEvents = new EventEmitter();
|
|
10969
|
+
this.mic = null;
|
|
10970
|
+
// Audio accumulation
|
|
10971
|
+
this.audioBuffer = [];
|
|
10972
|
+
this.audioBufferSamples = 0;
|
|
10973
|
+
this.speechStartTime = 0;
|
|
10974
|
+
this.silenceTimer = null;
|
|
10975
|
+
this.isSpeaking = false;
|
|
10976
|
+
// Progressive transcription
|
|
10977
|
+
this.progressiveTimer = null;
|
|
10978
|
+
this.progressivePromise = null;
|
|
10979
|
+
this.lastProgressiveResult = null;
|
|
10980
|
+
this.lastProgressiveSamples = 0;
|
|
10981
|
+
// ASR error recovery
|
|
10982
|
+
this.asrErrorCount = 0;
|
|
10983
|
+
// Response abort
|
|
10984
|
+
this.responseAbortController = null;
|
|
10985
|
+
// Frame refs
|
|
10986
|
+
this._currentFrame = null;
|
|
10987
|
+
this.config = config;
|
|
10988
|
+
}
|
|
10989
|
+
/** Current pipeline state */
|
|
10990
|
+
get state() {
|
|
10991
|
+
return this._state;
|
|
10992
|
+
}
|
|
10993
|
+
/** Latest blendshape frame */
|
|
10994
|
+
get currentFrame() {
|
|
10995
|
+
return this._currentFrame;
|
|
10996
|
+
}
|
|
10997
|
+
/** Whether user is currently speaking */
|
|
10998
|
+
get isSpeechActive() {
|
|
10999
|
+
return this.isSpeaking;
|
|
11000
|
+
}
|
|
11001
|
+
/** Session ID (generated on start(), null before) */
|
|
11002
|
+
get sessionId() {
|
|
11003
|
+
return this._sessionId;
|
|
11004
|
+
}
|
|
11005
|
+
// ---------------------------------------------------------------------------
|
|
11006
|
+
// Model loading
|
|
11007
|
+
// ---------------------------------------------------------------------------
|
|
11008
|
+
async loadModels() {
|
|
11009
|
+
this.setState("loading");
|
|
11010
|
+
const timeoutMs = this.config.lamLoadTimeoutMs ?? 3e4;
|
|
11011
|
+
try {
|
|
11012
|
+
if (isIOS()) {
|
|
11013
|
+
this.unifiedWorker = new UnifiedInferenceWorker();
|
|
11014
|
+
await this.unifiedWorker.init();
|
|
11015
|
+
}
|
|
11016
|
+
this.emitProgress("Speech recognition", 0, 3, 0);
|
|
11017
|
+
this.asr = createSenseVoice({
|
|
11018
|
+
modelUrl: this.config.models.senseVoice.modelUrl,
|
|
11019
|
+
tokensUrl: this.config.models.senseVoice.tokensUrl,
|
|
11020
|
+
language: this.config.models.senseVoice.language,
|
|
11021
|
+
unifiedWorker: this.unifiedWorker ?? void 0
|
|
11022
|
+
});
|
|
11023
|
+
await this.asr.load();
|
|
11024
|
+
this.emitProgress("Speech recognition", 45, 3, 1);
|
|
11025
|
+
this.emitProgress("Lip sync", 45, 3, 1);
|
|
11026
|
+
let lam = createA2E({
|
|
11027
|
+
gpuModelUrl: this.config.models.lam.gpuModelUrl,
|
|
11028
|
+
gpuExternalDataUrl: this.config.models.lam.gpuExternalDataUrl,
|
|
11029
|
+
cpuModelUrl: this.config.models.lam.cpuModelUrl,
|
|
11030
|
+
mode: this.config.models.lam.mode,
|
|
11031
|
+
unifiedWorker: this.unifiedWorker ?? void 0
|
|
11032
|
+
});
|
|
11033
|
+
let lamProgress = 45;
|
|
11034
|
+
const lamTickInterval = setInterval(() => {
|
|
11035
|
+
const remaining = 85 - lamProgress;
|
|
11036
|
+
lamProgress += Math.max(0.5, remaining * 0.08);
|
|
11037
|
+
this.emitProgress("Lip sync", Math.round(lamProgress), 3, 1);
|
|
11038
|
+
}, 300);
|
|
11039
|
+
try {
|
|
11040
|
+
const lamLoadResult = await Promise.race([
|
|
11041
|
+
lam.load().then(() => "ok"),
|
|
11042
|
+
new Promise((r) => setTimeout(() => r("timeout"), timeoutMs))
|
|
11043
|
+
]);
|
|
11044
|
+
if (lamLoadResult === "timeout") {
|
|
11045
|
+
logger19.warn(`LAM GPU load timed out after ${timeoutMs}ms, falling back to CPU`);
|
|
11046
|
+
await lam.dispose();
|
|
11047
|
+
lam = createA2E({
|
|
11048
|
+
gpuModelUrl: this.config.models.lam.gpuModelUrl,
|
|
11049
|
+
cpuModelUrl: this.config.models.lam.cpuModelUrl,
|
|
11050
|
+
mode: "cpu",
|
|
11051
|
+
unifiedWorker: this.unifiedWorker ?? void 0
|
|
11052
|
+
});
|
|
11053
|
+
await lam.load();
|
|
11054
|
+
}
|
|
11055
|
+
} finally {
|
|
11056
|
+
clearInterval(lamTickInterval);
|
|
11057
|
+
}
|
|
11058
|
+
this.lam = lam;
|
|
11059
|
+
this.emitProgress("Lip sync", 85, 3, 2);
|
|
11060
|
+
this.emitProgress("Voice detection", 85, 3, 2);
|
|
11061
|
+
this.vad = createSileroVAD({
|
|
11062
|
+
modelUrl: this.config.models.vad.modelUrl,
|
|
11063
|
+
threshold: this.config.models.vad.threshold,
|
|
11064
|
+
unifiedWorker: this.unifiedWorker ?? void 0
|
|
11065
|
+
});
|
|
11066
|
+
await this.vad.load();
|
|
11067
|
+
this.emitProgress("Voice detection", 100, 3, 3);
|
|
11068
|
+
this.playback = new PlaybackPipeline({
|
|
11069
|
+
lam: this.lam,
|
|
11070
|
+
profile: this.config.profile,
|
|
11071
|
+
identityIndex: this.config.identityIndex,
|
|
11072
|
+
neutralTransitionEnabled: this.config.neutralTransitionEnabled ?? true,
|
|
11073
|
+
neutralTransitionMs: this.config.neutralTransitionMs,
|
|
11074
|
+
audioDelayMs: this.config.audioDelayMs,
|
|
11075
|
+
chunkTargetMs: this.config.chunkTargetMs
|
|
11076
|
+
});
|
|
11077
|
+
await this.playback.initialize();
|
|
11078
|
+
this.playback.on("frame", (f) => {
|
|
11079
|
+
this._currentFrame = f.blendshapes;
|
|
11080
|
+
this.emit("frame", f);
|
|
11081
|
+
});
|
|
11082
|
+
this.playback.on("frame:raw", (f) => this.emit("frame:raw", f));
|
|
11083
|
+
this.playback.on("playback:start", (t) => this.emit("playback:start", t));
|
|
11084
|
+
this.playback.on("playback:complete", () => {
|
|
11085
|
+
if (this.stopped) return;
|
|
11086
|
+
this.emit("playback:complete", void 0);
|
|
11087
|
+
this.vad?.reset();
|
|
11088
|
+
this.epoch++;
|
|
11089
|
+
this.setState("listening");
|
|
11090
|
+
});
|
|
11091
|
+
this.playback.on("error", (e) => this.emit("error", e));
|
|
11092
|
+
this.interruption = new InterruptionHandler({
|
|
11093
|
+
enabled: this.config.interruptionEnabled ?? true,
|
|
11094
|
+
minSpeechDurationMs: this.config.interruptionMinSpeechMs ?? 200
|
|
11095
|
+
});
|
|
11096
|
+
this.interruption.on("interruption.triggered", () => {
|
|
11097
|
+
this.handleInterruption();
|
|
11098
|
+
});
|
|
11099
|
+
this.setState("ready");
|
|
11100
|
+
} catch (error) {
|
|
11101
|
+
const err = error instanceof Error ? error : new Error(String(error));
|
|
11102
|
+
logger19.error("Model loading failed", { message: err.message });
|
|
11103
|
+
this.emit("error", err);
|
|
11104
|
+
this.setState("error");
|
|
11105
|
+
throw err;
|
|
11106
|
+
}
|
|
11107
|
+
}
|
|
11108
|
+
// ---------------------------------------------------------------------------
|
|
11109
|
+
// Conversation lifecycle
|
|
11110
|
+
// ---------------------------------------------------------------------------
|
|
11111
|
+
async start() {
|
|
11112
|
+
if (this._state !== "ready") {
|
|
11113
|
+
throw new Error(`Cannot start: state is '${this._state}', expected 'ready'`);
|
|
11114
|
+
}
|
|
11115
|
+
this.stopped = false;
|
|
11116
|
+
this.epoch++;
|
|
11117
|
+
this._sessionId = crypto.randomUUID();
|
|
11118
|
+
this.asrErrorCount = 0;
|
|
11119
|
+
this.mic = new MicrophoneCapture(this.omoteEvents, {
|
|
11120
|
+
sampleRate: 16e3,
|
|
11121
|
+
chunkSize: 512
|
|
11122
|
+
});
|
|
11123
|
+
this.omoteEvents.on("audio.chunk", ({ pcm }) => {
|
|
11124
|
+
const float32 = int16ToFloat32(pcm);
|
|
11125
|
+
this.processAudioChunk(float32);
|
|
11126
|
+
});
|
|
11127
|
+
this.omoteEvents.on("audio.level", (level) => {
|
|
11128
|
+
this.emit("audio:level", level);
|
|
11129
|
+
});
|
|
11130
|
+
await this.mic.start();
|
|
11131
|
+
this.setState("listening");
|
|
11132
|
+
}
|
|
11133
|
+
stop() {
|
|
11134
|
+
this.stopped = true;
|
|
11135
|
+
this.epoch++;
|
|
11136
|
+
this.clearSilenceTimer();
|
|
11137
|
+
this.stopProgressiveTranscription();
|
|
11138
|
+
this.responseAbortController?.abort();
|
|
11139
|
+
this.responseAbortController = null;
|
|
11140
|
+
this.vad?.reset();
|
|
11141
|
+
this.playback?.stop();
|
|
11142
|
+
this.mic?.stop();
|
|
11143
|
+
this.mic = null;
|
|
11144
|
+
this.isSpeaking = false;
|
|
11145
|
+
this.audioBuffer = [];
|
|
11146
|
+
this.audioBufferSamples = 0;
|
|
11147
|
+
this._currentFrame = null;
|
|
11148
|
+
this.interruption?.setAISpeaking(false);
|
|
11149
|
+
if (this._state !== "idle") {
|
|
11150
|
+
this.setState("ready");
|
|
11151
|
+
}
|
|
11152
|
+
}
|
|
11153
|
+
setProfile(profile) {
|
|
11154
|
+
this.config.profile = profile;
|
|
11155
|
+
this.playback?.setProfile(profile);
|
|
11156
|
+
}
|
|
11157
|
+
async dispose() {
|
|
11158
|
+
this.stop();
|
|
11159
|
+
this.epoch++;
|
|
11160
|
+
await this.playback?.dispose();
|
|
11161
|
+
await this.asr?.dispose();
|
|
11162
|
+
await this.lam?.dispose();
|
|
11163
|
+
await this.vad?.dispose();
|
|
11164
|
+
this.playback = null;
|
|
11165
|
+
this.asr = null;
|
|
11166
|
+
this.lam = null;
|
|
11167
|
+
this.vad = null;
|
|
11168
|
+
this._state = "idle";
|
|
11169
|
+
}
|
|
11170
|
+
// ---------------------------------------------------------------------------
|
|
11171
|
+
// Audio processing
|
|
11172
|
+
// ---------------------------------------------------------------------------
|
|
11173
|
+
async processAudioChunk(samples) {
|
|
11174
|
+
if (!this.vad) return;
|
|
11175
|
+
try {
|
|
11176
|
+
const result = await this.vad.process(samples);
|
|
11177
|
+
if (this._state === "speaking" && this.interruption) {
|
|
11178
|
+
this.interruption.processVADResult(result.probability);
|
|
11179
|
+
return;
|
|
11180
|
+
}
|
|
11181
|
+
if (this._state !== "listening" && this._state !== "thinking") return;
|
|
11182
|
+
const wasSpeaking = this.isSpeaking;
|
|
11183
|
+
if (result.isSpeech) {
|
|
11184
|
+
if (!wasSpeaking) {
|
|
11185
|
+
this.isSpeaking = true;
|
|
11186
|
+
this.speechStartTime = performance.now();
|
|
11187
|
+
this.audioBuffer = [];
|
|
11188
|
+
this.audioBufferSamples = 0;
|
|
11189
|
+
this.lastProgressiveResult = null;
|
|
11190
|
+
this.lastProgressiveSamples = 0;
|
|
11191
|
+
this.emit("speech:start", void 0);
|
|
11192
|
+
this.startProgressiveTranscription();
|
|
11193
|
+
}
|
|
11194
|
+
this.audioBuffer.push(new Float32Array(samples));
|
|
11195
|
+
this.audioBufferSamples += samples.length;
|
|
11196
|
+
this.clearSilenceTimer();
|
|
11197
|
+
} else if (wasSpeaking) {
|
|
11198
|
+
this.audioBuffer.push(new Float32Array(samples));
|
|
11199
|
+
this.audioBufferSamples += samples.length;
|
|
11200
|
+
if (!this.silenceTimer) {
|
|
11201
|
+
const timeoutMs = this.getSilenceTimeout();
|
|
11202
|
+
this.silenceTimer = setTimeout(() => {
|
|
11203
|
+
this.onSilenceDetected();
|
|
11204
|
+
}, timeoutMs);
|
|
11205
|
+
}
|
|
11206
|
+
}
|
|
11207
|
+
} catch (err) {
|
|
11208
|
+
logger19.warn("VAD error", { error: String(err) });
|
|
11209
|
+
}
|
|
11210
|
+
}
|
|
11211
|
+
// ---------------------------------------------------------------------------
|
|
11212
|
+
// Silence detection
|
|
11213
|
+
// ---------------------------------------------------------------------------
|
|
11214
|
+
getSilenceTimeout() {
|
|
11215
|
+
const base = this.config.silenceTimeoutMs ?? 500;
|
|
11216
|
+
const extended = this.config.silenceTimeoutExtendedMs ?? 700;
|
|
11217
|
+
const adaptive = this.config.adaptiveTimeout ?? true;
|
|
11218
|
+
if (!adaptive) return base;
|
|
11219
|
+
const speechDurationMs = performance.now() - this.speechStartTime;
|
|
11220
|
+
return speechDurationMs > 3e3 ? extended : base;
|
|
11221
|
+
}
|
|
11222
|
+
onSilenceDetected() {
|
|
11223
|
+
const capturedEpoch = this.epoch;
|
|
11224
|
+
this.isSpeaking = false;
|
|
11225
|
+
const durationMs = performance.now() - this.speechStartTime;
|
|
11226
|
+
this.emit("speech:end", { durationMs });
|
|
11227
|
+
this.clearSilenceTimer();
|
|
11228
|
+
this.processEndOfSpeech(capturedEpoch).catch((err) => {
|
|
11229
|
+
logger19.error("End of speech processing failed", { error: String(err) });
|
|
11230
|
+
if (this.epoch === capturedEpoch && !this.stopped) {
|
|
11231
|
+
this.emit("error", err instanceof Error ? err : new Error(String(err)));
|
|
11232
|
+
this.setState("listening");
|
|
11233
|
+
}
|
|
11234
|
+
});
|
|
11235
|
+
}
|
|
11236
|
+
// ---------------------------------------------------------------------------
|
|
11237
|
+
// End of speech → transcription → response
|
|
11238
|
+
// ---------------------------------------------------------------------------
|
|
11239
|
+
async processEndOfSpeech(capturedEpoch) {
|
|
11240
|
+
if (this.progressivePromise) {
|
|
11241
|
+
try {
|
|
11242
|
+
await this.progressivePromise;
|
|
11243
|
+
} catch {
|
|
11244
|
+
}
|
|
11245
|
+
}
|
|
11246
|
+
this.stopProgressiveTranscription();
|
|
11247
|
+
if (this.epoch !== capturedEpoch || this.stopped) return;
|
|
11248
|
+
const totalSamples = this.audioBufferSamples;
|
|
11249
|
+
const fullAudio = new Float32Array(totalSamples);
|
|
11250
|
+
let offset = 0;
|
|
11251
|
+
for (const chunk of this.audioBuffer) {
|
|
11252
|
+
fullAudio.set(chunk, offset);
|
|
11253
|
+
offset += chunk.length;
|
|
11254
|
+
}
|
|
11255
|
+
this.audioBuffer = [];
|
|
11256
|
+
this.audioBufferSamples = 0;
|
|
11257
|
+
const minDuration = this.config.minAudioDurationSec ?? 0.3;
|
|
11258
|
+
const minEnergy = this.config.minAudioEnergy ?? 0.02;
|
|
11259
|
+
const durationSec = totalSamples / 16e3;
|
|
11260
|
+
if (durationSec < minDuration) {
|
|
11261
|
+
logger19.info("Audio too short, discarding", { durationSec });
|
|
11262
|
+
this.setState("listening");
|
|
11263
|
+
return;
|
|
11264
|
+
}
|
|
11265
|
+
let maxAbs = 0;
|
|
11266
|
+
for (let i = 0; i < fullAudio.length; i++) {
|
|
11267
|
+
const abs = Math.abs(fullAudio[i]);
|
|
11268
|
+
if (abs > maxAbs) maxAbs = abs;
|
|
11269
|
+
}
|
|
11270
|
+
let rms = 0;
|
|
11271
|
+
for (let i = 0; i < fullAudio.length; i++) {
|
|
11272
|
+
rms += fullAudio[i] * fullAudio[i];
|
|
11273
|
+
}
|
|
11274
|
+
rms = Math.sqrt(rms / fullAudio.length);
|
|
11275
|
+
if (rms < minEnergy) {
|
|
11276
|
+
logger19.info("Audio too quiet, discarding", { rms });
|
|
11277
|
+
this.setState("listening");
|
|
11278
|
+
return;
|
|
11279
|
+
}
|
|
11280
|
+
const normalizedAudio = this.normalizeAudio(fullAudio);
|
|
11281
|
+
this.setState("thinking");
|
|
11282
|
+
let transcript = null;
|
|
11283
|
+
const coverageThreshold = this.config.progressiveCoverageThreshold ?? 0.8;
|
|
11284
|
+
if (this.lastProgressiveResult && this.lastProgressiveResult.text.trim().length > 0 && this.lastProgressiveSamples >= totalSamples * coverageThreshold) {
|
|
11285
|
+
transcript = { ...this.lastProgressiveResult, isFinal: true };
|
|
11286
|
+
logger19.info("Using progressive result", {
|
|
11287
|
+
coverage: (this.lastProgressiveSamples / totalSamples).toFixed(2),
|
|
11288
|
+
text: transcript.text
|
|
11289
|
+
});
|
|
11290
|
+
} else {
|
|
11291
|
+
this.lastProgressiveResult = null;
|
|
11292
|
+
transcript = await this.transcribeWithTimeout(normalizedAudio);
|
|
11293
|
+
if (transcript) {
|
|
11294
|
+
transcript.isFinal = true;
|
|
11295
|
+
}
|
|
11296
|
+
}
|
|
11297
|
+
if (this.epoch !== capturedEpoch || this.stopped) return;
|
|
11298
|
+
if (!transcript || !transcript.text.trim()) {
|
|
11299
|
+
logger19.info("No transcript, resuming listening");
|
|
11300
|
+
this.setState("listening");
|
|
11301
|
+
return;
|
|
11302
|
+
}
|
|
11303
|
+
this.emit("transcript", transcript);
|
|
11304
|
+
await this.callResponseHandler(transcript, capturedEpoch);
|
|
11305
|
+
}
|
|
11306
|
+
// ---------------------------------------------------------------------------
|
|
11307
|
+
// Response handler
|
|
11308
|
+
// ---------------------------------------------------------------------------
|
|
11309
|
+
async callResponseHandler(transcript, capturedEpoch) {
|
|
11310
|
+
if (this.epoch !== capturedEpoch || this.stopped) return;
|
|
11311
|
+
this.setState("speaking");
|
|
11312
|
+
this.interruption?.setAISpeaking(true);
|
|
11313
|
+
const abortController = new AbortController();
|
|
11314
|
+
this.responseAbortController = abortController;
|
|
11315
|
+
try {
|
|
11316
|
+
this.playback.start();
|
|
11317
|
+
await this.config.onResponse({
|
|
11318
|
+
text: transcript.text,
|
|
11319
|
+
emotion: transcript.emotion,
|
|
11320
|
+
event: transcript.event,
|
|
11321
|
+
send: async (chunk) => {
|
|
11322
|
+
if (abortController.signal.aborted) return;
|
|
11323
|
+
await this.playback.onAudioChunk(chunk);
|
|
11324
|
+
},
|
|
11325
|
+
done: async () => {
|
|
11326
|
+
if (abortController.signal.aborted) return;
|
|
11327
|
+
await this.playback.end();
|
|
11328
|
+
},
|
|
11329
|
+
signal: abortController.signal,
|
|
11330
|
+
sessionId: this._sessionId
|
|
11331
|
+
});
|
|
11332
|
+
} catch (error) {
|
|
11333
|
+
if (abortController.signal.aborted) return;
|
|
11334
|
+
const err = error instanceof Error ? error : new Error(String(error));
|
|
11335
|
+
logger19.error("Response handler error", { message: err.message });
|
|
11336
|
+
this.emit("error", err);
|
|
11337
|
+
if (this.epoch === capturedEpoch && !this.stopped) {
|
|
11338
|
+
this.interruption?.setAISpeaking(false);
|
|
11339
|
+
this.setState("listening");
|
|
11340
|
+
}
|
|
11341
|
+
} finally {
|
|
11342
|
+
this.responseAbortController = null;
|
|
11343
|
+
}
|
|
11344
|
+
}
|
|
11345
|
+
// ---------------------------------------------------------------------------
|
|
11346
|
+
// Interruption handling
|
|
11347
|
+
// ---------------------------------------------------------------------------
|
|
11348
|
+
handleInterruption() {
|
|
11349
|
+
if (this._state !== "speaking") return;
|
|
11350
|
+
logger19.info("Interruption triggered");
|
|
11351
|
+
this.epoch++;
|
|
11352
|
+
this.responseAbortController?.abort();
|
|
11353
|
+
this.playback?.stop();
|
|
11354
|
+
this.interruption?.setAISpeaking(false);
|
|
11355
|
+
this.emit("interruption", void 0);
|
|
11356
|
+
if (!this.stopped) {
|
|
11357
|
+
this.setState("listening");
|
|
11358
|
+
}
|
|
11359
|
+
}
|
|
11360
|
+
// ---------------------------------------------------------------------------
|
|
11361
|
+
// Progressive transcription
|
|
11362
|
+
// ---------------------------------------------------------------------------
|
|
11363
|
+
startProgressiveTranscription() {
|
|
11364
|
+
this.stopProgressiveTranscription();
|
|
11365
|
+
const intervalMs = isIOS() ? this.config.progressiveIntervalIosMs ?? 800 : this.config.progressiveIntervalMs ?? 500;
|
|
11366
|
+
const minSamples = this.config.progressiveMinSamples ?? 8e3;
|
|
11367
|
+
this.progressiveTimer = setInterval(() => {
|
|
11368
|
+
if (this.audioBufferSamples < minSamples) return;
|
|
11369
|
+
if (!this.asr) return;
|
|
11370
|
+
const capturedEpoch = this.epoch;
|
|
11371
|
+
const snapshot = new Float32Array(this.audioBufferSamples);
|
|
11372
|
+
let offset = 0;
|
|
11373
|
+
for (const chunk of this.audioBuffer) {
|
|
11374
|
+
snapshot.set(chunk, offset);
|
|
11375
|
+
offset += chunk.length;
|
|
11376
|
+
}
|
|
11377
|
+
const snapshotSamples = this.audioBufferSamples;
|
|
11378
|
+
this.progressivePromise = (async () => {
|
|
11379
|
+
try {
|
|
11380
|
+
const result = await this.transcribeWithTimeout(snapshot);
|
|
11381
|
+
if (this.epoch !== capturedEpoch) return;
|
|
11382
|
+
if (result && result.text.trim()) {
|
|
11383
|
+
this.lastProgressiveResult = result;
|
|
11384
|
+
this.lastProgressiveSamples = snapshotSamples;
|
|
11385
|
+
this.emit("transcript", { ...result, isFinal: false });
|
|
11386
|
+
}
|
|
11387
|
+
} catch {
|
|
11388
|
+
}
|
|
11389
|
+
})();
|
|
11390
|
+
}, intervalMs);
|
|
11391
|
+
}
|
|
11392
|
+
stopProgressiveTranscription() {
|
|
11393
|
+
if (this.progressiveTimer) {
|
|
11394
|
+
clearInterval(this.progressiveTimer);
|
|
11395
|
+
this.progressiveTimer = null;
|
|
11396
|
+
}
|
|
11397
|
+
}
|
|
11398
|
+
// ---------------------------------------------------------------------------
|
|
11399
|
+
// Transcription with timeout + ASR error recovery
|
|
11400
|
+
// ---------------------------------------------------------------------------
|
|
11401
|
+
async transcribeWithTimeout(audio) {
|
|
11402
|
+
if (!this.asr) return null;
|
|
11403
|
+
const timeoutMs = this.config.transcriptionTimeoutMs ?? 1e4;
|
|
11404
|
+
const startTime = performance.now();
|
|
11405
|
+
try {
|
|
11406
|
+
const result = await Promise.race([
|
|
11407
|
+
this.asr.transcribe(audio),
|
|
11408
|
+
new Promise(
|
|
11409
|
+
(_, reject) => setTimeout(() => reject(new Error(`Transcription timed out after ${timeoutMs}ms`)), timeoutMs)
|
|
11410
|
+
)
|
|
11411
|
+
]);
|
|
11412
|
+
this.asrErrorCount = 0;
|
|
11413
|
+
return {
|
|
11414
|
+
text: result.text,
|
|
11415
|
+
emotion: result.emotion,
|
|
11416
|
+
language: result.language,
|
|
11417
|
+
isFinal: false,
|
|
11418
|
+
inferenceTimeMs: performance.now() - startTime
|
|
11419
|
+
};
|
|
11420
|
+
} catch (error) {
|
|
11421
|
+
this.asrErrorCount++;
|
|
11422
|
+
logger19.warn("Transcription failed", {
|
|
11423
|
+
attempt: this.asrErrorCount,
|
|
11424
|
+
error: String(error)
|
|
11425
|
+
});
|
|
11426
|
+
if (this.asrErrorCount >= 3) {
|
|
11427
|
+
logger19.warn("3 consecutive ASR errors, recreating session");
|
|
11428
|
+
try {
|
|
11429
|
+
await this.asr.dispose();
|
|
11430
|
+
this.asr = createSenseVoice({
|
|
11431
|
+
modelUrl: this.config.models.senseVoice.modelUrl,
|
|
11432
|
+
tokensUrl: this.config.models.senseVoice.tokensUrl,
|
|
11433
|
+
language: this.config.models.senseVoice.language,
|
|
11434
|
+
unifiedWorker: this.unifiedWorker ?? void 0
|
|
11435
|
+
});
|
|
11436
|
+
await this.asr.load();
|
|
11437
|
+
this.asrErrorCount = 0;
|
|
11438
|
+
} catch (recreateErr) {
|
|
11439
|
+
logger19.error("ASR session recreation failed", { error: String(recreateErr) });
|
|
11440
|
+
}
|
|
11441
|
+
}
|
|
11442
|
+
return null;
|
|
11443
|
+
}
|
|
11444
|
+
}
|
|
11445
|
+
// ---------------------------------------------------------------------------
|
|
11446
|
+
// Audio normalization
|
|
11447
|
+
// ---------------------------------------------------------------------------
|
|
11448
|
+
normalizeAudio(audio) {
|
|
11449
|
+
if (!(this.config.normalizeAudio ?? true)) return audio;
|
|
11450
|
+
let maxAbs = 0;
|
|
11451
|
+
for (let i = 0; i < audio.length; i++) {
|
|
11452
|
+
const abs = Math.abs(audio[i]);
|
|
11453
|
+
if (abs > maxAbs) maxAbs = abs;
|
|
11454
|
+
}
|
|
11455
|
+
if (maxAbs >= 0.1 || maxAbs === 0) return audio;
|
|
11456
|
+
const gain = 0.5 / maxAbs;
|
|
11457
|
+
const normalized = new Float32Array(audio.length);
|
|
11458
|
+
for (let i = 0; i < audio.length; i++) {
|
|
11459
|
+
normalized[i] = audio[i] * gain;
|
|
11460
|
+
}
|
|
11461
|
+
return normalized;
|
|
11462
|
+
}
|
|
11463
|
+
// ---------------------------------------------------------------------------
|
|
11464
|
+
// Helpers
|
|
11465
|
+
// ---------------------------------------------------------------------------
|
|
11466
|
+
setState(state) {
|
|
11467
|
+
if (this._state === state) return;
|
|
11468
|
+
logger19.info("State transition", { from: this._state, to: state });
|
|
11469
|
+
this._state = state;
|
|
11470
|
+
this.emit("state", state);
|
|
11471
|
+
}
|
|
11472
|
+
emitProgress(currentModel, progress, totalModels, modelsLoaded) {
|
|
11473
|
+
this.emit("loading:progress", { currentModel, progress, totalModels, modelsLoaded });
|
|
11474
|
+
}
|
|
11475
|
+
clearSilenceTimer() {
|
|
11476
|
+
if (this.silenceTimer) {
|
|
11477
|
+
clearTimeout(this.silenceTimer);
|
|
11478
|
+
this.silenceTimer = null;
|
|
11479
|
+
}
|
|
11480
|
+
}
|
|
11481
|
+
};
|
|
11482
|
+
|
|
10393
11483
|
// ../types/dist/index.mjs
|
|
10394
11484
|
var PROTOCOL_VERSION = 1;
|
|
10395
11485
|
function isProtocolEvent(obj) {
|