@omote/core 0.4.4 → 0.4.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.mts +1165 -673
- package/dist/index.d.ts +1165 -673
- package/dist/index.js +3307 -337
- package/dist/index.js.map +1 -1
- package/dist/index.mjs +3302 -332
- package/dist/index.mjs.map +1 -1
- package/package.json +1 -1
package/dist/index.mjs
CHANGED
|
@@ -312,7 +312,14 @@ var AudioScheduler = class {
|
|
|
312
312
|
source.connect(gainNode);
|
|
313
313
|
const scheduleTime = this.nextPlayTime;
|
|
314
314
|
source.start(scheduleTime);
|
|
315
|
-
|
|
315
|
+
const entry = { source, gainNode };
|
|
316
|
+
this.scheduledSources.push(entry);
|
|
317
|
+
source.onended = () => {
|
|
318
|
+
const idx = this.scheduledSources.indexOf(entry);
|
|
319
|
+
if (idx !== -1) {
|
|
320
|
+
this.scheduledSources.splice(idx, 1);
|
|
321
|
+
}
|
|
322
|
+
};
|
|
316
323
|
const duration = audioData.length / ctx.sampleRate;
|
|
317
324
|
this.nextPlayTime = scheduleTime + duration;
|
|
318
325
|
return scheduleTime;
|
|
@@ -668,7 +675,7 @@ var LAMPipeline = class {
|
|
|
668
675
|
}
|
|
669
676
|
};
|
|
670
677
|
|
|
671
|
-
// src/audio/
|
|
678
|
+
// src/audio/audioUtils.ts
|
|
672
679
|
function pcm16ToFloat32(buffer) {
|
|
673
680
|
const byteLen = buffer.byteLength & ~1;
|
|
674
681
|
const int16 = byteLen === buffer.byteLength ? new Int16Array(buffer) : new Int16Array(buffer, 0, byteLen / 2);
|
|
@@ -678,6 +685,15 @@ function pcm16ToFloat32(buffer) {
|
|
|
678
685
|
}
|
|
679
686
|
return float32;
|
|
680
687
|
}
|
|
688
|
+
function int16ToFloat32(int16) {
|
|
689
|
+
const float32 = new Float32Array(int16.length);
|
|
690
|
+
for (let i = 0; i < int16.length; i++) {
|
|
691
|
+
float32[i] = int16[i] / 32768;
|
|
692
|
+
}
|
|
693
|
+
return float32;
|
|
694
|
+
}
|
|
695
|
+
|
|
696
|
+
// src/audio/SyncedAudioPipeline.ts
|
|
681
697
|
var SyncedAudioPipeline = class extends EventEmitter {
|
|
682
698
|
constructor(options) {
|
|
683
699
|
super();
|
|
@@ -2385,7 +2401,7 @@ function isIOSSafari() {
|
|
|
2385
2401
|
function isIOS() {
|
|
2386
2402
|
if (typeof navigator === "undefined") return false;
|
|
2387
2403
|
const ua = navigator.userAgent.toLowerCase();
|
|
2388
|
-
return /iphone|ipad|ipod/.test(ua);
|
|
2404
|
+
return /iphone|ipad|ipod/.test(ua) || /macintosh/.test(ua) && navigator.maxTouchPoints > 1;
|
|
2389
2405
|
}
|
|
2390
2406
|
function isAndroid() {
|
|
2391
2407
|
if (typeof navigator === "undefined") return false;
|
|
@@ -3006,10 +3022,16 @@ var _Wav2Vec2Inference = class _Wav2Vec2Inference {
|
|
|
3006
3022
|
});
|
|
3007
3023
|
logger2.debug("Running warmup inference to initialize GPU context");
|
|
3008
3024
|
const warmupStart = performance.now();
|
|
3009
|
-
const
|
|
3025
|
+
const warmupAudio = new Float32Array(16e3);
|
|
3026
|
+
const warmupIdentity = new Float32Array(this.numIdentityClasses);
|
|
3027
|
+
warmupIdentity[0] = 1;
|
|
3028
|
+
const warmupFeeds = {
|
|
3029
|
+
"audio": new this.ort.Tensor("float32", warmupAudio, [1, 16e3]),
|
|
3030
|
+
"identity": new this.ort.Tensor("float32", warmupIdentity, [1, this.numIdentityClasses])
|
|
3031
|
+
};
|
|
3010
3032
|
const WARMUP_TIMEOUT_MS = 15e3;
|
|
3011
3033
|
const warmupResult = await Promise.race([
|
|
3012
|
-
this.
|
|
3034
|
+
this.session.run(warmupFeeds).then(() => "ok"),
|
|
3013
3035
|
new Promise((r) => setTimeout(() => r("timeout"), WARMUP_TIMEOUT_MS))
|
|
3014
3036
|
]);
|
|
3015
3037
|
const warmupTimeMs = performance.now() - warmupStart;
|
|
@@ -3115,14 +3137,18 @@ var _Wav2Vec2Inference = class _Wav2Vec2Inference {
|
|
|
3115
3137
|
});
|
|
3116
3138
|
try {
|
|
3117
3139
|
const startTime = performance.now();
|
|
3140
|
+
let timeoutId;
|
|
3118
3141
|
const results = await Promise.race([
|
|
3119
|
-
this.session.run(feeds)
|
|
3120
|
-
|
|
3121
|
-
|
|
3142
|
+
this.session.run(feeds).then((r) => {
|
|
3143
|
+
clearTimeout(timeoutId);
|
|
3144
|
+
return r;
|
|
3145
|
+
}),
|
|
3146
|
+
new Promise((_, rej) => {
|
|
3147
|
+
timeoutId = setTimeout(
|
|
3122
3148
|
() => rej(new Error(`Wav2Vec2 inference timed out after ${_Wav2Vec2Inference.INFERENCE_TIMEOUT_MS}ms`)),
|
|
3123
3149
|
_Wav2Vec2Inference.INFERENCE_TIMEOUT_MS
|
|
3124
|
-
)
|
|
3125
|
-
)
|
|
3150
|
+
);
|
|
3151
|
+
})
|
|
3126
3152
|
]);
|
|
3127
3153
|
const inferenceTimeMs = performance.now() - startTime;
|
|
3128
3154
|
const asrOutput = results["asr_logits"];
|
|
@@ -3228,15 +3254,6 @@ var Wav2Vec2Inference = _Wav2Vec2Inference;
|
|
|
3228
3254
|
|
|
3229
3255
|
// src/audio/FullFacePipeline.ts
|
|
3230
3256
|
var logger3 = createLogger("FullFacePipeline");
|
|
3231
|
-
function pcm16ToFloat322(buffer) {
|
|
3232
|
-
const byteLen = buffer.byteLength & ~1;
|
|
3233
|
-
const int16 = byteLen === buffer.byteLength ? new Int16Array(buffer) : new Int16Array(buffer, 0, byteLen / 2);
|
|
3234
|
-
const float32 = new Float32Array(int16.length);
|
|
3235
|
-
for (let i = 0; i < int16.length; i++) {
|
|
3236
|
-
float32[i] = int16[i] / 32768;
|
|
3237
|
-
}
|
|
3238
|
-
return float32;
|
|
3239
|
-
}
|
|
3240
3257
|
var BLENDSHAPE_INDEX_MAP = /* @__PURE__ */ new Map();
|
|
3241
3258
|
LAM_BLENDSHAPES.forEach((name, index) => {
|
|
3242
3259
|
BLENDSHAPE_INDEX_MAP.set(name, index);
|
|
@@ -3386,7 +3403,7 @@ var _FullFacePipeline = class _FullFacePipeline extends EventEmitter {
|
|
|
3386
3403
|
if (!combined) {
|
|
3387
3404
|
return;
|
|
3388
3405
|
}
|
|
3389
|
-
const float32 =
|
|
3406
|
+
const float32 = pcm16ToFloat32(combined);
|
|
3390
3407
|
const scheduleTime = await this.scheduler.schedule(float32);
|
|
3391
3408
|
if (!this.playbackStarted) {
|
|
3392
3409
|
this.playbackStarted = true;
|
|
@@ -3869,13 +3886,18 @@ function ctcGreedyDecode(logits, seqLen, vocabSize, tokenMap) {
|
|
|
3869
3886
|
|
|
3870
3887
|
// src/inference/SenseVoiceInference.ts
|
|
3871
3888
|
var logger4 = createLogger("SenseVoice");
|
|
3872
|
-
var
|
|
3889
|
+
var _SenseVoiceInference = class _SenseVoiceInference {
|
|
3873
3890
|
constructor(config) {
|
|
3874
3891
|
this.session = null;
|
|
3875
3892
|
this.ort = null;
|
|
3876
3893
|
this._backend = "wasm";
|
|
3877
3894
|
this.isLoading = false;
|
|
3878
3895
|
this.inferenceQueue = Promise.resolve();
|
|
3896
|
+
// Session health: set to true if session.run() times out.
|
|
3897
|
+
// A timed-out session may have a zombie WASM dispatch still running,
|
|
3898
|
+
// so all future transcribe() calls reject immediately to prevent concurrent access.
|
|
3899
|
+
this.poisoned = false;
|
|
3900
|
+
// 10s for SenseVoice (heavier preprocessing)
|
|
3879
3901
|
// Preprocessing state (loaded once)
|
|
3880
3902
|
this.tokenMap = null;
|
|
3881
3903
|
this.negMean = null;
|
|
@@ -4023,6 +4045,9 @@ var SenseVoiceInference = class {
|
|
|
4023
4045
|
if (!this.session || !this.ort || !this.tokenMap) {
|
|
4024
4046
|
throw new Error("Model not loaded. Call load() first.");
|
|
4025
4047
|
}
|
|
4048
|
+
if (this.poisoned) {
|
|
4049
|
+
throw new Error("SenseVoice session timed out \u2014 inference unavailable until page reload");
|
|
4050
|
+
}
|
|
4026
4051
|
const audio = new Float32Array(audioSamples);
|
|
4027
4052
|
return this.queueInference(audio);
|
|
4028
4053
|
}
|
|
@@ -4060,7 +4085,19 @@ var SenseVoiceInference = class {
|
|
|
4060
4085
|
language: new ort.Tensor("int32", new Int32Array([this.languageId]), [1]),
|
|
4061
4086
|
text_norm: new ort.Tensor("int32", new Int32Array([this.textNormId]), [1])
|
|
4062
4087
|
};
|
|
4063
|
-
|
|
4088
|
+
let timeoutId;
|
|
4089
|
+
const results = await Promise.race([
|
|
4090
|
+
this.session.run(feeds).then((r) => {
|
|
4091
|
+
clearTimeout(timeoutId);
|
|
4092
|
+
return r;
|
|
4093
|
+
}),
|
|
4094
|
+
new Promise((_, rej) => {
|
|
4095
|
+
timeoutId = setTimeout(
|
|
4096
|
+
() => rej(new Error(`SenseVoice inference timed out after ${_SenseVoiceInference.INFERENCE_TIMEOUT_MS}ms`)),
|
|
4097
|
+
_SenseVoiceInference.INFERENCE_TIMEOUT_MS
|
|
4098
|
+
);
|
|
4099
|
+
})
|
|
4100
|
+
]);
|
|
4064
4101
|
const logitsOutput = results["logits"];
|
|
4065
4102
|
if (!logitsOutput) {
|
|
4066
4103
|
throw new Error('Model output missing "logits" tensor');
|
|
@@ -4106,6 +4143,32 @@ var SenseVoiceInference = class {
|
|
|
4106
4143
|
preprocessTimeMs
|
|
4107
4144
|
});
|
|
4108
4145
|
} catch (err) {
|
|
4146
|
+
const errMsg = err instanceof Error ? err.message : String(err);
|
|
4147
|
+
if (errMsg.includes("timed out")) {
|
|
4148
|
+
this.poisoned = true;
|
|
4149
|
+
logger4.error("CRITICAL: Inference session timed out \u2014 SenseVoice is dead. Page reload required.", {
|
|
4150
|
+
backend: this._backend,
|
|
4151
|
+
timeoutMs: _SenseVoiceInference.INFERENCE_TIMEOUT_MS
|
|
4152
|
+
});
|
|
4153
|
+
} else if (typeof err === "number") {
|
|
4154
|
+
const oomError = new Error(
|
|
4155
|
+
`SenseVoice inference failed with raw C++ exception pointer (0x${err.toString(16)}). This is likely an OOM crash in WASM. Try reloading the page.`
|
|
4156
|
+
);
|
|
4157
|
+
logger4.error("ORT WASM OOM \u2014 raw C++ exception pointer", {
|
|
4158
|
+
pointer: `0x${err.toString(16)}`,
|
|
4159
|
+
backend: this._backend
|
|
4160
|
+
});
|
|
4161
|
+
span?.endWithError(oomError);
|
|
4162
|
+
telemetry?.incrementCounter("omote.inference.total", 1, {
|
|
4163
|
+
model: "sensevoice",
|
|
4164
|
+
backend: this._backend,
|
|
4165
|
+
status: "error"
|
|
4166
|
+
});
|
|
4167
|
+
reject(oomError);
|
|
4168
|
+
return;
|
|
4169
|
+
} else {
|
|
4170
|
+
logger4.error("Inference failed", { error: errMsg, backend: this._backend });
|
|
4171
|
+
}
|
|
4109
4172
|
span?.endWithError(err instanceof Error ? err : new Error(String(err)));
|
|
4110
4173
|
telemetry?.incrementCounter("omote.inference.total", 1, {
|
|
4111
4174
|
model: "sensevoice",
|
|
@@ -4129,241 +4192,3082 @@ var SenseVoiceInference = class {
|
|
|
4129
4192
|
this.invStddev = null;
|
|
4130
4193
|
}
|
|
4131
4194
|
};
|
|
4195
|
+
_SenseVoiceInference.INFERENCE_TIMEOUT_MS = 1e4;
|
|
4196
|
+
var SenseVoiceInference = _SenseVoiceInference;
|
|
4132
4197
|
|
|
4133
|
-
// src/inference/
|
|
4134
|
-
var logger5 = createLogger("
|
|
4135
|
-
var
|
|
4136
|
-
|
|
4137
|
-
|
|
4138
|
-
|
|
4139
|
-
|
|
4140
|
-
|
|
4141
|
-
|
|
4142
|
-
|
|
4143
|
-
|
|
4144
|
-
this.config = config;
|
|
4145
|
-
}
|
|
4146
|
-
get backend() {
|
|
4147
|
-
return this.session ? this._backend : null;
|
|
4148
|
-
}
|
|
4149
|
-
get isLoaded() {
|
|
4150
|
-
return this.session !== null;
|
|
4198
|
+
// src/inference/SenseVoiceWorker.ts
|
|
4199
|
+
var logger5 = createLogger("SenseVoiceWorker");
|
|
4200
|
+
var WASM_CDN_PATH2 = "https://cdn.jsdelivr.net/npm/onnxruntime-web@1.23.2/dist/";
|
|
4201
|
+
var LOAD_TIMEOUT_MS = 3e4;
|
|
4202
|
+
var INFERENCE_TIMEOUT_MS = 1e4;
|
|
4203
|
+
function resolveUrl(url) {
|
|
4204
|
+
if (/^https?:\/\//i.test(url) || /^blob:/i.test(url)) return url;
|
|
4205
|
+
try {
|
|
4206
|
+
return new URL(url, globalThis.location?.origin ?? "https://localhost").href;
|
|
4207
|
+
} catch {
|
|
4208
|
+
return url;
|
|
4151
4209
|
}
|
|
4152
|
-
|
|
4153
|
-
|
|
4154
|
-
|
|
4155
|
-
|
|
4156
|
-
|
|
4157
|
-
|
|
4210
|
+
}
|
|
4211
|
+
var WORKER_SCRIPT = `
|
|
4212
|
+
// SenseVoice ASR Worker Script
|
|
4213
|
+
// Loaded via Blob URL - no separate file needed
|
|
4214
|
+
|
|
4215
|
+
var ort = null;
|
|
4216
|
+
var session = null;
|
|
4217
|
+
var tokenMap = null;
|
|
4218
|
+
var negMean = null;
|
|
4219
|
+
var invStddev = null;
|
|
4220
|
+
var languageId = 0;
|
|
4221
|
+
var textNormId = 14;
|
|
4222
|
+
var vocabSize = 0;
|
|
4223
|
+
|
|
4224
|
+
// \u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550
|
|
4225
|
+
// kaldiFbank.ts \u2014 inlined as plain JavaScript
|
|
4226
|
+
// \u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550
|
|
4227
|
+
|
|
4228
|
+
/**
|
|
4229
|
+
* In-place Radix-2 Cooley-Tukey FFT
|
|
4230
|
+
*/
|
|
4231
|
+
function fft(re, im) {
|
|
4232
|
+
var n = re.length;
|
|
4233
|
+
|
|
4234
|
+
// Bit-reversal permutation
|
|
4235
|
+
for (var i = 1, j = 0; i < n; i++) {
|
|
4236
|
+
var bit = n >> 1;
|
|
4237
|
+
while (j & bit) {
|
|
4238
|
+
j ^= bit;
|
|
4239
|
+
bit >>= 1;
|
|
4158
4240
|
}
|
|
4159
|
-
|
|
4160
|
-
|
|
4241
|
+
j ^= bit;
|
|
4242
|
+
if (i < j) {
|
|
4243
|
+
var tmp = re[i]; re[i] = re[j]; re[j] = tmp;
|
|
4244
|
+
tmp = im[i]; im[i] = im[j]; im[j] = tmp;
|
|
4161
4245
|
}
|
|
4162
|
-
|
|
4163
|
-
|
|
4164
|
-
|
|
4165
|
-
|
|
4166
|
-
|
|
4167
|
-
|
|
4168
|
-
|
|
4169
|
-
|
|
4170
|
-
|
|
4171
|
-
|
|
4172
|
-
|
|
4173
|
-
|
|
4174
|
-
|
|
4175
|
-
|
|
4176
|
-
|
|
4177
|
-
|
|
4178
|
-
|
|
4179
|
-
|
|
4180
|
-
|
|
4181
|
-
|
|
4182
|
-
|
|
4183
|
-
|
|
4184
|
-
|
|
4185
|
-
|
|
4186
|
-
sessionOptions.externalData = [{
|
|
4187
|
-
path: dataFilename,
|
|
4188
|
-
data: dataUrl
|
|
4189
|
-
// URL string — ORT fetches directly into WASM
|
|
4190
|
-
}];
|
|
4191
|
-
}
|
|
4192
|
-
this.session = await this.ort.InferenceSession.create(modelUrl, sessionOptions);
|
|
4193
|
-
} else {
|
|
4194
|
-
const cache = getModelCache();
|
|
4195
|
-
const isCached = await cache.has(modelUrl);
|
|
4196
|
-
let modelBuffer;
|
|
4197
|
-
if (isCached) {
|
|
4198
|
-
logger5.debug("Loading model from cache", { modelUrl });
|
|
4199
|
-
modelBuffer = await cache.get(modelUrl);
|
|
4200
|
-
if (!modelBuffer) {
|
|
4201
|
-
logger5.warn("Cache corruption detected, clearing and retrying", { modelUrl });
|
|
4202
|
-
await cache.delete(modelUrl);
|
|
4203
|
-
modelBuffer = await fetchWithCache(modelUrl);
|
|
4204
|
-
}
|
|
4205
|
-
} else {
|
|
4206
|
-
logger5.debug("Fetching and caching model graph", { modelUrl });
|
|
4207
|
-
modelBuffer = await fetchWithCache(modelUrl);
|
|
4208
|
-
}
|
|
4209
|
-
if (!modelBuffer) {
|
|
4210
|
-
throw new Error(`Failed to load model: ${modelUrl}`);
|
|
4211
|
-
}
|
|
4212
|
-
let externalDataBuffer = null;
|
|
4213
|
-
if (dataUrl) {
|
|
4214
|
-
try {
|
|
4215
|
-
const isDataCached = await cache.has(dataUrl);
|
|
4216
|
-
if (isDataCached) {
|
|
4217
|
-
logger5.debug("Loading external data from cache", { dataUrl });
|
|
4218
|
-
externalDataBuffer = await cache.get(dataUrl);
|
|
4219
|
-
if (!externalDataBuffer) {
|
|
4220
|
-
logger5.warn("Cache corruption for external data, retrying", { dataUrl });
|
|
4221
|
-
await cache.delete(dataUrl);
|
|
4222
|
-
externalDataBuffer = await fetchWithCache(dataUrl);
|
|
4223
|
-
}
|
|
4224
|
-
} else {
|
|
4225
|
-
logger5.info("Fetching external model data", {
|
|
4226
|
-
dataUrl,
|
|
4227
|
-
note: "This may be a large download (400MB+)"
|
|
4228
|
-
});
|
|
4229
|
-
externalDataBuffer = await fetchWithCache(dataUrl);
|
|
4230
|
-
}
|
|
4231
|
-
logger5.info("External data loaded", {
|
|
4232
|
-
size: formatBytes(externalDataBuffer.byteLength)
|
|
4233
|
-
});
|
|
4234
|
-
} catch (err) {
|
|
4235
|
-
logger5.debug("No external data file found (single-file model)", {
|
|
4236
|
-
dataUrl,
|
|
4237
|
-
error: err.message
|
|
4238
|
-
});
|
|
4239
|
-
}
|
|
4240
|
-
}
|
|
4241
|
-
logger5.debug("Creating ONNX session", {
|
|
4242
|
-
graphSize: formatBytes(modelBuffer.byteLength),
|
|
4243
|
-
externalDataSize: externalDataBuffer ? formatBytes(externalDataBuffer.byteLength) : "none",
|
|
4244
|
-
backend: this._backend
|
|
4245
|
-
});
|
|
4246
|
-
if (externalDataBuffer) {
|
|
4247
|
-
const dataFilename = dataUrl.split("/").pop();
|
|
4248
|
-
sessionOptions.externalData = [{
|
|
4249
|
-
path: dataFilename,
|
|
4250
|
-
data: new Uint8Array(externalDataBuffer)
|
|
4251
|
-
}];
|
|
4252
|
-
}
|
|
4253
|
-
const modelData = new Uint8Array(modelBuffer);
|
|
4254
|
-
this.session = await this.ort.InferenceSession.create(modelData, sessionOptions);
|
|
4246
|
+
}
|
|
4247
|
+
|
|
4248
|
+
// Butterfly passes
|
|
4249
|
+
for (var len = 2; len <= n; len *= 2) {
|
|
4250
|
+
var halfLen = len / 2;
|
|
4251
|
+
var angle = -2 * Math.PI / len;
|
|
4252
|
+
var wRe = Math.cos(angle);
|
|
4253
|
+
var wIm = Math.sin(angle);
|
|
4254
|
+
|
|
4255
|
+
for (var i = 0; i < n; i += len) {
|
|
4256
|
+
var curRe = 1;
|
|
4257
|
+
var curIm = 0;
|
|
4258
|
+
for (var j = 0; j < halfLen; j++) {
|
|
4259
|
+
var a = i + j;
|
|
4260
|
+
var b = a + halfLen;
|
|
4261
|
+
var tRe = curRe * re[b] - curIm * im[b];
|
|
4262
|
+
var tIm = curRe * im[b] + curIm * re[b];
|
|
4263
|
+
re[b] = re[a] - tRe;
|
|
4264
|
+
im[b] = im[a] - tIm;
|
|
4265
|
+
re[a] += tRe;
|
|
4266
|
+
im[a] += tIm;
|
|
4267
|
+
var nextRe = curRe * wRe - curIm * wIm;
|
|
4268
|
+
curIm = curRe * wIm + curIm * wRe;
|
|
4269
|
+
curRe = nextRe;
|
|
4255
4270
|
}
|
|
4256
|
-
const loadTimeMs = performance.now() - startTime;
|
|
4257
|
-
logger5.info("Model loaded successfully", {
|
|
4258
|
-
backend: this._backend,
|
|
4259
|
-
loadTimeMs: Math.round(loadTimeMs),
|
|
4260
|
-
inputs: this.session.inputNames,
|
|
4261
|
-
outputs: this.session.outputNames
|
|
4262
|
-
});
|
|
4263
|
-
span?.setAttributes({
|
|
4264
|
-
"model.backend": this._backend,
|
|
4265
|
-
"model.load_time_ms": loadTimeMs,
|
|
4266
|
-
"model.cached": !isIOS()
|
|
4267
|
-
});
|
|
4268
|
-
span?.end();
|
|
4269
|
-
telemetry?.recordHistogram("omote.model.load_time", loadTimeMs, {
|
|
4270
|
-
model: "wav2arkit_cpu",
|
|
4271
|
-
backend: this._backend
|
|
4272
|
-
});
|
|
4273
|
-
logger5.debug("Running warmup inference");
|
|
4274
|
-
const warmupStart = performance.now();
|
|
4275
|
-
const silentAudio = new Float32Array(16e3);
|
|
4276
|
-
await this.infer(silentAudio);
|
|
4277
|
-
const warmupTimeMs = performance.now() - warmupStart;
|
|
4278
|
-
logger5.info("Warmup inference complete", {
|
|
4279
|
-
warmupTimeMs: Math.round(warmupTimeMs),
|
|
4280
|
-
backend: this._backend
|
|
4281
|
-
});
|
|
4282
|
-
telemetry?.recordHistogram("omote.model.warmup_time", warmupTimeMs, {
|
|
4283
|
-
model: "wav2arkit_cpu",
|
|
4284
|
-
backend: this._backend
|
|
4285
|
-
});
|
|
4286
|
-
return {
|
|
4287
|
-
backend: this._backend,
|
|
4288
|
-
loadTimeMs,
|
|
4289
|
-
inputNames: [...this.session.inputNames],
|
|
4290
|
-
outputNames: [...this.session.outputNames]
|
|
4291
|
-
};
|
|
4292
|
-
} catch (error) {
|
|
4293
|
-
span?.endWithError(error instanceof Error ? error : new Error(String(error)));
|
|
4294
|
-
telemetry?.incrementCounter("omote.errors.total", 1, {
|
|
4295
|
-
model: "wav2arkit_cpu",
|
|
4296
|
-
error_type: "load_failed"
|
|
4297
|
-
});
|
|
4298
|
-
throw error;
|
|
4299
|
-
} finally {
|
|
4300
|
-
this.isLoading = false;
|
|
4301
4271
|
}
|
|
4302
4272
|
}
|
|
4303
|
-
|
|
4304
|
-
|
|
4305
|
-
|
|
4306
|
-
|
|
4307
|
-
|
|
4308
|
-
|
|
4309
|
-
|
|
4310
|
-
|
|
4311
|
-
|
|
4312
|
-
|
|
4313
|
-
|
|
4314
|
-
|
|
4273
|
+
}
|
|
4274
|
+
|
|
4275
|
+
/** HTK mel scale */
|
|
4276
|
+
function htkMel(freq) {
|
|
4277
|
+
return 1127.0 * Math.log(1.0 + freq / 700.0);
|
|
4278
|
+
}
|
|
4279
|
+
|
|
4280
|
+
function htkMelInverse(mel) {
|
|
4281
|
+
return 700.0 * (Math.exp(mel / 1127.0) - 1.0);
|
|
4282
|
+
}
|
|
4283
|
+
|
|
4284
|
+
/**
|
|
4285
|
+
* Build triangular mel filterbank matrix
|
|
4286
|
+
*/
|
|
4287
|
+
function buildMelFilterbank(numBins, fftSize, sampleRate, lowFreq, highFreq) {
|
|
4288
|
+
var numFftBins = fftSize / 2 + 1;
|
|
4289
|
+
var lowMel = htkMel(lowFreq);
|
|
4290
|
+
var highMel = htkMel(highFreq);
|
|
4291
|
+
|
|
4292
|
+
// numBins + 2 equally spaced points in mel space
|
|
4293
|
+
var melPoints = new Float64Array(numBins + 2);
|
|
4294
|
+
for (var i = 0; i < numBins + 2; i++) {
|
|
4295
|
+
melPoints[i] = lowMel + (highMel - lowMel) * i / (numBins + 1);
|
|
4296
|
+
}
|
|
4297
|
+
|
|
4298
|
+
// Convert mel points to FFT bin indices (float, not rounded)
|
|
4299
|
+
var binFreqs = new Float64Array(numBins + 2);
|
|
4300
|
+
for (var i = 0; i < numBins + 2; i++) {
|
|
4301
|
+
binFreqs[i] = htkMelInverse(melPoints[i]) * fftSize / sampleRate;
|
|
4302
|
+
}
|
|
4303
|
+
|
|
4304
|
+
var filters = [];
|
|
4305
|
+
|
|
4306
|
+
for (var m = 0; m < numBins; m++) {
|
|
4307
|
+
var left = binFreqs[m];
|
|
4308
|
+
var center = binFreqs[m + 1];
|
|
4309
|
+
var right = binFreqs[m + 2];
|
|
4310
|
+
|
|
4311
|
+
var startBin = Math.max(0, Math.ceil(left));
|
|
4312
|
+
var endBin = Math.min(numFftBins - 1, Math.floor(right));
|
|
4313
|
+
|
|
4314
|
+
var weights = new Float32Array(endBin - startBin + 1);
|
|
4315
|
+
for (var k = startBin; k <= endBin; k++) {
|
|
4316
|
+
if (k <= center) {
|
|
4317
|
+
weights[k - startBin] = (center - left) > 0 ? (k - left) / (center - left) : 0;
|
|
4318
|
+
} else {
|
|
4319
|
+
weights[k - startBin] = (right - center) > 0 ? (right - k) / (right - center) : 0;
|
|
4320
|
+
}
|
|
4315
4321
|
}
|
|
4316
|
-
|
|
4317
|
-
|
|
4318
|
-
|
|
4319
|
-
|
|
4320
|
-
|
|
4322
|
+
|
|
4323
|
+
filters.push({ startBin: startBin, weights: weights });
|
|
4324
|
+
}
|
|
4325
|
+
|
|
4326
|
+
return filters;
|
|
4327
|
+
}
|
|
4328
|
+
|
|
4329
|
+
/** Create Hamming window */
|
|
4330
|
+
function createHammingWindow(length) {
|
|
4331
|
+
var w = new Float32Array(length);
|
|
4332
|
+
for (var i = 0; i < length; i++) {
|
|
4333
|
+
w[i] = 0.54 - 0.46 * Math.cos(2 * Math.PI * i / (length - 1));
|
|
4334
|
+
}
|
|
4335
|
+
return w;
|
|
4336
|
+
}
|
|
4337
|
+
|
|
4338
|
+
/**
|
|
4339
|
+
* Compute Kaldi-compatible log mel filterbank features
|
|
4340
|
+
*/
|
|
4341
|
+
function computeKaldiFbank(audio, sampleRate, numMelBins, opts) {
|
|
4342
|
+
var frameLengthMs = (opts && opts.frameLengthMs !== undefined) ? opts.frameLengthMs : 25;
|
|
4343
|
+
var frameShiftMs = (opts && opts.frameShiftMs !== undefined) ? opts.frameShiftMs : 10;
|
|
4344
|
+
var lowFreq = (opts && opts.lowFreq !== undefined) ? opts.lowFreq : 20;
|
|
4345
|
+
var highFreq = (opts && opts.highFreq !== undefined) ? opts.highFreq : (sampleRate / 2);
|
|
4346
|
+
var dither = (opts && opts.dither !== undefined) ? opts.dither : 0;
|
|
4347
|
+
var preemphasis = (opts && opts.preemphasis !== undefined) ? opts.preemphasis : 0.97;
|
|
4348
|
+
|
|
4349
|
+
var frameLengthSamples = Math.round(sampleRate * frameLengthMs / 1000);
|
|
4350
|
+
var frameShiftSamples = Math.round(sampleRate * frameShiftMs / 1000);
|
|
4351
|
+
|
|
4352
|
+
// Kaldi signal scaling: float [-1,1] -> int16 range
|
|
4353
|
+
var scaled = new Float32Array(audio.length);
|
|
4354
|
+
for (var i = 0; i < audio.length; i++) {
|
|
4355
|
+
scaled[i] = audio[i] * 32768;
|
|
4356
|
+
}
|
|
4357
|
+
|
|
4358
|
+
// Optional dithering
|
|
4359
|
+
if (dither > 0) {
|
|
4360
|
+
for (var i = 0; i < scaled.length; i++) {
|
|
4361
|
+
var u1 = Math.random();
|
|
4362
|
+
var u2 = Math.random();
|
|
4363
|
+
scaled[i] += dither * Math.sqrt(-2 * Math.log(u1 + 1e-10)) * Math.cos(2 * Math.PI * u2);
|
|
4364
|
+
}
|
|
4365
|
+
}
|
|
4366
|
+
|
|
4367
|
+
// Number of frames (snip_edges=true: only complete frames)
|
|
4368
|
+
var numFrames = Math.max(0, Math.floor((scaled.length - frameLengthSamples) / frameShiftSamples) + 1);
|
|
4369
|
+
if (numFrames === 0) {
|
|
4370
|
+
return new Float32Array(0);
|
|
4371
|
+
}
|
|
4372
|
+
|
|
4373
|
+
// FFT size: next power of 2
|
|
4374
|
+
var fftSize = 1;
|
|
4375
|
+
while (fftSize < frameLengthSamples) fftSize *= 2;
|
|
4376
|
+
|
|
4377
|
+
var numFftBins = fftSize / 2 + 1;
|
|
4378
|
+
|
|
4379
|
+
// Pre-compute window and filterbank
|
|
4380
|
+
var window = createHammingWindow(frameLengthSamples);
|
|
4381
|
+
var filters = buildMelFilterbank(numMelBins, fftSize, sampleRate, lowFreq, highFreq);
|
|
4382
|
+
|
|
4383
|
+
// Allocate output
|
|
4384
|
+
var output = new Float32Array(numFrames * numMelBins);
|
|
4385
|
+
|
|
4386
|
+
// FFT buffers (reused per frame)
|
|
4387
|
+
var fftRe = new Float64Array(fftSize);
|
|
4388
|
+
var fftIm = new Float64Array(fftSize);
|
|
4389
|
+
|
|
4390
|
+
for (var f = 0; f < numFrames; f++) {
|
|
4391
|
+
var offset = f * frameShiftSamples;
|
|
4392
|
+
|
|
4393
|
+
// Clear FFT buffers
|
|
4394
|
+
fftRe.fill(0);
|
|
4395
|
+
fftIm.fill(0);
|
|
4396
|
+
|
|
4397
|
+
// Extract frame with preemphasis and windowing
|
|
4398
|
+
for (var i = 0; i < frameLengthSamples; i++) {
|
|
4399
|
+
var sample = scaled[offset + i];
|
|
4400
|
+
// Preemphasis: y[n] = x[n] - coeff * x[n-1]
|
|
4401
|
+
if (preemphasis > 0 && i > 0) {
|
|
4402
|
+
sample -= preemphasis * scaled[offset + i - 1];
|
|
4403
|
+
} else if (preemphasis > 0 && i === 0 && offset > 0) {
|
|
4404
|
+
sample -= preemphasis * scaled[offset - 1];
|
|
4405
|
+
}
|
|
4406
|
+
// Apply window
|
|
4407
|
+
fftRe[i] = sample * window[i];
|
|
4408
|
+
}
|
|
4409
|
+
|
|
4410
|
+
// FFT
|
|
4411
|
+
fft(fftRe, fftIm);
|
|
4412
|
+
|
|
4413
|
+
// Power spectrum -> mel filterbank -> log
|
|
4414
|
+
var outOffset = f * numMelBins;
|
|
4415
|
+
for (var m = 0; m < numMelBins; m++) {
|
|
4416
|
+
var filter = filters[m];
|
|
4417
|
+
var energy = 0;
|
|
4418
|
+
for (var k = 0; k < filter.weights.length; k++) {
|
|
4419
|
+
var bin = filter.startBin + k;
|
|
4420
|
+
if (bin < numFftBins) {
|
|
4421
|
+
var powerSpec = fftRe[bin] * fftRe[bin] + fftIm[bin] * fftIm[bin];
|
|
4422
|
+
energy += filter.weights[k] * powerSpec;
|
|
4423
|
+
}
|
|
4424
|
+
}
|
|
4425
|
+
output[outOffset + m] = Math.log(Math.max(energy, 1e-10));
|
|
4426
|
+
}
|
|
4427
|
+
}
|
|
4428
|
+
|
|
4429
|
+
return output;
|
|
4430
|
+
}
|
|
4431
|
+
|
|
4432
|
+
/**
|
|
4433
|
+
* Apply Low Frame Rate stacking for SenseVoice
|
|
4434
|
+
*/
|
|
4435
|
+
function applyLFR(features, featureDim, lfrM, lfrN) {
|
|
4436
|
+
var numFrames = features.length / featureDim;
|
|
4437
|
+
if (numFrames === 0) return new Float32Array(0);
|
|
4438
|
+
|
|
4439
|
+
var leftPad = Math.floor((lfrM - 1) / 2); // 3 for lfrM=7
|
|
4440
|
+
var paddedLen = numFrames + leftPad;
|
|
4441
|
+
var numOutputFrames = Math.ceil(paddedLen / lfrN);
|
|
4442
|
+
var outputDim = featureDim * lfrM;
|
|
4443
|
+
|
|
4444
|
+
var output = new Float32Array(numOutputFrames * outputDim);
|
|
4445
|
+
|
|
4446
|
+
for (var i = 0; i < numOutputFrames; i++) {
|
|
4447
|
+
var startFrame = i * lfrN - leftPad;
|
|
4448
|
+
|
|
4449
|
+
for (var j = 0; j < lfrM; j++) {
|
|
4450
|
+
var srcFrame = startFrame + j;
|
|
4451
|
+
// Clamp to valid range
|
|
4452
|
+
if (srcFrame < 0) srcFrame = 0;
|
|
4453
|
+
if (srcFrame >= numFrames) srcFrame = numFrames - 1;
|
|
4454
|
+
|
|
4455
|
+
var srcOffset = srcFrame * featureDim;
|
|
4456
|
+
var dstOffset = i * outputDim + j * featureDim;
|
|
4457
|
+
for (var k = 0; k < featureDim; k++) {
|
|
4458
|
+
output[dstOffset + k] = features[srcOffset + k];
|
|
4459
|
+
}
|
|
4460
|
+
}
|
|
4461
|
+
}
|
|
4462
|
+
|
|
4463
|
+
return output;
|
|
4464
|
+
}
|
|
4465
|
+
|
|
4466
|
+
/**
|
|
4467
|
+
* Apply CMVN normalization in-place
|
|
4468
|
+
*/
|
|
4469
|
+
function applyCMVN(features, dim, negMeanVec, invStddevVec) {
|
|
4470
|
+
for (var i = 0; i < features.length; i++) {
|
|
4471
|
+
var d = i % dim;
|
|
4472
|
+
features[i] = (features[i] + negMeanVec[d]) * invStddevVec[d];
|
|
4473
|
+
}
|
|
4474
|
+
return features;
|
|
4475
|
+
}
|
|
4476
|
+
|
|
4477
|
+
/**
|
|
4478
|
+
* Parse CMVN vectors from comma-separated strings (stored in ONNX metadata)
|
|
4479
|
+
*/
|
|
4480
|
+
function parseCMVNFromMetadata(negMeanStr, invStddevStr) {
|
|
4481
|
+
var negMeanArr = new Float32Array(
|
|
4482
|
+
negMeanStr.split(',').map(function(s) { return parseFloat(s.trim()); })
|
|
4483
|
+
);
|
|
4484
|
+
var invStddevArr = new Float32Array(
|
|
4485
|
+
invStddevStr.split(',').map(function(s) { return parseFloat(s.trim()); })
|
|
4486
|
+
);
|
|
4487
|
+
return { negMean: negMeanArr, invStddev: invStddevArr };
|
|
4488
|
+
}
|
|
4489
|
+
|
|
4490
|
+
// \u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550
|
|
4491
|
+
// ctcDecoder.ts \u2014 inlined as plain JavaScript
|
|
4492
|
+
// \u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550
|
|
4493
|
+
|
|
4494
|
+
/** SenseVoice language ID -> string mapping */
|
|
4495
|
+
var LANGUAGE_IDS = {
|
|
4496
|
+
0: 'auto',
|
|
4497
|
+
3: 'zh',
|
|
4498
|
+
4: 'en',
|
|
4499
|
+
7: 'yue',
|
|
4500
|
+
11: 'ja',
|
|
4501
|
+
12: 'ko',
|
|
4502
|
+
13: 'nospeech'
|
|
4503
|
+
};
|
|
4504
|
+
|
|
4505
|
+
/** SenseVoice text normalization ID -> string mapping */
|
|
4506
|
+
var TEXT_NORM_IDS = {
|
|
4507
|
+
14: 'with_itn',
|
|
4508
|
+
15: 'without_itn'
|
|
4509
|
+
};
|
|
4510
|
+
|
|
4511
|
+
/** Resolve language string to SenseVoice language ID */
|
|
4512
|
+
function resolveLanguageId(language) {
|
|
4513
|
+
var map = {
|
|
4514
|
+
auto: 0,
|
|
4515
|
+
zh: 3,
|
|
4516
|
+
en: 4,
|
|
4517
|
+
yue: 7,
|
|
4518
|
+
ja: 11,
|
|
4519
|
+
ko: 12
|
|
4520
|
+
};
|
|
4521
|
+
return map[language] !== undefined ? map[language] : 0;
|
|
4522
|
+
}
|
|
4523
|
+
|
|
4524
|
+
/** Resolve text norm string to SenseVoice text norm ID */
|
|
4525
|
+
function resolveTextNormId(textNorm) {
|
|
4526
|
+
return textNorm === 'without_itn' ? 15 : 14;
|
|
4527
|
+
}
|
|
4528
|
+
|
|
4529
|
+
/**
|
|
4530
|
+
* Parse tokens.txt into a token ID -> string map
|
|
4531
|
+
*/
|
|
4532
|
+
function parseTokensFile(content) {
|
|
4533
|
+
var map = new Map();
|
|
4534
|
+
var lines = content.split('\\n');
|
|
4535
|
+
for (var idx = 0; idx < lines.length; idx++) {
|
|
4536
|
+
var trimmed = lines[idx].trim();
|
|
4537
|
+
if (!trimmed) continue;
|
|
4538
|
+
// Find the last space - token string may contain spaces
|
|
4539
|
+
var lastSpace = trimmed.lastIndexOf(' ');
|
|
4540
|
+
if (lastSpace === -1) continue;
|
|
4541
|
+
var token = trimmed.substring(0, lastSpace);
|
|
4542
|
+
var id = parseInt(trimmed.substring(lastSpace + 1), 10);
|
|
4543
|
+
if (!isNaN(id)) {
|
|
4544
|
+
map.set(id, token);
|
|
4545
|
+
}
|
|
4546
|
+
}
|
|
4547
|
+
return map;
|
|
4548
|
+
}
|
|
4549
|
+
|
|
4550
|
+
/**
|
|
4551
|
+
* SenseVoice structured token pattern matching
|
|
4552
|
+
*/
|
|
4553
|
+
function parseStructuredToken(token) {
|
|
4554
|
+
var match = token.match(/^<\\|(.+)\\|>$/);
|
|
4555
|
+
if (!match) return null;
|
|
4556
|
+
|
|
4557
|
+
var value = match[1];
|
|
4558
|
+
|
|
4559
|
+
// Language tokens
|
|
4560
|
+
if (value === 'zh' || value === 'en' || value === 'ja' || value === 'ko' || value === 'yue' || value === 'nospeech') {
|
|
4561
|
+
return { type: 'language', value: value };
|
|
4562
|
+
}
|
|
4563
|
+
|
|
4564
|
+
// Emotion tokens
|
|
4565
|
+
var emotions = ['HAPPY', 'SAD', 'ANGRY', 'NEUTRAL', 'FEARFUL', 'DISGUSTED', 'SURPRISED', 'EMO_UNKNOWN'];
|
|
4566
|
+
if (emotions.indexOf(value) !== -1) {
|
|
4567
|
+
return { type: 'emotion', value: value };
|
|
4568
|
+
}
|
|
4569
|
+
|
|
4570
|
+
// Audio event tokens
|
|
4571
|
+
var events = ['Speech', 'BGM', 'Applause', 'Laughter', 'Crying', 'Coughing', 'Sneezing', 'EVENT_UNKNOWN'];
|
|
4572
|
+
if (events.indexOf(value) !== -1) {
|
|
4573
|
+
return { type: 'event', value: value };
|
|
4574
|
+
}
|
|
4575
|
+
|
|
4576
|
+
// ITN tokens
|
|
4577
|
+
if (value === 'withitn' || value === 'woitn' || value === 'with_itn' || value === 'without_itn') {
|
|
4578
|
+
return { type: 'textnorm', value: value };
|
|
4579
|
+
}
|
|
4580
|
+
|
|
4581
|
+
return null;
|
|
4582
|
+
}
|
|
4583
|
+
|
|
4584
|
+
/**
|
|
4585
|
+
* CTC greedy decode
|
|
4586
|
+
*/
|
|
4587
|
+
function ctcGreedyDecode(logits, seqLen, vocabSz, tokenMapLocal) {
|
|
4588
|
+
// Step 1: Argmax per time step
|
|
4589
|
+
var tokenIds = [];
|
|
4590
|
+
for (var t = 0; t < seqLen; t++) {
|
|
4591
|
+
var offset = t * vocabSz;
|
|
4592
|
+
var maxIdx = 0;
|
|
4593
|
+
var maxVal = logits[offset];
|
|
4594
|
+
for (var v = 1; v < vocabSz; v++) {
|
|
4595
|
+
if (logits[offset + v] > maxVal) {
|
|
4596
|
+
maxVal = logits[offset + v];
|
|
4597
|
+
maxIdx = v;
|
|
4598
|
+
}
|
|
4599
|
+
}
|
|
4600
|
+
tokenIds.push(maxIdx);
|
|
4601
|
+
}
|
|
4602
|
+
|
|
4603
|
+
// Step 2: Collapse consecutive duplicates
|
|
4604
|
+
var collapsed = [];
|
|
4605
|
+
var prev = -1;
|
|
4606
|
+
for (var idx = 0; idx < tokenIds.length; idx++) {
|
|
4607
|
+
var id = tokenIds[idx];
|
|
4608
|
+
if (id !== prev) {
|
|
4609
|
+
collapsed.push(id);
|
|
4610
|
+
prev = id;
|
|
4611
|
+
}
|
|
4612
|
+
}
|
|
4613
|
+
|
|
4614
|
+
// Step 3: Remove blank tokens (ID 0) and special tokens (<s>=1, </s>=2)
|
|
4615
|
+
var filtered = collapsed.filter(function(id) { return id !== 0 && id !== 1 && id !== 2; });
|
|
4616
|
+
|
|
4617
|
+
// Step 4: Convert to token strings and parse structured tokens
|
|
4618
|
+
var language = undefined;
|
|
4619
|
+
var emotion = undefined;
|
|
4620
|
+
var event = undefined;
|
|
4621
|
+
var textTokens = [];
|
|
4622
|
+
|
|
4623
|
+
for (var idx = 0; idx < filtered.length; idx++) {
|
|
4624
|
+
var id = filtered[idx];
|
|
4625
|
+
var token = tokenMapLocal.get(id);
|
|
4626
|
+
if (!token) continue;
|
|
4627
|
+
|
|
4628
|
+
var structured = parseStructuredToken(token);
|
|
4629
|
+
if (structured) {
|
|
4630
|
+
if (structured.type === 'language') language = structured.value;
|
|
4631
|
+
else if (structured.type === 'emotion') emotion = structured.value;
|
|
4632
|
+
else if (structured.type === 'event') event = structured.value;
|
|
4633
|
+
// Skip textnorm tokens
|
|
4634
|
+
} else {
|
|
4635
|
+
textTokens.push(token);
|
|
4636
|
+
}
|
|
4637
|
+
}
|
|
4638
|
+
|
|
4639
|
+
// Step 5: Join tokens, handle SentencePiece boundary marker
|
|
4640
|
+
var text = textTokens.join('');
|
|
4641
|
+
// Replace SentencePiece word boundary (U+2581) with space
|
|
4642
|
+
text = text.replace(/\\u2581/g, ' ').trim();
|
|
4643
|
+
|
|
4644
|
+
return { text: text, language: language, emotion: emotion, event: event };
|
|
4645
|
+
}
|
|
4646
|
+
|
|
4647
|
+
// \u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550
|
|
4648
|
+
// Worker globals and message handler
|
|
4649
|
+
// \u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550
|
|
4650
|
+
|
|
4651
|
+
/**
|
|
4652
|
+
* Load ONNX Runtime from CDN
|
|
4653
|
+
*/
|
|
4654
|
+
async function loadOrt(wasmPaths) {
|
|
4655
|
+
if (ort) return;
|
|
4656
|
+
|
|
4657
|
+
// Import ONNX Runtime from CDN
|
|
4658
|
+
var ortUrl = wasmPaths + 'ort.wasm.min.js';
|
|
4659
|
+
|
|
4660
|
+
// Load the script by fetching and executing it
|
|
4661
|
+
var response = await fetch(ortUrl);
|
|
4662
|
+
var scriptText = await response.text();
|
|
4663
|
+
|
|
4664
|
+
// Create a blob URL for the script
|
|
4665
|
+
var blob = new Blob([scriptText], { type: 'application/javascript' });
|
|
4666
|
+
var blobUrl = URL.createObjectURL(blob);
|
|
4667
|
+
|
|
4668
|
+
// Import the module
|
|
4669
|
+
importScripts(blobUrl);
|
|
4670
|
+
URL.revokeObjectURL(blobUrl);
|
|
4671
|
+
|
|
4672
|
+
// ort is now available as global
|
|
4673
|
+
ort = self.ort;
|
|
4674
|
+
|
|
4675
|
+
// Configure WASM settings
|
|
4676
|
+
ort.env.wasm.wasmPaths = wasmPaths;
|
|
4677
|
+
ort.env.wasm.numThreads = 1; // Single thread in worker
|
|
4678
|
+
ort.env.wasm.simd = true;
|
|
4679
|
+
ort.env.wasm.proxy = false; // No proxy in worker
|
|
4680
|
+
}
|
|
4681
|
+
|
|
4682
|
+
/**
|
|
4683
|
+
* Load the SenseVoice model and tokens
|
|
4684
|
+
*/
|
|
4685
|
+
async function loadModel(modelUrl, tokensUrl, isIOSDevice, lang, textNorm) {
|
|
4686
|
+
// 1. Fetch and parse tokens.txt
|
|
4687
|
+
var tokensResponse = await fetch(tokensUrl);
|
|
4688
|
+
if (!tokensResponse.ok) {
|
|
4689
|
+
throw new Error('Failed to fetch tokens.txt: ' + tokensResponse.status + ' ' + tokensResponse.statusText);
|
|
4690
|
+
}
|
|
4691
|
+
var tokensText = await tokensResponse.text();
|
|
4692
|
+
tokenMap = parseTokensFile(tokensText);
|
|
4693
|
+
|
|
4694
|
+
// 2. Store language/textNorm IDs
|
|
4695
|
+
languageId = lang;
|
|
4696
|
+
textNormId = textNorm;
|
|
4697
|
+
|
|
4698
|
+
// 3. Create inference session
|
|
4699
|
+
var sessionOptions = {
|
|
4700
|
+
executionProviders: ['wasm'],
|
|
4701
|
+
graphOptimizationLevel: 'all',
|
|
4702
|
+
};
|
|
4703
|
+
|
|
4704
|
+
if (isIOSDevice) {
|
|
4705
|
+
// iOS: pass URL string directly to ORT to avoid 239MB JS heap allocation
|
|
4706
|
+
// ORT fetches into WASM memory, keeping JS heap at ~2MB
|
|
4707
|
+
session = await ort.InferenceSession.create(modelUrl, sessionOptions);
|
|
4708
|
+
} else {
|
|
4709
|
+
// Desktop: fetch ArrayBuffer for potential caching
|
|
4710
|
+
var modelResponse = await fetch(modelUrl);
|
|
4711
|
+
if (!modelResponse.ok) {
|
|
4712
|
+
throw new Error('Failed to fetch model: ' + modelResponse.status + ' ' + modelResponse.statusText);
|
|
4713
|
+
}
|
|
4714
|
+
var modelBuffer = await modelResponse.arrayBuffer();
|
|
4715
|
+
var modelData = new Uint8Array(modelBuffer);
|
|
4716
|
+
session = await ort.InferenceSession.create(modelData, sessionOptions);
|
|
4717
|
+
}
|
|
4718
|
+
|
|
4719
|
+
// 4. Try to read CMVN from model metadata
|
|
4720
|
+
try {
|
|
4721
|
+
var metadata = session.handler && session.handler.metadata;
|
|
4722
|
+
if (metadata && metadata.neg_mean && metadata.inv_stddev) {
|
|
4723
|
+
var cmvn = parseCMVNFromMetadata(metadata.neg_mean, metadata.inv_stddev);
|
|
4724
|
+
negMean = cmvn.negMean;
|
|
4725
|
+
invStddev = cmvn.invStddev;
|
|
4726
|
+
}
|
|
4727
|
+
} catch (cmvnErr) {
|
|
4728
|
+
// CMVN not available \u2014 features will not be normalized
|
|
4729
|
+
}
|
|
4730
|
+
|
|
4731
|
+
// 5. Determine vocab size from tokenMap
|
|
4732
|
+
vocabSize = 0;
|
|
4733
|
+
tokenMap.forEach(function(val, key) {
|
|
4734
|
+
if (key >= vocabSize) vocabSize = key + 1;
|
|
4735
|
+
});
|
|
4736
|
+
|
|
4737
|
+
return {
|
|
4738
|
+
vocabSize: vocabSize,
|
|
4739
|
+
inputNames: session.inputNames.slice(),
|
|
4740
|
+
outputNames: session.outputNames.slice(),
|
|
4741
|
+
};
|
|
4742
|
+
}
|
|
4743
|
+
|
|
4744
|
+
/**
|
|
4745
|
+
* Run transcription on audio samples
|
|
4746
|
+
*/
|
|
4747
|
+
async function runTranscription(audio) {
|
|
4748
|
+
var preprocessStart = performance.now();
|
|
4749
|
+
|
|
4750
|
+
// 1. Compute Kaldi fbank features [T, 80]
|
|
4751
|
+
var fbank = computeKaldiFbank(audio, 16000, 80);
|
|
4752
|
+
var numFrames = fbank.length / 80;
|
|
4753
|
+
|
|
4754
|
+
if (numFrames === 0) {
|
|
4755
|
+
return {
|
|
4756
|
+
text: '',
|
|
4757
|
+
language: undefined,
|
|
4758
|
+
emotion: undefined,
|
|
4759
|
+
event: undefined,
|
|
4760
|
+
inferenceTimeMs: performance.now() - preprocessStart,
|
|
4761
|
+
preprocessTimeMs: performance.now() - preprocessStart,
|
|
4762
|
+
};
|
|
4763
|
+
}
|
|
4764
|
+
|
|
4765
|
+
// 2. Apply LFR stacking [T_reduced, 560]
|
|
4766
|
+
var lfrFeatures = applyLFR(fbank, 80, 7, 6);
|
|
4767
|
+
var numLfrFrames = lfrFeatures.length / 560;
|
|
4768
|
+
|
|
4769
|
+
// 3. Apply CMVN normalization (in-place)
|
|
4770
|
+
if (negMean && invStddev) {
|
|
4771
|
+
applyCMVN(lfrFeatures, 560, negMean, invStddev);
|
|
4772
|
+
}
|
|
4773
|
+
|
|
4774
|
+
var preprocessTimeMs = performance.now() - preprocessStart;
|
|
4775
|
+
|
|
4776
|
+
// 4. Build ORT tensors
|
|
4777
|
+
var feeds = {
|
|
4778
|
+
x: new ort.Tensor('float32', lfrFeatures, [1, numLfrFrames, 560]),
|
|
4779
|
+
x_length: new ort.Tensor('int32', new Int32Array([numLfrFrames]), [1]),
|
|
4780
|
+
language: new ort.Tensor('int32', new Int32Array([languageId]), [1]),
|
|
4781
|
+
text_norm: new ort.Tensor('int32', new Int32Array([textNormId]), [1]),
|
|
4782
|
+
};
|
|
4783
|
+
|
|
4784
|
+
// 5. Run inference
|
|
4785
|
+
var results = await session.run(feeds);
|
|
4786
|
+
|
|
4787
|
+
var logitsOutput = results['logits'];
|
|
4788
|
+
if (!logitsOutput) {
|
|
4789
|
+
throw new Error('Model output missing "logits" tensor');
|
|
4790
|
+
}
|
|
4791
|
+
|
|
4792
|
+
var logitsData = logitsOutput.data;
|
|
4793
|
+
var logitsDims = logitsOutput.dims;
|
|
4794
|
+
var seqLen = logitsDims[1];
|
|
4795
|
+
var modelVocabSize = logitsDims[2];
|
|
4796
|
+
|
|
4797
|
+
// 6. CTC decode
|
|
4798
|
+
var decoded = ctcGreedyDecode(logitsData, seqLen, modelVocabSize, tokenMap);
|
|
4799
|
+
|
|
4800
|
+
var totalTimeMs = performance.now() - preprocessStart;
|
|
4801
|
+
|
|
4802
|
+
return {
|
|
4803
|
+
text: decoded.text,
|
|
4804
|
+
language: decoded.language,
|
|
4805
|
+
emotion: decoded.emotion,
|
|
4806
|
+
event: decoded.event,
|
|
4807
|
+
inferenceTimeMs: totalTimeMs,
|
|
4808
|
+
preprocessTimeMs: preprocessTimeMs,
|
|
4809
|
+
};
|
|
4810
|
+
}
|
|
4811
|
+
|
|
4812
|
+
// Message handler
|
|
4813
|
+
self.onmessage = async function(e) {
|
|
4814
|
+
var msg = e.data;
|
|
4815
|
+
|
|
4816
|
+
try {
|
|
4817
|
+
switch (msg.type) {
|
|
4818
|
+
case 'load': {
|
|
4819
|
+
var startTime = performance.now();
|
|
4820
|
+
await loadOrt(msg.wasmPaths);
|
|
4821
|
+
var info = await loadModel(msg.modelUrl, msg.tokensUrl, msg.isIOS, msg.language, msg.textNorm);
|
|
4822
|
+
var loadTimeMs = performance.now() - startTime;
|
|
4823
|
+
|
|
4824
|
+
self.postMessage({
|
|
4825
|
+
type: 'loaded',
|
|
4826
|
+
vocabSize: info.vocabSize,
|
|
4827
|
+
inputNames: info.inputNames,
|
|
4828
|
+
outputNames: info.outputNames,
|
|
4829
|
+
loadTimeMs: loadTimeMs,
|
|
4830
|
+
});
|
|
4831
|
+
break;
|
|
4832
|
+
}
|
|
4833
|
+
|
|
4834
|
+
case 'transcribe': {
|
|
4835
|
+
var result = await runTranscription(msg.audio);
|
|
4836
|
+
|
|
4837
|
+
self.postMessage({
|
|
4838
|
+
type: 'result',
|
|
4839
|
+
text: result.text,
|
|
4840
|
+
language: result.language,
|
|
4841
|
+
emotion: result.emotion,
|
|
4842
|
+
event: result.event,
|
|
4843
|
+
inferenceTimeMs: result.inferenceTimeMs,
|
|
4844
|
+
preprocessTimeMs: result.preprocessTimeMs,
|
|
4845
|
+
});
|
|
4846
|
+
break;
|
|
4847
|
+
}
|
|
4848
|
+
|
|
4849
|
+
case 'dispose': {
|
|
4850
|
+
if (session) {
|
|
4851
|
+
await session.release();
|
|
4852
|
+
session = null;
|
|
4853
|
+
}
|
|
4854
|
+
ort = null;
|
|
4855
|
+
tokenMap = null;
|
|
4856
|
+
negMean = null;
|
|
4857
|
+
invStddev = null;
|
|
4858
|
+
self.postMessage({ type: 'disposed' });
|
|
4859
|
+
break;
|
|
4860
|
+
}
|
|
4861
|
+
|
|
4862
|
+
default:
|
|
4863
|
+
self.postMessage({
|
|
4864
|
+
type: 'error',
|
|
4865
|
+
error: 'Unknown message type: ' + msg.type,
|
|
4866
|
+
});
|
|
4867
|
+
}
|
|
4868
|
+
} catch (err) {
|
|
4869
|
+
var errorMsg = err.message || String(err);
|
|
4870
|
+
// Handle raw C++ exception pointers from ORT WASM
|
|
4871
|
+
if (typeof err === 'number') {
|
|
4872
|
+
errorMsg = 'Raw C++ exception pointer (0x' + err.toString(16) + '). Likely OOM in WASM.';
|
|
4873
|
+
}
|
|
4874
|
+
self.postMessage({
|
|
4875
|
+
type: 'error',
|
|
4876
|
+
error: errorMsg,
|
|
4877
|
+
});
|
|
4878
|
+
}
|
|
4879
|
+
};
|
|
4880
|
+
|
|
4881
|
+
// Error handler
|
|
4882
|
+
self.onerror = function(err) {
|
|
4883
|
+
self.postMessage({
|
|
4884
|
+
type: 'error',
|
|
4885
|
+
error: 'Worker error: ' + (err.message || String(err)),
|
|
4886
|
+
});
|
|
4887
|
+
};
|
|
4888
|
+
`;
|
|
4889
|
+
var SenseVoiceWorker = class {
|
|
4890
|
+
constructor(config) {
|
|
4891
|
+
this.worker = null;
|
|
4892
|
+
this.isLoading = false;
|
|
4893
|
+
this._isLoaded = false;
|
|
4894
|
+
// Inference queue for serialization
|
|
4895
|
+
this.inferenceQueue = Promise.resolve();
|
|
4896
|
+
// Session health: set to true if worker operation times out
|
|
4897
|
+
this.poisoned = false;
|
|
4898
|
+
// Pending message handlers
|
|
4899
|
+
this.pendingResolvers = /* @__PURE__ */ new Map();
|
|
4900
|
+
const modelDir = config.modelUrl.substring(0, config.modelUrl.lastIndexOf("/"));
|
|
4901
|
+
const tokensUrl = config.tokensUrl ?? `${modelDir}/tokens.txt`;
|
|
4902
|
+
this.config = {
|
|
4903
|
+
modelUrl: config.modelUrl,
|
|
4904
|
+
tokensUrl,
|
|
4905
|
+
language: config.language ?? "auto",
|
|
4906
|
+
textNorm: config.textNorm ?? "with_itn"
|
|
4907
|
+
};
|
|
4908
|
+
this.languageId = resolveLanguageId(this.config.language);
|
|
4909
|
+
this.textNormId = resolveTextNormId(this.config.textNorm);
|
|
4910
|
+
}
|
|
4911
|
+
get isLoaded() {
|
|
4912
|
+
return this._isLoaded;
|
|
4913
|
+
}
|
|
4914
|
+
/**
|
|
4915
|
+
* Backend type (always 'wasm' for Worker, WebGPU not supported in Workers)
|
|
4916
|
+
*/
|
|
4917
|
+
get backend() {
|
|
4918
|
+
return this._isLoaded ? "wasm" : null;
|
|
4919
|
+
}
|
|
4920
|
+
/**
|
|
4921
|
+
* Create the worker from inline script
|
|
4922
|
+
*/
|
|
4923
|
+
createWorker() {
|
|
4924
|
+
const blob = new Blob([WORKER_SCRIPT], { type: "application/javascript" });
|
|
4925
|
+
const blobUrl = URL.createObjectURL(blob);
|
|
4926
|
+
const worker = new Worker(blobUrl);
|
|
4927
|
+
URL.revokeObjectURL(blobUrl);
|
|
4928
|
+
worker.onmessage = (event) => {
|
|
4929
|
+
this.handleWorkerMessage(event.data);
|
|
4930
|
+
};
|
|
4931
|
+
worker.onerror = (error) => {
|
|
4932
|
+
logger5.error("Worker error", { error: error.message });
|
|
4933
|
+
for (const [, resolver] of this.pendingResolvers) {
|
|
4934
|
+
resolver.reject(new Error(`Worker error: ${error.message}`));
|
|
4935
|
+
}
|
|
4936
|
+
this.pendingResolvers.clear();
|
|
4937
|
+
};
|
|
4938
|
+
return worker;
|
|
4939
|
+
}
|
|
4940
|
+
/**
|
|
4941
|
+
* Handle messages from worker
|
|
4942
|
+
*/
|
|
4943
|
+
handleWorkerMessage(result) {
|
|
4944
|
+
const resolver = this.pendingResolvers.get(result.type);
|
|
4945
|
+
if (resolver) {
|
|
4946
|
+
this.pendingResolvers.delete(result.type);
|
|
4947
|
+
if (result.type === "error") {
|
|
4948
|
+
resolver.reject(new Error(result.error));
|
|
4949
|
+
} else {
|
|
4950
|
+
resolver.resolve(result);
|
|
4951
|
+
}
|
|
4952
|
+
}
|
|
4953
|
+
}
|
|
4954
|
+
/**
|
|
4955
|
+
* Send message to worker and wait for response
|
|
4956
|
+
*/
|
|
4957
|
+
sendMessage(message, expectedType, timeoutMs) {
|
|
4958
|
+
return new Promise((resolve, reject) => {
|
|
4959
|
+
if (!this.worker) {
|
|
4960
|
+
reject(new Error("Worker not initialized"));
|
|
4961
|
+
return;
|
|
4962
|
+
}
|
|
4963
|
+
const timeoutId = setTimeout(() => {
|
|
4964
|
+
this.pendingResolvers.delete(expectedType);
|
|
4965
|
+
this.poisoned = true;
|
|
4966
|
+
reject(new Error(`Worker operation timed out after ${timeoutMs}ms`));
|
|
4967
|
+
}, timeoutMs);
|
|
4968
|
+
this.pendingResolvers.set(expectedType, {
|
|
4969
|
+
resolve: (value) => {
|
|
4970
|
+
clearTimeout(timeoutId);
|
|
4971
|
+
resolve(value);
|
|
4972
|
+
},
|
|
4973
|
+
reject: (error) => {
|
|
4974
|
+
clearTimeout(timeoutId);
|
|
4975
|
+
reject(error);
|
|
4976
|
+
}
|
|
4977
|
+
});
|
|
4978
|
+
this.pendingResolvers.set("error", {
|
|
4979
|
+
resolve: () => {
|
|
4980
|
+
},
|
|
4981
|
+
// Never called for errors
|
|
4982
|
+
reject: (error) => {
|
|
4983
|
+
clearTimeout(timeoutId);
|
|
4984
|
+
this.pendingResolvers.delete(expectedType);
|
|
4985
|
+
reject(error);
|
|
4986
|
+
}
|
|
4987
|
+
});
|
|
4988
|
+
this.worker.postMessage(message);
|
|
4989
|
+
});
|
|
4990
|
+
}
|
|
4991
|
+
/**
|
|
4992
|
+
* Load the ONNX model in the worker
|
|
4993
|
+
*
|
|
4994
|
+
* @param onProgress - Optional progress callback. Fires once at 100% when load completes
|
|
4995
|
+
* (the worker downloads and loads the model internally, so granular progress is not available).
|
|
4996
|
+
*/
|
|
4997
|
+
async load(onProgress) {
|
|
4998
|
+
if (this.isLoading) {
|
|
4999
|
+
throw new Error("Model is already loading");
|
|
5000
|
+
}
|
|
5001
|
+
if (this._isLoaded) {
|
|
5002
|
+
throw new Error("Model already loaded. Call dispose() first.");
|
|
5003
|
+
}
|
|
5004
|
+
this.isLoading = true;
|
|
5005
|
+
const startTime = performance.now();
|
|
5006
|
+
const telemetry = getTelemetry();
|
|
5007
|
+
const span = telemetry?.startSpan("SenseVoiceWorker.load", {
|
|
5008
|
+
"model.url": this.config.modelUrl,
|
|
5009
|
+
"model.language": this.config.language
|
|
5010
|
+
});
|
|
5011
|
+
try {
|
|
5012
|
+
logger5.info("Creating SenseVoice worker...");
|
|
5013
|
+
this.worker = this.createWorker();
|
|
5014
|
+
logger5.info("Loading model in worker...", {
|
|
5015
|
+
modelUrl: this.config.modelUrl,
|
|
5016
|
+
tokensUrl: this.config.tokensUrl,
|
|
5017
|
+
language: this.config.language,
|
|
5018
|
+
textNorm: this.config.textNorm
|
|
5019
|
+
});
|
|
5020
|
+
const result = await this.sendMessage(
|
|
5021
|
+
{
|
|
5022
|
+
type: "load",
|
|
5023
|
+
modelUrl: resolveUrl(this.config.modelUrl),
|
|
5024
|
+
tokensUrl: resolveUrl(this.config.tokensUrl),
|
|
5025
|
+
wasmPaths: WASM_CDN_PATH2,
|
|
5026
|
+
isIOS: isIOS(),
|
|
5027
|
+
language: this.languageId,
|
|
5028
|
+
textNorm: this.textNormId
|
|
5029
|
+
},
|
|
5030
|
+
"loaded",
|
|
5031
|
+
LOAD_TIMEOUT_MS
|
|
5032
|
+
);
|
|
5033
|
+
this._isLoaded = true;
|
|
5034
|
+
const loadTimeMs = performance.now() - startTime;
|
|
5035
|
+
onProgress?.(1, 1);
|
|
5036
|
+
logger5.info("SenseVoice worker loaded successfully", {
|
|
5037
|
+
backend: "wasm",
|
|
5038
|
+
loadTimeMs: Math.round(loadTimeMs),
|
|
5039
|
+
workerLoadTimeMs: Math.round(result.loadTimeMs),
|
|
5040
|
+
vocabSize: result.vocabSize,
|
|
5041
|
+
language: this.config.language,
|
|
5042
|
+
textNorm: this.config.textNorm
|
|
5043
|
+
});
|
|
5044
|
+
span?.setAttributes({
|
|
5045
|
+
"model.backend": "wasm",
|
|
5046
|
+
"model.load_time_ms": loadTimeMs,
|
|
5047
|
+
"model.worker_load_time_ms": result.loadTimeMs,
|
|
5048
|
+
"model.vocab_size": result.vocabSize
|
|
5049
|
+
});
|
|
5050
|
+
span?.end();
|
|
5051
|
+
telemetry?.recordHistogram("omote.model.load_time", loadTimeMs, {
|
|
5052
|
+
model: "sensevoice-worker",
|
|
5053
|
+
backend: "wasm"
|
|
5054
|
+
});
|
|
5055
|
+
return {
|
|
5056
|
+
backend: "wasm",
|
|
5057
|
+
loadTimeMs,
|
|
5058
|
+
inputNames: result.inputNames,
|
|
5059
|
+
outputNames: result.outputNames,
|
|
5060
|
+
vocabSize: result.vocabSize
|
|
5061
|
+
};
|
|
5062
|
+
} catch (error) {
|
|
5063
|
+
span?.endWithError(error instanceof Error ? error : new Error(String(error)));
|
|
5064
|
+
telemetry?.incrementCounter("omote.errors.total", 1, {
|
|
5065
|
+
model: "sensevoice-worker",
|
|
5066
|
+
error_type: "load_failed"
|
|
5067
|
+
});
|
|
5068
|
+
if (this.worker) {
|
|
5069
|
+
this.worker.terminate();
|
|
5070
|
+
this.worker = null;
|
|
5071
|
+
}
|
|
5072
|
+
throw error;
|
|
5073
|
+
} finally {
|
|
5074
|
+
this.isLoading = false;
|
|
5075
|
+
}
|
|
5076
|
+
}
|
|
5077
|
+
/**
|
|
5078
|
+
* Transcribe audio samples to text
|
|
5079
|
+
*
|
|
5080
|
+
* @param audioSamples Float32Array of audio samples at 16kHz, [-1, 1] range
|
|
5081
|
+
* @returns Transcription result with text, emotion, language, and event
|
|
5082
|
+
*/
|
|
5083
|
+
async transcribe(audioSamples) {
|
|
5084
|
+
if (!this._isLoaded || !this.worker) {
|
|
5085
|
+
throw new Error("Worker not loaded. Call load() first.");
|
|
5086
|
+
}
|
|
5087
|
+
if (this.poisoned) {
|
|
5088
|
+
throw new Error("SenseVoice worker timed out \u2014 inference unavailable until page reload");
|
|
5089
|
+
}
|
|
5090
|
+
const audio = new Float32Array(audioSamples);
|
|
5091
|
+
return this.queueInference(audio);
|
|
5092
|
+
}
|
|
5093
|
+
/**
|
|
5094
|
+
* Queue inference to serialize worker calls
|
|
5095
|
+
*/
|
|
5096
|
+
queueInference(audio) {
|
|
5097
|
+
return new Promise((resolve, reject) => {
|
|
5098
|
+
this.inferenceQueue = this.inferenceQueue.then(async () => {
|
|
5099
|
+
const telemetry = getTelemetry();
|
|
5100
|
+
const span = telemetry?.startSpan("SenseVoiceWorker.transcribe", {
|
|
5101
|
+
"inference.backend": "wasm",
|
|
5102
|
+
"inference.input_samples": audio.length
|
|
5103
|
+
});
|
|
5104
|
+
try {
|
|
5105
|
+
const startTime = performance.now();
|
|
5106
|
+
const result = await this.sendMessage(
|
|
5107
|
+
{
|
|
5108
|
+
type: "transcribe",
|
|
5109
|
+
audio
|
|
5110
|
+
},
|
|
5111
|
+
"result",
|
|
5112
|
+
INFERENCE_TIMEOUT_MS
|
|
5113
|
+
);
|
|
5114
|
+
const totalTimeMs = performance.now() - startTime;
|
|
5115
|
+
logger5.trace("Worker transcription complete", {
|
|
5116
|
+
text: result.text.substring(0, 50),
|
|
5117
|
+
language: result.language,
|
|
5118
|
+
emotion: result.emotion,
|
|
5119
|
+
event: result.event,
|
|
5120
|
+
preprocessTimeMs: Math.round(result.preprocessTimeMs * 100) / 100,
|
|
5121
|
+
inferenceTimeMs: Math.round(result.inferenceTimeMs * 100) / 100,
|
|
5122
|
+
roundTripMs: Math.round(totalTimeMs * 100) / 100
|
|
5123
|
+
});
|
|
5124
|
+
span?.setAttributes({
|
|
5125
|
+
"inference.duration_ms": totalTimeMs,
|
|
5126
|
+
"inference.worker_duration_ms": result.inferenceTimeMs,
|
|
5127
|
+
"inference.preprocess_ms": result.preprocessTimeMs,
|
|
5128
|
+
"inference.text_length": result.text.length
|
|
5129
|
+
});
|
|
5130
|
+
span?.end();
|
|
5131
|
+
telemetry?.recordHistogram("omote.inference.latency", totalTimeMs, {
|
|
5132
|
+
model: "sensevoice-worker",
|
|
5133
|
+
backend: "wasm"
|
|
5134
|
+
});
|
|
5135
|
+
telemetry?.incrementCounter("omote.inference.total", 1, {
|
|
5136
|
+
model: "sensevoice-worker",
|
|
5137
|
+
backend: "wasm",
|
|
5138
|
+
status: "success"
|
|
5139
|
+
});
|
|
5140
|
+
resolve({
|
|
5141
|
+
text: result.text,
|
|
5142
|
+
language: result.language,
|
|
5143
|
+
emotion: result.emotion,
|
|
5144
|
+
event: result.event,
|
|
5145
|
+
inferenceTimeMs: result.inferenceTimeMs,
|
|
5146
|
+
preprocessTimeMs: result.preprocessTimeMs
|
|
5147
|
+
});
|
|
5148
|
+
} catch (err) {
|
|
5149
|
+
const errMsg = err instanceof Error ? err.message : String(err);
|
|
5150
|
+
if (errMsg.includes("timed out")) {
|
|
5151
|
+
logger5.error("CRITICAL: Worker inference timed out \u2014 SenseVoice worker is dead. Page reload required.", {
|
|
5152
|
+
timeoutMs: INFERENCE_TIMEOUT_MS
|
|
5153
|
+
});
|
|
5154
|
+
} else {
|
|
5155
|
+
logger5.error("Worker inference failed", { error: errMsg });
|
|
5156
|
+
}
|
|
5157
|
+
span?.endWithError(err instanceof Error ? err : new Error(String(err)));
|
|
5158
|
+
telemetry?.incrementCounter("omote.inference.total", 1, {
|
|
5159
|
+
model: "sensevoice-worker",
|
|
5160
|
+
backend: "wasm",
|
|
5161
|
+
status: "error"
|
|
5162
|
+
});
|
|
5163
|
+
reject(err);
|
|
5164
|
+
}
|
|
5165
|
+
});
|
|
5166
|
+
});
|
|
5167
|
+
}
|
|
5168
|
+
/**
|
|
5169
|
+
* Dispose of the worker and free resources
|
|
5170
|
+
*/
|
|
5171
|
+
async dispose() {
|
|
5172
|
+
if (this.worker) {
|
|
5173
|
+
try {
|
|
5174
|
+
await this.sendMessage({ type: "dispose" }, "disposed", INFERENCE_TIMEOUT_MS);
|
|
5175
|
+
} catch {
|
|
5176
|
+
}
|
|
5177
|
+
this.worker.terminate();
|
|
5178
|
+
this.worker = null;
|
|
5179
|
+
}
|
|
5180
|
+
this._isLoaded = false;
|
|
5181
|
+
this.poisoned = false;
|
|
5182
|
+
this.pendingResolvers.clear();
|
|
5183
|
+
}
|
|
5184
|
+
/**
|
|
5185
|
+
* Check if Web Workers are supported
|
|
5186
|
+
*/
|
|
5187
|
+
static isSupported() {
|
|
5188
|
+
return typeof Worker !== "undefined";
|
|
5189
|
+
}
|
|
5190
|
+
};
|
|
5191
|
+
|
|
5192
|
+
// src/inference/UnifiedInferenceWorker.ts
|
|
5193
|
+
var logger6 = createLogger("UnifiedInferenceWorker");
|
|
5194
|
+
var WASM_CDN_PATH3 = "https://cdn.jsdelivr.net/npm/onnxruntime-web@1.23.2/dist/";
|
|
5195
|
+
var INIT_TIMEOUT_MS = 15e3;
|
|
5196
|
+
var SV_LOAD_TIMEOUT_MS = 3e4;
|
|
5197
|
+
var SV_INFER_TIMEOUT_MS = 1e4;
|
|
5198
|
+
var CPU_LOAD_TIMEOUT_MS = 6e4;
|
|
5199
|
+
var CPU_INFER_TIMEOUT_MS = 5e3;
|
|
5200
|
+
var VAD_LOAD_TIMEOUT_MS = 1e4;
|
|
5201
|
+
var VAD_INFER_TIMEOUT_MS = 1e3;
|
|
5202
|
+
var DISPOSE_TIMEOUT_MS = 5e3;
|
|
5203
|
+
function resolveUrl2(url) {
|
|
5204
|
+
if (/^https?:\/\//i.test(url) || /^blob:/i.test(url)) return url;
|
|
5205
|
+
try {
|
|
5206
|
+
return new URL(url, globalThis.location?.origin ?? "https://localhost").href;
|
|
5207
|
+
} catch {
|
|
5208
|
+
return url;
|
|
5209
|
+
}
|
|
5210
|
+
}
|
|
5211
|
+
var requestCounter = 0;
|
|
5212
|
+
function nextRequestId() {
|
|
5213
|
+
return `req_${++requestCounter}_${Date.now()}`;
|
|
5214
|
+
}
|
|
5215
|
+
var WORKER_SCRIPT2 = `
|
|
5216
|
+
// Unified Inference Worker Script
|
|
5217
|
+
// Hosts SenseVoice + Wav2ArkitCpu + Silero VAD in a single ORT instance
|
|
5218
|
+
|
|
5219
|
+
var ort = null;
|
|
5220
|
+
|
|
5221
|
+
// SenseVoice state
|
|
5222
|
+
var svSession = null;
|
|
5223
|
+
var svTokenMap = null;
|
|
5224
|
+
var svNegMean = null;
|
|
5225
|
+
var svInvStddev = null;
|
|
5226
|
+
var svLanguageId = 0;
|
|
5227
|
+
var svTextNormId = 14;
|
|
5228
|
+
var svVocabSize = 0;
|
|
5229
|
+
|
|
5230
|
+
// Wav2ArkitCpu state
|
|
5231
|
+
var cpuSession = null;
|
|
5232
|
+
|
|
5233
|
+
// Silero VAD state
|
|
5234
|
+
var vadSession = null;
|
|
5235
|
+
var vadSampleRate = 16000;
|
|
5236
|
+
var vadChunkSize = 512;
|
|
5237
|
+
var vadContextSize = 64;
|
|
5238
|
+
|
|
5239
|
+
// \u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550
|
|
5240
|
+
// kaldiFbank.ts \u2014 inlined as plain JavaScript
|
|
5241
|
+
// \u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550
|
|
5242
|
+
|
|
5243
|
+
function fft(re, im) {
|
|
5244
|
+
var n = re.length;
|
|
5245
|
+
for (var i = 1, j = 0; i < n; i++) {
|
|
5246
|
+
var bit = n >> 1;
|
|
5247
|
+
while (j & bit) { j ^= bit; bit >>= 1; }
|
|
5248
|
+
j ^= bit;
|
|
5249
|
+
if (i < j) {
|
|
5250
|
+
var tmp = re[i]; re[i] = re[j]; re[j] = tmp;
|
|
5251
|
+
tmp = im[i]; im[i] = im[j]; im[j] = tmp;
|
|
5252
|
+
}
|
|
5253
|
+
}
|
|
5254
|
+
for (var len = 2; len <= n; len *= 2) {
|
|
5255
|
+
var halfLen = len / 2;
|
|
5256
|
+
var angle = -2 * Math.PI / len;
|
|
5257
|
+
var wRe = Math.cos(angle);
|
|
5258
|
+
var wIm = Math.sin(angle);
|
|
5259
|
+
for (var i = 0; i < n; i += len) {
|
|
5260
|
+
var curRe = 1, curIm = 0;
|
|
5261
|
+
for (var j = 0; j < halfLen; j++) {
|
|
5262
|
+
var a = i + j, b = a + halfLen;
|
|
5263
|
+
var tRe = curRe * re[b] - curIm * im[b];
|
|
5264
|
+
var tIm = curRe * im[b] + curIm * re[b];
|
|
5265
|
+
re[b] = re[a] - tRe; im[b] = im[a] - tIm;
|
|
5266
|
+
re[a] += tRe; im[a] += tIm;
|
|
5267
|
+
var nextRe = curRe * wRe - curIm * wIm;
|
|
5268
|
+
curIm = curRe * wIm + curIm * wRe;
|
|
5269
|
+
curRe = nextRe;
|
|
5270
|
+
}
|
|
5271
|
+
}
|
|
5272
|
+
}
|
|
5273
|
+
}
|
|
5274
|
+
|
|
5275
|
+
function htkMel(freq) { return 1127.0 * Math.log(1.0 + freq / 700.0); }
|
|
5276
|
+
function htkMelInverse(mel) { return 700.0 * (Math.exp(mel / 1127.0) - 1.0); }
|
|
5277
|
+
|
|
5278
|
+
function buildMelFilterbank(numBins, fftSize, sampleRate, lowFreq, highFreq) {
|
|
5279
|
+
var numFftBins = fftSize / 2 + 1;
|
|
5280
|
+
var lowMel = htkMel(lowFreq);
|
|
5281
|
+
var highMel = htkMel(highFreq);
|
|
5282
|
+
var melPoints = new Float64Array(numBins + 2);
|
|
5283
|
+
for (var i = 0; i < numBins + 2; i++) {
|
|
5284
|
+
melPoints[i] = lowMel + (highMel - lowMel) * i / (numBins + 1);
|
|
5285
|
+
}
|
|
5286
|
+
var binFreqs = new Float64Array(numBins + 2);
|
|
5287
|
+
for (var i = 0; i < numBins + 2; i++) {
|
|
5288
|
+
binFreqs[i] = htkMelInverse(melPoints[i]) * fftSize / sampleRate;
|
|
5289
|
+
}
|
|
5290
|
+
var filters = [];
|
|
5291
|
+
for (var m = 0; m < numBins; m++) {
|
|
5292
|
+
var left = binFreqs[m], center = binFreqs[m + 1], right = binFreqs[m + 2];
|
|
5293
|
+
var startBin = Math.max(0, Math.ceil(left));
|
|
5294
|
+
var endBin = Math.min(numFftBins - 1, Math.floor(right));
|
|
5295
|
+
var weights = new Float32Array(endBin - startBin + 1);
|
|
5296
|
+
for (var k = startBin; k <= endBin; k++) {
|
|
5297
|
+
if (k <= center) {
|
|
5298
|
+
weights[k - startBin] = (center - left) > 0 ? (k - left) / (center - left) : 0;
|
|
5299
|
+
} else {
|
|
5300
|
+
weights[k - startBin] = (right - center) > 0 ? (right - k) / (right - center) : 0;
|
|
5301
|
+
}
|
|
5302
|
+
}
|
|
5303
|
+
filters.push({ startBin: startBin, weights: weights });
|
|
5304
|
+
}
|
|
5305
|
+
return filters;
|
|
5306
|
+
}
|
|
5307
|
+
|
|
5308
|
+
function createHammingWindow(length) {
|
|
5309
|
+
var w = new Float32Array(length);
|
|
5310
|
+
for (var i = 0; i < length; i++) {
|
|
5311
|
+
w[i] = 0.54 - 0.46 * Math.cos(2 * Math.PI * i / (length - 1));
|
|
5312
|
+
}
|
|
5313
|
+
return w;
|
|
5314
|
+
}
|
|
5315
|
+
|
|
5316
|
+
function computeKaldiFbank(audio, sampleRate, numMelBins, opts) {
|
|
5317
|
+
var frameLengthMs = (opts && opts.frameLengthMs !== undefined) ? opts.frameLengthMs : 25;
|
|
5318
|
+
var frameShiftMs = (opts && opts.frameShiftMs !== undefined) ? opts.frameShiftMs : 10;
|
|
5319
|
+
var lowFreq = (opts && opts.lowFreq !== undefined) ? opts.lowFreq : 20;
|
|
5320
|
+
var highFreq = (opts && opts.highFreq !== undefined) ? opts.highFreq : (sampleRate / 2);
|
|
5321
|
+
var dither = (opts && opts.dither !== undefined) ? opts.dither : 0;
|
|
5322
|
+
var preemphasis = (opts && opts.preemphasis !== undefined) ? opts.preemphasis : 0.97;
|
|
5323
|
+
|
|
5324
|
+
var frameLengthSamples = Math.round(sampleRate * frameLengthMs / 1000);
|
|
5325
|
+
var frameShiftSamples = Math.round(sampleRate * frameShiftMs / 1000);
|
|
5326
|
+
|
|
5327
|
+
var scaled = new Float32Array(audio.length);
|
|
5328
|
+
for (var i = 0; i < audio.length; i++) { scaled[i] = audio[i] * 32768; }
|
|
5329
|
+
|
|
5330
|
+
if (dither > 0) {
|
|
5331
|
+
for (var i = 0; i < scaled.length; i++) {
|
|
5332
|
+
var u1 = Math.random(), u2 = Math.random();
|
|
5333
|
+
scaled[i] += dither * Math.sqrt(-2 * Math.log(u1 + 1e-10)) * Math.cos(2 * Math.PI * u2);
|
|
5334
|
+
}
|
|
5335
|
+
}
|
|
5336
|
+
|
|
5337
|
+
var numFrames = Math.max(0, Math.floor((scaled.length - frameLengthSamples) / frameShiftSamples) + 1);
|
|
5338
|
+
if (numFrames === 0) return new Float32Array(0);
|
|
5339
|
+
|
|
5340
|
+
var fftSize = 1;
|
|
5341
|
+
while (fftSize < frameLengthSamples) fftSize *= 2;
|
|
5342
|
+
var numFftBins = fftSize / 2 + 1;
|
|
5343
|
+
|
|
5344
|
+
var window = createHammingWindow(frameLengthSamples);
|
|
5345
|
+
var filters = buildMelFilterbank(numMelBins, fftSize, sampleRate, lowFreq, highFreq);
|
|
5346
|
+
var output = new Float32Array(numFrames * numMelBins);
|
|
5347
|
+
var fftRe = new Float64Array(fftSize);
|
|
5348
|
+
var fftIm = new Float64Array(fftSize);
|
|
5349
|
+
|
|
5350
|
+
for (var f = 0; f < numFrames; f++) {
|
|
5351
|
+
var offset = f * frameShiftSamples;
|
|
5352
|
+
fftRe.fill(0); fftIm.fill(0);
|
|
5353
|
+
for (var i = 0; i < frameLengthSamples; i++) {
|
|
5354
|
+
var sample = scaled[offset + i];
|
|
5355
|
+
if (preemphasis > 0 && i > 0) {
|
|
5356
|
+
sample -= preemphasis * scaled[offset + i - 1];
|
|
5357
|
+
} else if (preemphasis > 0 && i === 0 && offset > 0) {
|
|
5358
|
+
sample -= preemphasis * scaled[offset - 1];
|
|
5359
|
+
}
|
|
5360
|
+
fftRe[i] = sample * window[i];
|
|
5361
|
+
}
|
|
5362
|
+
fft(fftRe, fftIm);
|
|
5363
|
+
var outOffset = f * numMelBins;
|
|
5364
|
+
for (var m = 0; m < numMelBins; m++) {
|
|
5365
|
+
var filter = filters[m];
|
|
5366
|
+
var energy = 0;
|
|
5367
|
+
for (var k = 0; k < filter.weights.length; k++) {
|
|
5368
|
+
var bin = filter.startBin + k;
|
|
5369
|
+
if (bin < numFftBins) {
|
|
5370
|
+
var powerSpec = fftRe[bin] * fftRe[bin] + fftIm[bin] * fftIm[bin];
|
|
5371
|
+
energy += filter.weights[k] * powerSpec;
|
|
5372
|
+
}
|
|
5373
|
+
}
|
|
5374
|
+
output[outOffset + m] = Math.log(Math.max(energy, 1e-10));
|
|
5375
|
+
}
|
|
5376
|
+
}
|
|
5377
|
+
return output;
|
|
5378
|
+
}
|
|
5379
|
+
|
|
5380
|
+
function applyLFR(features, featureDim, lfrM, lfrN) {
|
|
5381
|
+
var numFrames = features.length / featureDim;
|
|
5382
|
+
if (numFrames === 0) return new Float32Array(0);
|
|
5383
|
+
var leftPad = Math.floor((lfrM - 1) / 2);
|
|
5384
|
+
var paddedLen = numFrames + leftPad;
|
|
5385
|
+
var numOutputFrames = Math.ceil(paddedLen / lfrN);
|
|
5386
|
+
var outputDim = featureDim * lfrM;
|
|
5387
|
+
var output = new Float32Array(numOutputFrames * outputDim);
|
|
5388
|
+
for (var i = 0; i < numOutputFrames; i++) {
|
|
5389
|
+
var startFrame = i * lfrN - leftPad;
|
|
5390
|
+
for (var j = 0; j < lfrM; j++) {
|
|
5391
|
+
var srcFrame = startFrame + j;
|
|
5392
|
+
if (srcFrame < 0) srcFrame = 0;
|
|
5393
|
+
if (srcFrame >= numFrames) srcFrame = numFrames - 1;
|
|
5394
|
+
var srcOffset = srcFrame * featureDim;
|
|
5395
|
+
var dstOffset = i * outputDim + j * featureDim;
|
|
5396
|
+
for (var k = 0; k < featureDim; k++) {
|
|
5397
|
+
output[dstOffset + k] = features[srcOffset + k];
|
|
5398
|
+
}
|
|
5399
|
+
}
|
|
5400
|
+
}
|
|
5401
|
+
return output;
|
|
5402
|
+
}
|
|
5403
|
+
|
|
5404
|
+
function applyCMVN(features, dim, negMeanVec, invStddevVec) {
|
|
5405
|
+
for (var i = 0; i < features.length; i++) {
|
|
5406
|
+
var d = i % dim;
|
|
5407
|
+
features[i] = (features[i] + negMeanVec[d]) * invStddevVec[d];
|
|
5408
|
+
}
|
|
5409
|
+
return features;
|
|
5410
|
+
}
|
|
5411
|
+
|
|
5412
|
+
function parseCMVNFromMetadata(negMeanStr, invStddevStr) {
|
|
5413
|
+
var negMeanArr = new Float32Array(
|
|
5414
|
+
negMeanStr.split(',').map(function(s) { return parseFloat(s.trim()); })
|
|
5415
|
+
);
|
|
5416
|
+
var invStddevArr = new Float32Array(
|
|
5417
|
+
invStddevStr.split(',').map(function(s) { return parseFloat(s.trim()); })
|
|
5418
|
+
);
|
|
5419
|
+
return { negMean: negMeanArr, invStddev: invStddevArr };
|
|
5420
|
+
}
|
|
5421
|
+
|
|
5422
|
+
// \u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550
|
|
5423
|
+
// ctcDecoder.ts \u2014 inlined as plain JavaScript
|
|
5424
|
+
// \u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550
|
|
5425
|
+
|
|
5426
|
+
var LANGUAGE_IDS = { 0: 'auto', 3: 'zh', 4: 'en', 7: 'yue', 11: 'ja', 12: 'ko', 13: 'nospeech' };
|
|
5427
|
+
var TEXT_NORM_IDS = { 14: 'with_itn', 15: 'without_itn' };
|
|
5428
|
+
|
|
5429
|
+
function resolveLanguageIdW(language) {
|
|
5430
|
+
var map = { auto: 0, zh: 3, en: 4, yue: 7, ja: 11, ko: 12 };
|
|
5431
|
+
return map[language] !== undefined ? map[language] : 0;
|
|
5432
|
+
}
|
|
5433
|
+
|
|
5434
|
+
function resolveTextNormIdW(textNorm) {
|
|
5435
|
+
return textNorm === 'without_itn' ? 15 : 14;
|
|
5436
|
+
}
|
|
5437
|
+
|
|
5438
|
+
function parseTokensFile(content) {
|
|
5439
|
+
var map = new Map();
|
|
5440
|
+
var lines = content.split('\\n');
|
|
5441
|
+
for (var idx = 0; idx < lines.length; idx++) {
|
|
5442
|
+
var trimmed = lines[idx].trim();
|
|
5443
|
+
if (!trimmed) continue;
|
|
5444
|
+
var lastSpace = trimmed.lastIndexOf(' ');
|
|
5445
|
+
if (lastSpace === -1) continue;
|
|
5446
|
+
var token = trimmed.substring(0, lastSpace);
|
|
5447
|
+
var id = parseInt(trimmed.substring(lastSpace + 1), 10);
|
|
5448
|
+
if (!isNaN(id)) map.set(id, token);
|
|
5449
|
+
}
|
|
5450
|
+
return map;
|
|
5451
|
+
}
|
|
5452
|
+
|
|
5453
|
+
function parseStructuredToken(token) {
|
|
5454
|
+
var match = token.match(/^<\\|(.+)\\|>$/);
|
|
5455
|
+
if (!match) return null;
|
|
5456
|
+
var value = match[1];
|
|
5457
|
+
if (value === 'zh' || value === 'en' || value === 'ja' || value === 'ko' || value === 'yue' || value === 'nospeech') {
|
|
5458
|
+
return { type: 'language', value: value };
|
|
5459
|
+
}
|
|
5460
|
+
var emotions = ['HAPPY', 'SAD', 'ANGRY', 'NEUTRAL', 'FEARFUL', 'DISGUSTED', 'SURPRISED', 'EMO_UNKNOWN'];
|
|
5461
|
+
if (emotions.indexOf(value) !== -1) return { type: 'emotion', value: value };
|
|
5462
|
+
var events = ['Speech', 'BGM', 'Applause', 'Laughter', 'Crying', 'Coughing', 'Sneezing', 'EVENT_UNKNOWN'];
|
|
5463
|
+
if (events.indexOf(value) !== -1) return { type: 'event', value: value };
|
|
5464
|
+
if (value === 'withitn' || value === 'woitn' || value === 'with_itn' || value === 'without_itn') {
|
|
5465
|
+
return { type: 'textnorm', value: value };
|
|
5466
|
+
}
|
|
5467
|
+
return null;
|
|
5468
|
+
}
|
|
5469
|
+
|
|
5470
|
+
function ctcGreedyDecode(logits, seqLen, vocabSz, tokenMapLocal) {
|
|
5471
|
+
var tokenIds = [];
|
|
5472
|
+
for (var t = 0; t < seqLen; t++) {
|
|
5473
|
+
var offset = t * vocabSz;
|
|
5474
|
+
var maxIdx = 0, maxVal = logits[offset];
|
|
5475
|
+
for (var v = 1; v < vocabSz; v++) {
|
|
5476
|
+
if (logits[offset + v] > maxVal) { maxVal = logits[offset + v]; maxIdx = v; }
|
|
5477
|
+
}
|
|
5478
|
+
tokenIds.push(maxIdx);
|
|
5479
|
+
}
|
|
5480
|
+
var collapsed = [], prev = -1;
|
|
5481
|
+
for (var idx = 0; idx < tokenIds.length; idx++) {
|
|
5482
|
+
var id = tokenIds[idx];
|
|
5483
|
+
if (id !== prev) { collapsed.push(id); prev = id; }
|
|
5484
|
+
}
|
|
5485
|
+
var filtered = collapsed.filter(function(id) { return id !== 0 && id !== 1 && id !== 2; });
|
|
5486
|
+
var language = undefined, emotion = undefined, event = undefined;
|
|
5487
|
+
var textTokens = [];
|
|
5488
|
+
for (var idx = 0; idx < filtered.length; idx++) {
|
|
5489
|
+
var id = filtered[idx];
|
|
5490
|
+
var token = tokenMapLocal.get(id);
|
|
5491
|
+
if (!token) continue;
|
|
5492
|
+
var structured = parseStructuredToken(token);
|
|
5493
|
+
if (structured) {
|
|
5494
|
+
if (structured.type === 'language') language = structured.value;
|
|
5495
|
+
else if (structured.type === 'emotion') emotion = structured.value;
|
|
5496
|
+
else if (structured.type === 'event') event = structured.value;
|
|
5497
|
+
} else {
|
|
5498
|
+
textTokens.push(token);
|
|
5499
|
+
}
|
|
5500
|
+
}
|
|
5501
|
+
var text = textTokens.join('');
|
|
5502
|
+
text = text.replace(/\\u2581/g, ' ').trim();
|
|
5503
|
+
return { text: text, language: language, emotion: emotion, event: event };
|
|
5504
|
+
}
|
|
5505
|
+
|
|
5506
|
+
// \u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550
|
|
5507
|
+
// blendshapeUtils.ts \u2014 inlined
|
|
5508
|
+
// \u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550
|
|
5509
|
+
|
|
5510
|
+
var SYMMETRIC_INDEX_PAIRS = [
|
|
5511
|
+
[23, 25], [32, 38], [43, 44], [29, 30], [27, 28], [45, 46],
|
|
5512
|
+
[35, 36], [47, 48], [33, 34], [49, 50], [6, 7], [0, 1],
|
|
5513
|
+
[3, 4], [8, 9], [16, 17], [10, 11], [12, 13], [14, 15],
|
|
5514
|
+
[18, 19], [20, 21],
|
|
5515
|
+
];
|
|
5516
|
+
|
|
5517
|
+
function symmetrizeBlendshapes(frame) {
|
|
5518
|
+
var result = new Float32Array(frame);
|
|
5519
|
+
for (var p = 0; p < SYMMETRIC_INDEX_PAIRS.length; p++) {
|
|
5520
|
+
var lIdx = SYMMETRIC_INDEX_PAIRS[p][0], rIdx = SYMMETRIC_INDEX_PAIRS[p][1];
|
|
5521
|
+
var avg = (frame[lIdx] + frame[rIdx]) / 2;
|
|
5522
|
+
result[lIdx] = avg;
|
|
5523
|
+
result[rIdx] = avg;
|
|
5524
|
+
}
|
|
5525
|
+
return result;
|
|
5526
|
+
}
|
|
5527
|
+
|
|
5528
|
+
// \u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550
|
|
5529
|
+
// Shared ORT loader
|
|
5530
|
+
// \u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550
|
|
5531
|
+
|
|
5532
|
+
async function loadOrt(wasmPaths, isIOSDevice) {
|
|
5533
|
+
if (ort) return;
|
|
5534
|
+
var ortUrl = wasmPaths + 'ort.wasm.min.js';
|
|
5535
|
+
var response = await fetch(ortUrl);
|
|
5536
|
+
var scriptText = await response.text();
|
|
5537
|
+
var blob = new Blob([scriptText], { type: 'application/javascript' });
|
|
5538
|
+
var blobUrl = URL.createObjectURL(blob);
|
|
5539
|
+
importScripts(blobUrl);
|
|
5540
|
+
URL.revokeObjectURL(blobUrl);
|
|
5541
|
+
ort = self.ort;
|
|
5542
|
+
ort.env.wasm.wasmPaths = wasmPaths;
|
|
5543
|
+
ort.env.wasm.numThreads = isIOSDevice ? 1 : 4;
|
|
5544
|
+
ort.env.wasm.simd = true;
|
|
5545
|
+
ort.env.wasm.proxy = false;
|
|
5546
|
+
}
|
|
5547
|
+
|
|
5548
|
+
// \u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550
|
|
5549
|
+
// SenseVoice handlers
|
|
5550
|
+
// \u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550
|
|
5551
|
+
|
|
5552
|
+
async function svLoad(msg) {
|
|
5553
|
+
var tokensResponse = await fetch(msg.tokensUrl);
|
|
5554
|
+
if (!tokensResponse.ok) throw new Error('Failed to fetch tokens.txt: ' + tokensResponse.status);
|
|
5555
|
+
var tokensText = await tokensResponse.text();
|
|
5556
|
+
svTokenMap = parseTokensFile(tokensText);
|
|
5557
|
+
svLanguageId = msg.language;
|
|
5558
|
+
svTextNormId = msg.textNorm;
|
|
5559
|
+
|
|
5560
|
+
var sessionOptions = { executionProviders: ['wasm'], graphOptimizationLevel: 'all' };
|
|
5561
|
+
if (msg.isIOS) {
|
|
5562
|
+
svSession = await ort.InferenceSession.create(msg.modelUrl, sessionOptions);
|
|
5563
|
+
} else {
|
|
5564
|
+
var modelResponse = await fetch(msg.modelUrl);
|
|
5565
|
+
if (!modelResponse.ok) throw new Error('Failed to fetch model: ' + modelResponse.status);
|
|
5566
|
+
var modelBuffer = await modelResponse.arrayBuffer();
|
|
5567
|
+
svSession = await ort.InferenceSession.create(new Uint8Array(modelBuffer), sessionOptions);
|
|
5568
|
+
}
|
|
5569
|
+
|
|
5570
|
+
try {
|
|
5571
|
+
var metadata = svSession.handler && svSession.handler.metadata;
|
|
5572
|
+
if (metadata && metadata.neg_mean && metadata.inv_stddev) {
|
|
5573
|
+
var cmvn = parseCMVNFromMetadata(metadata.neg_mean, metadata.inv_stddev);
|
|
5574
|
+
svNegMean = cmvn.negMean;
|
|
5575
|
+
svInvStddev = cmvn.invStddev;
|
|
5576
|
+
}
|
|
5577
|
+
} catch (e) { /* CMVN not available */ }
|
|
5578
|
+
|
|
5579
|
+
svVocabSize = 0;
|
|
5580
|
+
svTokenMap.forEach(function(val, key) { if (key >= svVocabSize) svVocabSize = key + 1; });
|
|
5581
|
+
|
|
5582
|
+
return {
|
|
5583
|
+
vocabSize: svVocabSize,
|
|
5584
|
+
inputNames: svSession.inputNames.slice(),
|
|
5585
|
+
outputNames: svSession.outputNames.slice(),
|
|
5586
|
+
};
|
|
5587
|
+
}
|
|
5588
|
+
|
|
5589
|
+
async function svTranscribe(audio) {
|
|
5590
|
+
var preprocessStart = performance.now();
|
|
5591
|
+
var fbank = computeKaldiFbank(audio, 16000, 80);
|
|
5592
|
+
var numFrames = fbank.length / 80;
|
|
5593
|
+
if (numFrames === 0) {
|
|
5594
|
+
return { text: '', inferenceTimeMs: performance.now() - preprocessStart, preprocessTimeMs: performance.now() - preprocessStart };
|
|
5595
|
+
}
|
|
5596
|
+
var lfrFeatures = applyLFR(fbank, 80, 7, 6);
|
|
5597
|
+
var numLfrFrames = lfrFeatures.length / 560;
|
|
5598
|
+
if (svNegMean && svInvStddev) applyCMVN(lfrFeatures, 560, svNegMean, svInvStddev);
|
|
5599
|
+
var preprocessTimeMs = performance.now() - preprocessStart;
|
|
5600
|
+
|
|
5601
|
+
var feeds = {
|
|
5602
|
+
x: new ort.Tensor('float32', lfrFeatures, [1, numLfrFrames, 560]),
|
|
5603
|
+
x_length: new ort.Tensor('int32', new Int32Array([numLfrFrames]), [1]),
|
|
5604
|
+
language: new ort.Tensor('int32', new Int32Array([svLanguageId]), [1]),
|
|
5605
|
+
text_norm: new ort.Tensor('int32', new Int32Array([svTextNormId]), [1]),
|
|
5606
|
+
};
|
|
5607
|
+
var results = await svSession.run(feeds);
|
|
5608
|
+
var logitsOutput = results['logits'];
|
|
5609
|
+
if (!logitsOutput) throw new Error('Model output missing "logits" tensor');
|
|
5610
|
+
|
|
5611
|
+
var decoded = ctcGreedyDecode(logitsOutput.data, logitsOutput.dims[1], logitsOutput.dims[2], svTokenMap);
|
|
5612
|
+
var totalTimeMs = performance.now() - preprocessStart;
|
|
5613
|
+
|
|
5614
|
+
return {
|
|
5615
|
+
text: decoded.text, language: decoded.language, emotion: decoded.emotion, event: decoded.event,
|
|
5616
|
+
inferenceTimeMs: totalTimeMs, preprocessTimeMs: preprocessTimeMs,
|
|
5617
|
+
};
|
|
5618
|
+
}
|
|
5619
|
+
|
|
5620
|
+
async function svDispose() {
|
|
5621
|
+
if (svSession) { await svSession.release(); svSession = null; }
|
|
5622
|
+
svTokenMap = null; svNegMean = null; svInvStddev = null;
|
|
5623
|
+
}
|
|
5624
|
+
|
|
5625
|
+
// \u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550
|
|
5626
|
+
// Wav2ArkitCpu handlers
|
|
5627
|
+
// \u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550
|
|
5628
|
+
|
|
5629
|
+
async function cpuLoad(msg) {
|
|
5630
|
+
var sessionOptions = { executionProviders: ['wasm'], graphOptimizationLevel: 'all' };
|
|
5631
|
+
var dataFilename = msg.externalDataUrl ? msg.externalDataUrl.split('/').pop() : null;
|
|
5632
|
+
|
|
5633
|
+
if (msg.isIOS) {
|
|
5634
|
+
if (msg.externalDataUrl && dataFilename) {
|
|
5635
|
+
sessionOptions.externalData = [{ path: dataFilename, data: msg.externalDataUrl }];
|
|
5636
|
+
}
|
|
5637
|
+
cpuSession = await ort.InferenceSession.create(msg.modelUrl, sessionOptions);
|
|
5638
|
+
} else {
|
|
5639
|
+
var graphResponse = await fetch(msg.modelUrl);
|
|
5640
|
+
if (!graphResponse.ok) throw new Error('Failed to fetch model graph: ' + graphResponse.status);
|
|
5641
|
+
var graphBuffer = await graphResponse.arrayBuffer();
|
|
5642
|
+
if (msg.externalDataUrl && dataFilename) {
|
|
5643
|
+
var dataResponse = await fetch(msg.externalDataUrl);
|
|
5644
|
+
if (!dataResponse.ok) throw new Error('Failed to fetch external data: ' + dataResponse.status);
|
|
5645
|
+
var dataBuffer = await dataResponse.arrayBuffer();
|
|
5646
|
+
sessionOptions.externalData = [{ path: dataFilename, data: new Uint8Array(dataBuffer) }];
|
|
5647
|
+
}
|
|
5648
|
+
cpuSession = await ort.InferenceSession.create(new Uint8Array(graphBuffer), sessionOptions);
|
|
5649
|
+
}
|
|
5650
|
+
|
|
5651
|
+
// Warmup
|
|
5652
|
+
var warmupAudio = new Float32Array(16000);
|
|
5653
|
+
var warmupTensor = new ort.Tensor('float32', warmupAudio, [1, warmupAudio.length]);
|
|
5654
|
+
await cpuSession.run({ audio_waveform: warmupTensor });
|
|
5655
|
+
|
|
5656
|
+
return {
|
|
5657
|
+
inputNames: cpuSession.inputNames.slice(),
|
|
5658
|
+
outputNames: cpuSession.outputNames.slice(),
|
|
5659
|
+
};
|
|
5660
|
+
}
|
|
5661
|
+
|
|
5662
|
+
async function cpuInfer(audio) {
|
|
5663
|
+
var tensor = new ort.Tensor('float32', audio, [1, audio.length]);
|
|
5664
|
+
var results = await cpuSession.run({ audio_waveform: tensor });
|
|
5665
|
+
var blendshapeOutput = results['blendshapes'];
|
|
5666
|
+
if (!blendshapeOutput) throw new Error('Missing blendshapes output from model');
|
|
5667
|
+
|
|
5668
|
+
var blendshapeData = blendshapeOutput.data;
|
|
5669
|
+
var numFrames = blendshapeOutput.dims[1];
|
|
5670
|
+
var numBlendshapes = blendshapeOutput.dims[2];
|
|
5671
|
+
|
|
5672
|
+
var flatBuffer = new Float32Array(numFrames * numBlendshapes);
|
|
5673
|
+
for (var f = 0; f < numFrames; f++) {
|
|
5674
|
+
var offset = f * numBlendshapes;
|
|
5675
|
+
var rawFrame = blendshapeData.slice(offset, offset + numBlendshapes);
|
|
5676
|
+
var symmetrized = symmetrizeBlendshapes(rawFrame);
|
|
5677
|
+
flatBuffer.set(symmetrized, offset);
|
|
5678
|
+
}
|
|
5679
|
+
return { flatBuffer: flatBuffer, numFrames: numFrames, numBlendshapes: numBlendshapes };
|
|
5680
|
+
}
|
|
5681
|
+
|
|
5682
|
+
async function cpuDispose() {
|
|
5683
|
+
if (cpuSession) { await cpuSession.release(); cpuSession = null; }
|
|
5684
|
+
}
|
|
5685
|
+
|
|
5686
|
+
// \u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550
|
|
5687
|
+
// Silero VAD handlers
|
|
5688
|
+
// \u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550
|
|
5689
|
+
|
|
5690
|
+
async function vadLoad(msg) {
|
|
5691
|
+
vadSampleRate = msg.sampleRate;
|
|
5692
|
+
vadChunkSize = vadSampleRate === 16000 ? 512 : 256;
|
|
5693
|
+
vadContextSize = vadSampleRate === 16000 ? 64 : 32;
|
|
5694
|
+
|
|
5695
|
+
var response = await fetch(msg.modelUrl);
|
|
5696
|
+
if (!response.ok) throw new Error('Failed to fetch VAD model: ' + response.status);
|
|
5697
|
+
var modelBuffer = await response.arrayBuffer();
|
|
5698
|
+
vadSession = await ort.InferenceSession.create(new Uint8Array(modelBuffer), {
|
|
5699
|
+
executionProviders: ['wasm'],
|
|
5700
|
+
graphOptimizationLevel: 'all',
|
|
5701
|
+
});
|
|
5702
|
+
|
|
5703
|
+
return {
|
|
5704
|
+
inputNames: vadSession.inputNames.slice(),
|
|
5705
|
+
outputNames: vadSession.outputNames.slice(),
|
|
5706
|
+
};
|
|
5707
|
+
}
|
|
5708
|
+
|
|
5709
|
+
async function vadProcess(audio, state, context) {
|
|
5710
|
+
var inputSize = vadContextSize + vadChunkSize;
|
|
5711
|
+
var inputBuffer = new Float32Array(inputSize);
|
|
5712
|
+
inputBuffer.set(context, 0);
|
|
5713
|
+
inputBuffer.set(audio, vadContextSize);
|
|
5714
|
+
|
|
5715
|
+
var inputTensor = new ort.Tensor('float32', new Float32Array(inputBuffer), [1, inputSize]);
|
|
5716
|
+
var stateTensor = new ort.Tensor('float32', new Float32Array(state), [2, 1, 128]);
|
|
5717
|
+
var srTensor;
|
|
5718
|
+
try {
|
|
5719
|
+
srTensor = new ort.Tensor('int64', new BigInt64Array([BigInt(vadSampleRate)]), []);
|
|
5720
|
+
} catch (e) {
|
|
5721
|
+
srTensor = new ort.Tensor('int64', [BigInt(vadSampleRate)], []);
|
|
5722
|
+
}
|
|
5723
|
+
|
|
5724
|
+
var feeds = { 'input': inputTensor, 'state': stateTensor, 'sr': srTensor };
|
|
5725
|
+
var results = await vadSession.run(feeds);
|
|
5726
|
+
var outputTensor = results['output'];
|
|
5727
|
+
var newStateTensor = results['stateN'] || results['state'];
|
|
5728
|
+
if (!outputTensor) throw new Error('Missing output tensor from VAD model');
|
|
5729
|
+
|
|
5730
|
+
return { probability: outputTensor.data[0], newState: new Float32Array(newStateTensor.data) };
|
|
5731
|
+
}
|
|
5732
|
+
|
|
5733
|
+
function vadCreateInitialState() {
|
|
5734
|
+
return new Float32Array(2 * 1 * 128);
|
|
5735
|
+
}
|
|
5736
|
+
|
|
5737
|
+
async function vadDispose() {
|
|
5738
|
+
if (vadSession) { await vadSession.release(); vadSession = null; }
|
|
5739
|
+
}
|
|
5740
|
+
|
|
5741
|
+
// \u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550
|
|
5742
|
+
// Message handler
|
|
5743
|
+
// \u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550
|
|
5744
|
+
|
|
5745
|
+
self.onmessage = async function(e) {
|
|
5746
|
+
var msg = e.data;
|
|
5747
|
+
var requestId = msg.requestId;
|
|
5748
|
+
|
|
5749
|
+
try {
|
|
5750
|
+
switch (msg.type) {
|
|
5751
|
+
case 'init': {
|
|
5752
|
+
var startTime = performance.now();
|
|
5753
|
+
await loadOrt(msg.wasmPaths, msg.isIOS);
|
|
5754
|
+
self.postMessage({ type: 'init:done', requestId: requestId, loadTimeMs: performance.now() - startTime });
|
|
5755
|
+
break;
|
|
5756
|
+
}
|
|
5757
|
+
|
|
5758
|
+
case 'sv:load': {
|
|
5759
|
+
var startTime = performance.now();
|
|
5760
|
+
var info = await svLoad(msg);
|
|
5761
|
+
self.postMessage({
|
|
5762
|
+
type: 'sv:loaded', requestId: requestId, vocabSize: info.vocabSize,
|
|
5763
|
+
inputNames: info.inputNames, outputNames: info.outputNames,
|
|
5764
|
+
loadTimeMs: performance.now() - startTime,
|
|
5765
|
+
});
|
|
5766
|
+
break;
|
|
5767
|
+
}
|
|
5768
|
+
|
|
5769
|
+
case 'sv:transcribe': {
|
|
5770
|
+
var result = await svTranscribe(msg.audio);
|
|
5771
|
+
self.postMessage({
|
|
5772
|
+
type: 'sv:result', requestId: requestId,
|
|
5773
|
+
text: result.text, language: result.language, emotion: result.emotion, event: result.event,
|
|
5774
|
+
inferenceTimeMs: result.inferenceTimeMs, preprocessTimeMs: result.preprocessTimeMs,
|
|
5775
|
+
});
|
|
5776
|
+
break;
|
|
5777
|
+
}
|
|
5778
|
+
|
|
5779
|
+
case 'sv:dispose': {
|
|
5780
|
+
await svDispose();
|
|
5781
|
+
self.postMessage({ type: 'sv:disposed', requestId: requestId });
|
|
5782
|
+
break;
|
|
5783
|
+
}
|
|
5784
|
+
|
|
5785
|
+
case 'cpu:load': {
|
|
5786
|
+
var startTime = performance.now();
|
|
5787
|
+
var info = await cpuLoad(msg);
|
|
5788
|
+
self.postMessage({
|
|
5789
|
+
type: 'cpu:loaded', requestId: requestId,
|
|
5790
|
+
inputNames: info.inputNames, outputNames: info.outputNames,
|
|
5791
|
+
loadTimeMs: performance.now() - startTime,
|
|
5792
|
+
});
|
|
5793
|
+
break;
|
|
5794
|
+
}
|
|
5795
|
+
|
|
5796
|
+
case 'cpu:infer': {
|
|
5797
|
+
var startTime = performance.now();
|
|
5798
|
+
var result = await cpuInfer(msg.audio);
|
|
5799
|
+
var inferenceTimeMs = performance.now() - startTime;
|
|
5800
|
+
self.postMessage({
|
|
5801
|
+
type: 'cpu:result', requestId: requestId,
|
|
5802
|
+
blendshapes: result.flatBuffer, numFrames: result.numFrames,
|
|
5803
|
+
numBlendshapes: result.numBlendshapes, inferenceTimeMs: inferenceTimeMs,
|
|
5804
|
+
}, [result.flatBuffer.buffer]);
|
|
5805
|
+
break;
|
|
5806
|
+
}
|
|
5807
|
+
|
|
5808
|
+
case 'cpu:dispose': {
|
|
5809
|
+
await cpuDispose();
|
|
5810
|
+
self.postMessage({ type: 'cpu:disposed', requestId: requestId });
|
|
5811
|
+
break;
|
|
5812
|
+
}
|
|
5813
|
+
|
|
5814
|
+
case 'vad:load': {
|
|
5815
|
+
var startTime = performance.now();
|
|
5816
|
+
var info = await vadLoad(msg);
|
|
5817
|
+
self.postMessage({
|
|
5818
|
+
type: 'vad:loaded', requestId: requestId,
|
|
5819
|
+
inputNames: info.inputNames, outputNames: info.outputNames,
|
|
5820
|
+
loadTimeMs: performance.now() - startTime,
|
|
5821
|
+
});
|
|
5822
|
+
break;
|
|
5823
|
+
}
|
|
5824
|
+
|
|
5825
|
+
case 'vad:process': {
|
|
5826
|
+
var startTime = performance.now();
|
|
5827
|
+
var result = await vadProcess(msg.audio, msg.state, msg.context);
|
|
5828
|
+
self.postMessage({
|
|
5829
|
+
type: 'vad:result', requestId: requestId,
|
|
5830
|
+
probability: result.probability, state: result.newState,
|
|
5831
|
+
inferenceTimeMs: performance.now() - startTime,
|
|
5832
|
+
});
|
|
5833
|
+
break;
|
|
5834
|
+
}
|
|
5835
|
+
|
|
5836
|
+
case 'vad:reset': {
|
|
5837
|
+
var state = vadCreateInitialState();
|
|
5838
|
+
self.postMessage({ type: 'vad:reset', requestId: requestId, state: state });
|
|
5839
|
+
break;
|
|
5840
|
+
}
|
|
5841
|
+
|
|
5842
|
+
case 'vad:dispose': {
|
|
5843
|
+
await vadDispose();
|
|
5844
|
+
self.postMessage({ type: 'vad:disposed', requestId: requestId });
|
|
5845
|
+
break;
|
|
5846
|
+
}
|
|
5847
|
+
|
|
5848
|
+
case 'dispose-all': {
|
|
5849
|
+
await svDispose();
|
|
5850
|
+
await cpuDispose();
|
|
5851
|
+
await vadDispose();
|
|
5852
|
+
ort = null;
|
|
5853
|
+
self.postMessage({ type: 'dispose-all:done', requestId: requestId });
|
|
5854
|
+
break;
|
|
5855
|
+
}
|
|
5856
|
+
|
|
5857
|
+
default:
|
|
5858
|
+
self.postMessage({ type: 'error', requestId: requestId, error: 'Unknown message type: ' + msg.type });
|
|
5859
|
+
}
|
|
5860
|
+
} catch (err) {
|
|
5861
|
+
var errorMsg = err.message || String(err);
|
|
5862
|
+
if (typeof err === 'number') {
|
|
5863
|
+
errorMsg = 'Raw C++ exception pointer (0x' + err.toString(16) + '). Likely OOM in WASM.';
|
|
5864
|
+
}
|
|
5865
|
+
self.postMessage({ type: 'error', requestId: requestId, error: errorMsg });
|
|
5866
|
+
}
|
|
5867
|
+
};
|
|
5868
|
+
|
|
5869
|
+
self.onerror = function(err) {
|
|
5870
|
+
self.postMessage({ type: 'error', requestId: null, error: 'Worker error: ' + (err.message || String(err)) });
|
|
5871
|
+
};
|
|
5872
|
+
`;
|
|
5873
|
+
var UnifiedInferenceWorker = class {
|
|
5874
|
+
constructor() {
|
|
5875
|
+
this.worker = null;
|
|
5876
|
+
this.pendingRequests = /* @__PURE__ */ new Map();
|
|
5877
|
+
this.initialized = false;
|
|
5878
|
+
this.poisoned = false;
|
|
5879
|
+
}
|
|
5880
|
+
/**
|
|
5881
|
+
* Initialize the worker (load ORT WASM from CDN)
|
|
5882
|
+
*/
|
|
5883
|
+
async init() {
|
|
5884
|
+
if (this.initialized) return;
|
|
5885
|
+
const startTime = performance.now();
|
|
5886
|
+
const telemetry = getTelemetry();
|
|
5887
|
+
const span = telemetry?.startSpan("UnifiedInferenceWorker.init");
|
|
5888
|
+
try {
|
|
5889
|
+
logger6.info("Creating unified inference worker...");
|
|
5890
|
+
this.worker = this.createWorker();
|
|
5891
|
+
await this.sendMessage(
|
|
5892
|
+
{ type: "init", wasmPaths: WASM_CDN_PATH3, isIOS: isIOS() },
|
|
5893
|
+
"init:done",
|
|
5894
|
+
INIT_TIMEOUT_MS
|
|
5895
|
+
);
|
|
5896
|
+
this.initialized = true;
|
|
5897
|
+
const loadTimeMs = performance.now() - startTime;
|
|
5898
|
+
logger6.info("Unified worker initialized", { loadTimeMs: Math.round(loadTimeMs) });
|
|
5899
|
+
span?.setAttributes({ "worker.init_time_ms": loadTimeMs });
|
|
5900
|
+
span?.end();
|
|
5901
|
+
} catch (error) {
|
|
5902
|
+
span?.endWithError(error instanceof Error ? error : new Error(String(error)));
|
|
5903
|
+
this.cleanup();
|
|
5904
|
+
throw error;
|
|
5905
|
+
}
|
|
5906
|
+
}
|
|
5907
|
+
// ── SenseVoice ────────────────────────────────────────────────────────
|
|
5908
|
+
async loadSenseVoice(config) {
|
|
5909
|
+
this.assertReady();
|
|
5910
|
+
const startTime = performance.now();
|
|
5911
|
+
const result = await this.sendMessage(
|
|
5912
|
+
{
|
|
5913
|
+
type: "sv:load",
|
|
5914
|
+
modelUrl: resolveUrl2(config.modelUrl),
|
|
5915
|
+
tokensUrl: resolveUrl2(config.tokensUrl),
|
|
5916
|
+
isIOS: isIOS(),
|
|
5917
|
+
language: config.language,
|
|
5918
|
+
textNorm: config.textNorm
|
|
5919
|
+
},
|
|
5920
|
+
"sv:loaded",
|
|
5921
|
+
SV_LOAD_TIMEOUT_MS
|
|
5922
|
+
);
|
|
5923
|
+
const loadTimeMs = performance.now() - startTime;
|
|
5924
|
+
return {
|
|
5925
|
+
backend: "wasm",
|
|
5926
|
+
loadTimeMs,
|
|
5927
|
+
inputNames: result.inputNames,
|
|
5928
|
+
outputNames: result.outputNames,
|
|
5929
|
+
vocabSize: result.vocabSize
|
|
5930
|
+
};
|
|
5931
|
+
}
|
|
5932
|
+
async transcribe(audio) {
|
|
5933
|
+
this.assertReady();
|
|
5934
|
+
const result = await this.sendMessage(
|
|
5935
|
+
{ type: "sv:transcribe", audio },
|
|
5936
|
+
"sv:result",
|
|
5937
|
+
SV_INFER_TIMEOUT_MS
|
|
5938
|
+
);
|
|
5939
|
+
return {
|
|
5940
|
+
text: result.text,
|
|
5941
|
+
language: result.language,
|
|
5942
|
+
emotion: result.emotion,
|
|
5943
|
+
event: result.event,
|
|
5944
|
+
inferenceTimeMs: result.inferenceTimeMs,
|
|
5945
|
+
preprocessTimeMs: result.preprocessTimeMs
|
|
5946
|
+
};
|
|
5947
|
+
}
|
|
5948
|
+
async disposeSenseVoice() {
|
|
5949
|
+
if (!this.worker) return;
|
|
5950
|
+
await this.sendMessage({ type: "sv:dispose" }, "sv:disposed", DISPOSE_TIMEOUT_MS);
|
|
5951
|
+
}
|
|
5952
|
+
// ── Wav2ArkitCpu (Lip Sync) ──────────────────────────────────────────
|
|
5953
|
+
async loadLipSync(config) {
|
|
5954
|
+
this.assertReady();
|
|
5955
|
+
const startTime = performance.now();
|
|
5956
|
+
const result = await this.sendMessage(
|
|
5957
|
+
{
|
|
5958
|
+
type: "cpu:load",
|
|
5959
|
+
modelUrl: resolveUrl2(config.modelUrl),
|
|
5960
|
+
externalDataUrl: config.externalDataUrl ? resolveUrl2(config.externalDataUrl) : null,
|
|
5961
|
+
isIOS: isIOS()
|
|
5962
|
+
},
|
|
5963
|
+
"cpu:loaded",
|
|
5964
|
+
CPU_LOAD_TIMEOUT_MS
|
|
5965
|
+
);
|
|
5966
|
+
const loadTimeMs = performance.now() - startTime;
|
|
5967
|
+
return {
|
|
5968
|
+
backend: "wasm",
|
|
5969
|
+
loadTimeMs,
|
|
5970
|
+
inputNames: result.inputNames,
|
|
5971
|
+
outputNames: result.outputNames
|
|
5972
|
+
};
|
|
5973
|
+
}
|
|
5974
|
+
async inferLipSync(audio) {
|
|
5975
|
+
this.assertReady();
|
|
5976
|
+
return this.sendMessage(
|
|
5977
|
+
{ type: "cpu:infer", audio },
|
|
5978
|
+
"cpu:result",
|
|
5979
|
+
CPU_INFER_TIMEOUT_MS
|
|
5980
|
+
);
|
|
5981
|
+
}
|
|
5982
|
+
async disposeLipSync() {
|
|
5983
|
+
if (!this.worker) return;
|
|
5984
|
+
await this.sendMessage({ type: "cpu:dispose" }, "cpu:disposed", DISPOSE_TIMEOUT_MS);
|
|
5985
|
+
}
|
|
5986
|
+
// ── Silero VAD ────────────────────────────────────────────────────────
|
|
5987
|
+
async loadVAD(config) {
|
|
5988
|
+
this.assertReady();
|
|
5989
|
+
const startTime = performance.now();
|
|
5990
|
+
const chunkSize = config.sampleRate === 16e3 ? 512 : 256;
|
|
5991
|
+
const result = await this.sendMessage(
|
|
5992
|
+
{
|
|
5993
|
+
type: "vad:load",
|
|
5994
|
+
modelUrl: resolveUrl2(config.modelUrl),
|
|
5995
|
+
sampleRate: config.sampleRate
|
|
5996
|
+
},
|
|
5997
|
+
"vad:loaded",
|
|
5998
|
+
VAD_LOAD_TIMEOUT_MS
|
|
5999
|
+
);
|
|
6000
|
+
const loadTimeMs = performance.now() - startTime;
|
|
6001
|
+
return {
|
|
6002
|
+
backend: "wasm",
|
|
6003
|
+
loadTimeMs,
|
|
6004
|
+
inputNames: result.inputNames,
|
|
6005
|
+
outputNames: result.outputNames,
|
|
6006
|
+
sampleRate: config.sampleRate,
|
|
6007
|
+
chunkSize
|
|
6008
|
+
};
|
|
6009
|
+
}
|
|
6010
|
+
async processVAD(audio, state, context) {
|
|
6011
|
+
this.assertReady();
|
|
6012
|
+
return this.sendMessage(
|
|
6013
|
+
{ type: "vad:process", audio, state, context },
|
|
6014
|
+
"vad:result",
|
|
6015
|
+
VAD_INFER_TIMEOUT_MS
|
|
6016
|
+
);
|
|
6017
|
+
}
|
|
6018
|
+
async resetVAD() {
|
|
6019
|
+
this.assertReady();
|
|
6020
|
+
const result = await this.sendMessage(
|
|
6021
|
+
{ type: "vad:reset" },
|
|
6022
|
+
"vad:reset",
|
|
6023
|
+
VAD_INFER_TIMEOUT_MS
|
|
6024
|
+
);
|
|
6025
|
+
return result.state;
|
|
6026
|
+
}
|
|
6027
|
+
async disposeVAD() {
|
|
6028
|
+
if (!this.worker) return;
|
|
6029
|
+
await this.sendMessage({ type: "vad:dispose" }, "vad:disposed", DISPOSE_TIMEOUT_MS);
|
|
6030
|
+
}
|
|
6031
|
+
// ── Lifecycle ─────────────────────────────────────────────────────────
|
|
6032
|
+
async dispose() {
|
|
6033
|
+
if (this.worker) {
|
|
6034
|
+
try {
|
|
6035
|
+
await this.sendMessage({ type: "dispose-all" }, "dispose-all:done", DISPOSE_TIMEOUT_MS);
|
|
6036
|
+
} catch {
|
|
6037
|
+
}
|
|
6038
|
+
this.worker.terminate();
|
|
6039
|
+
this.worker = null;
|
|
6040
|
+
}
|
|
6041
|
+
this.initialized = false;
|
|
6042
|
+
this.poisoned = false;
|
|
6043
|
+
this.rejectAllPending("Worker disposed");
|
|
6044
|
+
this.pendingRequests.clear();
|
|
6045
|
+
}
|
|
6046
|
+
/** Check if the worker is initialized and not poisoned */
|
|
6047
|
+
get isReady() {
|
|
6048
|
+
return this.initialized && !this.poisoned && this.worker !== null;
|
|
6049
|
+
}
|
|
6050
|
+
/** Check if Web Workers are supported */
|
|
6051
|
+
static isSupported() {
|
|
6052
|
+
return typeof Worker !== "undefined";
|
|
6053
|
+
}
|
|
6054
|
+
// ── Private ───────────────────────────────────────────────────────────
|
|
6055
|
+
assertReady() {
|
|
6056
|
+
if (!this.initialized || !this.worker) {
|
|
6057
|
+
throw new Error("UnifiedInferenceWorker not initialized. Call init() first.");
|
|
6058
|
+
}
|
|
6059
|
+
if (this.poisoned) {
|
|
6060
|
+
throw new Error("UnifiedInferenceWorker timed out \u2014 unavailable until page reload");
|
|
6061
|
+
}
|
|
6062
|
+
}
|
|
6063
|
+
createWorker() {
|
|
6064
|
+
const blob = new Blob([WORKER_SCRIPT2], { type: "application/javascript" });
|
|
6065
|
+
const blobUrl = URL.createObjectURL(blob);
|
|
6066
|
+
const worker = new Worker(blobUrl);
|
|
6067
|
+
URL.revokeObjectURL(blobUrl);
|
|
6068
|
+
worker.onmessage = (event) => {
|
|
6069
|
+
this.handleWorkerMessage(event.data);
|
|
6070
|
+
};
|
|
6071
|
+
worker.onerror = (error) => {
|
|
6072
|
+
logger6.error("Unified worker error", { error: error.message });
|
|
6073
|
+
this.rejectAllPending(`Worker error: ${error.message}`);
|
|
6074
|
+
};
|
|
6075
|
+
return worker;
|
|
6076
|
+
}
|
|
6077
|
+
handleWorkerMessage(data) {
|
|
6078
|
+
const requestId = data.requestId;
|
|
6079
|
+
if (data.type === "error") {
|
|
6080
|
+
if (requestId && this.pendingRequests.has(requestId)) {
|
|
6081
|
+
const pending = this.pendingRequests.get(requestId);
|
|
6082
|
+
clearTimeout(pending.timeout);
|
|
6083
|
+
this.pendingRequests.delete(requestId);
|
|
6084
|
+
pending.reject(new Error(data.error));
|
|
6085
|
+
} else {
|
|
6086
|
+
logger6.error("Worker broadcast error", { error: data.error });
|
|
6087
|
+
this.rejectAllPending(data.error);
|
|
6088
|
+
}
|
|
6089
|
+
return;
|
|
6090
|
+
}
|
|
6091
|
+
if (requestId && this.pendingRequests.has(requestId)) {
|
|
6092
|
+
const pending = this.pendingRequests.get(requestId);
|
|
6093
|
+
clearTimeout(pending.timeout);
|
|
6094
|
+
this.pendingRequests.delete(requestId);
|
|
6095
|
+
pending.resolve(data);
|
|
6096
|
+
}
|
|
6097
|
+
}
|
|
6098
|
+
sendMessage(message, expectedType, timeoutMs) {
|
|
6099
|
+
return new Promise((resolve, reject) => {
|
|
6100
|
+
if (!this.worker) {
|
|
6101
|
+
reject(new Error("Worker not initialized"));
|
|
6102
|
+
return;
|
|
6103
|
+
}
|
|
6104
|
+
const requestId = nextRequestId();
|
|
6105
|
+
const timeout = setTimeout(() => {
|
|
6106
|
+
this.pendingRequests.delete(requestId);
|
|
6107
|
+
this.poisoned = true;
|
|
6108
|
+
logger6.error("CRITICAL: Worker operation timed out \u2014 worker is dead", {
|
|
6109
|
+
type: message.type,
|
|
6110
|
+
timeoutMs
|
|
6111
|
+
});
|
|
6112
|
+
reject(new Error(`Worker operation '${message.type}' timed out after ${timeoutMs}ms`));
|
|
6113
|
+
}, timeoutMs);
|
|
6114
|
+
this.pendingRequests.set(requestId, {
|
|
6115
|
+
resolve,
|
|
6116
|
+
reject,
|
|
6117
|
+
timeout
|
|
6118
|
+
});
|
|
6119
|
+
this.worker.postMessage({ ...message, requestId });
|
|
6120
|
+
});
|
|
6121
|
+
}
|
|
6122
|
+
rejectAllPending(reason) {
|
|
6123
|
+
for (const [, pending] of this.pendingRequests) {
|
|
6124
|
+
clearTimeout(pending.timeout);
|
|
6125
|
+
pending.reject(new Error(reason));
|
|
6126
|
+
}
|
|
6127
|
+
this.pendingRequests.clear();
|
|
6128
|
+
}
|
|
6129
|
+
cleanup() {
|
|
6130
|
+
if (this.worker) {
|
|
6131
|
+
this.worker.terminate();
|
|
6132
|
+
this.worker = null;
|
|
6133
|
+
}
|
|
6134
|
+
this.initialized = false;
|
|
6135
|
+
this.rejectAllPending("Worker cleanup");
|
|
6136
|
+
this.pendingRequests.clear();
|
|
6137
|
+
}
|
|
6138
|
+
};
|
|
6139
|
+
var SenseVoiceUnifiedAdapter = class {
|
|
6140
|
+
constructor(worker, config) {
|
|
6141
|
+
this._isLoaded = false;
|
|
6142
|
+
this.inferenceQueue = Promise.resolve();
|
|
6143
|
+
this.worker = worker;
|
|
6144
|
+
const modelDir = config.modelUrl.substring(0, config.modelUrl.lastIndexOf("/"));
|
|
6145
|
+
this.config = {
|
|
6146
|
+
modelUrl: config.modelUrl,
|
|
6147
|
+
tokensUrl: config.tokensUrl ?? `${modelDir}/tokens.txt`,
|
|
6148
|
+
language: config.language ?? "auto",
|
|
6149
|
+
textNorm: config.textNorm ?? "with_itn"
|
|
6150
|
+
};
|
|
6151
|
+
this.languageId = resolveLanguageId(this.config.language);
|
|
6152
|
+
this.textNormId = resolveTextNormId(this.config.textNorm);
|
|
6153
|
+
}
|
|
6154
|
+
get isLoaded() {
|
|
6155
|
+
return this._isLoaded;
|
|
6156
|
+
}
|
|
6157
|
+
get backend() {
|
|
6158
|
+
return this._isLoaded ? "wasm" : null;
|
|
6159
|
+
}
|
|
6160
|
+
async load(onProgress) {
|
|
6161
|
+
const telemetry = getTelemetry();
|
|
6162
|
+
const span = telemetry?.startSpan("SenseVoiceUnifiedAdapter.load", {
|
|
6163
|
+
"model.url": this.config.modelUrl
|
|
6164
|
+
});
|
|
6165
|
+
try {
|
|
6166
|
+
const result = await this.worker.loadSenseVoice({
|
|
6167
|
+
modelUrl: this.config.modelUrl,
|
|
6168
|
+
tokensUrl: this.config.tokensUrl,
|
|
6169
|
+
language: this.languageId,
|
|
6170
|
+
textNorm: this.textNormId
|
|
6171
|
+
});
|
|
6172
|
+
this._isLoaded = true;
|
|
6173
|
+
onProgress?.(1, 1);
|
|
6174
|
+
logger6.info("SenseVoice loaded via unified worker", {
|
|
6175
|
+
backend: "wasm",
|
|
6176
|
+
loadTimeMs: Math.round(result.loadTimeMs),
|
|
6177
|
+
vocabSize: result.vocabSize
|
|
6178
|
+
});
|
|
6179
|
+
span?.setAttributes({ "model.backend": "wasm", "model.load_time_ms": result.loadTimeMs });
|
|
6180
|
+
span?.end();
|
|
6181
|
+
telemetry?.recordHistogram("omote.model.load_time", result.loadTimeMs, {
|
|
6182
|
+
model: "sensevoice-unified",
|
|
6183
|
+
backend: "wasm"
|
|
6184
|
+
});
|
|
6185
|
+
return result;
|
|
6186
|
+
} catch (error) {
|
|
6187
|
+
span?.endWithError(error instanceof Error ? error : new Error(String(error)));
|
|
6188
|
+
throw error;
|
|
6189
|
+
}
|
|
6190
|
+
}
|
|
6191
|
+
async transcribe(audioSamples) {
|
|
6192
|
+
if (!this._isLoaded) throw new Error("Model not loaded. Call load() first.");
|
|
6193
|
+
const audio = new Float32Array(audioSamples);
|
|
6194
|
+
return new Promise((resolve, reject) => {
|
|
6195
|
+
this.inferenceQueue = this.inferenceQueue.then(async () => {
|
|
6196
|
+
try {
|
|
6197
|
+
const result = await this.worker.transcribe(audio);
|
|
6198
|
+
resolve(result);
|
|
6199
|
+
} catch (err) {
|
|
6200
|
+
reject(err);
|
|
6201
|
+
}
|
|
6202
|
+
});
|
|
6203
|
+
});
|
|
6204
|
+
}
|
|
6205
|
+
async dispose() {
|
|
6206
|
+
if (this._isLoaded) {
|
|
6207
|
+
await this.worker.disposeSenseVoice();
|
|
6208
|
+
this._isLoaded = false;
|
|
6209
|
+
}
|
|
6210
|
+
}
|
|
6211
|
+
};
|
|
6212
|
+
var Wav2ArkitCpuUnifiedAdapter = class {
|
|
6213
|
+
constructor(worker, config) {
|
|
6214
|
+
this.modelId = "wav2arkit_cpu";
|
|
6215
|
+
this._isLoaded = false;
|
|
6216
|
+
this.inferenceQueue = Promise.resolve();
|
|
6217
|
+
this.worker = worker;
|
|
6218
|
+
this.config = config;
|
|
6219
|
+
}
|
|
6220
|
+
get isLoaded() {
|
|
6221
|
+
return this._isLoaded;
|
|
6222
|
+
}
|
|
6223
|
+
get backend() {
|
|
6224
|
+
return this._isLoaded ? "wasm" : null;
|
|
6225
|
+
}
|
|
6226
|
+
async load() {
|
|
6227
|
+
const telemetry = getTelemetry();
|
|
6228
|
+
const span = telemetry?.startSpan("Wav2ArkitCpuUnifiedAdapter.load", {
|
|
6229
|
+
"model.url": this.config.modelUrl
|
|
6230
|
+
});
|
|
6231
|
+
try {
|
|
6232
|
+
const externalDataUrl = this.config.externalDataUrl !== false ? this.config.externalDataUrl || `${this.config.modelUrl}.data` : null;
|
|
6233
|
+
const result = await this.worker.loadLipSync({
|
|
6234
|
+
modelUrl: this.config.modelUrl,
|
|
6235
|
+
externalDataUrl: externalDataUrl || null
|
|
6236
|
+
});
|
|
6237
|
+
this._isLoaded = true;
|
|
6238
|
+
logger6.info("Wav2ArkitCpu loaded via unified worker", {
|
|
6239
|
+
backend: "wasm",
|
|
6240
|
+
loadTimeMs: Math.round(result.loadTimeMs)
|
|
6241
|
+
});
|
|
6242
|
+
span?.setAttributes({ "model.backend": "wasm", "model.load_time_ms": result.loadTimeMs });
|
|
6243
|
+
span?.end();
|
|
6244
|
+
telemetry?.recordHistogram("omote.model.load_time", result.loadTimeMs, {
|
|
6245
|
+
model: "wav2arkit_cpu-unified",
|
|
6246
|
+
backend: "wasm"
|
|
6247
|
+
});
|
|
6248
|
+
return result;
|
|
6249
|
+
} catch (error) {
|
|
6250
|
+
span?.endWithError(error instanceof Error ? error : new Error(String(error)));
|
|
6251
|
+
throw error;
|
|
6252
|
+
}
|
|
6253
|
+
}
|
|
6254
|
+
async infer(audioSamples, _identityIndex) {
|
|
6255
|
+
if (!this._isLoaded) throw new Error("Model not loaded. Call load() first.");
|
|
6256
|
+
const audioCopy = new Float32Array(audioSamples);
|
|
6257
|
+
return new Promise((resolve, reject) => {
|
|
6258
|
+
this.inferenceQueue = this.inferenceQueue.then(async () => {
|
|
6259
|
+
const telemetry = getTelemetry();
|
|
6260
|
+
const span = telemetry?.startSpan("Wav2ArkitCpuUnifiedAdapter.infer", {
|
|
6261
|
+
"inference.input_samples": audioCopy.length
|
|
6262
|
+
});
|
|
6263
|
+
try {
|
|
6264
|
+
const startTime = performance.now();
|
|
6265
|
+
const result = await this.worker.inferLipSync(audioCopy);
|
|
6266
|
+
const inferenceTimeMs = performance.now() - startTime;
|
|
6267
|
+
const flatBuffer = result.blendshapes;
|
|
6268
|
+
const { numFrames, numBlendshapes } = result;
|
|
6269
|
+
const blendshapes = [];
|
|
6270
|
+
for (let f = 0; f < numFrames; f++) {
|
|
6271
|
+
blendshapes.push(flatBuffer.slice(f * numBlendshapes, (f + 1) * numBlendshapes));
|
|
6272
|
+
}
|
|
6273
|
+
span?.setAttributes({
|
|
6274
|
+
"inference.duration_ms": inferenceTimeMs,
|
|
6275
|
+
"inference.frames": numFrames
|
|
6276
|
+
});
|
|
6277
|
+
span?.end();
|
|
6278
|
+
resolve({ blendshapes, numFrames, inferenceTimeMs });
|
|
6279
|
+
} catch (err) {
|
|
6280
|
+
span?.endWithError(err instanceof Error ? err : new Error(String(err)));
|
|
6281
|
+
reject(err);
|
|
6282
|
+
}
|
|
6283
|
+
});
|
|
6284
|
+
});
|
|
6285
|
+
}
|
|
6286
|
+
async dispose() {
|
|
6287
|
+
if (this._isLoaded) {
|
|
6288
|
+
await this.worker.disposeLipSync();
|
|
6289
|
+
this._isLoaded = false;
|
|
6290
|
+
}
|
|
6291
|
+
}
|
|
6292
|
+
};
|
|
6293
|
+
var SileroVADUnifiedAdapter = class {
|
|
6294
|
+
constructor(worker, config) {
|
|
6295
|
+
this._isLoaded = false;
|
|
6296
|
+
// Inference queue
|
|
6297
|
+
this.inferenceQueue = Promise.resolve();
|
|
6298
|
+
// Pre-speech buffer
|
|
6299
|
+
this.preSpeechBuffer = [];
|
|
6300
|
+
this.wasSpeaking = false;
|
|
6301
|
+
this.worker = worker;
|
|
6302
|
+
const sr = config.sampleRate ?? 16e3;
|
|
6303
|
+
this.config = {
|
|
6304
|
+
modelUrl: config.modelUrl,
|
|
6305
|
+
backend: config.backend ?? "wasm",
|
|
6306
|
+
sampleRate: sr,
|
|
6307
|
+
threshold: config.threshold ?? 0.5,
|
|
6308
|
+
preSpeechBufferChunks: config.preSpeechBufferChunks ?? 10
|
|
6309
|
+
};
|
|
6310
|
+
this.chunkSize = sr === 16e3 ? 512 : 256;
|
|
6311
|
+
this.contextSize = sr === 16e3 ? 64 : 32;
|
|
6312
|
+
this.state = new Float32Array(2 * 1 * 128);
|
|
6313
|
+
this.context = new Float32Array(this.contextSize);
|
|
6314
|
+
}
|
|
6315
|
+
get isLoaded() {
|
|
6316
|
+
return this._isLoaded;
|
|
6317
|
+
}
|
|
6318
|
+
get backend() {
|
|
6319
|
+
return this._isLoaded ? "wasm" : null;
|
|
6320
|
+
}
|
|
6321
|
+
get sampleRate() {
|
|
6322
|
+
return this.config.sampleRate;
|
|
6323
|
+
}
|
|
6324
|
+
get threshold() {
|
|
6325
|
+
return this.config.threshold;
|
|
6326
|
+
}
|
|
6327
|
+
getChunkSize() {
|
|
6328
|
+
return this.chunkSize;
|
|
6329
|
+
}
|
|
6330
|
+
getChunkDurationMs() {
|
|
6331
|
+
return this.chunkSize / this.config.sampleRate * 1e3;
|
|
6332
|
+
}
|
|
6333
|
+
async load() {
|
|
6334
|
+
const telemetry = getTelemetry();
|
|
6335
|
+
const span = telemetry?.startSpan("SileroVADUnifiedAdapter.load", {
|
|
6336
|
+
"model.url": this.config.modelUrl
|
|
6337
|
+
});
|
|
6338
|
+
try {
|
|
6339
|
+
const result = await this.worker.loadVAD({
|
|
6340
|
+
modelUrl: this.config.modelUrl,
|
|
6341
|
+
sampleRate: this.config.sampleRate
|
|
6342
|
+
});
|
|
6343
|
+
this._isLoaded = true;
|
|
6344
|
+
logger6.info("SileroVAD loaded via unified worker", {
|
|
6345
|
+
backend: "wasm",
|
|
6346
|
+
loadTimeMs: Math.round(result.loadTimeMs),
|
|
6347
|
+
sampleRate: this.config.sampleRate,
|
|
6348
|
+
chunkSize: this.chunkSize
|
|
6349
|
+
});
|
|
6350
|
+
span?.setAttributes({ "model.backend": "wasm", "model.load_time_ms": result.loadTimeMs });
|
|
6351
|
+
span?.end();
|
|
6352
|
+
telemetry?.recordHistogram("omote.model.load_time", result.loadTimeMs, {
|
|
6353
|
+
model: "silero-vad-unified",
|
|
6354
|
+
backend: "wasm"
|
|
6355
|
+
});
|
|
6356
|
+
return result;
|
|
6357
|
+
} catch (error) {
|
|
6358
|
+
span?.endWithError(error instanceof Error ? error : new Error(String(error)));
|
|
6359
|
+
throw error;
|
|
6360
|
+
}
|
|
6361
|
+
}
|
|
6362
|
+
async process(audioChunk) {
|
|
6363
|
+
if (!this._isLoaded) throw new Error("Model not loaded. Call load() first.");
|
|
6364
|
+
if (audioChunk.length !== this.chunkSize) {
|
|
6365
|
+
throw new Error(
|
|
6366
|
+
`Audio chunk must be exactly ${this.chunkSize} samples (got ${audioChunk.length}). Use getChunkSize() to get required size.`
|
|
6367
|
+
);
|
|
6368
|
+
}
|
|
6369
|
+
const audioChunkCopy = new Float32Array(audioChunk);
|
|
6370
|
+
return new Promise((resolve, reject) => {
|
|
6371
|
+
this.inferenceQueue = this.inferenceQueue.then(async () => {
|
|
6372
|
+
try {
|
|
6373
|
+
const startTime = performance.now();
|
|
6374
|
+
const result = await this.worker.processVAD(audioChunkCopy, this.state, this.context);
|
|
6375
|
+
this.state = result.state;
|
|
6376
|
+
this.context = audioChunkCopy.slice(-this.contextSize);
|
|
6377
|
+
const inferenceTimeMs = performance.now() - startTime;
|
|
6378
|
+
const isSpeech = result.probability > this.config.threshold;
|
|
6379
|
+
let preSpeechChunks;
|
|
6380
|
+
if (isSpeech && !this.wasSpeaking) {
|
|
6381
|
+
preSpeechChunks = [...this.preSpeechBuffer];
|
|
6382
|
+
this.preSpeechBuffer = [];
|
|
6383
|
+
} else if (!isSpeech && !this.wasSpeaking) {
|
|
6384
|
+
this.preSpeechBuffer.push(new Float32Array(audioChunkCopy));
|
|
6385
|
+
if (this.preSpeechBuffer.length > this.config.preSpeechBufferChunks) {
|
|
6386
|
+
this.preSpeechBuffer.shift();
|
|
6387
|
+
}
|
|
6388
|
+
} else if (!isSpeech && this.wasSpeaking) {
|
|
6389
|
+
this.preSpeechBuffer = [];
|
|
6390
|
+
}
|
|
6391
|
+
this.wasSpeaking = isSpeech;
|
|
6392
|
+
resolve({
|
|
6393
|
+
probability: result.probability,
|
|
6394
|
+
isSpeech,
|
|
6395
|
+
inferenceTimeMs,
|
|
6396
|
+
preSpeechChunks
|
|
6397
|
+
});
|
|
6398
|
+
} catch (err) {
|
|
6399
|
+
reject(err);
|
|
6400
|
+
}
|
|
6401
|
+
});
|
|
6402
|
+
});
|
|
6403
|
+
}
|
|
6404
|
+
async reset() {
|
|
6405
|
+
if (!this._isLoaded) throw new Error("Model not loaded. Call load() first.");
|
|
6406
|
+
const newState = await this.worker.resetVAD();
|
|
6407
|
+
this.state = newState;
|
|
6408
|
+
this.context = new Float32Array(this.contextSize);
|
|
6409
|
+
this.preSpeechBuffer = [];
|
|
6410
|
+
this.wasSpeaking = false;
|
|
6411
|
+
}
|
|
6412
|
+
async dispose() {
|
|
6413
|
+
if (this._isLoaded) {
|
|
6414
|
+
await this.worker.disposeVAD();
|
|
6415
|
+
this._isLoaded = false;
|
|
6416
|
+
}
|
|
6417
|
+
this.state = new Float32Array(2 * 1 * 128);
|
|
6418
|
+
this.context = new Float32Array(this.contextSize);
|
|
6419
|
+
this.preSpeechBuffer = [];
|
|
6420
|
+
this.wasSpeaking = false;
|
|
6421
|
+
}
|
|
6422
|
+
};
|
|
6423
|
+
|
|
6424
|
+
// src/inference/createSenseVoice.ts
|
|
6425
|
+
var logger7 = createLogger("createSenseVoice");
|
|
6426
|
+
function createSenseVoice(config) {
|
|
6427
|
+
if (config.unifiedWorker) {
|
|
6428
|
+
logger7.info("Creating SenseVoiceUnifiedAdapter (shared unified worker)");
|
|
6429
|
+
return new SenseVoiceUnifiedAdapter(config.unifiedWorker, {
|
|
6430
|
+
modelUrl: config.modelUrl,
|
|
6431
|
+
tokensUrl: config.tokensUrl,
|
|
6432
|
+
language: config.language,
|
|
6433
|
+
textNorm: config.textNorm
|
|
6434
|
+
});
|
|
6435
|
+
}
|
|
6436
|
+
const useWorker = config.useWorker ?? "auto";
|
|
6437
|
+
if (useWorker === true) {
|
|
6438
|
+
if (!SenseVoiceWorker.isSupported()) {
|
|
6439
|
+
throw new Error("Web Workers are not supported in this environment");
|
|
6440
|
+
}
|
|
6441
|
+
logger7.info("Creating SenseVoiceWorker (off-main-thread)");
|
|
6442
|
+
return new SenseVoiceWorker({
|
|
6443
|
+
modelUrl: config.modelUrl,
|
|
6444
|
+
tokensUrl: config.tokensUrl,
|
|
6445
|
+
language: config.language,
|
|
6446
|
+
textNorm: config.textNorm
|
|
6447
|
+
});
|
|
6448
|
+
}
|
|
6449
|
+
if (useWorker === false) {
|
|
6450
|
+
logger7.info("Creating SenseVoiceInference (main thread)");
|
|
6451
|
+
return new SenseVoiceInference({
|
|
6452
|
+
modelUrl: config.modelUrl,
|
|
6453
|
+
tokensUrl: config.tokensUrl,
|
|
6454
|
+
language: config.language,
|
|
6455
|
+
textNorm: config.textNorm
|
|
6456
|
+
});
|
|
6457
|
+
}
|
|
6458
|
+
if (SenseVoiceWorker.isSupported() && !isIOS()) {
|
|
6459
|
+
logger7.info("Auto-detected: creating SenseVoiceWorker (off-main-thread)");
|
|
6460
|
+
return new SenseVoiceWorker({
|
|
6461
|
+
modelUrl: config.modelUrl,
|
|
6462
|
+
tokensUrl: config.tokensUrl,
|
|
6463
|
+
language: config.language,
|
|
6464
|
+
textNorm: config.textNorm
|
|
6465
|
+
});
|
|
6466
|
+
}
|
|
6467
|
+
logger7.info("Auto-detected: creating SenseVoiceInference (main thread)", {
|
|
6468
|
+
reason: isIOS() ? "iOS (shared ORT instance)" : "Worker unsupported"
|
|
6469
|
+
});
|
|
6470
|
+
return new SenseVoiceInference({
|
|
6471
|
+
modelUrl: config.modelUrl,
|
|
6472
|
+
tokensUrl: config.tokensUrl,
|
|
6473
|
+
language: config.language,
|
|
6474
|
+
textNorm: config.textNorm
|
|
6475
|
+
});
|
|
6476
|
+
}
|
|
6477
|
+
|
|
6478
|
+
// src/inference/Wav2ArkitCpuInference.ts
|
|
6479
|
+
var logger8 = createLogger("Wav2ArkitCpu");
|
|
6480
|
+
var _Wav2ArkitCpuInference = class _Wav2ArkitCpuInference {
|
|
6481
|
+
constructor(config) {
|
|
6482
|
+
this.modelId = "wav2arkit_cpu";
|
|
6483
|
+
this.session = null;
|
|
6484
|
+
this.ort = null;
|
|
6485
|
+
this._backend = "wasm";
|
|
6486
|
+
this.isLoading = false;
|
|
6487
|
+
// Inference queue for handling concurrent calls
|
|
6488
|
+
this.inferenceQueue = Promise.resolve();
|
|
6489
|
+
// Session health: set to true if session.run() times out.
|
|
6490
|
+
// A timed-out session may have a zombie WASM dispatch still running,
|
|
6491
|
+
// so all future infer() calls reject immediately to prevent concurrent access.
|
|
6492
|
+
this.poisoned = false;
|
|
6493
|
+
this.config = config;
|
|
6494
|
+
}
|
|
6495
|
+
get backend() {
|
|
6496
|
+
return this.session ? this._backend : null;
|
|
6497
|
+
}
|
|
6498
|
+
get isLoaded() {
|
|
6499
|
+
return this.session !== null;
|
|
6500
|
+
}
|
|
6501
|
+
/**
|
|
6502
|
+
* Load the ONNX model
|
|
6503
|
+
*/
|
|
6504
|
+
async load() {
|
|
6505
|
+
if (this.isLoading) {
|
|
6506
|
+
throw new Error("Model is already loading");
|
|
6507
|
+
}
|
|
6508
|
+
if (this.session) {
|
|
6509
|
+
throw new Error("Model already loaded. Call dispose() first.");
|
|
6510
|
+
}
|
|
6511
|
+
this.isLoading = true;
|
|
6512
|
+
const startTime = performance.now();
|
|
6513
|
+
const telemetry = getTelemetry();
|
|
6514
|
+
const span = telemetry?.startSpan("Wav2ArkitCpu.load", {
|
|
6515
|
+
"model.url": this.config.modelUrl,
|
|
6516
|
+
"model.backend_requested": this.config.backend || "wasm"
|
|
6517
|
+
});
|
|
6518
|
+
try {
|
|
6519
|
+
const preference = this.config.backend || "wasm";
|
|
6520
|
+
logger8.info("Loading ONNX Runtime...", { preference });
|
|
6521
|
+
const { ort, backend } = await getOnnxRuntimeForPreference(preference);
|
|
6522
|
+
this.ort = ort;
|
|
6523
|
+
this._backend = backend;
|
|
6524
|
+
logger8.info("ONNX Runtime loaded", { backend: this._backend });
|
|
6525
|
+
const modelUrl = this.config.modelUrl;
|
|
6526
|
+
const dataUrl = this.config.externalDataUrl !== false ? this.config.externalDataUrl || `${modelUrl}.data` : null;
|
|
6527
|
+
const sessionOptions = getSessionOptions(this._backend);
|
|
6528
|
+
if (isIOS()) {
|
|
6529
|
+
logger8.info("iOS: passing model URLs directly to ORT (low-memory path)", {
|
|
6530
|
+
modelUrl,
|
|
6531
|
+
dataUrl
|
|
6532
|
+
});
|
|
6533
|
+
if (dataUrl) {
|
|
6534
|
+
const dataFilename = dataUrl.split("/").pop();
|
|
6535
|
+
sessionOptions.externalData = [{
|
|
6536
|
+
path: dataFilename,
|
|
6537
|
+
data: dataUrl
|
|
6538
|
+
// URL string — ORT fetches directly into WASM
|
|
6539
|
+
}];
|
|
6540
|
+
}
|
|
6541
|
+
this.session = await this.ort.InferenceSession.create(modelUrl, sessionOptions);
|
|
6542
|
+
} else {
|
|
6543
|
+
const cache = getModelCache();
|
|
6544
|
+
const isCached = await cache.has(modelUrl);
|
|
6545
|
+
let modelBuffer;
|
|
6546
|
+
if (isCached) {
|
|
6547
|
+
logger8.debug("Loading model from cache", { modelUrl });
|
|
6548
|
+
modelBuffer = await cache.get(modelUrl);
|
|
6549
|
+
if (!modelBuffer) {
|
|
6550
|
+
logger8.warn("Cache corruption detected, clearing and retrying", { modelUrl });
|
|
6551
|
+
await cache.delete(modelUrl);
|
|
6552
|
+
modelBuffer = await fetchWithCache(modelUrl);
|
|
6553
|
+
}
|
|
6554
|
+
} else {
|
|
6555
|
+
logger8.debug("Fetching and caching model graph", { modelUrl });
|
|
6556
|
+
modelBuffer = await fetchWithCache(modelUrl);
|
|
6557
|
+
}
|
|
6558
|
+
if (!modelBuffer) {
|
|
6559
|
+
throw new Error(`Failed to load model: ${modelUrl}`);
|
|
6560
|
+
}
|
|
6561
|
+
let externalDataBuffer = null;
|
|
6562
|
+
if (dataUrl) {
|
|
6563
|
+
try {
|
|
6564
|
+
const isDataCached = await cache.has(dataUrl);
|
|
6565
|
+
if (isDataCached) {
|
|
6566
|
+
logger8.debug("Loading external data from cache", { dataUrl });
|
|
6567
|
+
externalDataBuffer = await cache.get(dataUrl);
|
|
6568
|
+
if (!externalDataBuffer) {
|
|
6569
|
+
logger8.warn("Cache corruption for external data, retrying", { dataUrl });
|
|
6570
|
+
await cache.delete(dataUrl);
|
|
6571
|
+
externalDataBuffer = await fetchWithCache(dataUrl);
|
|
6572
|
+
}
|
|
6573
|
+
} else {
|
|
6574
|
+
logger8.info("Fetching external model data", {
|
|
6575
|
+
dataUrl,
|
|
6576
|
+
note: "This may be a large download (400MB+)"
|
|
6577
|
+
});
|
|
6578
|
+
externalDataBuffer = await fetchWithCache(dataUrl);
|
|
6579
|
+
}
|
|
6580
|
+
logger8.info("External data loaded", {
|
|
6581
|
+
size: formatBytes(externalDataBuffer.byteLength)
|
|
6582
|
+
});
|
|
6583
|
+
} catch (err) {
|
|
6584
|
+
logger8.debug("No external data file found (single-file model)", {
|
|
6585
|
+
dataUrl,
|
|
6586
|
+
error: err.message
|
|
6587
|
+
});
|
|
6588
|
+
}
|
|
6589
|
+
}
|
|
6590
|
+
logger8.debug("Creating ONNX session", {
|
|
6591
|
+
graphSize: formatBytes(modelBuffer.byteLength),
|
|
6592
|
+
externalDataSize: externalDataBuffer ? formatBytes(externalDataBuffer.byteLength) : "none",
|
|
6593
|
+
backend: this._backend
|
|
6594
|
+
});
|
|
6595
|
+
if (externalDataBuffer) {
|
|
6596
|
+
const dataFilename = dataUrl.split("/").pop();
|
|
6597
|
+
sessionOptions.externalData = [{
|
|
6598
|
+
path: dataFilename,
|
|
6599
|
+
data: new Uint8Array(externalDataBuffer)
|
|
6600
|
+
}];
|
|
6601
|
+
}
|
|
6602
|
+
const modelData = new Uint8Array(modelBuffer);
|
|
6603
|
+
this.session = await this.ort.InferenceSession.create(modelData, sessionOptions);
|
|
6604
|
+
}
|
|
6605
|
+
const loadTimeMs = performance.now() - startTime;
|
|
6606
|
+
logger8.info("Model loaded successfully", {
|
|
6607
|
+
backend: this._backend,
|
|
6608
|
+
loadTimeMs: Math.round(loadTimeMs),
|
|
6609
|
+
inputs: this.session.inputNames,
|
|
6610
|
+
outputs: this.session.outputNames
|
|
6611
|
+
});
|
|
6612
|
+
span?.setAttributes({
|
|
6613
|
+
"model.backend": this._backend,
|
|
6614
|
+
"model.load_time_ms": loadTimeMs,
|
|
6615
|
+
"model.cached": !isIOS()
|
|
6616
|
+
});
|
|
6617
|
+
span?.end();
|
|
6618
|
+
telemetry?.recordHistogram("omote.model.load_time", loadTimeMs, {
|
|
6619
|
+
model: "wav2arkit_cpu",
|
|
6620
|
+
backend: this._backend
|
|
6621
|
+
});
|
|
6622
|
+
logger8.debug("Running warmup inference");
|
|
6623
|
+
const warmupStart = performance.now();
|
|
6624
|
+
const silentAudio = new Float32Array(16e3);
|
|
6625
|
+
await this.infer(silentAudio);
|
|
6626
|
+
const warmupTimeMs = performance.now() - warmupStart;
|
|
6627
|
+
logger8.info("Warmup inference complete", {
|
|
6628
|
+
warmupTimeMs: Math.round(warmupTimeMs),
|
|
6629
|
+
backend: this._backend
|
|
6630
|
+
});
|
|
6631
|
+
telemetry?.recordHistogram("omote.model.warmup_time", warmupTimeMs, {
|
|
6632
|
+
model: "wav2arkit_cpu",
|
|
6633
|
+
backend: this._backend
|
|
6634
|
+
});
|
|
6635
|
+
return {
|
|
6636
|
+
backend: this._backend,
|
|
6637
|
+
loadTimeMs,
|
|
6638
|
+
inputNames: [...this.session.inputNames],
|
|
6639
|
+
outputNames: [...this.session.outputNames]
|
|
6640
|
+
};
|
|
6641
|
+
} catch (error) {
|
|
6642
|
+
span?.endWithError(error instanceof Error ? error : new Error(String(error)));
|
|
6643
|
+
telemetry?.incrementCounter("omote.errors.total", 1, {
|
|
6644
|
+
model: "wav2arkit_cpu",
|
|
6645
|
+
error_type: "load_failed"
|
|
6646
|
+
});
|
|
6647
|
+
throw error;
|
|
6648
|
+
} finally {
|
|
6649
|
+
this.isLoading = false;
|
|
6650
|
+
}
|
|
6651
|
+
}
|
|
6652
|
+
/**
|
|
6653
|
+
* Run inference on raw audio
|
|
6654
|
+
*
|
|
6655
|
+
* Accepts variable-length audio (not fixed to 16000 samples).
|
|
6656
|
+
* Output frames = ceil(30 * numSamples / 16000).
|
|
6657
|
+
*
|
|
6658
|
+
* @param audioSamples - Float32Array of raw audio at 16kHz
|
|
6659
|
+
* @param _identityIndex - Ignored (identity 11 is baked into the model)
|
|
6660
|
+
*/
|
|
6661
|
+
async infer(audioSamples, _identityIndex) {
|
|
6662
|
+
if (!this.session) {
|
|
6663
|
+
throw new Error("Model not loaded. Call load() first.");
|
|
6664
|
+
}
|
|
6665
|
+
if (this.poisoned) {
|
|
6666
|
+
throw new Error("Wav2ArkitCpu session timed out \u2014 inference unavailable until page reload");
|
|
6667
|
+
}
|
|
6668
|
+
const audioCopy = new Float32Array(audioSamples);
|
|
6669
|
+
const feeds = {
|
|
6670
|
+
"audio_waveform": new this.ort.Tensor("float32", audioCopy, [1, audioCopy.length])
|
|
6671
|
+
};
|
|
6672
|
+
return this.queueInference(feeds, audioCopy.length);
|
|
6673
|
+
}
|
|
6674
|
+
/**
|
|
6675
|
+
* Queue inference to serialize ONNX session calls
|
|
6676
|
+
*/
|
|
6677
|
+
queueInference(feeds, inputSamples) {
|
|
6678
|
+
return new Promise((resolve, reject) => {
|
|
6679
|
+
this.inferenceQueue = this.inferenceQueue.then(async () => {
|
|
6680
|
+
const telemetry = getTelemetry();
|
|
6681
|
+
const span = telemetry?.startSpan("Wav2ArkitCpu.infer", {
|
|
6682
|
+
"inference.backend": this._backend,
|
|
6683
|
+
"inference.input_samples": inputSamples
|
|
6684
|
+
});
|
|
6685
|
+
try {
|
|
6686
|
+
const startTime = performance.now();
|
|
6687
|
+
let timeoutId;
|
|
6688
|
+
const results = await Promise.race([
|
|
6689
|
+
this.session.run(feeds).then((r) => {
|
|
6690
|
+
clearTimeout(timeoutId);
|
|
6691
|
+
return r;
|
|
6692
|
+
}),
|
|
6693
|
+
new Promise((_, rej) => {
|
|
6694
|
+
timeoutId = setTimeout(
|
|
6695
|
+
() => rej(new Error(`Wav2ArkitCpu inference timed out after ${_Wav2ArkitCpuInference.INFERENCE_TIMEOUT_MS}ms`)),
|
|
6696
|
+
_Wav2ArkitCpuInference.INFERENCE_TIMEOUT_MS
|
|
6697
|
+
);
|
|
6698
|
+
})
|
|
6699
|
+
]);
|
|
6700
|
+
const inferenceTimeMs = performance.now() - startTime;
|
|
6701
|
+
const blendshapeOutput = results["blendshapes"];
|
|
6702
|
+
if (!blendshapeOutput) {
|
|
6703
|
+
throw new Error("Missing blendshapes output from model");
|
|
6704
|
+
}
|
|
6705
|
+
const blendshapeData = blendshapeOutput.data;
|
|
6706
|
+
const numFrames = blendshapeOutput.dims[1];
|
|
6707
|
+
const numBlendshapes = blendshapeOutput.dims[2];
|
|
6708
|
+
const blendshapes = [];
|
|
6709
|
+
for (let f = 0; f < numFrames; f++) {
|
|
6710
|
+
const rawFrame = blendshapeData.slice(f * numBlendshapes, (f + 1) * numBlendshapes);
|
|
6711
|
+
const symmetrized = symmetrizeBlendshapes(rawFrame);
|
|
6712
|
+
blendshapes.push(symmetrized);
|
|
6713
|
+
}
|
|
6714
|
+
logger8.trace("Inference completed", {
|
|
6715
|
+
inferenceTimeMs: Math.round(inferenceTimeMs * 100) / 100,
|
|
6716
|
+
numFrames,
|
|
6717
|
+
inputSamples
|
|
6718
|
+
});
|
|
6719
|
+
span?.setAttributes({
|
|
6720
|
+
"inference.duration_ms": inferenceTimeMs,
|
|
6721
|
+
"inference.frames": numFrames
|
|
6722
|
+
});
|
|
6723
|
+
span?.end();
|
|
6724
|
+
telemetry?.recordHistogram("omote.inference.latency", inferenceTimeMs, {
|
|
6725
|
+
model: "wav2arkit_cpu",
|
|
6726
|
+
backend: this._backend
|
|
6727
|
+
});
|
|
6728
|
+
telemetry?.incrementCounter("omote.inference.total", 1, {
|
|
6729
|
+
model: "wav2arkit_cpu",
|
|
6730
|
+
backend: this._backend,
|
|
6731
|
+
status: "success"
|
|
6732
|
+
});
|
|
6733
|
+
resolve({
|
|
6734
|
+
blendshapes,
|
|
6735
|
+
numFrames,
|
|
6736
|
+
inferenceTimeMs
|
|
6737
|
+
});
|
|
6738
|
+
} catch (err) {
|
|
6739
|
+
const errMsg = err instanceof Error ? err.message : String(err);
|
|
6740
|
+
if (errMsg.includes("timed out")) {
|
|
6741
|
+
this.poisoned = true;
|
|
6742
|
+
logger8.error("CRITICAL: Inference session timed out \u2014 Wav2ArkitCpu is dead. Page reload required.", {
|
|
6743
|
+
backend: this._backend,
|
|
6744
|
+
timeoutMs: _Wav2ArkitCpuInference.INFERENCE_TIMEOUT_MS
|
|
6745
|
+
});
|
|
6746
|
+
} else if (typeof err === "number") {
|
|
6747
|
+
const oomError = new Error(
|
|
6748
|
+
`Wav2ArkitCpu inference failed with raw C++ exception pointer (0x${err.toString(16)}). This is likely an OOM crash in WASM. Try reloading the page.`
|
|
6749
|
+
);
|
|
6750
|
+
logger8.error("ORT WASM OOM \u2014 raw C++ exception pointer", {
|
|
6751
|
+
pointer: `0x${err.toString(16)}`,
|
|
6752
|
+
backend: this._backend
|
|
6753
|
+
});
|
|
6754
|
+
span?.endWithError(oomError);
|
|
6755
|
+
telemetry?.incrementCounter("omote.inference.total", 1, {
|
|
6756
|
+
model: "wav2arkit_cpu",
|
|
6757
|
+
backend: this._backend,
|
|
6758
|
+
status: "error"
|
|
6759
|
+
});
|
|
6760
|
+
reject(oomError);
|
|
6761
|
+
return;
|
|
6762
|
+
} else {
|
|
6763
|
+
logger8.error("Inference failed", { error: errMsg, backend: this._backend });
|
|
6764
|
+
}
|
|
6765
|
+
span?.endWithError(err instanceof Error ? err : new Error(String(err)));
|
|
6766
|
+
telemetry?.incrementCounter("omote.inference.total", 1, {
|
|
6767
|
+
model: "wav2arkit_cpu",
|
|
6768
|
+
backend: this._backend,
|
|
6769
|
+
status: "error"
|
|
6770
|
+
});
|
|
6771
|
+
reject(err);
|
|
6772
|
+
}
|
|
6773
|
+
});
|
|
6774
|
+
});
|
|
6775
|
+
}
|
|
6776
|
+
/**
|
|
6777
|
+
* Dispose of the model and free resources
|
|
6778
|
+
*/
|
|
6779
|
+
async dispose() {
|
|
6780
|
+
if (this.session) {
|
|
6781
|
+
await this.session.release();
|
|
6782
|
+
this.session = null;
|
|
6783
|
+
}
|
|
6784
|
+
}
|
|
6785
|
+
};
|
|
6786
|
+
_Wav2ArkitCpuInference.INFERENCE_TIMEOUT_MS = 5e3;
|
|
6787
|
+
var Wav2ArkitCpuInference = _Wav2ArkitCpuInference;
|
|
6788
|
+
|
|
6789
|
+
// src/inference/Wav2ArkitCpuWorker.ts
|
|
6790
|
+
var logger9 = createLogger("Wav2ArkitCpuWorker");
|
|
6791
|
+
var WASM_CDN_PATH4 = "https://cdn.jsdelivr.net/npm/onnxruntime-web@1.23.2/dist/";
|
|
6792
|
+
var LOAD_TIMEOUT_MS2 = 6e4;
|
|
6793
|
+
var INFERENCE_TIMEOUT_MS2 = 5e3;
|
|
6794
|
+
function resolveUrl3(url) {
|
|
6795
|
+
if (/^https?:\/\//i.test(url) || /^blob:/i.test(url)) return url;
|
|
6796
|
+
try {
|
|
6797
|
+
return new URL(url, globalThis.location?.origin ?? "https://localhost").href;
|
|
6798
|
+
} catch {
|
|
6799
|
+
return url;
|
|
6800
|
+
}
|
|
6801
|
+
}
|
|
6802
|
+
var WORKER_SCRIPT3 = `
|
|
6803
|
+
// Wav2ArkitCpu Worker Script
|
|
6804
|
+
// Loaded via Blob URL - no separate file needed
|
|
6805
|
+
|
|
6806
|
+
var ort = null;
|
|
6807
|
+
var session = null;
|
|
6808
|
+
|
|
6809
|
+
// Precomputed symmetric index pairs from LAM_BLENDSHAPES alphabetical ordering
|
|
6810
|
+
// Used to average left/right blendshape pairs for symmetrized output
|
|
6811
|
+
const SYMMETRIC_INDEX_PAIRS = [
|
|
6812
|
+
[23, 25], // jawLeft, jawRight
|
|
6813
|
+
[32, 38], // mouthLeft, mouthRight
|
|
6814
|
+
[43, 44], // mouthSmileLeft, mouthSmileRight
|
|
6815
|
+
[29, 30], // mouthFrownLeft, mouthFrownRight
|
|
6816
|
+
[27, 28], // mouthDimpleLeft, mouthDimpleRight
|
|
6817
|
+
[45, 46], // mouthStretchLeft, mouthStretchRight
|
|
6818
|
+
[35, 36], // mouthPressLeft, mouthPressRight
|
|
6819
|
+
[47, 48], // mouthUpperUpLeft, mouthUpperUpRight
|
|
6820
|
+
[33, 34], // mouthLowerDownLeft, mouthLowerDownRight
|
|
6821
|
+
[49, 50], // noseSneerLeft, noseSneerRight
|
|
6822
|
+
[6, 7], // cheekSquintLeft, cheekSquintRight
|
|
6823
|
+
[0, 1], // browDownLeft, browDownRight
|
|
6824
|
+
[3, 4], // browOuterUpLeft, browOuterUpRight
|
|
6825
|
+
[8, 9], // eyeBlinkLeft, eyeBlinkRight
|
|
6826
|
+
[16, 17], // eyeLookUpLeft, eyeLookUpRight
|
|
6827
|
+
[10, 11], // eyeLookDownLeft, eyeLookDownRight
|
|
6828
|
+
[12, 13], // eyeLookInLeft, eyeLookInRight
|
|
6829
|
+
[14, 15], // eyeLookOutLeft, eyeLookOutRight
|
|
6830
|
+
[18, 19], // eyeSquintLeft, eyeSquintRight
|
|
6831
|
+
[20, 21], // eyeWideLeft, eyeWideRight
|
|
6832
|
+
];
|
|
6833
|
+
|
|
6834
|
+
/**
|
|
6835
|
+
* Symmetrize blendshapes by averaging left/right pairs
|
|
6836
|
+
* Inlined from blendshapeUtils.ts for worker context
|
|
6837
|
+
*/
|
|
6838
|
+
function symmetrizeBlendshapes(frame) {
|
|
6839
|
+
const result = new Float32Array(frame);
|
|
6840
|
+
for (const [lIdx, rIdx] of SYMMETRIC_INDEX_PAIRS) {
|
|
6841
|
+
const avg = (frame[lIdx] + frame[rIdx]) / 2;
|
|
6842
|
+
result[lIdx] = avg;
|
|
6843
|
+
result[rIdx] = avg;
|
|
6844
|
+
}
|
|
6845
|
+
return result;
|
|
6846
|
+
}
|
|
6847
|
+
|
|
6848
|
+
/**
|
|
6849
|
+
* Load ONNX Runtime from CDN
|
|
6850
|
+
*/
|
|
6851
|
+
async function loadOrt(wasmPaths) {
|
|
6852
|
+
if (ort) return;
|
|
6853
|
+
|
|
6854
|
+
// Import ONNX Runtime from CDN
|
|
6855
|
+
const ortUrl = wasmPaths + 'ort.wasm.min.js';
|
|
6856
|
+
|
|
6857
|
+
// Load the script by fetching and executing it
|
|
6858
|
+
const response = await fetch(ortUrl);
|
|
6859
|
+
const scriptText = await response.text();
|
|
6860
|
+
|
|
6861
|
+
// Create a blob URL for the script
|
|
6862
|
+
const blob = new Blob([scriptText], { type: 'application/javascript' });
|
|
6863
|
+
const blobUrl = URL.createObjectURL(blob);
|
|
6864
|
+
|
|
6865
|
+
// Import the module
|
|
6866
|
+
importScripts(blobUrl);
|
|
6867
|
+
URL.revokeObjectURL(blobUrl);
|
|
6868
|
+
|
|
6869
|
+
// ort is now available as global
|
|
6870
|
+
ort = self.ort;
|
|
6871
|
+
|
|
6872
|
+
// Configure WASM settings
|
|
6873
|
+
ort.env.wasm.wasmPaths = wasmPaths;
|
|
6874
|
+
ort.env.wasm.numThreads = 1; // Single thread in worker
|
|
6875
|
+
ort.env.wasm.simd = true;
|
|
6876
|
+
ort.env.wasm.proxy = false; // No proxy in worker
|
|
6877
|
+
}
|
|
6878
|
+
|
|
6879
|
+
/**
|
|
6880
|
+
* Load the wav2arkit_cpu model
|
|
6881
|
+
*/
|
|
6882
|
+
async function loadModel(modelUrl, externalDataUrl, isIOS) {
|
|
6883
|
+
const sessionOptions = {
|
|
6884
|
+
executionProviders: ['wasm'],
|
|
6885
|
+
graphOptimizationLevel: 'all',
|
|
6886
|
+
};
|
|
6887
|
+
|
|
6888
|
+
const dataFilename = externalDataUrl ? externalDataUrl.split('/').pop() : null;
|
|
6889
|
+
|
|
6890
|
+
if (isIOS) {
|
|
6891
|
+
// iOS: Pass URLs directly to ORT to avoid loading 402MB into JS heap.
|
|
6892
|
+
// ORT fetches externally into WASM memory, cutting peak JS memory from
|
|
6893
|
+
// ~800MB to ~2MB (just the graph).
|
|
6894
|
+
if (externalDataUrl && dataFilename) {
|
|
6895
|
+
sessionOptions.externalData = [{ path: dataFilename, data: externalDataUrl }];
|
|
6896
|
+
}
|
|
6897
|
+
session = await ort.InferenceSession.create(modelUrl, sessionOptions);
|
|
6898
|
+
} else {
|
|
6899
|
+
// Desktop: fetch model graph as ArrayBuffer
|
|
6900
|
+
const graphResponse = await fetch(modelUrl);
|
|
6901
|
+
if (!graphResponse.ok) {
|
|
6902
|
+
throw new Error('Failed to fetch model graph: ' + graphResponse.status + ' ' + graphResponse.statusText);
|
|
6903
|
+
}
|
|
6904
|
+
const graphBuffer = await graphResponse.arrayBuffer();
|
|
6905
|
+
|
|
6906
|
+
// Fetch external data file if present
|
|
6907
|
+
if (externalDataUrl && dataFilename) {
|
|
6908
|
+
const dataResponse = await fetch(externalDataUrl);
|
|
6909
|
+
if (!dataResponse.ok) {
|
|
6910
|
+
throw new Error('Failed to fetch external data: ' + dataResponse.status + ' ' + dataResponse.statusText);
|
|
6911
|
+
}
|
|
6912
|
+
const dataBuffer = await dataResponse.arrayBuffer();
|
|
6913
|
+
sessionOptions.externalData = [{ path: dataFilename, data: new Uint8Array(dataBuffer) }];
|
|
6914
|
+
}
|
|
6915
|
+
|
|
6916
|
+
session = await ort.InferenceSession.create(new Uint8Array(graphBuffer), sessionOptions);
|
|
6917
|
+
}
|
|
6918
|
+
|
|
6919
|
+
// Warmup inference with 16000 silent samples
|
|
6920
|
+
const warmupAudio = new Float32Array(16000);
|
|
6921
|
+
const warmupTensor = new ort.Tensor('float32', warmupAudio, [1, warmupAudio.length]);
|
|
6922
|
+
await session.run({ audio_waveform: warmupTensor });
|
|
6923
|
+
|
|
6924
|
+
return {
|
|
6925
|
+
inputNames: session.inputNames.slice(),
|
|
6926
|
+
outputNames: session.outputNames.slice(),
|
|
6927
|
+
};
|
|
6928
|
+
}
|
|
6929
|
+
|
|
6930
|
+
/**
|
|
6931
|
+
* Run lip sync inference
|
|
6932
|
+
*/
|
|
6933
|
+
async function runInference(audio) {
|
|
6934
|
+
const tensor = new ort.Tensor('float32', audio, [1, audio.length]);
|
|
6935
|
+
const results = await session.run({ audio_waveform: tensor });
|
|
6936
|
+
|
|
6937
|
+
const blendshapeOutput = results['blendshapes'];
|
|
6938
|
+
if (!blendshapeOutput) {
|
|
6939
|
+
throw new Error('Missing blendshapes output from model');
|
|
6940
|
+
}
|
|
6941
|
+
|
|
6942
|
+
const blendshapeData = blendshapeOutput.data;
|
|
6943
|
+
const numFrames = blendshapeOutput.dims[1];
|
|
6944
|
+
const numBlendshapes = blendshapeOutput.dims[2];
|
|
6945
|
+
|
|
6946
|
+
// Symmetrize each frame and flatten into a single Float32Array for transfer
|
|
6947
|
+
const flatBuffer = new Float32Array(numFrames * numBlendshapes);
|
|
6948
|
+
for (let f = 0; f < numFrames; f++) {
|
|
6949
|
+
const offset = f * numBlendshapes;
|
|
6950
|
+
const rawFrame = blendshapeData.slice(offset, offset + numBlendshapes);
|
|
6951
|
+
const symmetrized = symmetrizeBlendshapes(rawFrame);
|
|
6952
|
+
flatBuffer.set(symmetrized, offset);
|
|
6953
|
+
}
|
|
6954
|
+
|
|
6955
|
+
return { flatBuffer, numFrames, numBlendshapes };
|
|
6956
|
+
}
|
|
6957
|
+
|
|
6958
|
+
// Message handler
|
|
6959
|
+
self.onmessage = async function(e) {
|
|
6960
|
+
const msg = e.data;
|
|
6961
|
+
|
|
6962
|
+
try {
|
|
6963
|
+
switch (msg.type) {
|
|
6964
|
+
case 'load': {
|
|
6965
|
+
const startTime = performance.now();
|
|
6966
|
+
await loadOrt(msg.wasmPaths);
|
|
6967
|
+
const { inputNames, outputNames } = await loadModel(msg.modelUrl, msg.externalDataUrl, msg.isIOS);
|
|
6968
|
+
const loadTimeMs = performance.now() - startTime;
|
|
6969
|
+
|
|
6970
|
+
self.postMessage({
|
|
6971
|
+
type: 'loaded',
|
|
6972
|
+
inputNames,
|
|
6973
|
+
outputNames,
|
|
6974
|
+
loadTimeMs,
|
|
6975
|
+
});
|
|
6976
|
+
break;
|
|
6977
|
+
}
|
|
6978
|
+
|
|
6979
|
+
case 'infer': {
|
|
6980
|
+
const startTime = performance.now();
|
|
6981
|
+
const { flatBuffer, numFrames, numBlendshapes } = await runInference(msg.audio);
|
|
6982
|
+
const inferenceTimeMs = performance.now() - startTime;
|
|
6983
|
+
|
|
6984
|
+
self.postMessage({
|
|
6985
|
+
type: 'result',
|
|
6986
|
+
blendshapes: flatBuffer,
|
|
6987
|
+
numFrames,
|
|
6988
|
+
numBlendshapes,
|
|
6989
|
+
inferenceTimeMs,
|
|
6990
|
+
}, [flatBuffer.buffer]);
|
|
6991
|
+
break;
|
|
6992
|
+
}
|
|
6993
|
+
|
|
6994
|
+
case 'dispose': {
|
|
6995
|
+
if (session) {
|
|
6996
|
+
await session.release();
|
|
6997
|
+
session = null;
|
|
6998
|
+
}
|
|
6999
|
+
ort = null;
|
|
7000
|
+
self.postMessage({ type: 'disposed' });
|
|
7001
|
+
break;
|
|
7002
|
+
}
|
|
7003
|
+
|
|
7004
|
+
default:
|
|
7005
|
+
self.postMessage({
|
|
7006
|
+
type: 'error',
|
|
7007
|
+
error: 'Unknown message type: ' + msg.type,
|
|
7008
|
+
});
|
|
7009
|
+
}
|
|
7010
|
+
} catch (err) {
|
|
7011
|
+
let errorMessage;
|
|
7012
|
+
if (typeof err === 'number') {
|
|
7013
|
+
// ORT WASM throws raw C++ exception pointers as bare numbers
|
|
7014
|
+
errorMessage = 'ORT WASM C++ exception pointer (0x' + err.toString(16) + ') \u2014 likely OOM';
|
|
7015
|
+
} else {
|
|
7016
|
+
errorMessage = err.message || String(err);
|
|
7017
|
+
}
|
|
7018
|
+
self.postMessage({
|
|
7019
|
+
type: 'error',
|
|
7020
|
+
error: errorMessage,
|
|
7021
|
+
});
|
|
7022
|
+
}
|
|
7023
|
+
};
|
|
7024
|
+
|
|
7025
|
+
// Error handler
|
|
7026
|
+
self.onerror = function(err) {
|
|
7027
|
+
self.postMessage({
|
|
7028
|
+
type: 'error',
|
|
7029
|
+
error: 'Worker error: ' + (err.message || String(err)),
|
|
7030
|
+
});
|
|
7031
|
+
};
|
|
7032
|
+
`;
|
|
7033
|
+
var Wav2ArkitCpuWorker = class {
|
|
7034
|
+
constructor(config) {
|
|
7035
|
+
this.modelId = "wav2arkit_cpu";
|
|
7036
|
+
this.worker = null;
|
|
7037
|
+
this.isLoading = false;
|
|
7038
|
+
this._isLoaded = false;
|
|
7039
|
+
// Inference queue for serialization
|
|
7040
|
+
this.inferenceQueue = Promise.resolve();
|
|
7041
|
+
// Session health: set to true if worker inference times out.
|
|
7042
|
+
// A timed-out worker may have a zombie WASM dispatch still running,
|
|
7043
|
+
// so all future infer() calls reject immediately to prevent concurrent access.
|
|
7044
|
+
this.poisoned = false;
|
|
7045
|
+
// Pending message handlers
|
|
7046
|
+
this.pendingResolvers = /* @__PURE__ */ new Map();
|
|
7047
|
+
this.config = config;
|
|
7048
|
+
}
|
|
7049
|
+
get isLoaded() {
|
|
7050
|
+
return this._isLoaded;
|
|
7051
|
+
}
|
|
7052
|
+
/**
|
|
7053
|
+
* Backend type (always 'wasm' for Worker, WebGPU not supported in Workers)
|
|
7054
|
+
*/
|
|
7055
|
+
get backend() {
|
|
7056
|
+
return this._isLoaded ? "wasm" : null;
|
|
4321
7057
|
}
|
|
4322
7058
|
/**
|
|
4323
|
-
*
|
|
7059
|
+
* Create the worker from inline script
|
|
4324
7060
|
*/
|
|
4325
|
-
|
|
7061
|
+
createWorker() {
|
|
7062
|
+
const blob = new Blob([WORKER_SCRIPT3], { type: "application/javascript" });
|
|
7063
|
+
const blobUrl = URL.createObjectURL(blob);
|
|
7064
|
+
const worker = new Worker(blobUrl);
|
|
7065
|
+
URL.revokeObjectURL(blobUrl);
|
|
7066
|
+
worker.onmessage = (event) => {
|
|
7067
|
+
this.handleWorkerMessage(event.data);
|
|
7068
|
+
};
|
|
7069
|
+
worker.onerror = (error) => {
|
|
7070
|
+
logger9.error("Worker error", { error: error.message });
|
|
7071
|
+
for (const [, resolver] of this.pendingResolvers) {
|
|
7072
|
+
resolver.reject(new Error(`Worker error: ${error.message}`));
|
|
7073
|
+
}
|
|
7074
|
+
this.pendingResolvers.clear();
|
|
7075
|
+
};
|
|
7076
|
+
return worker;
|
|
7077
|
+
}
|
|
7078
|
+
/**
|
|
7079
|
+
* Handle messages from worker
|
|
7080
|
+
*/
|
|
7081
|
+
handleWorkerMessage(result) {
|
|
7082
|
+
const resolver = this.pendingResolvers.get(result.type);
|
|
7083
|
+
if (resolver) {
|
|
7084
|
+
this.pendingResolvers.delete(result.type);
|
|
7085
|
+
if (result.type === "error") {
|
|
7086
|
+
resolver.reject(new Error(result.error));
|
|
7087
|
+
} else {
|
|
7088
|
+
resolver.resolve(result);
|
|
7089
|
+
}
|
|
7090
|
+
}
|
|
7091
|
+
}
|
|
7092
|
+
/**
|
|
7093
|
+
* Send message to worker and wait for response
|
|
7094
|
+
*/
|
|
7095
|
+
sendMessage(message, expectedType, timeoutMs) {
|
|
7096
|
+
return new Promise((resolve, reject) => {
|
|
7097
|
+
if (!this.worker) {
|
|
7098
|
+
reject(new Error("Worker not initialized"));
|
|
7099
|
+
return;
|
|
7100
|
+
}
|
|
7101
|
+
const timeoutId = setTimeout(() => {
|
|
7102
|
+
this.pendingResolvers.delete(expectedType);
|
|
7103
|
+
reject(new Error(`Worker operation timed out after ${timeoutMs}ms`));
|
|
7104
|
+
}, timeoutMs);
|
|
7105
|
+
this.pendingResolvers.set(expectedType, {
|
|
7106
|
+
resolve: (value) => {
|
|
7107
|
+
clearTimeout(timeoutId);
|
|
7108
|
+
resolve(value);
|
|
7109
|
+
},
|
|
7110
|
+
reject: (error) => {
|
|
7111
|
+
clearTimeout(timeoutId);
|
|
7112
|
+
reject(error);
|
|
7113
|
+
}
|
|
7114
|
+
});
|
|
7115
|
+
this.pendingResolvers.set("error", {
|
|
7116
|
+
resolve: () => {
|
|
7117
|
+
},
|
|
7118
|
+
// Never called for errors
|
|
7119
|
+
reject: (error) => {
|
|
7120
|
+
clearTimeout(timeoutId);
|
|
7121
|
+
this.pendingResolvers.delete(expectedType);
|
|
7122
|
+
reject(error);
|
|
7123
|
+
}
|
|
7124
|
+
});
|
|
7125
|
+
this.worker.postMessage(message);
|
|
7126
|
+
});
|
|
7127
|
+
}
|
|
7128
|
+
/**
|
|
7129
|
+
* Load the ONNX model in the worker
|
|
7130
|
+
*/
|
|
7131
|
+
async load() {
|
|
7132
|
+
if (this.isLoading) {
|
|
7133
|
+
throw new Error("Model is already loading");
|
|
7134
|
+
}
|
|
7135
|
+
if (this._isLoaded) {
|
|
7136
|
+
throw new Error("Model already loaded. Call dispose() first.");
|
|
7137
|
+
}
|
|
7138
|
+
this.isLoading = true;
|
|
7139
|
+
const startTime = performance.now();
|
|
7140
|
+
const telemetry = getTelemetry();
|
|
7141
|
+
const span = telemetry?.startSpan("Wav2ArkitCpuWorker.load", {
|
|
7142
|
+
"model.url": this.config.modelUrl,
|
|
7143
|
+
"model.backend_requested": "wasm"
|
|
7144
|
+
});
|
|
7145
|
+
try {
|
|
7146
|
+
logger9.info("Creating wav2arkit_cpu worker...");
|
|
7147
|
+
this.worker = this.createWorker();
|
|
7148
|
+
const externalDataUrl = this.config.externalDataUrl !== false ? this.config.externalDataUrl || `${this.config.modelUrl}.data` : null;
|
|
7149
|
+
logger9.info("Loading model in worker...", {
|
|
7150
|
+
modelUrl: this.config.modelUrl,
|
|
7151
|
+
externalDataUrl,
|
|
7152
|
+
isIOS: isIOS()
|
|
7153
|
+
});
|
|
7154
|
+
const result = await this.sendMessage(
|
|
7155
|
+
{
|
|
7156
|
+
type: "load",
|
|
7157
|
+
modelUrl: resolveUrl3(this.config.modelUrl),
|
|
7158
|
+
externalDataUrl: externalDataUrl ? resolveUrl3(externalDataUrl) : null,
|
|
7159
|
+
wasmPaths: WASM_CDN_PATH4,
|
|
7160
|
+
isIOS: isIOS()
|
|
7161
|
+
},
|
|
7162
|
+
"loaded",
|
|
7163
|
+
LOAD_TIMEOUT_MS2
|
|
7164
|
+
);
|
|
7165
|
+
this._isLoaded = true;
|
|
7166
|
+
const loadTimeMs = performance.now() - startTime;
|
|
7167
|
+
logger9.info("Wav2ArkitCpu worker loaded successfully", {
|
|
7168
|
+
backend: "wasm",
|
|
7169
|
+
loadTimeMs: Math.round(loadTimeMs),
|
|
7170
|
+
workerLoadTimeMs: Math.round(result.loadTimeMs),
|
|
7171
|
+
inputs: result.inputNames,
|
|
7172
|
+
outputs: result.outputNames
|
|
7173
|
+
});
|
|
7174
|
+
span?.setAttributes({
|
|
7175
|
+
"model.backend": "wasm",
|
|
7176
|
+
"model.load_time_ms": loadTimeMs,
|
|
7177
|
+
"model.worker_load_time_ms": result.loadTimeMs
|
|
7178
|
+
});
|
|
7179
|
+
span?.end();
|
|
7180
|
+
telemetry?.recordHistogram("omote.model.load_time", loadTimeMs, {
|
|
7181
|
+
model: "wav2arkit_cpu-worker",
|
|
7182
|
+
backend: "wasm"
|
|
7183
|
+
});
|
|
7184
|
+
return {
|
|
7185
|
+
backend: "wasm",
|
|
7186
|
+
loadTimeMs,
|
|
7187
|
+
inputNames: result.inputNames,
|
|
7188
|
+
outputNames: result.outputNames
|
|
7189
|
+
};
|
|
7190
|
+
} catch (error) {
|
|
7191
|
+
span?.endWithError(error instanceof Error ? error : new Error(String(error)));
|
|
7192
|
+
telemetry?.incrementCounter("omote.errors.total", 1, {
|
|
7193
|
+
model: "wav2arkit_cpu-worker",
|
|
7194
|
+
error_type: "load_failed"
|
|
7195
|
+
});
|
|
7196
|
+
if (this.worker) {
|
|
7197
|
+
this.worker.terminate();
|
|
7198
|
+
this.worker = null;
|
|
7199
|
+
}
|
|
7200
|
+
throw error;
|
|
7201
|
+
} finally {
|
|
7202
|
+
this.isLoading = false;
|
|
7203
|
+
}
|
|
7204
|
+
}
|
|
7205
|
+
/**
|
|
7206
|
+
* Run inference on raw audio
|
|
7207
|
+
*
|
|
7208
|
+
* Accepts variable-length audio (not fixed to 16000 samples).
|
|
7209
|
+
* Output frames = ceil(30 * numSamples / 16000).
|
|
7210
|
+
*
|
|
7211
|
+
* @param audioSamples - Float32Array of raw audio at 16kHz
|
|
7212
|
+
* @param _identityIndex - Ignored (identity 11 is baked into the model)
|
|
7213
|
+
*/
|
|
7214
|
+
async infer(audioSamples, _identityIndex) {
|
|
7215
|
+
if (!this._isLoaded || !this.worker) {
|
|
7216
|
+
throw new Error("Model not loaded. Call load() first.");
|
|
7217
|
+
}
|
|
7218
|
+
if (this.poisoned) {
|
|
7219
|
+
throw new Error("Wav2ArkitCpu worker session timed out \u2014 inference unavailable until page reload");
|
|
7220
|
+
}
|
|
7221
|
+
const audioCopy = new Float32Array(audioSamples);
|
|
7222
|
+
return this.queueInference(audioCopy);
|
|
7223
|
+
}
|
|
7224
|
+
/**
|
|
7225
|
+
* Queue inference to serialize worker calls
|
|
7226
|
+
*/
|
|
7227
|
+
queueInference(audioSamples) {
|
|
4326
7228
|
return new Promise((resolve, reject) => {
|
|
4327
7229
|
this.inferenceQueue = this.inferenceQueue.then(async () => {
|
|
4328
7230
|
const telemetry = getTelemetry();
|
|
4329
|
-
const span = telemetry?.startSpan("
|
|
4330
|
-
"inference.backend":
|
|
4331
|
-
"inference.input_samples":
|
|
7231
|
+
const span = telemetry?.startSpan("Wav2ArkitCpuWorker.infer", {
|
|
7232
|
+
"inference.backend": "wasm",
|
|
7233
|
+
"inference.input_samples": audioSamples.length
|
|
4332
7234
|
});
|
|
4333
7235
|
try {
|
|
4334
7236
|
const startTime = performance.now();
|
|
4335
|
-
const
|
|
7237
|
+
const result = await this.sendMessage(
|
|
7238
|
+
{
|
|
7239
|
+
type: "infer",
|
|
7240
|
+
audio: audioSamples
|
|
7241
|
+
},
|
|
7242
|
+
"result",
|
|
7243
|
+
INFERENCE_TIMEOUT_MS2
|
|
7244
|
+
);
|
|
4336
7245
|
const inferenceTimeMs = performance.now() - startTime;
|
|
4337
|
-
const
|
|
4338
|
-
|
|
4339
|
-
throw new Error("Missing blendshapes output from model");
|
|
4340
|
-
}
|
|
4341
|
-
const blendshapeData = blendshapeOutput.data;
|
|
4342
|
-
const numFrames = blendshapeOutput.dims[1];
|
|
4343
|
-
const numBlendshapes = blendshapeOutput.dims[2];
|
|
7246
|
+
const flatBuffer = result.blendshapes;
|
|
7247
|
+
const { numFrames, numBlendshapes } = result;
|
|
4344
7248
|
const blendshapes = [];
|
|
4345
7249
|
for (let f = 0; f < numFrames; f++) {
|
|
4346
|
-
|
|
4347
|
-
const symmetrized = symmetrizeBlendshapes(rawFrame);
|
|
4348
|
-
blendshapes.push(symmetrized);
|
|
7250
|
+
blendshapes.push(flatBuffer.slice(f * numBlendshapes, (f + 1) * numBlendshapes));
|
|
4349
7251
|
}
|
|
4350
|
-
|
|
7252
|
+
logger9.trace("Worker inference completed", {
|
|
4351
7253
|
inferenceTimeMs: Math.round(inferenceTimeMs * 100) / 100,
|
|
7254
|
+
workerTimeMs: Math.round(result.inferenceTimeMs * 100) / 100,
|
|
4352
7255
|
numFrames,
|
|
4353
|
-
inputSamples
|
|
7256
|
+
inputSamples: audioSamples.length
|
|
4354
7257
|
});
|
|
4355
7258
|
span?.setAttributes({
|
|
4356
7259
|
"inference.duration_ms": inferenceTimeMs,
|
|
7260
|
+
"inference.worker_duration_ms": result.inferenceTimeMs,
|
|
4357
7261
|
"inference.frames": numFrames
|
|
4358
7262
|
});
|
|
4359
7263
|
span?.end();
|
|
4360
7264
|
telemetry?.recordHistogram("omote.inference.latency", inferenceTimeMs, {
|
|
4361
|
-
model: "wav2arkit_cpu",
|
|
4362
|
-
backend:
|
|
7265
|
+
model: "wav2arkit_cpu-worker",
|
|
7266
|
+
backend: "wasm"
|
|
4363
7267
|
});
|
|
4364
7268
|
telemetry?.incrementCounter("omote.inference.total", 1, {
|
|
4365
|
-
model: "wav2arkit_cpu",
|
|
4366
|
-
backend:
|
|
7269
|
+
model: "wav2arkit_cpu-worker",
|
|
7270
|
+
backend: "wasm",
|
|
4367
7271
|
status: "success"
|
|
4368
7272
|
});
|
|
4369
7273
|
resolve({
|
|
@@ -4372,10 +7276,20 @@ var Wav2ArkitCpuInference = class {
|
|
|
4372
7276
|
inferenceTimeMs
|
|
4373
7277
|
});
|
|
4374
7278
|
} catch (err) {
|
|
7279
|
+
const errMsg = err instanceof Error ? err.message : String(err);
|
|
7280
|
+
if (errMsg.includes("timed out")) {
|
|
7281
|
+
this.poisoned = true;
|
|
7282
|
+
logger9.error("CRITICAL: Worker inference timed out \u2014 Wav2ArkitCpu worker is dead. Page reload required.", {
|
|
7283
|
+
backend: "wasm",
|
|
7284
|
+
timeoutMs: INFERENCE_TIMEOUT_MS2
|
|
7285
|
+
});
|
|
7286
|
+
} else {
|
|
7287
|
+
logger9.error("Worker inference failed", { error: errMsg, backend: "wasm" });
|
|
7288
|
+
}
|
|
4375
7289
|
span?.endWithError(err instanceof Error ? err : new Error(String(err)));
|
|
4376
7290
|
telemetry?.incrementCounter("omote.inference.total", 1, {
|
|
4377
|
-
model: "wav2arkit_cpu",
|
|
4378
|
-
backend:
|
|
7291
|
+
model: "wav2arkit_cpu-worker",
|
|
7292
|
+
backend: "wasm",
|
|
4379
7293
|
status: "error"
|
|
4380
7294
|
});
|
|
4381
7295
|
reject(err);
|
|
@@ -4384,37 +7298,62 @@ var Wav2ArkitCpuInference = class {
|
|
|
4384
7298
|
});
|
|
4385
7299
|
}
|
|
4386
7300
|
/**
|
|
4387
|
-
* Dispose of the
|
|
7301
|
+
* Dispose of the worker and free resources
|
|
4388
7302
|
*/
|
|
4389
7303
|
async dispose() {
|
|
4390
|
-
if (this.
|
|
4391
|
-
|
|
4392
|
-
|
|
7304
|
+
if (this.worker) {
|
|
7305
|
+
try {
|
|
7306
|
+
await this.sendMessage({ type: "dispose" }, "disposed", INFERENCE_TIMEOUT_MS2);
|
|
7307
|
+
} catch {
|
|
7308
|
+
}
|
|
7309
|
+
this.worker.terminate();
|
|
7310
|
+
this.worker = null;
|
|
4393
7311
|
}
|
|
7312
|
+
this._isLoaded = false;
|
|
7313
|
+
this.poisoned = false;
|
|
7314
|
+
this.pendingResolvers.clear();
|
|
7315
|
+
}
|
|
7316
|
+
/**
|
|
7317
|
+
* Check if Web Workers are supported
|
|
7318
|
+
*/
|
|
7319
|
+
static isSupported() {
|
|
7320
|
+
return typeof Worker !== "undefined";
|
|
4394
7321
|
}
|
|
4395
7322
|
};
|
|
4396
7323
|
|
|
4397
7324
|
// src/inference/createLipSync.ts
|
|
4398
|
-
var
|
|
7325
|
+
var logger10 = createLogger("createLipSync");
|
|
4399
7326
|
function createLipSync(config) {
|
|
4400
7327
|
const mode = config.mode ?? "auto";
|
|
4401
7328
|
const fallbackOnError = config.fallbackOnError ?? true;
|
|
4402
7329
|
let useCpu;
|
|
4403
7330
|
if (mode === "cpu") {
|
|
4404
7331
|
useCpu = true;
|
|
4405
|
-
|
|
7332
|
+
logger10.info("Forcing CPU lip sync model (wav2arkit_cpu)");
|
|
4406
7333
|
} else if (mode === "gpu") {
|
|
4407
7334
|
useCpu = false;
|
|
4408
|
-
|
|
7335
|
+
logger10.info("Forcing GPU lip sync model (Wav2Vec2)");
|
|
4409
7336
|
} else {
|
|
4410
7337
|
useCpu = shouldUseCpuLipSync();
|
|
4411
|
-
|
|
7338
|
+
logger10.info("Auto-detected lip sync model", {
|
|
4412
7339
|
useCpu,
|
|
4413
7340
|
isSafari: isSafari()
|
|
4414
7341
|
});
|
|
4415
7342
|
}
|
|
4416
7343
|
if (useCpu) {
|
|
4417
|
-
|
|
7344
|
+
if (config.unifiedWorker) {
|
|
7345
|
+
logger10.info("Creating Wav2ArkitCpuUnifiedAdapter (404MB, WASM, shared unified worker)");
|
|
7346
|
+
return new Wav2ArkitCpuUnifiedAdapter(config.unifiedWorker, {
|
|
7347
|
+
modelUrl: config.cpuModelUrl
|
|
7348
|
+
});
|
|
7349
|
+
}
|
|
7350
|
+
if (config.useWorker && Wav2ArkitCpuWorker.isSupported() && !isIOS()) {
|
|
7351
|
+
logger10.info("Creating Wav2ArkitCpuWorker (404MB, WASM, off-main-thread)");
|
|
7352
|
+
return new Wav2ArkitCpuWorker({
|
|
7353
|
+
modelUrl: config.cpuModelUrl
|
|
7354
|
+
});
|
|
7355
|
+
}
|
|
7356
|
+
logger10.info("Creating Wav2ArkitCpuInference (404MB, WASM)");
|
|
4418
7357
|
return new Wav2ArkitCpuInference({
|
|
4419
7358
|
modelUrl: config.cpuModelUrl
|
|
4420
7359
|
});
|
|
@@ -4426,10 +7365,10 @@ function createLipSync(config) {
|
|
|
4426
7365
|
numIdentityClasses: config.numIdentityClasses
|
|
4427
7366
|
});
|
|
4428
7367
|
if (fallbackOnError) {
|
|
4429
|
-
|
|
7368
|
+
logger10.info("Creating Wav2Vec2Inference with CPU fallback");
|
|
4430
7369
|
return new LipSyncWithFallback(gpuInstance, config);
|
|
4431
7370
|
}
|
|
4432
|
-
|
|
7371
|
+
logger10.info("Creating Wav2Vec2Inference (no fallback)");
|
|
4433
7372
|
return gpuInstance;
|
|
4434
7373
|
}
|
|
4435
7374
|
var LipSyncWithFallback = class {
|
|
@@ -4455,16 +7394,28 @@ var LipSyncWithFallback = class {
|
|
|
4455
7394
|
}
|
|
4456
7395
|
}
|
|
4457
7396
|
async fallbackToCpu(reason) {
|
|
4458
|
-
|
|
7397
|
+
logger10.warn("GPU model load failed, falling back to CPU model", { reason });
|
|
4459
7398
|
try {
|
|
4460
7399
|
await this.implementation.dispose();
|
|
4461
7400
|
} catch {
|
|
4462
7401
|
}
|
|
4463
|
-
this.
|
|
4464
|
-
|
|
4465
|
-
|
|
7402
|
+
if (this.config.unifiedWorker) {
|
|
7403
|
+
this.implementation = new Wav2ArkitCpuUnifiedAdapter(this.config.unifiedWorker, {
|
|
7404
|
+
modelUrl: this.config.cpuModelUrl
|
|
7405
|
+
});
|
|
7406
|
+
logger10.info("Fallback to Wav2ArkitCpuUnifiedAdapter successful");
|
|
7407
|
+
} else if (this.config.useWorker && Wav2ArkitCpuWorker.isSupported() && !isIOS()) {
|
|
7408
|
+
this.implementation = new Wav2ArkitCpuWorker({
|
|
7409
|
+
modelUrl: this.config.cpuModelUrl
|
|
7410
|
+
});
|
|
7411
|
+
logger10.info("Fallback to Wav2ArkitCpuWorker successful");
|
|
7412
|
+
} else {
|
|
7413
|
+
this.implementation = new Wav2ArkitCpuInference({
|
|
7414
|
+
modelUrl: this.config.cpuModelUrl
|
|
7415
|
+
});
|
|
7416
|
+
logger10.info("Fallback to Wav2ArkitCpuInference successful");
|
|
7417
|
+
}
|
|
4466
7418
|
this.hasFallenBack = true;
|
|
4467
|
-
logger6.info("Fallback to Wav2ArkitCpuInference successful");
|
|
4468
7419
|
return await this.implementation.load();
|
|
4469
7420
|
}
|
|
4470
7421
|
async infer(audioSamples, identityIndex) {
|
|
@@ -4476,7 +7427,7 @@ var LipSyncWithFallback = class {
|
|
|
4476
7427
|
};
|
|
4477
7428
|
|
|
4478
7429
|
// src/inference/SileroVADInference.ts
|
|
4479
|
-
var
|
|
7430
|
+
var logger11 = createLogger("SileroVAD");
|
|
4480
7431
|
var SileroVADInference = class {
|
|
4481
7432
|
constructor(config) {
|
|
4482
7433
|
this.session = null;
|
|
@@ -4550,23 +7501,23 @@ var SileroVADInference = class {
|
|
|
4550
7501
|
"model.sample_rate": this.config.sampleRate
|
|
4551
7502
|
});
|
|
4552
7503
|
try {
|
|
4553
|
-
|
|
7504
|
+
logger11.info("Loading ONNX Runtime...", { preference: this.config.backend });
|
|
4554
7505
|
const { ort, backend } = await getOnnxRuntimeForPreference(this.config.backend);
|
|
4555
7506
|
this.ort = ort;
|
|
4556
7507
|
this._backend = backend;
|
|
4557
|
-
|
|
7508
|
+
logger11.info("ONNX Runtime loaded", { backend: this._backend });
|
|
4558
7509
|
const cache = getModelCache();
|
|
4559
7510
|
const modelUrl = this.config.modelUrl;
|
|
4560
7511
|
const isCached = await cache.has(modelUrl);
|
|
4561
7512
|
let modelBuffer;
|
|
4562
7513
|
if (isCached) {
|
|
4563
|
-
|
|
7514
|
+
logger11.debug("Loading model from cache", { modelUrl });
|
|
4564
7515
|
modelBuffer = await cache.get(modelUrl);
|
|
4565
7516
|
} else {
|
|
4566
|
-
|
|
7517
|
+
logger11.debug("Fetching and caching model", { modelUrl });
|
|
4567
7518
|
modelBuffer = await fetchWithCache(modelUrl);
|
|
4568
7519
|
}
|
|
4569
|
-
|
|
7520
|
+
logger11.debug("Creating ONNX session", {
|
|
4570
7521
|
size: formatBytes(modelBuffer.byteLength),
|
|
4571
7522
|
backend: this._backend
|
|
4572
7523
|
});
|
|
@@ -4575,7 +7526,7 @@ var SileroVADInference = class {
|
|
|
4575
7526
|
this.session = await ort.InferenceSession.create(modelData, sessionOptions);
|
|
4576
7527
|
this.reset();
|
|
4577
7528
|
const loadTimeMs = performance.now() - startTime;
|
|
4578
|
-
|
|
7529
|
+
logger11.info("Model loaded successfully", {
|
|
4579
7530
|
backend: this._backend,
|
|
4580
7531
|
loadTimeMs: Math.round(loadTimeMs),
|
|
4581
7532
|
sampleRate: this.config.sampleRate,
|
|
@@ -4630,7 +7581,7 @@ var SileroVADInference = class {
|
|
|
4630
7581
|
[]
|
|
4631
7582
|
);
|
|
4632
7583
|
} catch (e) {
|
|
4633
|
-
|
|
7584
|
+
logger11.warn("BigInt64Array not available, using bigint array fallback", {
|
|
4634
7585
|
error: e instanceof Error ? e.message : String(e)
|
|
4635
7586
|
});
|
|
4636
7587
|
this.srTensor = new this.ort.Tensor(
|
|
@@ -4722,23 +7673,13 @@ var SileroVADInference = class {
|
|
|
4722
7673
|
}
|
|
4723
7674
|
return segments;
|
|
4724
7675
|
}
|
|
4725
|
-
/**
|
|
4726
|
-
* Calculate RMS energy of audio chunk
|
|
4727
|
-
*/
|
|
4728
|
-
calculateRMS(samples) {
|
|
4729
|
-
let sum = 0;
|
|
4730
|
-
for (let i = 0; i < samples.length; i++) {
|
|
4731
|
-
sum += samples[i] * samples[i];
|
|
4732
|
-
}
|
|
4733
|
-
return Math.sqrt(sum / samples.length);
|
|
4734
|
-
}
|
|
4735
7676
|
/**
|
|
4736
7677
|
* Queue inference to serialize ONNX session calls
|
|
4737
7678
|
*/
|
|
4738
7679
|
queueInference(audioChunk) {
|
|
4739
7680
|
const audioChunkCopy = new Float32Array(audioChunk);
|
|
4740
7681
|
const MIN_ENERGY_THRESHOLD = 1e-3;
|
|
4741
|
-
const rms =
|
|
7682
|
+
const rms = calculateRMS(audioChunkCopy);
|
|
4742
7683
|
if (rms < MIN_ENERGY_THRESHOLD) {
|
|
4743
7684
|
if (!this.wasSpeaking) {
|
|
4744
7685
|
this.preSpeechBuffer.push(new Float32Array(audioChunkCopy));
|
|
@@ -4746,7 +7687,7 @@ var SileroVADInference = class {
|
|
|
4746
7687
|
this.preSpeechBuffer.shift();
|
|
4747
7688
|
}
|
|
4748
7689
|
}
|
|
4749
|
-
|
|
7690
|
+
logger11.trace("Skipping VAD inference - audio too quiet", {
|
|
4750
7691
|
rms: Math.round(rms * 1e4) / 1e4,
|
|
4751
7692
|
threshold: MIN_ENERGY_THRESHOLD
|
|
4752
7693
|
});
|
|
@@ -4793,19 +7734,19 @@ var SileroVADInference = class {
|
|
|
4793
7734
|
[2, 1, 128]
|
|
4794
7735
|
);
|
|
4795
7736
|
}
|
|
4796
|
-
this.context =
|
|
7737
|
+
this.context = audioChunkCopy.slice(-this.contextSize);
|
|
4797
7738
|
const inferenceTimeMs = performance.now() - startTime;
|
|
4798
7739
|
const isSpeech = probability > this.config.threshold;
|
|
4799
7740
|
let preSpeechChunks;
|
|
4800
7741
|
if (isSpeech && !this.wasSpeaking) {
|
|
4801
7742
|
preSpeechChunks = [...this.preSpeechBuffer];
|
|
4802
7743
|
this.preSpeechBuffer = [];
|
|
4803
|
-
|
|
7744
|
+
logger11.debug("Speech started with pre-speech buffer", {
|
|
4804
7745
|
preSpeechChunks: preSpeechChunks.length,
|
|
4805
7746
|
durationMs: Math.round(preSpeechChunks.length * this.getChunkDurationMs())
|
|
4806
7747
|
});
|
|
4807
7748
|
} else if (!isSpeech && !this.wasSpeaking) {
|
|
4808
|
-
this.preSpeechBuffer.push(new Float32Array(
|
|
7749
|
+
this.preSpeechBuffer.push(new Float32Array(audioChunkCopy));
|
|
4809
7750
|
if (this.preSpeechBuffer.length > this.config.preSpeechBufferChunks) {
|
|
4810
7751
|
this.preSpeechBuffer.shift();
|
|
4811
7752
|
}
|
|
@@ -4813,7 +7754,7 @@ var SileroVADInference = class {
|
|
|
4813
7754
|
this.preSpeechBuffer = [];
|
|
4814
7755
|
}
|
|
4815
7756
|
this.wasSpeaking = isSpeech;
|
|
4816
|
-
|
|
7757
|
+
logger11.trace("VAD inference completed", {
|
|
4817
7758
|
probability: Math.round(probability * 1e3) / 1e3,
|
|
4818
7759
|
isSpeech,
|
|
4819
7760
|
inferenceTimeMs: Math.round(inferenceTimeMs * 100) / 100
|
|
@@ -4840,13 +7781,30 @@ var SileroVADInference = class {
|
|
|
4840
7781
|
preSpeechChunks
|
|
4841
7782
|
});
|
|
4842
7783
|
} catch (err) {
|
|
4843
|
-
|
|
4844
|
-
|
|
4845
|
-
|
|
4846
|
-
|
|
4847
|
-
|
|
4848
|
-
|
|
4849
|
-
|
|
7784
|
+
if (typeof err === "number") {
|
|
7785
|
+
const oomError = new Error(
|
|
7786
|
+
`SileroVAD inference failed with raw C++ exception pointer (0x${err.toString(16)}). This is likely an OOM crash in WASM. Try reducing concurrent model sessions or reloading the page.`
|
|
7787
|
+
);
|
|
7788
|
+
logger11.error("ORT WASM OOM \u2014 raw C++ exception pointer", {
|
|
7789
|
+
pointer: `0x${err.toString(16)}`,
|
|
7790
|
+
backend: this._backend
|
|
7791
|
+
});
|
|
7792
|
+
span?.endWithError(oomError);
|
|
7793
|
+
telemetry?.incrementCounter("omote.inference.total", 1, {
|
|
7794
|
+
model: "silero-vad",
|
|
7795
|
+
backend: this._backend,
|
|
7796
|
+
status: "error"
|
|
7797
|
+
});
|
|
7798
|
+
reject(oomError);
|
|
7799
|
+
} else {
|
|
7800
|
+
span?.endWithError(err instanceof Error ? err : new Error(String(err)));
|
|
7801
|
+
telemetry?.incrementCounter("omote.inference.total", 1, {
|
|
7802
|
+
model: "silero-vad",
|
|
7803
|
+
backend: this._backend,
|
|
7804
|
+
status: "error"
|
|
7805
|
+
});
|
|
7806
|
+
reject(err);
|
|
7807
|
+
}
|
|
4850
7808
|
}
|
|
4851
7809
|
});
|
|
4852
7810
|
});
|
|
@@ -4870,19 +7828,27 @@ var SileroVADInference = class {
|
|
|
4870
7828
|
SileroVADInference.isWebGPUAvailable = isWebGPUAvailable;
|
|
4871
7829
|
|
|
4872
7830
|
// src/inference/SileroVADWorker.ts
|
|
4873
|
-
var
|
|
4874
|
-
var
|
|
4875
|
-
var
|
|
4876
|
-
var
|
|
4877
|
-
|
|
7831
|
+
var logger12 = createLogger("SileroVADWorker");
|
|
7832
|
+
var WASM_CDN_PATH5 = "https://cdn.jsdelivr.net/npm/onnxruntime-web@1.23.2/dist/";
|
|
7833
|
+
var LOAD_TIMEOUT_MS3 = 1e4;
|
|
7834
|
+
var INFERENCE_TIMEOUT_MS3 = 1e3;
|
|
7835
|
+
function resolveUrl4(url) {
|
|
7836
|
+
if (/^https?:\/\//i.test(url) || /^blob:/i.test(url)) return url;
|
|
7837
|
+
try {
|
|
7838
|
+
return new URL(url, globalThis.location?.origin ?? "https://localhost").href;
|
|
7839
|
+
} catch {
|
|
7840
|
+
return url;
|
|
7841
|
+
}
|
|
7842
|
+
}
|
|
7843
|
+
var WORKER_SCRIPT4 = `
|
|
4878
7844
|
// Silero VAD Worker Script
|
|
4879
7845
|
// Loaded via Blob URL - no separate file needed
|
|
4880
7846
|
|
|
4881
|
-
|
|
4882
|
-
|
|
4883
|
-
|
|
4884
|
-
|
|
4885
|
-
|
|
7847
|
+
var ort = null;
|
|
7848
|
+
var session = null;
|
|
7849
|
+
var sampleRate = 16000;
|
|
7850
|
+
var chunkSize = 512;
|
|
7851
|
+
var contextSize = 64;
|
|
4886
7852
|
|
|
4887
7853
|
/**
|
|
4888
7854
|
* Load ONNX Runtime from CDN
|
|
@@ -5132,7 +8098,7 @@ var SileroVADWorker = class {
|
|
|
5132
8098
|
* Create the worker from inline script
|
|
5133
8099
|
*/
|
|
5134
8100
|
createWorker() {
|
|
5135
|
-
const blob = new Blob([
|
|
8101
|
+
const blob = new Blob([WORKER_SCRIPT4], { type: "application/javascript" });
|
|
5136
8102
|
const blobUrl = URL.createObjectURL(blob);
|
|
5137
8103
|
const worker = new Worker(blobUrl);
|
|
5138
8104
|
URL.revokeObjectURL(blobUrl);
|
|
@@ -5140,7 +8106,7 @@ var SileroVADWorker = class {
|
|
|
5140
8106
|
this.handleWorkerMessage(event.data);
|
|
5141
8107
|
};
|
|
5142
8108
|
worker.onerror = (error) => {
|
|
5143
|
-
|
|
8109
|
+
logger12.error("Worker error", { error: error.message });
|
|
5144
8110
|
for (const [, resolver] of this.pendingResolvers) {
|
|
5145
8111
|
resolver.reject(new Error(`Worker error: ${error.message}`));
|
|
5146
8112
|
}
|
|
@@ -5216,25 +8182,25 @@ var SileroVADWorker = class {
|
|
|
5216
8182
|
"model.sample_rate": this.config.sampleRate
|
|
5217
8183
|
});
|
|
5218
8184
|
try {
|
|
5219
|
-
|
|
8185
|
+
logger12.info("Creating VAD worker...");
|
|
5220
8186
|
this.worker = this.createWorker();
|
|
5221
|
-
|
|
8187
|
+
logger12.info("Loading model in worker...", {
|
|
5222
8188
|
modelUrl: this.config.modelUrl,
|
|
5223
8189
|
sampleRate: this.config.sampleRate
|
|
5224
8190
|
});
|
|
5225
8191
|
const result = await this.sendMessage(
|
|
5226
8192
|
{
|
|
5227
8193
|
type: "load",
|
|
5228
|
-
modelUrl: this.config.modelUrl,
|
|
8194
|
+
modelUrl: resolveUrl4(this.config.modelUrl),
|
|
5229
8195
|
sampleRate: this.config.sampleRate,
|
|
5230
|
-
wasmPaths:
|
|
8196
|
+
wasmPaths: WASM_CDN_PATH5
|
|
5231
8197
|
},
|
|
5232
8198
|
"loaded",
|
|
5233
|
-
|
|
8199
|
+
LOAD_TIMEOUT_MS3
|
|
5234
8200
|
);
|
|
5235
8201
|
this._isLoaded = true;
|
|
5236
8202
|
const loadTimeMs = performance.now() - startTime;
|
|
5237
|
-
|
|
8203
|
+
logger12.info("VAD worker loaded successfully", {
|
|
5238
8204
|
backend: "wasm",
|
|
5239
8205
|
loadTimeMs: Math.round(loadTimeMs),
|
|
5240
8206
|
workerLoadTimeMs: Math.round(result.loadTimeMs),
|
|
@@ -5285,7 +8251,7 @@ var SileroVADWorker = class {
|
|
|
5285
8251
|
const result = await this.sendMessage(
|
|
5286
8252
|
{ type: "reset" },
|
|
5287
8253
|
"reset",
|
|
5288
|
-
|
|
8254
|
+
INFERENCE_TIMEOUT_MS3
|
|
5289
8255
|
);
|
|
5290
8256
|
this.state = result.state;
|
|
5291
8257
|
this.context = new Float32Array(this.contextSize);
|
|
@@ -5331,7 +8297,7 @@ var SileroVADWorker = class {
|
|
|
5331
8297
|
context: this.context
|
|
5332
8298
|
},
|
|
5333
8299
|
"result",
|
|
5334
|
-
|
|
8300
|
+
INFERENCE_TIMEOUT_MS3
|
|
5335
8301
|
);
|
|
5336
8302
|
this.state = result.state;
|
|
5337
8303
|
this.context = audioChunkCopy.slice(-this.contextSize);
|
|
@@ -5341,7 +8307,7 @@ var SileroVADWorker = class {
|
|
|
5341
8307
|
if (isSpeech && !this.wasSpeaking) {
|
|
5342
8308
|
preSpeechChunks = [...this.preSpeechBuffer];
|
|
5343
8309
|
this.preSpeechBuffer = [];
|
|
5344
|
-
|
|
8310
|
+
logger12.debug("Speech started with pre-speech buffer", {
|
|
5345
8311
|
preSpeechChunks: preSpeechChunks.length,
|
|
5346
8312
|
durationMs: Math.round(preSpeechChunks.length * this.getChunkDurationMs())
|
|
5347
8313
|
});
|
|
@@ -5354,7 +8320,7 @@ var SileroVADWorker = class {
|
|
|
5354
8320
|
this.preSpeechBuffer = [];
|
|
5355
8321
|
}
|
|
5356
8322
|
this.wasSpeaking = isSpeech;
|
|
5357
|
-
|
|
8323
|
+
logger12.trace("VAD worker inference completed", {
|
|
5358
8324
|
probability: Math.round(result.probability * 1e3) / 1e3,
|
|
5359
8325
|
isSpeech,
|
|
5360
8326
|
inferenceTimeMs: Math.round(inferenceTimeMs * 100) / 100,
|
|
@@ -5400,7 +8366,7 @@ var SileroVADWorker = class {
|
|
|
5400
8366
|
async dispose() {
|
|
5401
8367
|
if (this.worker) {
|
|
5402
8368
|
try {
|
|
5403
|
-
await this.sendMessage({ type: "dispose" }, "disposed",
|
|
8369
|
+
await this.sendMessage({ type: "dispose" }, "disposed", INFERENCE_TIMEOUT_MS3);
|
|
5404
8370
|
} catch {
|
|
5405
8371
|
}
|
|
5406
8372
|
this.worker.terminate();
|
|
@@ -5422,40 +8388,44 @@ var SileroVADWorker = class {
|
|
|
5422
8388
|
};
|
|
5423
8389
|
|
|
5424
8390
|
// src/inference/createSileroVAD.ts
|
|
5425
|
-
var
|
|
8391
|
+
var logger13 = createLogger("createSileroVAD");
|
|
5426
8392
|
function supportsVADWorker() {
|
|
5427
8393
|
if (typeof Worker === "undefined") {
|
|
5428
|
-
|
|
8394
|
+
logger13.debug("Worker not supported: Worker constructor undefined");
|
|
5429
8395
|
return false;
|
|
5430
8396
|
}
|
|
5431
8397
|
if (typeof URL === "undefined" || typeof URL.createObjectURL === "undefined") {
|
|
5432
|
-
|
|
8398
|
+
logger13.debug("Worker not supported: URL.createObjectURL unavailable");
|
|
5433
8399
|
return false;
|
|
5434
8400
|
}
|
|
5435
8401
|
if (typeof Blob === "undefined") {
|
|
5436
|
-
|
|
8402
|
+
logger13.debug("Worker not supported: Blob constructor unavailable");
|
|
5437
8403
|
return false;
|
|
5438
8404
|
}
|
|
5439
8405
|
return true;
|
|
5440
8406
|
}
|
|
5441
8407
|
function createSileroVAD(config) {
|
|
8408
|
+
if (config.unifiedWorker) {
|
|
8409
|
+
logger13.info("Creating SileroVADUnifiedAdapter (shared unified worker)");
|
|
8410
|
+
return new SileroVADUnifiedAdapter(config.unifiedWorker, config);
|
|
8411
|
+
}
|
|
5442
8412
|
const fallbackOnError = config.fallbackOnError ?? true;
|
|
5443
8413
|
let useWorker;
|
|
5444
8414
|
if (config.useWorker !== void 0) {
|
|
5445
8415
|
useWorker = config.useWorker;
|
|
5446
|
-
|
|
8416
|
+
logger13.debug("Worker preference explicitly set", { useWorker });
|
|
5447
8417
|
} else {
|
|
5448
8418
|
const workerSupported = supportsVADWorker();
|
|
5449
8419
|
const onMobile = isMobile();
|
|
5450
8420
|
useWorker = workerSupported && !onMobile;
|
|
5451
|
-
|
|
8421
|
+
logger13.debug("Auto-detected Worker preference", {
|
|
5452
8422
|
useWorker,
|
|
5453
8423
|
workerSupported,
|
|
5454
8424
|
onMobile
|
|
5455
8425
|
});
|
|
5456
8426
|
}
|
|
5457
8427
|
if (useWorker) {
|
|
5458
|
-
|
|
8428
|
+
logger13.info("Creating SileroVADWorker (off-main-thread)");
|
|
5459
8429
|
const worker = new SileroVADWorker({
|
|
5460
8430
|
modelUrl: config.modelUrl,
|
|
5461
8431
|
sampleRate: config.sampleRate,
|
|
@@ -5467,7 +8437,7 @@ function createSileroVAD(config) {
|
|
|
5467
8437
|
}
|
|
5468
8438
|
return worker;
|
|
5469
8439
|
}
|
|
5470
|
-
|
|
8440
|
+
logger13.info("Creating SileroVADInference (main thread)");
|
|
5471
8441
|
return new SileroVADInference(config);
|
|
5472
8442
|
}
|
|
5473
8443
|
var VADWorkerWithFallback = class {
|
|
@@ -5493,7 +8463,7 @@ var VADWorkerWithFallback = class {
|
|
|
5493
8463
|
try {
|
|
5494
8464
|
return await this.implementation.load();
|
|
5495
8465
|
} catch (error) {
|
|
5496
|
-
|
|
8466
|
+
logger13.warn("Worker load failed, falling back to main thread", {
|
|
5497
8467
|
error: error instanceof Error ? error.message : String(error)
|
|
5498
8468
|
});
|
|
5499
8469
|
try {
|
|
@@ -5502,7 +8472,7 @@ var VADWorkerWithFallback = class {
|
|
|
5502
8472
|
}
|
|
5503
8473
|
this.implementation = new SileroVADInference(this.config);
|
|
5504
8474
|
this.hasFallenBack = true;
|
|
5505
|
-
|
|
8475
|
+
logger13.info("Fallback to SileroVADInference successful");
|
|
5506
8476
|
return await this.implementation.load();
|
|
5507
8477
|
}
|
|
5508
8478
|
}
|
|
@@ -5524,7 +8494,7 @@ var VADWorkerWithFallback = class {
|
|
|
5524
8494
|
};
|
|
5525
8495
|
|
|
5526
8496
|
// src/inference/SafariSpeechRecognition.ts
|
|
5527
|
-
var
|
|
8497
|
+
var logger14 = createLogger("SafariSpeech");
|
|
5528
8498
|
var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
5529
8499
|
constructor(config = {}) {
|
|
5530
8500
|
this.recognition = null;
|
|
@@ -5543,7 +8513,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
5543
8513
|
interimResults: config.interimResults ?? true,
|
|
5544
8514
|
maxAlternatives: config.maxAlternatives ?? 1
|
|
5545
8515
|
};
|
|
5546
|
-
|
|
8516
|
+
logger14.debug("SafariSpeechRecognition created", {
|
|
5547
8517
|
language: this.config.language,
|
|
5548
8518
|
continuous: this.config.continuous
|
|
5549
8519
|
});
|
|
@@ -5604,7 +8574,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
5604
8574
|
*/
|
|
5605
8575
|
async start() {
|
|
5606
8576
|
if (this.isListening) {
|
|
5607
|
-
|
|
8577
|
+
logger14.warn("Already listening");
|
|
5608
8578
|
return;
|
|
5609
8579
|
}
|
|
5610
8580
|
if (!_SafariSpeechRecognition.isAvailable()) {
|
|
@@ -5634,7 +8604,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
5634
8604
|
this.isListening = true;
|
|
5635
8605
|
this.startTime = performance.now();
|
|
5636
8606
|
this.accumulatedText = "";
|
|
5637
|
-
|
|
8607
|
+
logger14.info("Speech recognition started", {
|
|
5638
8608
|
language: this.config.language
|
|
5639
8609
|
});
|
|
5640
8610
|
span?.end();
|
|
@@ -5649,7 +8619,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
5649
8619
|
*/
|
|
5650
8620
|
async stop() {
|
|
5651
8621
|
if (!this.isListening || !this.recognition) {
|
|
5652
|
-
|
|
8622
|
+
logger14.warn("Not currently listening");
|
|
5653
8623
|
return {
|
|
5654
8624
|
text: this.accumulatedText,
|
|
5655
8625
|
language: this.config.language,
|
|
@@ -5678,7 +8648,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
5678
8648
|
if (this.recognition && this.isListening) {
|
|
5679
8649
|
this.recognition.abort();
|
|
5680
8650
|
this.isListening = false;
|
|
5681
|
-
|
|
8651
|
+
logger14.info("Speech recognition aborted");
|
|
5682
8652
|
}
|
|
5683
8653
|
}
|
|
5684
8654
|
/**
|
|
@@ -5709,7 +8679,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
5709
8679
|
this.isListening = false;
|
|
5710
8680
|
this.resultCallbacks = [];
|
|
5711
8681
|
this.errorCallbacks = [];
|
|
5712
|
-
|
|
8682
|
+
logger14.debug("SafariSpeechRecognition disposed");
|
|
5713
8683
|
}
|
|
5714
8684
|
/**
|
|
5715
8685
|
* Set up event handlers for the recognition instance
|
|
@@ -5737,7 +8707,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
5737
8707
|
confidence: alternative.confidence
|
|
5738
8708
|
};
|
|
5739
8709
|
this.emitResult(speechResult);
|
|
5740
|
-
|
|
8710
|
+
logger14.trace("Speech result", {
|
|
5741
8711
|
text: text.substring(0, 50),
|
|
5742
8712
|
isFinal,
|
|
5743
8713
|
confidence: alternative.confidence
|
|
@@ -5747,12 +8717,12 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
5747
8717
|
span?.end();
|
|
5748
8718
|
} catch (error) {
|
|
5749
8719
|
span?.endWithError(error instanceof Error ? error : new Error(String(error)));
|
|
5750
|
-
|
|
8720
|
+
logger14.error("Error processing speech result", { error });
|
|
5751
8721
|
}
|
|
5752
8722
|
};
|
|
5753
8723
|
this.recognition.onerror = (event) => {
|
|
5754
8724
|
const error = new Error(`Speech recognition error: ${event.error} - ${event.message}`);
|
|
5755
|
-
|
|
8725
|
+
logger14.error("Speech recognition error", { error: event.error, message: event.message });
|
|
5756
8726
|
this.emitError(error);
|
|
5757
8727
|
if (this.stopRejecter) {
|
|
5758
8728
|
this.stopRejecter(error);
|
|
@@ -5762,7 +8732,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
5762
8732
|
};
|
|
5763
8733
|
this.recognition.onend = () => {
|
|
5764
8734
|
this.isListening = false;
|
|
5765
|
-
|
|
8735
|
+
logger14.info("Speech recognition ended", {
|
|
5766
8736
|
totalText: this.accumulatedText.length,
|
|
5767
8737
|
durationMs: performance.now() - this.startTime
|
|
5768
8738
|
});
|
|
@@ -5779,13 +8749,13 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
5779
8749
|
}
|
|
5780
8750
|
};
|
|
5781
8751
|
this.recognition.onstart = () => {
|
|
5782
|
-
|
|
8752
|
+
logger14.debug("Speech recognition started by browser");
|
|
5783
8753
|
};
|
|
5784
8754
|
this.recognition.onspeechstart = () => {
|
|
5785
|
-
|
|
8755
|
+
logger14.debug("Speech detected");
|
|
5786
8756
|
};
|
|
5787
8757
|
this.recognition.onspeechend = () => {
|
|
5788
|
-
|
|
8758
|
+
logger14.debug("Speech ended");
|
|
5789
8759
|
};
|
|
5790
8760
|
}
|
|
5791
8761
|
/**
|
|
@@ -5796,7 +8766,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
5796
8766
|
try {
|
|
5797
8767
|
callback(result);
|
|
5798
8768
|
} catch (error) {
|
|
5799
|
-
|
|
8769
|
+
logger14.error("Error in result callback", { error });
|
|
5800
8770
|
}
|
|
5801
8771
|
}
|
|
5802
8772
|
}
|
|
@@ -5808,7 +8778,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
5808
8778
|
try {
|
|
5809
8779
|
callback(error);
|
|
5810
8780
|
} catch (callbackError) {
|
|
5811
|
-
|
|
8781
|
+
logger14.error("Error in error callback", { error: callbackError });
|
|
5812
8782
|
}
|
|
5813
8783
|
}
|
|
5814
8784
|
}
|
|
@@ -6073,7 +9043,7 @@ var AgentCoreAdapter = class extends EventEmitter {
|
|
|
6073
9043
|
console.error("[AgentCore] VAD error during interruption detection:", error);
|
|
6074
9044
|
});
|
|
6075
9045
|
}
|
|
6076
|
-
const float32 = audio instanceof Float32Array ? audio :
|
|
9046
|
+
const float32 = audio instanceof Float32Array ? audio : int16ToFloat32(audio);
|
|
6077
9047
|
this.audioBuffer.push(float32);
|
|
6078
9048
|
this.scheduleTranscription();
|
|
6079
9049
|
}
|
|
@@ -6405,7 +9375,7 @@ var AgentCoreAdapter = class extends EventEmitter {
|
|
|
6405
9375
|
* Falls back to simple RMS if VAD not available
|
|
6406
9376
|
*/
|
|
6407
9377
|
async detectVoiceActivity(audio) {
|
|
6408
|
-
const float32 = audio instanceof Float32Array ? audio :
|
|
9378
|
+
const float32 = audio instanceof Float32Array ? audio : int16ToFloat32(audio);
|
|
6409
9379
|
if (this.vad) {
|
|
6410
9380
|
const chunkSize = this.vad.getChunkSize();
|
|
6411
9381
|
for (let i = 0; i + chunkSize <= float32.length; i += chunkSize) {
|
|
@@ -6424,13 +9394,6 @@ var AgentCoreAdapter = class extends EventEmitter {
|
|
|
6424
9394
|
const rms = Math.sqrt(sum / float32.length);
|
|
6425
9395
|
return rms > 0.02;
|
|
6426
9396
|
}
|
|
6427
|
-
int16ToFloat32(int16) {
|
|
6428
|
-
const float32 = new Float32Array(int16.length);
|
|
6429
|
-
for (let i = 0; i < int16.length; i++) {
|
|
6430
|
-
float32[i] = int16[i] / 32768;
|
|
6431
|
-
}
|
|
6432
|
-
return float32;
|
|
6433
|
-
}
|
|
6434
9397
|
base64ToArrayBuffer(base64) {
|
|
6435
9398
|
const binaryString = atob(base64);
|
|
6436
9399
|
const bytes = new Uint8Array(binaryString.length);
|
|
@@ -8277,13 +11240,19 @@ export {
|
|
|
8277
11240
|
RingBuffer,
|
|
8278
11241
|
SafariSpeechRecognition,
|
|
8279
11242
|
SenseVoiceInference,
|
|
11243
|
+
SenseVoiceUnifiedAdapter,
|
|
11244
|
+
SenseVoiceWorker,
|
|
8280
11245
|
SileroVADInference,
|
|
11246
|
+
SileroVADUnifiedAdapter,
|
|
8281
11247
|
SileroVADWorker,
|
|
8282
11248
|
SyncedAudioPipeline,
|
|
8283
11249
|
TenantManager,
|
|
8284
11250
|
UPPER_FACE_BLENDSHAPES,
|
|
11251
|
+
UnifiedInferenceWorker,
|
|
8285
11252
|
WAV2ARKIT_BLENDSHAPES,
|
|
8286
11253
|
Wav2ArkitCpuInference,
|
|
11254
|
+
Wav2ArkitCpuUnifiedAdapter,
|
|
11255
|
+
Wav2ArkitCpuWorker,
|
|
8287
11256
|
Wav2Vec2Inference,
|
|
8288
11257
|
applyCMVN,
|
|
8289
11258
|
applyLFR,
|
|
@@ -8297,6 +11266,7 @@ export {
|
|
|
8297
11266
|
createEmotionVector,
|
|
8298
11267
|
createLipSync,
|
|
8299
11268
|
createLogger,
|
|
11269
|
+
createSenseVoice,
|
|
8300
11270
|
createSessionWithFallback,
|
|
8301
11271
|
createSileroVAD,
|
|
8302
11272
|
ctcGreedyDecode,
|