@omote/core 0.4.4 → 0.4.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.mjs CHANGED
@@ -312,7 +312,14 @@ var AudioScheduler = class {
312
312
  source.connect(gainNode);
313
313
  const scheduleTime = this.nextPlayTime;
314
314
  source.start(scheduleTime);
315
- this.scheduledSources.push({ source, gainNode });
315
+ const entry = { source, gainNode };
316
+ this.scheduledSources.push(entry);
317
+ source.onended = () => {
318
+ const idx = this.scheduledSources.indexOf(entry);
319
+ if (idx !== -1) {
320
+ this.scheduledSources.splice(idx, 1);
321
+ }
322
+ };
316
323
  const duration = audioData.length / ctx.sampleRate;
317
324
  this.nextPlayTime = scheduleTime + duration;
318
325
  return scheduleTime;
@@ -668,7 +675,7 @@ var LAMPipeline = class {
668
675
  }
669
676
  };
670
677
 
671
- // src/audio/SyncedAudioPipeline.ts
678
+ // src/audio/audioUtils.ts
672
679
  function pcm16ToFloat32(buffer) {
673
680
  const byteLen = buffer.byteLength & ~1;
674
681
  const int16 = byteLen === buffer.byteLength ? new Int16Array(buffer) : new Int16Array(buffer, 0, byteLen / 2);
@@ -678,6 +685,15 @@ function pcm16ToFloat32(buffer) {
678
685
  }
679
686
  return float32;
680
687
  }
688
+ function int16ToFloat32(int16) {
689
+ const float32 = new Float32Array(int16.length);
690
+ for (let i = 0; i < int16.length; i++) {
691
+ float32[i] = int16[i] / 32768;
692
+ }
693
+ return float32;
694
+ }
695
+
696
+ // src/audio/SyncedAudioPipeline.ts
681
697
  var SyncedAudioPipeline = class extends EventEmitter {
682
698
  constructor(options) {
683
699
  super();
@@ -2385,7 +2401,7 @@ function isIOSSafari() {
2385
2401
  function isIOS() {
2386
2402
  if (typeof navigator === "undefined") return false;
2387
2403
  const ua = navigator.userAgent.toLowerCase();
2388
- return /iphone|ipad|ipod/.test(ua);
2404
+ return /iphone|ipad|ipod/.test(ua) || /macintosh/.test(ua) && navigator.maxTouchPoints > 1;
2389
2405
  }
2390
2406
  function isAndroid() {
2391
2407
  if (typeof navigator === "undefined") return false;
@@ -3006,10 +3022,16 @@ var _Wav2Vec2Inference = class _Wav2Vec2Inference {
3006
3022
  });
3007
3023
  logger2.debug("Running warmup inference to initialize GPU context");
3008
3024
  const warmupStart = performance.now();
3009
- const silentAudio = new Float32Array(16e3);
3025
+ const warmupAudio = new Float32Array(16e3);
3026
+ const warmupIdentity = new Float32Array(this.numIdentityClasses);
3027
+ warmupIdentity[0] = 1;
3028
+ const warmupFeeds = {
3029
+ "audio": new this.ort.Tensor("float32", warmupAudio, [1, 16e3]),
3030
+ "identity": new this.ort.Tensor("float32", warmupIdentity, [1, this.numIdentityClasses])
3031
+ };
3010
3032
  const WARMUP_TIMEOUT_MS = 15e3;
3011
3033
  const warmupResult = await Promise.race([
3012
- this.infer(silentAudio, 0).then(() => "ok"),
3034
+ this.session.run(warmupFeeds).then(() => "ok"),
3013
3035
  new Promise((r) => setTimeout(() => r("timeout"), WARMUP_TIMEOUT_MS))
3014
3036
  ]);
3015
3037
  const warmupTimeMs = performance.now() - warmupStart;
@@ -3115,14 +3137,18 @@ var _Wav2Vec2Inference = class _Wav2Vec2Inference {
3115
3137
  });
3116
3138
  try {
3117
3139
  const startTime = performance.now();
3140
+ let timeoutId;
3118
3141
  const results = await Promise.race([
3119
- this.session.run(feeds),
3120
- new Promise(
3121
- (_, rej) => setTimeout(
3142
+ this.session.run(feeds).then((r) => {
3143
+ clearTimeout(timeoutId);
3144
+ return r;
3145
+ }),
3146
+ new Promise((_, rej) => {
3147
+ timeoutId = setTimeout(
3122
3148
  () => rej(new Error(`Wav2Vec2 inference timed out after ${_Wav2Vec2Inference.INFERENCE_TIMEOUT_MS}ms`)),
3123
3149
  _Wav2Vec2Inference.INFERENCE_TIMEOUT_MS
3124
- )
3125
- )
3150
+ );
3151
+ })
3126
3152
  ]);
3127
3153
  const inferenceTimeMs = performance.now() - startTime;
3128
3154
  const asrOutput = results["asr_logits"];
@@ -3228,15 +3254,6 @@ var Wav2Vec2Inference = _Wav2Vec2Inference;
3228
3254
 
3229
3255
  // src/audio/FullFacePipeline.ts
3230
3256
  var logger3 = createLogger("FullFacePipeline");
3231
- function pcm16ToFloat322(buffer) {
3232
- const byteLen = buffer.byteLength & ~1;
3233
- const int16 = byteLen === buffer.byteLength ? new Int16Array(buffer) : new Int16Array(buffer, 0, byteLen / 2);
3234
- const float32 = new Float32Array(int16.length);
3235
- for (let i = 0; i < int16.length; i++) {
3236
- float32[i] = int16[i] / 32768;
3237
- }
3238
- return float32;
3239
- }
3240
3257
  var BLENDSHAPE_INDEX_MAP = /* @__PURE__ */ new Map();
3241
3258
  LAM_BLENDSHAPES.forEach((name, index) => {
3242
3259
  BLENDSHAPE_INDEX_MAP.set(name, index);
@@ -3386,7 +3403,7 @@ var _FullFacePipeline = class _FullFacePipeline extends EventEmitter {
3386
3403
  if (!combined) {
3387
3404
  return;
3388
3405
  }
3389
- const float32 = pcm16ToFloat322(combined);
3406
+ const float32 = pcm16ToFloat32(combined);
3390
3407
  const scheduleTime = await this.scheduler.schedule(float32);
3391
3408
  if (!this.playbackStarted) {
3392
3409
  this.playbackStarted = true;
@@ -3869,13 +3886,18 @@ function ctcGreedyDecode(logits, seqLen, vocabSize, tokenMap) {
3869
3886
 
3870
3887
  // src/inference/SenseVoiceInference.ts
3871
3888
  var logger4 = createLogger("SenseVoice");
3872
- var SenseVoiceInference = class {
3889
+ var _SenseVoiceInference = class _SenseVoiceInference {
3873
3890
  constructor(config) {
3874
3891
  this.session = null;
3875
3892
  this.ort = null;
3876
3893
  this._backend = "wasm";
3877
3894
  this.isLoading = false;
3878
3895
  this.inferenceQueue = Promise.resolve();
3896
+ // Session health: set to true if session.run() times out.
3897
+ // A timed-out session may have a zombie WASM dispatch still running,
3898
+ // so all future transcribe() calls reject immediately to prevent concurrent access.
3899
+ this.poisoned = false;
3900
+ // 10s for SenseVoice (heavier preprocessing)
3879
3901
  // Preprocessing state (loaded once)
3880
3902
  this.tokenMap = null;
3881
3903
  this.negMean = null;
@@ -4023,6 +4045,9 @@ var SenseVoiceInference = class {
4023
4045
  if (!this.session || !this.ort || !this.tokenMap) {
4024
4046
  throw new Error("Model not loaded. Call load() first.");
4025
4047
  }
4048
+ if (this.poisoned) {
4049
+ throw new Error("SenseVoice session timed out \u2014 inference unavailable until page reload");
4050
+ }
4026
4051
  const audio = new Float32Array(audioSamples);
4027
4052
  return this.queueInference(audio);
4028
4053
  }
@@ -4060,7 +4085,19 @@ var SenseVoiceInference = class {
4060
4085
  language: new ort.Tensor("int32", new Int32Array([this.languageId]), [1]),
4061
4086
  text_norm: new ort.Tensor("int32", new Int32Array([this.textNormId]), [1])
4062
4087
  };
4063
- const results = await this.session.run(feeds);
4088
+ let timeoutId;
4089
+ const results = await Promise.race([
4090
+ this.session.run(feeds).then((r) => {
4091
+ clearTimeout(timeoutId);
4092
+ return r;
4093
+ }),
4094
+ new Promise((_, rej) => {
4095
+ timeoutId = setTimeout(
4096
+ () => rej(new Error(`SenseVoice inference timed out after ${_SenseVoiceInference.INFERENCE_TIMEOUT_MS}ms`)),
4097
+ _SenseVoiceInference.INFERENCE_TIMEOUT_MS
4098
+ );
4099
+ })
4100
+ ]);
4064
4101
  const logitsOutput = results["logits"];
4065
4102
  if (!logitsOutput) {
4066
4103
  throw new Error('Model output missing "logits" tensor');
@@ -4106,6 +4143,32 @@ var SenseVoiceInference = class {
4106
4143
  preprocessTimeMs
4107
4144
  });
4108
4145
  } catch (err) {
4146
+ const errMsg = err instanceof Error ? err.message : String(err);
4147
+ if (errMsg.includes("timed out")) {
4148
+ this.poisoned = true;
4149
+ logger4.error("CRITICAL: Inference session timed out \u2014 SenseVoice is dead. Page reload required.", {
4150
+ backend: this._backend,
4151
+ timeoutMs: _SenseVoiceInference.INFERENCE_TIMEOUT_MS
4152
+ });
4153
+ } else if (typeof err === "number") {
4154
+ const oomError = new Error(
4155
+ `SenseVoice inference failed with raw C++ exception pointer (0x${err.toString(16)}). This is likely an OOM crash in WASM. Try reloading the page.`
4156
+ );
4157
+ logger4.error("ORT WASM OOM \u2014 raw C++ exception pointer", {
4158
+ pointer: `0x${err.toString(16)}`,
4159
+ backend: this._backend
4160
+ });
4161
+ span?.endWithError(oomError);
4162
+ telemetry?.incrementCounter("omote.inference.total", 1, {
4163
+ model: "sensevoice",
4164
+ backend: this._backend,
4165
+ status: "error"
4166
+ });
4167
+ reject(oomError);
4168
+ return;
4169
+ } else {
4170
+ logger4.error("Inference failed", { error: errMsg, backend: this._backend });
4171
+ }
4109
4172
  span?.endWithError(err instanceof Error ? err : new Error(String(err)));
4110
4173
  telemetry?.incrementCounter("omote.inference.total", 1, {
4111
4174
  model: "sensevoice",
@@ -4129,241 +4192,3082 @@ var SenseVoiceInference = class {
4129
4192
  this.invStddev = null;
4130
4193
  }
4131
4194
  };
4195
+ _SenseVoiceInference.INFERENCE_TIMEOUT_MS = 1e4;
4196
+ var SenseVoiceInference = _SenseVoiceInference;
4132
4197
 
4133
- // src/inference/Wav2ArkitCpuInference.ts
4134
- var logger5 = createLogger("Wav2ArkitCpu");
4135
- var Wav2ArkitCpuInference = class {
4136
- constructor(config) {
4137
- this.modelId = "wav2arkit_cpu";
4138
- this.session = null;
4139
- this.ort = null;
4140
- this._backend = "wasm";
4141
- this.isLoading = false;
4142
- // Inference queue for handling concurrent calls
4143
- this.inferenceQueue = Promise.resolve();
4144
- this.config = config;
4145
- }
4146
- get backend() {
4147
- return this.session ? this._backend : null;
4148
- }
4149
- get isLoaded() {
4150
- return this.session !== null;
4198
+ // src/inference/SenseVoiceWorker.ts
4199
+ var logger5 = createLogger("SenseVoiceWorker");
4200
+ var WASM_CDN_PATH2 = "https://cdn.jsdelivr.net/npm/onnxruntime-web@1.23.2/dist/";
4201
+ var LOAD_TIMEOUT_MS = 3e4;
4202
+ var INFERENCE_TIMEOUT_MS = 1e4;
4203
+ function resolveUrl(url) {
4204
+ if (/^https?:\/\//i.test(url) || /^blob:/i.test(url)) return url;
4205
+ try {
4206
+ return new URL(url, globalThis.location?.origin ?? "https://localhost").href;
4207
+ } catch {
4208
+ return url;
4151
4209
  }
4152
- /**
4153
- * Load the ONNX model
4154
- */
4155
- async load() {
4156
- if (this.isLoading) {
4157
- throw new Error("Model is already loading");
4210
+ }
4211
+ var WORKER_SCRIPT = `
4212
+ // SenseVoice ASR Worker Script
4213
+ // Loaded via Blob URL - no separate file needed
4214
+
4215
+ var ort = null;
4216
+ var session = null;
4217
+ var tokenMap = null;
4218
+ var negMean = null;
4219
+ var invStddev = null;
4220
+ var languageId = 0;
4221
+ var textNormId = 14;
4222
+ var vocabSize = 0;
4223
+
4224
+ // \u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550
4225
+ // kaldiFbank.ts \u2014 inlined as plain JavaScript
4226
+ // \u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550
4227
+
4228
+ /**
4229
+ * In-place Radix-2 Cooley-Tukey FFT
4230
+ */
4231
+ function fft(re, im) {
4232
+ var n = re.length;
4233
+
4234
+ // Bit-reversal permutation
4235
+ for (var i = 1, j = 0; i < n; i++) {
4236
+ var bit = n >> 1;
4237
+ while (j & bit) {
4238
+ j ^= bit;
4239
+ bit >>= 1;
4158
4240
  }
4159
- if (this.session) {
4160
- throw new Error("Model already loaded. Call dispose() first.");
4241
+ j ^= bit;
4242
+ if (i < j) {
4243
+ var tmp = re[i]; re[i] = re[j]; re[j] = tmp;
4244
+ tmp = im[i]; im[i] = im[j]; im[j] = tmp;
4161
4245
  }
4162
- this.isLoading = true;
4163
- const startTime = performance.now();
4164
- const telemetry = getTelemetry();
4165
- const span = telemetry?.startSpan("Wav2ArkitCpu.load", {
4166
- "model.url": this.config.modelUrl,
4167
- "model.backend_requested": this.config.backend || "wasm"
4168
- });
4169
- try {
4170
- const preference = this.config.backend || "wasm";
4171
- logger5.info("Loading ONNX Runtime...", { preference });
4172
- const { ort, backend } = await getOnnxRuntimeForPreference(preference);
4173
- this.ort = ort;
4174
- this._backend = backend;
4175
- logger5.info("ONNX Runtime loaded", { backend: this._backend });
4176
- const modelUrl = this.config.modelUrl;
4177
- const dataUrl = this.config.externalDataUrl !== false ? this.config.externalDataUrl || `${modelUrl}.data` : null;
4178
- const sessionOptions = getSessionOptions(this._backend);
4179
- if (isIOS()) {
4180
- logger5.info("iOS: passing model URLs directly to ORT (low-memory path)", {
4181
- modelUrl,
4182
- dataUrl
4183
- });
4184
- if (dataUrl) {
4185
- const dataFilename = dataUrl.split("/").pop();
4186
- sessionOptions.externalData = [{
4187
- path: dataFilename,
4188
- data: dataUrl
4189
- // URL string — ORT fetches directly into WASM
4190
- }];
4191
- }
4192
- this.session = await this.ort.InferenceSession.create(modelUrl, sessionOptions);
4193
- } else {
4194
- const cache = getModelCache();
4195
- const isCached = await cache.has(modelUrl);
4196
- let modelBuffer;
4197
- if (isCached) {
4198
- logger5.debug("Loading model from cache", { modelUrl });
4199
- modelBuffer = await cache.get(modelUrl);
4200
- if (!modelBuffer) {
4201
- logger5.warn("Cache corruption detected, clearing and retrying", { modelUrl });
4202
- await cache.delete(modelUrl);
4203
- modelBuffer = await fetchWithCache(modelUrl);
4204
- }
4205
- } else {
4206
- logger5.debug("Fetching and caching model graph", { modelUrl });
4207
- modelBuffer = await fetchWithCache(modelUrl);
4208
- }
4209
- if (!modelBuffer) {
4210
- throw new Error(`Failed to load model: ${modelUrl}`);
4211
- }
4212
- let externalDataBuffer = null;
4213
- if (dataUrl) {
4214
- try {
4215
- const isDataCached = await cache.has(dataUrl);
4216
- if (isDataCached) {
4217
- logger5.debug("Loading external data from cache", { dataUrl });
4218
- externalDataBuffer = await cache.get(dataUrl);
4219
- if (!externalDataBuffer) {
4220
- logger5.warn("Cache corruption for external data, retrying", { dataUrl });
4221
- await cache.delete(dataUrl);
4222
- externalDataBuffer = await fetchWithCache(dataUrl);
4223
- }
4224
- } else {
4225
- logger5.info("Fetching external model data", {
4226
- dataUrl,
4227
- note: "This may be a large download (400MB+)"
4228
- });
4229
- externalDataBuffer = await fetchWithCache(dataUrl);
4230
- }
4231
- logger5.info("External data loaded", {
4232
- size: formatBytes(externalDataBuffer.byteLength)
4233
- });
4234
- } catch (err) {
4235
- logger5.debug("No external data file found (single-file model)", {
4236
- dataUrl,
4237
- error: err.message
4238
- });
4239
- }
4240
- }
4241
- logger5.debug("Creating ONNX session", {
4242
- graphSize: formatBytes(modelBuffer.byteLength),
4243
- externalDataSize: externalDataBuffer ? formatBytes(externalDataBuffer.byteLength) : "none",
4244
- backend: this._backend
4245
- });
4246
- if (externalDataBuffer) {
4247
- const dataFilename = dataUrl.split("/").pop();
4248
- sessionOptions.externalData = [{
4249
- path: dataFilename,
4250
- data: new Uint8Array(externalDataBuffer)
4251
- }];
4252
- }
4253
- const modelData = new Uint8Array(modelBuffer);
4254
- this.session = await this.ort.InferenceSession.create(modelData, sessionOptions);
4246
+ }
4247
+
4248
+ // Butterfly passes
4249
+ for (var len = 2; len <= n; len *= 2) {
4250
+ var halfLen = len / 2;
4251
+ var angle = -2 * Math.PI / len;
4252
+ var wRe = Math.cos(angle);
4253
+ var wIm = Math.sin(angle);
4254
+
4255
+ for (var i = 0; i < n; i += len) {
4256
+ var curRe = 1;
4257
+ var curIm = 0;
4258
+ for (var j = 0; j < halfLen; j++) {
4259
+ var a = i + j;
4260
+ var b = a + halfLen;
4261
+ var tRe = curRe * re[b] - curIm * im[b];
4262
+ var tIm = curRe * im[b] + curIm * re[b];
4263
+ re[b] = re[a] - tRe;
4264
+ im[b] = im[a] - tIm;
4265
+ re[a] += tRe;
4266
+ im[a] += tIm;
4267
+ var nextRe = curRe * wRe - curIm * wIm;
4268
+ curIm = curRe * wIm + curIm * wRe;
4269
+ curRe = nextRe;
4255
4270
  }
4256
- const loadTimeMs = performance.now() - startTime;
4257
- logger5.info("Model loaded successfully", {
4258
- backend: this._backend,
4259
- loadTimeMs: Math.round(loadTimeMs),
4260
- inputs: this.session.inputNames,
4261
- outputs: this.session.outputNames
4262
- });
4263
- span?.setAttributes({
4264
- "model.backend": this._backend,
4265
- "model.load_time_ms": loadTimeMs,
4266
- "model.cached": !isIOS()
4267
- });
4268
- span?.end();
4269
- telemetry?.recordHistogram("omote.model.load_time", loadTimeMs, {
4270
- model: "wav2arkit_cpu",
4271
- backend: this._backend
4272
- });
4273
- logger5.debug("Running warmup inference");
4274
- const warmupStart = performance.now();
4275
- const silentAudio = new Float32Array(16e3);
4276
- await this.infer(silentAudio);
4277
- const warmupTimeMs = performance.now() - warmupStart;
4278
- logger5.info("Warmup inference complete", {
4279
- warmupTimeMs: Math.round(warmupTimeMs),
4280
- backend: this._backend
4281
- });
4282
- telemetry?.recordHistogram("omote.model.warmup_time", warmupTimeMs, {
4283
- model: "wav2arkit_cpu",
4284
- backend: this._backend
4285
- });
4286
- return {
4287
- backend: this._backend,
4288
- loadTimeMs,
4289
- inputNames: [...this.session.inputNames],
4290
- outputNames: [...this.session.outputNames]
4291
- };
4292
- } catch (error) {
4293
- span?.endWithError(error instanceof Error ? error : new Error(String(error)));
4294
- telemetry?.incrementCounter("omote.errors.total", 1, {
4295
- model: "wav2arkit_cpu",
4296
- error_type: "load_failed"
4297
- });
4298
- throw error;
4299
- } finally {
4300
- this.isLoading = false;
4301
4271
  }
4302
4272
  }
4303
- /**
4304
- * Run inference on raw audio
4305
- *
4306
- * Accepts variable-length audio (not fixed to 16000 samples).
4307
- * Output frames = ceil(30 * numSamples / 16000).
4308
- *
4309
- * @param audioSamples - Float32Array of raw audio at 16kHz
4310
- * @param _identityIndex - Ignored (identity 11 is baked into the model)
4311
- */
4312
- async infer(audioSamples, _identityIndex) {
4313
- if (!this.session) {
4314
- throw new Error("Model not loaded. Call load() first.");
4273
+ }
4274
+
4275
+ /** HTK mel scale */
4276
+ function htkMel(freq) {
4277
+ return 1127.0 * Math.log(1.0 + freq / 700.0);
4278
+ }
4279
+
4280
+ function htkMelInverse(mel) {
4281
+ return 700.0 * (Math.exp(mel / 1127.0) - 1.0);
4282
+ }
4283
+
4284
+ /**
4285
+ * Build triangular mel filterbank matrix
4286
+ */
4287
+ function buildMelFilterbank(numBins, fftSize, sampleRate, lowFreq, highFreq) {
4288
+ var numFftBins = fftSize / 2 + 1;
4289
+ var lowMel = htkMel(lowFreq);
4290
+ var highMel = htkMel(highFreq);
4291
+
4292
+ // numBins + 2 equally spaced points in mel space
4293
+ var melPoints = new Float64Array(numBins + 2);
4294
+ for (var i = 0; i < numBins + 2; i++) {
4295
+ melPoints[i] = lowMel + (highMel - lowMel) * i / (numBins + 1);
4296
+ }
4297
+
4298
+ // Convert mel points to FFT bin indices (float, not rounded)
4299
+ var binFreqs = new Float64Array(numBins + 2);
4300
+ for (var i = 0; i < numBins + 2; i++) {
4301
+ binFreqs[i] = htkMelInverse(melPoints[i]) * fftSize / sampleRate;
4302
+ }
4303
+
4304
+ var filters = [];
4305
+
4306
+ for (var m = 0; m < numBins; m++) {
4307
+ var left = binFreqs[m];
4308
+ var center = binFreqs[m + 1];
4309
+ var right = binFreqs[m + 2];
4310
+
4311
+ var startBin = Math.max(0, Math.ceil(left));
4312
+ var endBin = Math.min(numFftBins - 1, Math.floor(right));
4313
+
4314
+ var weights = new Float32Array(endBin - startBin + 1);
4315
+ for (var k = startBin; k <= endBin; k++) {
4316
+ if (k <= center) {
4317
+ weights[k - startBin] = (center - left) > 0 ? (k - left) / (center - left) : 0;
4318
+ } else {
4319
+ weights[k - startBin] = (right - center) > 0 ? (right - k) / (right - center) : 0;
4320
+ }
4315
4321
  }
4316
- const audioCopy = new Float32Array(audioSamples);
4317
- const feeds = {
4318
- "audio_waveform": new this.ort.Tensor("float32", audioCopy, [1, audioCopy.length])
4319
- };
4320
- return this.queueInference(feeds, audioCopy.length);
4322
+
4323
+ filters.push({ startBin: startBin, weights: weights });
4324
+ }
4325
+
4326
+ return filters;
4327
+ }
4328
+
4329
+ /** Create Hamming window */
4330
+ function createHammingWindow(length) {
4331
+ var w = new Float32Array(length);
4332
+ for (var i = 0; i < length; i++) {
4333
+ w[i] = 0.54 - 0.46 * Math.cos(2 * Math.PI * i / (length - 1));
4334
+ }
4335
+ return w;
4336
+ }
4337
+
4338
+ /**
4339
+ * Compute Kaldi-compatible log mel filterbank features
4340
+ */
4341
+ function computeKaldiFbank(audio, sampleRate, numMelBins, opts) {
4342
+ var frameLengthMs = (opts && opts.frameLengthMs !== undefined) ? opts.frameLengthMs : 25;
4343
+ var frameShiftMs = (opts && opts.frameShiftMs !== undefined) ? opts.frameShiftMs : 10;
4344
+ var lowFreq = (opts && opts.lowFreq !== undefined) ? opts.lowFreq : 20;
4345
+ var highFreq = (opts && opts.highFreq !== undefined) ? opts.highFreq : (sampleRate / 2);
4346
+ var dither = (opts && opts.dither !== undefined) ? opts.dither : 0;
4347
+ var preemphasis = (opts && opts.preemphasis !== undefined) ? opts.preemphasis : 0.97;
4348
+
4349
+ var frameLengthSamples = Math.round(sampleRate * frameLengthMs / 1000);
4350
+ var frameShiftSamples = Math.round(sampleRate * frameShiftMs / 1000);
4351
+
4352
+ // Kaldi signal scaling: float [-1,1] -> int16 range
4353
+ var scaled = new Float32Array(audio.length);
4354
+ for (var i = 0; i < audio.length; i++) {
4355
+ scaled[i] = audio[i] * 32768;
4356
+ }
4357
+
4358
+ // Optional dithering
4359
+ if (dither > 0) {
4360
+ for (var i = 0; i < scaled.length; i++) {
4361
+ var u1 = Math.random();
4362
+ var u2 = Math.random();
4363
+ scaled[i] += dither * Math.sqrt(-2 * Math.log(u1 + 1e-10)) * Math.cos(2 * Math.PI * u2);
4364
+ }
4365
+ }
4366
+
4367
+ // Number of frames (snip_edges=true: only complete frames)
4368
+ var numFrames = Math.max(0, Math.floor((scaled.length - frameLengthSamples) / frameShiftSamples) + 1);
4369
+ if (numFrames === 0) {
4370
+ return new Float32Array(0);
4371
+ }
4372
+
4373
+ // FFT size: next power of 2
4374
+ var fftSize = 1;
4375
+ while (fftSize < frameLengthSamples) fftSize *= 2;
4376
+
4377
+ var numFftBins = fftSize / 2 + 1;
4378
+
4379
+ // Pre-compute window and filterbank
4380
+ var window = createHammingWindow(frameLengthSamples);
4381
+ var filters = buildMelFilterbank(numMelBins, fftSize, sampleRate, lowFreq, highFreq);
4382
+
4383
+ // Allocate output
4384
+ var output = new Float32Array(numFrames * numMelBins);
4385
+
4386
+ // FFT buffers (reused per frame)
4387
+ var fftRe = new Float64Array(fftSize);
4388
+ var fftIm = new Float64Array(fftSize);
4389
+
4390
+ for (var f = 0; f < numFrames; f++) {
4391
+ var offset = f * frameShiftSamples;
4392
+
4393
+ // Clear FFT buffers
4394
+ fftRe.fill(0);
4395
+ fftIm.fill(0);
4396
+
4397
+ // Extract frame with preemphasis and windowing
4398
+ for (var i = 0; i < frameLengthSamples; i++) {
4399
+ var sample = scaled[offset + i];
4400
+ // Preemphasis: y[n] = x[n] - coeff * x[n-1]
4401
+ if (preemphasis > 0 && i > 0) {
4402
+ sample -= preemphasis * scaled[offset + i - 1];
4403
+ } else if (preemphasis > 0 && i === 0 && offset > 0) {
4404
+ sample -= preemphasis * scaled[offset - 1];
4405
+ }
4406
+ // Apply window
4407
+ fftRe[i] = sample * window[i];
4408
+ }
4409
+
4410
+ // FFT
4411
+ fft(fftRe, fftIm);
4412
+
4413
+ // Power spectrum -> mel filterbank -> log
4414
+ var outOffset = f * numMelBins;
4415
+ for (var m = 0; m < numMelBins; m++) {
4416
+ var filter = filters[m];
4417
+ var energy = 0;
4418
+ for (var k = 0; k < filter.weights.length; k++) {
4419
+ var bin = filter.startBin + k;
4420
+ if (bin < numFftBins) {
4421
+ var powerSpec = fftRe[bin] * fftRe[bin] + fftIm[bin] * fftIm[bin];
4422
+ energy += filter.weights[k] * powerSpec;
4423
+ }
4424
+ }
4425
+ output[outOffset + m] = Math.log(Math.max(energy, 1e-10));
4426
+ }
4427
+ }
4428
+
4429
+ return output;
4430
+ }
4431
+
4432
+ /**
4433
+ * Apply Low Frame Rate stacking for SenseVoice
4434
+ */
4435
+ function applyLFR(features, featureDim, lfrM, lfrN) {
4436
+ var numFrames = features.length / featureDim;
4437
+ if (numFrames === 0) return new Float32Array(0);
4438
+
4439
+ var leftPad = Math.floor((lfrM - 1) / 2); // 3 for lfrM=7
4440
+ var paddedLen = numFrames + leftPad;
4441
+ var numOutputFrames = Math.ceil(paddedLen / lfrN);
4442
+ var outputDim = featureDim * lfrM;
4443
+
4444
+ var output = new Float32Array(numOutputFrames * outputDim);
4445
+
4446
+ for (var i = 0; i < numOutputFrames; i++) {
4447
+ var startFrame = i * lfrN - leftPad;
4448
+
4449
+ for (var j = 0; j < lfrM; j++) {
4450
+ var srcFrame = startFrame + j;
4451
+ // Clamp to valid range
4452
+ if (srcFrame < 0) srcFrame = 0;
4453
+ if (srcFrame >= numFrames) srcFrame = numFrames - 1;
4454
+
4455
+ var srcOffset = srcFrame * featureDim;
4456
+ var dstOffset = i * outputDim + j * featureDim;
4457
+ for (var k = 0; k < featureDim; k++) {
4458
+ output[dstOffset + k] = features[srcOffset + k];
4459
+ }
4460
+ }
4461
+ }
4462
+
4463
+ return output;
4464
+ }
4465
+
4466
+ /**
4467
+ * Apply CMVN normalization in-place
4468
+ */
4469
+ function applyCMVN(features, dim, negMeanVec, invStddevVec) {
4470
+ for (var i = 0; i < features.length; i++) {
4471
+ var d = i % dim;
4472
+ features[i] = (features[i] + negMeanVec[d]) * invStddevVec[d];
4473
+ }
4474
+ return features;
4475
+ }
4476
+
4477
+ /**
4478
+ * Parse CMVN vectors from comma-separated strings (stored in ONNX metadata)
4479
+ */
4480
+ function parseCMVNFromMetadata(negMeanStr, invStddevStr) {
4481
+ var negMeanArr = new Float32Array(
4482
+ negMeanStr.split(',').map(function(s) { return parseFloat(s.trim()); })
4483
+ );
4484
+ var invStddevArr = new Float32Array(
4485
+ invStddevStr.split(',').map(function(s) { return parseFloat(s.trim()); })
4486
+ );
4487
+ return { negMean: negMeanArr, invStddev: invStddevArr };
4488
+ }
4489
+
4490
+ // \u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550
4491
+ // ctcDecoder.ts \u2014 inlined as plain JavaScript
4492
+ // \u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550
4493
+
4494
+ /** SenseVoice language ID -> string mapping */
4495
+ var LANGUAGE_IDS = {
4496
+ 0: 'auto',
4497
+ 3: 'zh',
4498
+ 4: 'en',
4499
+ 7: 'yue',
4500
+ 11: 'ja',
4501
+ 12: 'ko',
4502
+ 13: 'nospeech'
4503
+ };
4504
+
4505
+ /** SenseVoice text normalization ID -> string mapping */
4506
+ var TEXT_NORM_IDS = {
4507
+ 14: 'with_itn',
4508
+ 15: 'without_itn'
4509
+ };
4510
+
4511
+ /** Resolve language string to SenseVoice language ID */
4512
+ function resolveLanguageId(language) {
4513
+ var map = {
4514
+ auto: 0,
4515
+ zh: 3,
4516
+ en: 4,
4517
+ yue: 7,
4518
+ ja: 11,
4519
+ ko: 12
4520
+ };
4521
+ return map[language] !== undefined ? map[language] : 0;
4522
+ }
4523
+
4524
+ /** Resolve text norm string to SenseVoice text norm ID */
4525
+ function resolveTextNormId(textNorm) {
4526
+ return textNorm === 'without_itn' ? 15 : 14;
4527
+ }
4528
+
4529
+ /**
4530
+ * Parse tokens.txt into a token ID -> string map
4531
+ */
4532
+ function parseTokensFile(content) {
4533
+ var map = new Map();
4534
+ var lines = content.split('\\n');
4535
+ for (var idx = 0; idx < lines.length; idx++) {
4536
+ var trimmed = lines[idx].trim();
4537
+ if (!trimmed) continue;
4538
+ // Find the last space - token string may contain spaces
4539
+ var lastSpace = trimmed.lastIndexOf(' ');
4540
+ if (lastSpace === -1) continue;
4541
+ var token = trimmed.substring(0, lastSpace);
4542
+ var id = parseInt(trimmed.substring(lastSpace + 1), 10);
4543
+ if (!isNaN(id)) {
4544
+ map.set(id, token);
4545
+ }
4546
+ }
4547
+ return map;
4548
+ }
4549
+
4550
+ /**
4551
+ * SenseVoice structured token pattern matching
4552
+ */
4553
+ function parseStructuredToken(token) {
4554
+ var match = token.match(/^<\\|(.+)\\|>$/);
4555
+ if (!match) return null;
4556
+
4557
+ var value = match[1];
4558
+
4559
+ // Language tokens
4560
+ if (value === 'zh' || value === 'en' || value === 'ja' || value === 'ko' || value === 'yue' || value === 'nospeech') {
4561
+ return { type: 'language', value: value };
4562
+ }
4563
+
4564
+ // Emotion tokens
4565
+ var emotions = ['HAPPY', 'SAD', 'ANGRY', 'NEUTRAL', 'FEARFUL', 'DISGUSTED', 'SURPRISED', 'EMO_UNKNOWN'];
4566
+ if (emotions.indexOf(value) !== -1) {
4567
+ return { type: 'emotion', value: value };
4568
+ }
4569
+
4570
+ // Audio event tokens
4571
+ var events = ['Speech', 'BGM', 'Applause', 'Laughter', 'Crying', 'Coughing', 'Sneezing', 'EVENT_UNKNOWN'];
4572
+ if (events.indexOf(value) !== -1) {
4573
+ return { type: 'event', value: value };
4574
+ }
4575
+
4576
+ // ITN tokens
4577
+ if (value === 'withitn' || value === 'woitn' || value === 'with_itn' || value === 'without_itn') {
4578
+ return { type: 'textnorm', value: value };
4579
+ }
4580
+
4581
+ return null;
4582
+ }
4583
+
4584
+ /**
4585
+ * CTC greedy decode
4586
+ */
4587
+ function ctcGreedyDecode(logits, seqLen, vocabSz, tokenMapLocal) {
4588
+ // Step 1: Argmax per time step
4589
+ var tokenIds = [];
4590
+ for (var t = 0; t < seqLen; t++) {
4591
+ var offset = t * vocabSz;
4592
+ var maxIdx = 0;
4593
+ var maxVal = logits[offset];
4594
+ for (var v = 1; v < vocabSz; v++) {
4595
+ if (logits[offset + v] > maxVal) {
4596
+ maxVal = logits[offset + v];
4597
+ maxIdx = v;
4598
+ }
4599
+ }
4600
+ tokenIds.push(maxIdx);
4601
+ }
4602
+
4603
+ // Step 2: Collapse consecutive duplicates
4604
+ var collapsed = [];
4605
+ var prev = -1;
4606
+ for (var idx = 0; idx < tokenIds.length; idx++) {
4607
+ var id = tokenIds[idx];
4608
+ if (id !== prev) {
4609
+ collapsed.push(id);
4610
+ prev = id;
4611
+ }
4612
+ }
4613
+
4614
+ // Step 3: Remove blank tokens (ID 0) and special tokens (<s>=1, </s>=2)
4615
+ var filtered = collapsed.filter(function(id) { return id !== 0 && id !== 1 && id !== 2; });
4616
+
4617
+ // Step 4: Convert to token strings and parse structured tokens
4618
+ var language = undefined;
4619
+ var emotion = undefined;
4620
+ var event = undefined;
4621
+ var textTokens = [];
4622
+
4623
+ for (var idx = 0; idx < filtered.length; idx++) {
4624
+ var id = filtered[idx];
4625
+ var token = tokenMapLocal.get(id);
4626
+ if (!token) continue;
4627
+
4628
+ var structured = parseStructuredToken(token);
4629
+ if (structured) {
4630
+ if (structured.type === 'language') language = structured.value;
4631
+ else if (structured.type === 'emotion') emotion = structured.value;
4632
+ else if (structured.type === 'event') event = structured.value;
4633
+ // Skip textnorm tokens
4634
+ } else {
4635
+ textTokens.push(token);
4636
+ }
4637
+ }
4638
+
4639
+ // Step 5: Join tokens, handle SentencePiece boundary marker
4640
+ var text = textTokens.join('');
4641
+ // Replace SentencePiece word boundary (U+2581) with space
4642
+ text = text.replace(/\\u2581/g, ' ').trim();
4643
+
4644
+ return { text: text, language: language, emotion: emotion, event: event };
4645
+ }
4646
+
4647
+ // \u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550
4648
+ // Worker globals and message handler
4649
+ // \u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550
4650
+
4651
+ /**
4652
+ * Load ONNX Runtime from CDN
4653
+ */
4654
+ async function loadOrt(wasmPaths) {
4655
+ if (ort) return;
4656
+
4657
+ // Import ONNX Runtime from CDN
4658
+ var ortUrl = wasmPaths + 'ort.wasm.min.js';
4659
+
4660
+ // Load the script by fetching and executing it
4661
+ var response = await fetch(ortUrl);
4662
+ var scriptText = await response.text();
4663
+
4664
+ // Create a blob URL for the script
4665
+ var blob = new Blob([scriptText], { type: 'application/javascript' });
4666
+ var blobUrl = URL.createObjectURL(blob);
4667
+
4668
+ // Import the module
4669
+ importScripts(blobUrl);
4670
+ URL.revokeObjectURL(blobUrl);
4671
+
4672
+ // ort is now available as global
4673
+ ort = self.ort;
4674
+
4675
+ // Configure WASM settings
4676
+ ort.env.wasm.wasmPaths = wasmPaths;
4677
+ ort.env.wasm.numThreads = 1; // Single thread in worker
4678
+ ort.env.wasm.simd = true;
4679
+ ort.env.wasm.proxy = false; // No proxy in worker
4680
+ }
4681
+
4682
+ /**
4683
+ * Load the SenseVoice model and tokens
4684
+ */
4685
+ async function loadModel(modelUrl, tokensUrl, isIOSDevice, lang, textNorm) {
4686
+ // 1. Fetch and parse tokens.txt
4687
+ var tokensResponse = await fetch(tokensUrl);
4688
+ if (!tokensResponse.ok) {
4689
+ throw new Error('Failed to fetch tokens.txt: ' + tokensResponse.status + ' ' + tokensResponse.statusText);
4690
+ }
4691
+ var tokensText = await tokensResponse.text();
4692
+ tokenMap = parseTokensFile(tokensText);
4693
+
4694
+ // 2. Store language/textNorm IDs
4695
+ languageId = lang;
4696
+ textNormId = textNorm;
4697
+
4698
+ // 3. Create inference session
4699
+ var sessionOptions = {
4700
+ executionProviders: ['wasm'],
4701
+ graphOptimizationLevel: 'all',
4702
+ };
4703
+
4704
+ if (isIOSDevice) {
4705
+ // iOS: pass URL string directly to ORT to avoid 239MB JS heap allocation
4706
+ // ORT fetches into WASM memory, keeping JS heap at ~2MB
4707
+ session = await ort.InferenceSession.create(modelUrl, sessionOptions);
4708
+ } else {
4709
+ // Desktop: fetch ArrayBuffer for potential caching
4710
+ var modelResponse = await fetch(modelUrl);
4711
+ if (!modelResponse.ok) {
4712
+ throw new Error('Failed to fetch model: ' + modelResponse.status + ' ' + modelResponse.statusText);
4713
+ }
4714
+ var modelBuffer = await modelResponse.arrayBuffer();
4715
+ var modelData = new Uint8Array(modelBuffer);
4716
+ session = await ort.InferenceSession.create(modelData, sessionOptions);
4717
+ }
4718
+
4719
+ // 4. Try to read CMVN from model metadata
4720
+ try {
4721
+ var metadata = session.handler && session.handler.metadata;
4722
+ if (metadata && metadata.neg_mean && metadata.inv_stddev) {
4723
+ var cmvn = parseCMVNFromMetadata(metadata.neg_mean, metadata.inv_stddev);
4724
+ negMean = cmvn.negMean;
4725
+ invStddev = cmvn.invStddev;
4726
+ }
4727
+ } catch (cmvnErr) {
4728
+ // CMVN not available \u2014 features will not be normalized
4729
+ }
4730
+
4731
+ // 5. Determine vocab size from tokenMap
4732
+ vocabSize = 0;
4733
+ tokenMap.forEach(function(val, key) {
4734
+ if (key >= vocabSize) vocabSize = key + 1;
4735
+ });
4736
+
4737
+ return {
4738
+ vocabSize: vocabSize,
4739
+ inputNames: session.inputNames.slice(),
4740
+ outputNames: session.outputNames.slice(),
4741
+ };
4742
+ }
4743
+
4744
+ /**
4745
+ * Run transcription on audio samples
4746
+ */
4747
+ async function runTranscription(audio) {
4748
+ var preprocessStart = performance.now();
4749
+
4750
+ // 1. Compute Kaldi fbank features [T, 80]
4751
+ var fbank = computeKaldiFbank(audio, 16000, 80);
4752
+ var numFrames = fbank.length / 80;
4753
+
4754
+ if (numFrames === 0) {
4755
+ return {
4756
+ text: '',
4757
+ language: undefined,
4758
+ emotion: undefined,
4759
+ event: undefined,
4760
+ inferenceTimeMs: performance.now() - preprocessStart,
4761
+ preprocessTimeMs: performance.now() - preprocessStart,
4762
+ };
4763
+ }
4764
+
4765
+ // 2. Apply LFR stacking [T_reduced, 560]
4766
+ var lfrFeatures = applyLFR(fbank, 80, 7, 6);
4767
+ var numLfrFrames = lfrFeatures.length / 560;
4768
+
4769
+ // 3. Apply CMVN normalization (in-place)
4770
+ if (negMean && invStddev) {
4771
+ applyCMVN(lfrFeatures, 560, negMean, invStddev);
4772
+ }
4773
+
4774
+ var preprocessTimeMs = performance.now() - preprocessStart;
4775
+
4776
+ // 4. Build ORT tensors
4777
+ var feeds = {
4778
+ x: new ort.Tensor('float32', lfrFeatures, [1, numLfrFrames, 560]),
4779
+ x_length: new ort.Tensor('int32', new Int32Array([numLfrFrames]), [1]),
4780
+ language: new ort.Tensor('int32', new Int32Array([languageId]), [1]),
4781
+ text_norm: new ort.Tensor('int32', new Int32Array([textNormId]), [1]),
4782
+ };
4783
+
4784
+ // 5. Run inference
4785
+ var results = await session.run(feeds);
4786
+
4787
+ var logitsOutput = results['logits'];
4788
+ if (!logitsOutput) {
4789
+ throw new Error('Model output missing "logits" tensor');
4790
+ }
4791
+
4792
+ var logitsData = logitsOutput.data;
4793
+ var logitsDims = logitsOutput.dims;
4794
+ var seqLen = logitsDims[1];
4795
+ var modelVocabSize = logitsDims[2];
4796
+
4797
+ // 6. CTC decode
4798
+ var decoded = ctcGreedyDecode(logitsData, seqLen, modelVocabSize, tokenMap);
4799
+
4800
+ var totalTimeMs = performance.now() - preprocessStart;
4801
+
4802
+ return {
4803
+ text: decoded.text,
4804
+ language: decoded.language,
4805
+ emotion: decoded.emotion,
4806
+ event: decoded.event,
4807
+ inferenceTimeMs: totalTimeMs,
4808
+ preprocessTimeMs: preprocessTimeMs,
4809
+ };
4810
+ }
4811
+
4812
+ // Message handler
4813
+ self.onmessage = async function(e) {
4814
+ var msg = e.data;
4815
+
4816
+ try {
4817
+ switch (msg.type) {
4818
+ case 'load': {
4819
+ var startTime = performance.now();
4820
+ await loadOrt(msg.wasmPaths);
4821
+ var info = await loadModel(msg.modelUrl, msg.tokensUrl, msg.isIOS, msg.language, msg.textNorm);
4822
+ var loadTimeMs = performance.now() - startTime;
4823
+
4824
+ self.postMessage({
4825
+ type: 'loaded',
4826
+ vocabSize: info.vocabSize,
4827
+ inputNames: info.inputNames,
4828
+ outputNames: info.outputNames,
4829
+ loadTimeMs: loadTimeMs,
4830
+ });
4831
+ break;
4832
+ }
4833
+
4834
+ case 'transcribe': {
4835
+ var result = await runTranscription(msg.audio);
4836
+
4837
+ self.postMessage({
4838
+ type: 'result',
4839
+ text: result.text,
4840
+ language: result.language,
4841
+ emotion: result.emotion,
4842
+ event: result.event,
4843
+ inferenceTimeMs: result.inferenceTimeMs,
4844
+ preprocessTimeMs: result.preprocessTimeMs,
4845
+ });
4846
+ break;
4847
+ }
4848
+
4849
+ case 'dispose': {
4850
+ if (session) {
4851
+ await session.release();
4852
+ session = null;
4853
+ }
4854
+ ort = null;
4855
+ tokenMap = null;
4856
+ negMean = null;
4857
+ invStddev = null;
4858
+ self.postMessage({ type: 'disposed' });
4859
+ break;
4860
+ }
4861
+
4862
+ default:
4863
+ self.postMessage({
4864
+ type: 'error',
4865
+ error: 'Unknown message type: ' + msg.type,
4866
+ });
4867
+ }
4868
+ } catch (err) {
4869
+ var errorMsg = err.message || String(err);
4870
+ // Handle raw C++ exception pointers from ORT WASM
4871
+ if (typeof err === 'number') {
4872
+ errorMsg = 'Raw C++ exception pointer (0x' + err.toString(16) + '). Likely OOM in WASM.';
4873
+ }
4874
+ self.postMessage({
4875
+ type: 'error',
4876
+ error: errorMsg,
4877
+ });
4878
+ }
4879
+ };
4880
+
4881
+ // Error handler
4882
+ self.onerror = function(err) {
4883
+ self.postMessage({
4884
+ type: 'error',
4885
+ error: 'Worker error: ' + (err.message || String(err)),
4886
+ });
4887
+ };
4888
+ `;
4889
+ var SenseVoiceWorker = class {
4890
+ constructor(config) {
4891
+ this.worker = null;
4892
+ this.isLoading = false;
4893
+ this._isLoaded = false;
4894
+ // Inference queue for serialization
4895
+ this.inferenceQueue = Promise.resolve();
4896
+ // Session health: set to true if worker operation times out
4897
+ this.poisoned = false;
4898
+ // Pending message handlers
4899
+ this.pendingResolvers = /* @__PURE__ */ new Map();
4900
+ const modelDir = config.modelUrl.substring(0, config.modelUrl.lastIndexOf("/"));
4901
+ const tokensUrl = config.tokensUrl ?? `${modelDir}/tokens.txt`;
4902
+ this.config = {
4903
+ modelUrl: config.modelUrl,
4904
+ tokensUrl,
4905
+ language: config.language ?? "auto",
4906
+ textNorm: config.textNorm ?? "with_itn"
4907
+ };
4908
+ this.languageId = resolveLanguageId(this.config.language);
4909
+ this.textNormId = resolveTextNormId(this.config.textNorm);
4910
+ }
4911
+ get isLoaded() {
4912
+ return this._isLoaded;
4913
+ }
4914
+ /**
4915
+ * Backend type (always 'wasm' for Worker, WebGPU not supported in Workers)
4916
+ */
4917
+ get backend() {
4918
+ return this._isLoaded ? "wasm" : null;
4919
+ }
4920
+ /**
4921
+ * Create the worker from inline script
4922
+ */
4923
+ createWorker() {
4924
+ const blob = new Blob([WORKER_SCRIPT], { type: "application/javascript" });
4925
+ const blobUrl = URL.createObjectURL(blob);
4926
+ const worker = new Worker(blobUrl);
4927
+ URL.revokeObjectURL(blobUrl);
4928
+ worker.onmessage = (event) => {
4929
+ this.handleWorkerMessage(event.data);
4930
+ };
4931
+ worker.onerror = (error) => {
4932
+ logger5.error("Worker error", { error: error.message });
4933
+ for (const [, resolver] of this.pendingResolvers) {
4934
+ resolver.reject(new Error(`Worker error: ${error.message}`));
4935
+ }
4936
+ this.pendingResolvers.clear();
4937
+ };
4938
+ return worker;
4939
+ }
4940
+ /**
4941
+ * Handle messages from worker
4942
+ */
4943
+ handleWorkerMessage(result) {
4944
+ const resolver = this.pendingResolvers.get(result.type);
4945
+ if (resolver) {
4946
+ this.pendingResolvers.delete(result.type);
4947
+ if (result.type === "error") {
4948
+ resolver.reject(new Error(result.error));
4949
+ } else {
4950
+ resolver.resolve(result);
4951
+ }
4952
+ }
4953
+ }
4954
+ /**
4955
+ * Send message to worker and wait for response
4956
+ */
4957
+ sendMessage(message, expectedType, timeoutMs) {
4958
+ return new Promise((resolve, reject) => {
4959
+ if (!this.worker) {
4960
+ reject(new Error("Worker not initialized"));
4961
+ return;
4962
+ }
4963
+ const timeoutId = setTimeout(() => {
4964
+ this.pendingResolvers.delete(expectedType);
4965
+ this.poisoned = true;
4966
+ reject(new Error(`Worker operation timed out after ${timeoutMs}ms`));
4967
+ }, timeoutMs);
4968
+ this.pendingResolvers.set(expectedType, {
4969
+ resolve: (value) => {
4970
+ clearTimeout(timeoutId);
4971
+ resolve(value);
4972
+ },
4973
+ reject: (error) => {
4974
+ clearTimeout(timeoutId);
4975
+ reject(error);
4976
+ }
4977
+ });
4978
+ this.pendingResolvers.set("error", {
4979
+ resolve: () => {
4980
+ },
4981
+ // Never called for errors
4982
+ reject: (error) => {
4983
+ clearTimeout(timeoutId);
4984
+ this.pendingResolvers.delete(expectedType);
4985
+ reject(error);
4986
+ }
4987
+ });
4988
+ this.worker.postMessage(message);
4989
+ });
4990
+ }
4991
+ /**
4992
+ * Load the ONNX model in the worker
4993
+ *
4994
+ * @param onProgress - Optional progress callback. Fires once at 100% when load completes
4995
+ * (the worker downloads and loads the model internally, so granular progress is not available).
4996
+ */
4997
+ async load(onProgress) {
4998
+ if (this.isLoading) {
4999
+ throw new Error("Model is already loading");
5000
+ }
5001
+ if (this._isLoaded) {
5002
+ throw new Error("Model already loaded. Call dispose() first.");
5003
+ }
5004
+ this.isLoading = true;
5005
+ const startTime = performance.now();
5006
+ const telemetry = getTelemetry();
5007
+ const span = telemetry?.startSpan("SenseVoiceWorker.load", {
5008
+ "model.url": this.config.modelUrl,
5009
+ "model.language": this.config.language
5010
+ });
5011
+ try {
5012
+ logger5.info("Creating SenseVoice worker...");
5013
+ this.worker = this.createWorker();
5014
+ logger5.info("Loading model in worker...", {
5015
+ modelUrl: this.config.modelUrl,
5016
+ tokensUrl: this.config.tokensUrl,
5017
+ language: this.config.language,
5018
+ textNorm: this.config.textNorm
5019
+ });
5020
+ const result = await this.sendMessage(
5021
+ {
5022
+ type: "load",
5023
+ modelUrl: resolveUrl(this.config.modelUrl),
5024
+ tokensUrl: resolveUrl(this.config.tokensUrl),
5025
+ wasmPaths: WASM_CDN_PATH2,
5026
+ isIOS: isIOS(),
5027
+ language: this.languageId,
5028
+ textNorm: this.textNormId
5029
+ },
5030
+ "loaded",
5031
+ LOAD_TIMEOUT_MS
5032
+ );
5033
+ this._isLoaded = true;
5034
+ const loadTimeMs = performance.now() - startTime;
5035
+ onProgress?.(1, 1);
5036
+ logger5.info("SenseVoice worker loaded successfully", {
5037
+ backend: "wasm",
5038
+ loadTimeMs: Math.round(loadTimeMs),
5039
+ workerLoadTimeMs: Math.round(result.loadTimeMs),
5040
+ vocabSize: result.vocabSize,
5041
+ language: this.config.language,
5042
+ textNorm: this.config.textNorm
5043
+ });
5044
+ span?.setAttributes({
5045
+ "model.backend": "wasm",
5046
+ "model.load_time_ms": loadTimeMs,
5047
+ "model.worker_load_time_ms": result.loadTimeMs,
5048
+ "model.vocab_size": result.vocabSize
5049
+ });
5050
+ span?.end();
5051
+ telemetry?.recordHistogram("omote.model.load_time", loadTimeMs, {
5052
+ model: "sensevoice-worker",
5053
+ backend: "wasm"
5054
+ });
5055
+ return {
5056
+ backend: "wasm",
5057
+ loadTimeMs,
5058
+ inputNames: result.inputNames,
5059
+ outputNames: result.outputNames,
5060
+ vocabSize: result.vocabSize
5061
+ };
5062
+ } catch (error) {
5063
+ span?.endWithError(error instanceof Error ? error : new Error(String(error)));
5064
+ telemetry?.incrementCounter("omote.errors.total", 1, {
5065
+ model: "sensevoice-worker",
5066
+ error_type: "load_failed"
5067
+ });
5068
+ if (this.worker) {
5069
+ this.worker.terminate();
5070
+ this.worker = null;
5071
+ }
5072
+ throw error;
5073
+ } finally {
5074
+ this.isLoading = false;
5075
+ }
5076
+ }
5077
+ /**
5078
+ * Transcribe audio samples to text
5079
+ *
5080
+ * @param audioSamples Float32Array of audio samples at 16kHz, [-1, 1] range
5081
+ * @returns Transcription result with text, emotion, language, and event
5082
+ */
5083
+ async transcribe(audioSamples) {
5084
+ if (!this._isLoaded || !this.worker) {
5085
+ throw new Error("Worker not loaded. Call load() first.");
5086
+ }
5087
+ if (this.poisoned) {
5088
+ throw new Error("SenseVoice worker timed out \u2014 inference unavailable until page reload");
5089
+ }
5090
+ const audio = new Float32Array(audioSamples);
5091
+ return this.queueInference(audio);
5092
+ }
5093
+ /**
5094
+ * Queue inference to serialize worker calls
5095
+ */
5096
+ queueInference(audio) {
5097
+ return new Promise((resolve, reject) => {
5098
+ this.inferenceQueue = this.inferenceQueue.then(async () => {
5099
+ const telemetry = getTelemetry();
5100
+ const span = telemetry?.startSpan("SenseVoiceWorker.transcribe", {
5101
+ "inference.backend": "wasm",
5102
+ "inference.input_samples": audio.length
5103
+ });
5104
+ try {
5105
+ const startTime = performance.now();
5106
+ const result = await this.sendMessage(
5107
+ {
5108
+ type: "transcribe",
5109
+ audio
5110
+ },
5111
+ "result",
5112
+ INFERENCE_TIMEOUT_MS
5113
+ );
5114
+ const totalTimeMs = performance.now() - startTime;
5115
+ logger5.trace("Worker transcription complete", {
5116
+ text: result.text.substring(0, 50),
5117
+ language: result.language,
5118
+ emotion: result.emotion,
5119
+ event: result.event,
5120
+ preprocessTimeMs: Math.round(result.preprocessTimeMs * 100) / 100,
5121
+ inferenceTimeMs: Math.round(result.inferenceTimeMs * 100) / 100,
5122
+ roundTripMs: Math.round(totalTimeMs * 100) / 100
5123
+ });
5124
+ span?.setAttributes({
5125
+ "inference.duration_ms": totalTimeMs,
5126
+ "inference.worker_duration_ms": result.inferenceTimeMs,
5127
+ "inference.preprocess_ms": result.preprocessTimeMs,
5128
+ "inference.text_length": result.text.length
5129
+ });
5130
+ span?.end();
5131
+ telemetry?.recordHistogram("omote.inference.latency", totalTimeMs, {
5132
+ model: "sensevoice-worker",
5133
+ backend: "wasm"
5134
+ });
5135
+ telemetry?.incrementCounter("omote.inference.total", 1, {
5136
+ model: "sensevoice-worker",
5137
+ backend: "wasm",
5138
+ status: "success"
5139
+ });
5140
+ resolve({
5141
+ text: result.text,
5142
+ language: result.language,
5143
+ emotion: result.emotion,
5144
+ event: result.event,
5145
+ inferenceTimeMs: result.inferenceTimeMs,
5146
+ preprocessTimeMs: result.preprocessTimeMs
5147
+ });
5148
+ } catch (err) {
5149
+ const errMsg = err instanceof Error ? err.message : String(err);
5150
+ if (errMsg.includes("timed out")) {
5151
+ logger5.error("CRITICAL: Worker inference timed out \u2014 SenseVoice worker is dead. Page reload required.", {
5152
+ timeoutMs: INFERENCE_TIMEOUT_MS
5153
+ });
5154
+ } else {
5155
+ logger5.error("Worker inference failed", { error: errMsg });
5156
+ }
5157
+ span?.endWithError(err instanceof Error ? err : new Error(String(err)));
5158
+ telemetry?.incrementCounter("omote.inference.total", 1, {
5159
+ model: "sensevoice-worker",
5160
+ backend: "wasm",
5161
+ status: "error"
5162
+ });
5163
+ reject(err);
5164
+ }
5165
+ });
5166
+ });
5167
+ }
5168
+ /**
5169
+ * Dispose of the worker and free resources
5170
+ */
5171
+ async dispose() {
5172
+ if (this.worker) {
5173
+ try {
5174
+ await this.sendMessage({ type: "dispose" }, "disposed", INFERENCE_TIMEOUT_MS);
5175
+ } catch {
5176
+ }
5177
+ this.worker.terminate();
5178
+ this.worker = null;
5179
+ }
5180
+ this._isLoaded = false;
5181
+ this.poisoned = false;
5182
+ this.pendingResolvers.clear();
5183
+ }
5184
+ /**
5185
+ * Check if Web Workers are supported
5186
+ */
5187
+ static isSupported() {
5188
+ return typeof Worker !== "undefined";
5189
+ }
5190
+ };
5191
+
5192
+ // src/inference/UnifiedInferenceWorker.ts
5193
+ var logger6 = createLogger("UnifiedInferenceWorker");
5194
+ var WASM_CDN_PATH3 = "https://cdn.jsdelivr.net/npm/onnxruntime-web@1.23.2/dist/";
5195
+ var INIT_TIMEOUT_MS = 15e3;
5196
+ var SV_LOAD_TIMEOUT_MS = 3e4;
5197
+ var SV_INFER_TIMEOUT_MS = 1e4;
5198
+ var CPU_LOAD_TIMEOUT_MS = 6e4;
5199
+ var CPU_INFER_TIMEOUT_MS = 5e3;
5200
+ var VAD_LOAD_TIMEOUT_MS = 1e4;
5201
+ var VAD_INFER_TIMEOUT_MS = 1e3;
5202
+ var DISPOSE_TIMEOUT_MS = 5e3;
5203
+ function resolveUrl2(url) {
5204
+ if (/^https?:\/\//i.test(url) || /^blob:/i.test(url)) return url;
5205
+ try {
5206
+ return new URL(url, globalThis.location?.origin ?? "https://localhost").href;
5207
+ } catch {
5208
+ return url;
5209
+ }
5210
+ }
5211
+ var requestCounter = 0;
5212
+ function nextRequestId() {
5213
+ return `req_${++requestCounter}_${Date.now()}`;
5214
+ }
5215
+ var WORKER_SCRIPT2 = `
5216
+ // Unified Inference Worker Script
5217
+ // Hosts SenseVoice + Wav2ArkitCpu + Silero VAD in a single ORT instance
5218
+
5219
+ var ort = null;
5220
+
5221
+ // SenseVoice state
5222
+ var svSession = null;
5223
+ var svTokenMap = null;
5224
+ var svNegMean = null;
5225
+ var svInvStddev = null;
5226
+ var svLanguageId = 0;
5227
+ var svTextNormId = 14;
5228
+ var svVocabSize = 0;
5229
+
5230
+ // Wav2ArkitCpu state
5231
+ var cpuSession = null;
5232
+
5233
+ // Silero VAD state
5234
+ var vadSession = null;
5235
+ var vadSampleRate = 16000;
5236
+ var vadChunkSize = 512;
5237
+ var vadContextSize = 64;
5238
+
5239
+ // \u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550
5240
+ // kaldiFbank.ts \u2014 inlined as plain JavaScript
5241
+ // \u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550
5242
+
5243
+ function fft(re, im) {
5244
+ var n = re.length;
5245
+ for (var i = 1, j = 0; i < n; i++) {
5246
+ var bit = n >> 1;
5247
+ while (j & bit) { j ^= bit; bit >>= 1; }
5248
+ j ^= bit;
5249
+ if (i < j) {
5250
+ var tmp = re[i]; re[i] = re[j]; re[j] = tmp;
5251
+ tmp = im[i]; im[i] = im[j]; im[j] = tmp;
5252
+ }
5253
+ }
5254
+ for (var len = 2; len <= n; len *= 2) {
5255
+ var halfLen = len / 2;
5256
+ var angle = -2 * Math.PI / len;
5257
+ var wRe = Math.cos(angle);
5258
+ var wIm = Math.sin(angle);
5259
+ for (var i = 0; i < n; i += len) {
5260
+ var curRe = 1, curIm = 0;
5261
+ for (var j = 0; j < halfLen; j++) {
5262
+ var a = i + j, b = a + halfLen;
5263
+ var tRe = curRe * re[b] - curIm * im[b];
5264
+ var tIm = curRe * im[b] + curIm * re[b];
5265
+ re[b] = re[a] - tRe; im[b] = im[a] - tIm;
5266
+ re[a] += tRe; im[a] += tIm;
5267
+ var nextRe = curRe * wRe - curIm * wIm;
5268
+ curIm = curRe * wIm + curIm * wRe;
5269
+ curRe = nextRe;
5270
+ }
5271
+ }
5272
+ }
5273
+ }
5274
+
5275
+ function htkMel(freq) { return 1127.0 * Math.log(1.0 + freq / 700.0); }
5276
+ function htkMelInverse(mel) { return 700.0 * (Math.exp(mel / 1127.0) - 1.0); }
5277
+
5278
+ function buildMelFilterbank(numBins, fftSize, sampleRate, lowFreq, highFreq) {
5279
+ var numFftBins = fftSize / 2 + 1;
5280
+ var lowMel = htkMel(lowFreq);
5281
+ var highMel = htkMel(highFreq);
5282
+ var melPoints = new Float64Array(numBins + 2);
5283
+ for (var i = 0; i < numBins + 2; i++) {
5284
+ melPoints[i] = lowMel + (highMel - lowMel) * i / (numBins + 1);
5285
+ }
5286
+ var binFreqs = new Float64Array(numBins + 2);
5287
+ for (var i = 0; i < numBins + 2; i++) {
5288
+ binFreqs[i] = htkMelInverse(melPoints[i]) * fftSize / sampleRate;
5289
+ }
5290
+ var filters = [];
5291
+ for (var m = 0; m < numBins; m++) {
5292
+ var left = binFreqs[m], center = binFreqs[m + 1], right = binFreqs[m + 2];
5293
+ var startBin = Math.max(0, Math.ceil(left));
5294
+ var endBin = Math.min(numFftBins - 1, Math.floor(right));
5295
+ var weights = new Float32Array(endBin - startBin + 1);
5296
+ for (var k = startBin; k <= endBin; k++) {
5297
+ if (k <= center) {
5298
+ weights[k - startBin] = (center - left) > 0 ? (k - left) / (center - left) : 0;
5299
+ } else {
5300
+ weights[k - startBin] = (right - center) > 0 ? (right - k) / (right - center) : 0;
5301
+ }
5302
+ }
5303
+ filters.push({ startBin: startBin, weights: weights });
5304
+ }
5305
+ return filters;
5306
+ }
5307
+
5308
+ function createHammingWindow(length) {
5309
+ var w = new Float32Array(length);
5310
+ for (var i = 0; i < length; i++) {
5311
+ w[i] = 0.54 - 0.46 * Math.cos(2 * Math.PI * i / (length - 1));
5312
+ }
5313
+ return w;
5314
+ }
5315
+
5316
+ function computeKaldiFbank(audio, sampleRate, numMelBins, opts) {
5317
+ var frameLengthMs = (opts && opts.frameLengthMs !== undefined) ? opts.frameLengthMs : 25;
5318
+ var frameShiftMs = (opts && opts.frameShiftMs !== undefined) ? opts.frameShiftMs : 10;
5319
+ var lowFreq = (opts && opts.lowFreq !== undefined) ? opts.lowFreq : 20;
5320
+ var highFreq = (opts && opts.highFreq !== undefined) ? opts.highFreq : (sampleRate / 2);
5321
+ var dither = (opts && opts.dither !== undefined) ? opts.dither : 0;
5322
+ var preemphasis = (opts && opts.preemphasis !== undefined) ? opts.preemphasis : 0.97;
5323
+
5324
+ var frameLengthSamples = Math.round(sampleRate * frameLengthMs / 1000);
5325
+ var frameShiftSamples = Math.round(sampleRate * frameShiftMs / 1000);
5326
+
5327
+ var scaled = new Float32Array(audio.length);
5328
+ for (var i = 0; i < audio.length; i++) { scaled[i] = audio[i] * 32768; }
5329
+
5330
+ if (dither > 0) {
5331
+ for (var i = 0; i < scaled.length; i++) {
5332
+ var u1 = Math.random(), u2 = Math.random();
5333
+ scaled[i] += dither * Math.sqrt(-2 * Math.log(u1 + 1e-10)) * Math.cos(2 * Math.PI * u2);
5334
+ }
5335
+ }
5336
+
5337
+ var numFrames = Math.max(0, Math.floor((scaled.length - frameLengthSamples) / frameShiftSamples) + 1);
5338
+ if (numFrames === 0) return new Float32Array(0);
5339
+
5340
+ var fftSize = 1;
5341
+ while (fftSize < frameLengthSamples) fftSize *= 2;
5342
+ var numFftBins = fftSize / 2 + 1;
5343
+
5344
+ var window = createHammingWindow(frameLengthSamples);
5345
+ var filters = buildMelFilterbank(numMelBins, fftSize, sampleRate, lowFreq, highFreq);
5346
+ var output = new Float32Array(numFrames * numMelBins);
5347
+ var fftRe = new Float64Array(fftSize);
5348
+ var fftIm = new Float64Array(fftSize);
5349
+
5350
+ for (var f = 0; f < numFrames; f++) {
5351
+ var offset = f * frameShiftSamples;
5352
+ fftRe.fill(0); fftIm.fill(0);
5353
+ for (var i = 0; i < frameLengthSamples; i++) {
5354
+ var sample = scaled[offset + i];
5355
+ if (preemphasis > 0 && i > 0) {
5356
+ sample -= preemphasis * scaled[offset + i - 1];
5357
+ } else if (preemphasis > 0 && i === 0 && offset > 0) {
5358
+ sample -= preemphasis * scaled[offset - 1];
5359
+ }
5360
+ fftRe[i] = sample * window[i];
5361
+ }
5362
+ fft(fftRe, fftIm);
5363
+ var outOffset = f * numMelBins;
5364
+ for (var m = 0; m < numMelBins; m++) {
5365
+ var filter = filters[m];
5366
+ var energy = 0;
5367
+ for (var k = 0; k < filter.weights.length; k++) {
5368
+ var bin = filter.startBin + k;
5369
+ if (bin < numFftBins) {
5370
+ var powerSpec = fftRe[bin] * fftRe[bin] + fftIm[bin] * fftIm[bin];
5371
+ energy += filter.weights[k] * powerSpec;
5372
+ }
5373
+ }
5374
+ output[outOffset + m] = Math.log(Math.max(energy, 1e-10));
5375
+ }
5376
+ }
5377
+ return output;
5378
+ }
5379
+
5380
+ function applyLFR(features, featureDim, lfrM, lfrN) {
5381
+ var numFrames = features.length / featureDim;
5382
+ if (numFrames === 0) return new Float32Array(0);
5383
+ var leftPad = Math.floor((lfrM - 1) / 2);
5384
+ var paddedLen = numFrames + leftPad;
5385
+ var numOutputFrames = Math.ceil(paddedLen / lfrN);
5386
+ var outputDim = featureDim * lfrM;
5387
+ var output = new Float32Array(numOutputFrames * outputDim);
5388
+ for (var i = 0; i < numOutputFrames; i++) {
5389
+ var startFrame = i * lfrN - leftPad;
5390
+ for (var j = 0; j < lfrM; j++) {
5391
+ var srcFrame = startFrame + j;
5392
+ if (srcFrame < 0) srcFrame = 0;
5393
+ if (srcFrame >= numFrames) srcFrame = numFrames - 1;
5394
+ var srcOffset = srcFrame * featureDim;
5395
+ var dstOffset = i * outputDim + j * featureDim;
5396
+ for (var k = 0; k < featureDim; k++) {
5397
+ output[dstOffset + k] = features[srcOffset + k];
5398
+ }
5399
+ }
5400
+ }
5401
+ return output;
5402
+ }
5403
+
5404
+ function applyCMVN(features, dim, negMeanVec, invStddevVec) {
5405
+ for (var i = 0; i < features.length; i++) {
5406
+ var d = i % dim;
5407
+ features[i] = (features[i] + negMeanVec[d]) * invStddevVec[d];
5408
+ }
5409
+ return features;
5410
+ }
5411
+
5412
+ function parseCMVNFromMetadata(negMeanStr, invStddevStr) {
5413
+ var negMeanArr = new Float32Array(
5414
+ negMeanStr.split(',').map(function(s) { return parseFloat(s.trim()); })
5415
+ );
5416
+ var invStddevArr = new Float32Array(
5417
+ invStddevStr.split(',').map(function(s) { return parseFloat(s.trim()); })
5418
+ );
5419
+ return { negMean: negMeanArr, invStddev: invStddevArr };
5420
+ }
5421
+
5422
+ // \u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550
5423
+ // ctcDecoder.ts \u2014 inlined as plain JavaScript
5424
+ // \u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550
5425
+
5426
+ var LANGUAGE_IDS = { 0: 'auto', 3: 'zh', 4: 'en', 7: 'yue', 11: 'ja', 12: 'ko', 13: 'nospeech' };
5427
+ var TEXT_NORM_IDS = { 14: 'with_itn', 15: 'without_itn' };
5428
+
5429
+ function resolveLanguageIdW(language) {
5430
+ var map = { auto: 0, zh: 3, en: 4, yue: 7, ja: 11, ko: 12 };
5431
+ return map[language] !== undefined ? map[language] : 0;
5432
+ }
5433
+
5434
+ function resolveTextNormIdW(textNorm) {
5435
+ return textNorm === 'without_itn' ? 15 : 14;
5436
+ }
5437
+
5438
+ function parseTokensFile(content) {
5439
+ var map = new Map();
5440
+ var lines = content.split('\\n');
5441
+ for (var idx = 0; idx < lines.length; idx++) {
5442
+ var trimmed = lines[idx].trim();
5443
+ if (!trimmed) continue;
5444
+ var lastSpace = trimmed.lastIndexOf(' ');
5445
+ if (lastSpace === -1) continue;
5446
+ var token = trimmed.substring(0, lastSpace);
5447
+ var id = parseInt(trimmed.substring(lastSpace + 1), 10);
5448
+ if (!isNaN(id)) map.set(id, token);
5449
+ }
5450
+ return map;
5451
+ }
5452
+
5453
+ function parseStructuredToken(token) {
5454
+ var match = token.match(/^<\\|(.+)\\|>$/);
5455
+ if (!match) return null;
5456
+ var value = match[1];
5457
+ if (value === 'zh' || value === 'en' || value === 'ja' || value === 'ko' || value === 'yue' || value === 'nospeech') {
5458
+ return { type: 'language', value: value };
5459
+ }
5460
+ var emotions = ['HAPPY', 'SAD', 'ANGRY', 'NEUTRAL', 'FEARFUL', 'DISGUSTED', 'SURPRISED', 'EMO_UNKNOWN'];
5461
+ if (emotions.indexOf(value) !== -1) return { type: 'emotion', value: value };
5462
+ var events = ['Speech', 'BGM', 'Applause', 'Laughter', 'Crying', 'Coughing', 'Sneezing', 'EVENT_UNKNOWN'];
5463
+ if (events.indexOf(value) !== -1) return { type: 'event', value: value };
5464
+ if (value === 'withitn' || value === 'woitn' || value === 'with_itn' || value === 'without_itn') {
5465
+ return { type: 'textnorm', value: value };
5466
+ }
5467
+ return null;
5468
+ }
5469
+
5470
+ function ctcGreedyDecode(logits, seqLen, vocabSz, tokenMapLocal) {
5471
+ var tokenIds = [];
5472
+ for (var t = 0; t < seqLen; t++) {
5473
+ var offset = t * vocabSz;
5474
+ var maxIdx = 0, maxVal = logits[offset];
5475
+ for (var v = 1; v < vocabSz; v++) {
5476
+ if (logits[offset + v] > maxVal) { maxVal = logits[offset + v]; maxIdx = v; }
5477
+ }
5478
+ tokenIds.push(maxIdx);
5479
+ }
5480
+ var collapsed = [], prev = -1;
5481
+ for (var idx = 0; idx < tokenIds.length; idx++) {
5482
+ var id = tokenIds[idx];
5483
+ if (id !== prev) { collapsed.push(id); prev = id; }
5484
+ }
5485
+ var filtered = collapsed.filter(function(id) { return id !== 0 && id !== 1 && id !== 2; });
5486
+ var language = undefined, emotion = undefined, event = undefined;
5487
+ var textTokens = [];
5488
+ for (var idx = 0; idx < filtered.length; idx++) {
5489
+ var id = filtered[idx];
5490
+ var token = tokenMapLocal.get(id);
5491
+ if (!token) continue;
5492
+ var structured = parseStructuredToken(token);
5493
+ if (structured) {
5494
+ if (structured.type === 'language') language = structured.value;
5495
+ else if (structured.type === 'emotion') emotion = structured.value;
5496
+ else if (structured.type === 'event') event = structured.value;
5497
+ } else {
5498
+ textTokens.push(token);
5499
+ }
5500
+ }
5501
+ var text = textTokens.join('');
5502
+ text = text.replace(/\\u2581/g, ' ').trim();
5503
+ return { text: text, language: language, emotion: emotion, event: event };
5504
+ }
5505
+
5506
+ // \u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550
5507
+ // blendshapeUtils.ts \u2014 inlined
5508
+ // \u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550
5509
+
5510
+ var SYMMETRIC_INDEX_PAIRS = [
5511
+ [23, 25], [32, 38], [43, 44], [29, 30], [27, 28], [45, 46],
5512
+ [35, 36], [47, 48], [33, 34], [49, 50], [6, 7], [0, 1],
5513
+ [3, 4], [8, 9], [16, 17], [10, 11], [12, 13], [14, 15],
5514
+ [18, 19], [20, 21],
5515
+ ];
5516
+
5517
+ function symmetrizeBlendshapes(frame) {
5518
+ var result = new Float32Array(frame);
5519
+ for (var p = 0; p < SYMMETRIC_INDEX_PAIRS.length; p++) {
5520
+ var lIdx = SYMMETRIC_INDEX_PAIRS[p][0], rIdx = SYMMETRIC_INDEX_PAIRS[p][1];
5521
+ var avg = (frame[lIdx] + frame[rIdx]) / 2;
5522
+ result[lIdx] = avg;
5523
+ result[rIdx] = avg;
5524
+ }
5525
+ return result;
5526
+ }
5527
+
5528
+ // \u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550
5529
+ // Shared ORT loader
5530
+ // \u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550
5531
+
5532
+ async function loadOrt(wasmPaths, isIOSDevice) {
5533
+ if (ort) return;
5534
+ var ortUrl = wasmPaths + 'ort.wasm.min.js';
5535
+ var response = await fetch(ortUrl);
5536
+ var scriptText = await response.text();
5537
+ var blob = new Blob([scriptText], { type: 'application/javascript' });
5538
+ var blobUrl = URL.createObjectURL(blob);
5539
+ importScripts(blobUrl);
5540
+ URL.revokeObjectURL(blobUrl);
5541
+ ort = self.ort;
5542
+ ort.env.wasm.wasmPaths = wasmPaths;
5543
+ ort.env.wasm.numThreads = isIOSDevice ? 1 : 4;
5544
+ ort.env.wasm.simd = true;
5545
+ ort.env.wasm.proxy = false;
5546
+ }
5547
+
5548
+ // \u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550
5549
+ // SenseVoice handlers
5550
+ // \u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550
5551
+
5552
+ async function svLoad(msg) {
5553
+ var tokensResponse = await fetch(msg.tokensUrl);
5554
+ if (!tokensResponse.ok) throw new Error('Failed to fetch tokens.txt: ' + tokensResponse.status);
5555
+ var tokensText = await tokensResponse.text();
5556
+ svTokenMap = parseTokensFile(tokensText);
5557
+ svLanguageId = msg.language;
5558
+ svTextNormId = msg.textNorm;
5559
+
5560
+ var sessionOptions = { executionProviders: ['wasm'], graphOptimizationLevel: 'all' };
5561
+ if (msg.isIOS) {
5562
+ svSession = await ort.InferenceSession.create(msg.modelUrl, sessionOptions);
5563
+ } else {
5564
+ var modelResponse = await fetch(msg.modelUrl);
5565
+ if (!modelResponse.ok) throw new Error('Failed to fetch model: ' + modelResponse.status);
5566
+ var modelBuffer = await modelResponse.arrayBuffer();
5567
+ svSession = await ort.InferenceSession.create(new Uint8Array(modelBuffer), sessionOptions);
5568
+ }
5569
+
5570
+ try {
5571
+ var metadata = svSession.handler && svSession.handler.metadata;
5572
+ if (metadata && metadata.neg_mean && metadata.inv_stddev) {
5573
+ var cmvn = parseCMVNFromMetadata(metadata.neg_mean, metadata.inv_stddev);
5574
+ svNegMean = cmvn.negMean;
5575
+ svInvStddev = cmvn.invStddev;
5576
+ }
5577
+ } catch (e) { /* CMVN not available */ }
5578
+
5579
+ svVocabSize = 0;
5580
+ svTokenMap.forEach(function(val, key) { if (key >= svVocabSize) svVocabSize = key + 1; });
5581
+
5582
+ return {
5583
+ vocabSize: svVocabSize,
5584
+ inputNames: svSession.inputNames.slice(),
5585
+ outputNames: svSession.outputNames.slice(),
5586
+ };
5587
+ }
5588
+
5589
+ async function svTranscribe(audio) {
5590
+ var preprocessStart = performance.now();
5591
+ var fbank = computeKaldiFbank(audio, 16000, 80);
5592
+ var numFrames = fbank.length / 80;
5593
+ if (numFrames === 0) {
5594
+ return { text: '', inferenceTimeMs: performance.now() - preprocessStart, preprocessTimeMs: performance.now() - preprocessStart };
5595
+ }
5596
+ var lfrFeatures = applyLFR(fbank, 80, 7, 6);
5597
+ var numLfrFrames = lfrFeatures.length / 560;
5598
+ if (svNegMean && svInvStddev) applyCMVN(lfrFeatures, 560, svNegMean, svInvStddev);
5599
+ var preprocessTimeMs = performance.now() - preprocessStart;
5600
+
5601
+ var feeds = {
5602
+ x: new ort.Tensor('float32', lfrFeatures, [1, numLfrFrames, 560]),
5603
+ x_length: new ort.Tensor('int32', new Int32Array([numLfrFrames]), [1]),
5604
+ language: new ort.Tensor('int32', new Int32Array([svLanguageId]), [1]),
5605
+ text_norm: new ort.Tensor('int32', new Int32Array([svTextNormId]), [1]),
5606
+ };
5607
+ var results = await svSession.run(feeds);
5608
+ var logitsOutput = results['logits'];
5609
+ if (!logitsOutput) throw new Error('Model output missing "logits" tensor');
5610
+
5611
+ var decoded = ctcGreedyDecode(logitsOutput.data, logitsOutput.dims[1], logitsOutput.dims[2], svTokenMap);
5612
+ var totalTimeMs = performance.now() - preprocessStart;
5613
+
5614
+ return {
5615
+ text: decoded.text, language: decoded.language, emotion: decoded.emotion, event: decoded.event,
5616
+ inferenceTimeMs: totalTimeMs, preprocessTimeMs: preprocessTimeMs,
5617
+ };
5618
+ }
5619
+
5620
+ async function svDispose() {
5621
+ if (svSession) { await svSession.release(); svSession = null; }
5622
+ svTokenMap = null; svNegMean = null; svInvStddev = null;
5623
+ }
5624
+
5625
+ // \u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550
5626
+ // Wav2ArkitCpu handlers
5627
+ // \u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550
5628
+
5629
+ async function cpuLoad(msg) {
5630
+ var sessionOptions = { executionProviders: ['wasm'], graphOptimizationLevel: 'all' };
5631
+ var dataFilename = msg.externalDataUrl ? msg.externalDataUrl.split('/').pop() : null;
5632
+
5633
+ if (msg.isIOS) {
5634
+ if (msg.externalDataUrl && dataFilename) {
5635
+ sessionOptions.externalData = [{ path: dataFilename, data: msg.externalDataUrl }];
5636
+ }
5637
+ cpuSession = await ort.InferenceSession.create(msg.modelUrl, sessionOptions);
5638
+ } else {
5639
+ var graphResponse = await fetch(msg.modelUrl);
5640
+ if (!graphResponse.ok) throw new Error('Failed to fetch model graph: ' + graphResponse.status);
5641
+ var graphBuffer = await graphResponse.arrayBuffer();
5642
+ if (msg.externalDataUrl && dataFilename) {
5643
+ var dataResponse = await fetch(msg.externalDataUrl);
5644
+ if (!dataResponse.ok) throw new Error('Failed to fetch external data: ' + dataResponse.status);
5645
+ var dataBuffer = await dataResponse.arrayBuffer();
5646
+ sessionOptions.externalData = [{ path: dataFilename, data: new Uint8Array(dataBuffer) }];
5647
+ }
5648
+ cpuSession = await ort.InferenceSession.create(new Uint8Array(graphBuffer), sessionOptions);
5649
+ }
5650
+
5651
+ // Warmup
5652
+ var warmupAudio = new Float32Array(16000);
5653
+ var warmupTensor = new ort.Tensor('float32', warmupAudio, [1, warmupAudio.length]);
5654
+ await cpuSession.run({ audio_waveform: warmupTensor });
5655
+
5656
+ return {
5657
+ inputNames: cpuSession.inputNames.slice(),
5658
+ outputNames: cpuSession.outputNames.slice(),
5659
+ };
5660
+ }
5661
+
5662
+ async function cpuInfer(audio) {
5663
+ var tensor = new ort.Tensor('float32', audio, [1, audio.length]);
5664
+ var results = await cpuSession.run({ audio_waveform: tensor });
5665
+ var blendshapeOutput = results['blendshapes'];
5666
+ if (!blendshapeOutput) throw new Error('Missing blendshapes output from model');
5667
+
5668
+ var blendshapeData = blendshapeOutput.data;
5669
+ var numFrames = blendshapeOutput.dims[1];
5670
+ var numBlendshapes = blendshapeOutput.dims[2];
5671
+
5672
+ var flatBuffer = new Float32Array(numFrames * numBlendshapes);
5673
+ for (var f = 0; f < numFrames; f++) {
5674
+ var offset = f * numBlendshapes;
5675
+ var rawFrame = blendshapeData.slice(offset, offset + numBlendshapes);
5676
+ var symmetrized = symmetrizeBlendshapes(rawFrame);
5677
+ flatBuffer.set(symmetrized, offset);
5678
+ }
5679
+ return { flatBuffer: flatBuffer, numFrames: numFrames, numBlendshapes: numBlendshapes };
5680
+ }
5681
+
5682
+ async function cpuDispose() {
5683
+ if (cpuSession) { await cpuSession.release(); cpuSession = null; }
5684
+ }
5685
+
5686
+ // \u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550
5687
+ // Silero VAD handlers
5688
+ // \u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550
5689
+
5690
+ async function vadLoad(msg) {
5691
+ vadSampleRate = msg.sampleRate;
5692
+ vadChunkSize = vadSampleRate === 16000 ? 512 : 256;
5693
+ vadContextSize = vadSampleRate === 16000 ? 64 : 32;
5694
+
5695
+ var response = await fetch(msg.modelUrl);
5696
+ if (!response.ok) throw new Error('Failed to fetch VAD model: ' + response.status);
5697
+ var modelBuffer = await response.arrayBuffer();
5698
+ vadSession = await ort.InferenceSession.create(new Uint8Array(modelBuffer), {
5699
+ executionProviders: ['wasm'],
5700
+ graphOptimizationLevel: 'all',
5701
+ });
5702
+
5703
+ return {
5704
+ inputNames: vadSession.inputNames.slice(),
5705
+ outputNames: vadSession.outputNames.slice(),
5706
+ };
5707
+ }
5708
+
5709
+ async function vadProcess(audio, state, context) {
5710
+ var inputSize = vadContextSize + vadChunkSize;
5711
+ var inputBuffer = new Float32Array(inputSize);
5712
+ inputBuffer.set(context, 0);
5713
+ inputBuffer.set(audio, vadContextSize);
5714
+
5715
+ var inputTensor = new ort.Tensor('float32', new Float32Array(inputBuffer), [1, inputSize]);
5716
+ var stateTensor = new ort.Tensor('float32', new Float32Array(state), [2, 1, 128]);
5717
+ var srTensor;
5718
+ try {
5719
+ srTensor = new ort.Tensor('int64', new BigInt64Array([BigInt(vadSampleRate)]), []);
5720
+ } catch (e) {
5721
+ srTensor = new ort.Tensor('int64', [BigInt(vadSampleRate)], []);
5722
+ }
5723
+
5724
+ var feeds = { 'input': inputTensor, 'state': stateTensor, 'sr': srTensor };
5725
+ var results = await vadSession.run(feeds);
5726
+ var outputTensor = results['output'];
5727
+ var newStateTensor = results['stateN'] || results['state'];
5728
+ if (!outputTensor) throw new Error('Missing output tensor from VAD model');
5729
+
5730
+ return { probability: outputTensor.data[0], newState: new Float32Array(newStateTensor.data) };
5731
+ }
5732
+
5733
+ function vadCreateInitialState() {
5734
+ return new Float32Array(2 * 1 * 128);
5735
+ }
5736
+
5737
+ async function vadDispose() {
5738
+ if (vadSession) { await vadSession.release(); vadSession = null; }
5739
+ }
5740
+
5741
+ // \u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550
5742
+ // Message handler
5743
+ // \u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550
5744
+
5745
+ self.onmessage = async function(e) {
5746
+ var msg = e.data;
5747
+ var requestId = msg.requestId;
5748
+
5749
+ try {
5750
+ switch (msg.type) {
5751
+ case 'init': {
5752
+ var startTime = performance.now();
5753
+ await loadOrt(msg.wasmPaths, msg.isIOS);
5754
+ self.postMessage({ type: 'init:done', requestId: requestId, loadTimeMs: performance.now() - startTime });
5755
+ break;
5756
+ }
5757
+
5758
+ case 'sv:load': {
5759
+ var startTime = performance.now();
5760
+ var info = await svLoad(msg);
5761
+ self.postMessage({
5762
+ type: 'sv:loaded', requestId: requestId, vocabSize: info.vocabSize,
5763
+ inputNames: info.inputNames, outputNames: info.outputNames,
5764
+ loadTimeMs: performance.now() - startTime,
5765
+ });
5766
+ break;
5767
+ }
5768
+
5769
+ case 'sv:transcribe': {
5770
+ var result = await svTranscribe(msg.audio);
5771
+ self.postMessage({
5772
+ type: 'sv:result', requestId: requestId,
5773
+ text: result.text, language: result.language, emotion: result.emotion, event: result.event,
5774
+ inferenceTimeMs: result.inferenceTimeMs, preprocessTimeMs: result.preprocessTimeMs,
5775
+ });
5776
+ break;
5777
+ }
5778
+
5779
+ case 'sv:dispose': {
5780
+ await svDispose();
5781
+ self.postMessage({ type: 'sv:disposed', requestId: requestId });
5782
+ break;
5783
+ }
5784
+
5785
+ case 'cpu:load': {
5786
+ var startTime = performance.now();
5787
+ var info = await cpuLoad(msg);
5788
+ self.postMessage({
5789
+ type: 'cpu:loaded', requestId: requestId,
5790
+ inputNames: info.inputNames, outputNames: info.outputNames,
5791
+ loadTimeMs: performance.now() - startTime,
5792
+ });
5793
+ break;
5794
+ }
5795
+
5796
+ case 'cpu:infer': {
5797
+ var startTime = performance.now();
5798
+ var result = await cpuInfer(msg.audio);
5799
+ var inferenceTimeMs = performance.now() - startTime;
5800
+ self.postMessage({
5801
+ type: 'cpu:result', requestId: requestId,
5802
+ blendshapes: result.flatBuffer, numFrames: result.numFrames,
5803
+ numBlendshapes: result.numBlendshapes, inferenceTimeMs: inferenceTimeMs,
5804
+ }, [result.flatBuffer.buffer]);
5805
+ break;
5806
+ }
5807
+
5808
+ case 'cpu:dispose': {
5809
+ await cpuDispose();
5810
+ self.postMessage({ type: 'cpu:disposed', requestId: requestId });
5811
+ break;
5812
+ }
5813
+
5814
+ case 'vad:load': {
5815
+ var startTime = performance.now();
5816
+ var info = await vadLoad(msg);
5817
+ self.postMessage({
5818
+ type: 'vad:loaded', requestId: requestId,
5819
+ inputNames: info.inputNames, outputNames: info.outputNames,
5820
+ loadTimeMs: performance.now() - startTime,
5821
+ });
5822
+ break;
5823
+ }
5824
+
5825
+ case 'vad:process': {
5826
+ var startTime = performance.now();
5827
+ var result = await vadProcess(msg.audio, msg.state, msg.context);
5828
+ self.postMessage({
5829
+ type: 'vad:result', requestId: requestId,
5830
+ probability: result.probability, state: result.newState,
5831
+ inferenceTimeMs: performance.now() - startTime,
5832
+ });
5833
+ break;
5834
+ }
5835
+
5836
+ case 'vad:reset': {
5837
+ var state = vadCreateInitialState();
5838
+ self.postMessage({ type: 'vad:reset', requestId: requestId, state: state });
5839
+ break;
5840
+ }
5841
+
5842
+ case 'vad:dispose': {
5843
+ await vadDispose();
5844
+ self.postMessage({ type: 'vad:disposed', requestId: requestId });
5845
+ break;
5846
+ }
5847
+
5848
+ case 'dispose-all': {
5849
+ await svDispose();
5850
+ await cpuDispose();
5851
+ await vadDispose();
5852
+ ort = null;
5853
+ self.postMessage({ type: 'dispose-all:done', requestId: requestId });
5854
+ break;
5855
+ }
5856
+
5857
+ default:
5858
+ self.postMessage({ type: 'error', requestId: requestId, error: 'Unknown message type: ' + msg.type });
5859
+ }
5860
+ } catch (err) {
5861
+ var errorMsg = err.message || String(err);
5862
+ if (typeof err === 'number') {
5863
+ errorMsg = 'Raw C++ exception pointer (0x' + err.toString(16) + '). Likely OOM in WASM.';
5864
+ }
5865
+ self.postMessage({ type: 'error', requestId: requestId, error: errorMsg });
5866
+ }
5867
+ };
5868
+
5869
+ self.onerror = function(err) {
5870
+ self.postMessage({ type: 'error', requestId: null, error: 'Worker error: ' + (err.message || String(err)) });
5871
+ };
5872
+ `;
5873
+ var UnifiedInferenceWorker = class {
5874
+ constructor() {
5875
+ this.worker = null;
5876
+ this.pendingRequests = /* @__PURE__ */ new Map();
5877
+ this.initialized = false;
5878
+ this.poisoned = false;
5879
+ }
5880
+ /**
5881
+ * Initialize the worker (load ORT WASM from CDN)
5882
+ */
5883
+ async init() {
5884
+ if (this.initialized) return;
5885
+ const startTime = performance.now();
5886
+ const telemetry = getTelemetry();
5887
+ const span = telemetry?.startSpan("UnifiedInferenceWorker.init");
5888
+ try {
5889
+ logger6.info("Creating unified inference worker...");
5890
+ this.worker = this.createWorker();
5891
+ await this.sendMessage(
5892
+ { type: "init", wasmPaths: WASM_CDN_PATH3, isIOS: isIOS() },
5893
+ "init:done",
5894
+ INIT_TIMEOUT_MS
5895
+ );
5896
+ this.initialized = true;
5897
+ const loadTimeMs = performance.now() - startTime;
5898
+ logger6.info("Unified worker initialized", { loadTimeMs: Math.round(loadTimeMs) });
5899
+ span?.setAttributes({ "worker.init_time_ms": loadTimeMs });
5900
+ span?.end();
5901
+ } catch (error) {
5902
+ span?.endWithError(error instanceof Error ? error : new Error(String(error)));
5903
+ this.cleanup();
5904
+ throw error;
5905
+ }
5906
+ }
5907
+ // ── SenseVoice ────────────────────────────────────────────────────────
5908
+ async loadSenseVoice(config) {
5909
+ this.assertReady();
5910
+ const startTime = performance.now();
5911
+ const result = await this.sendMessage(
5912
+ {
5913
+ type: "sv:load",
5914
+ modelUrl: resolveUrl2(config.modelUrl),
5915
+ tokensUrl: resolveUrl2(config.tokensUrl),
5916
+ isIOS: isIOS(),
5917
+ language: config.language,
5918
+ textNorm: config.textNorm
5919
+ },
5920
+ "sv:loaded",
5921
+ SV_LOAD_TIMEOUT_MS
5922
+ );
5923
+ const loadTimeMs = performance.now() - startTime;
5924
+ return {
5925
+ backend: "wasm",
5926
+ loadTimeMs,
5927
+ inputNames: result.inputNames,
5928
+ outputNames: result.outputNames,
5929
+ vocabSize: result.vocabSize
5930
+ };
5931
+ }
5932
+ async transcribe(audio) {
5933
+ this.assertReady();
5934
+ const result = await this.sendMessage(
5935
+ { type: "sv:transcribe", audio },
5936
+ "sv:result",
5937
+ SV_INFER_TIMEOUT_MS
5938
+ );
5939
+ return {
5940
+ text: result.text,
5941
+ language: result.language,
5942
+ emotion: result.emotion,
5943
+ event: result.event,
5944
+ inferenceTimeMs: result.inferenceTimeMs,
5945
+ preprocessTimeMs: result.preprocessTimeMs
5946
+ };
5947
+ }
5948
+ async disposeSenseVoice() {
5949
+ if (!this.worker) return;
5950
+ await this.sendMessage({ type: "sv:dispose" }, "sv:disposed", DISPOSE_TIMEOUT_MS);
5951
+ }
5952
+ // ── Wav2ArkitCpu (Lip Sync) ──────────────────────────────────────────
5953
+ async loadLipSync(config) {
5954
+ this.assertReady();
5955
+ const startTime = performance.now();
5956
+ const result = await this.sendMessage(
5957
+ {
5958
+ type: "cpu:load",
5959
+ modelUrl: resolveUrl2(config.modelUrl),
5960
+ externalDataUrl: config.externalDataUrl ? resolveUrl2(config.externalDataUrl) : null,
5961
+ isIOS: isIOS()
5962
+ },
5963
+ "cpu:loaded",
5964
+ CPU_LOAD_TIMEOUT_MS
5965
+ );
5966
+ const loadTimeMs = performance.now() - startTime;
5967
+ return {
5968
+ backend: "wasm",
5969
+ loadTimeMs,
5970
+ inputNames: result.inputNames,
5971
+ outputNames: result.outputNames
5972
+ };
5973
+ }
5974
+ async inferLipSync(audio) {
5975
+ this.assertReady();
5976
+ return this.sendMessage(
5977
+ { type: "cpu:infer", audio },
5978
+ "cpu:result",
5979
+ CPU_INFER_TIMEOUT_MS
5980
+ );
5981
+ }
5982
+ async disposeLipSync() {
5983
+ if (!this.worker) return;
5984
+ await this.sendMessage({ type: "cpu:dispose" }, "cpu:disposed", DISPOSE_TIMEOUT_MS);
5985
+ }
5986
+ // ── Silero VAD ────────────────────────────────────────────────────────
5987
+ async loadVAD(config) {
5988
+ this.assertReady();
5989
+ const startTime = performance.now();
5990
+ const chunkSize = config.sampleRate === 16e3 ? 512 : 256;
5991
+ const result = await this.sendMessage(
5992
+ {
5993
+ type: "vad:load",
5994
+ modelUrl: resolveUrl2(config.modelUrl),
5995
+ sampleRate: config.sampleRate
5996
+ },
5997
+ "vad:loaded",
5998
+ VAD_LOAD_TIMEOUT_MS
5999
+ );
6000
+ const loadTimeMs = performance.now() - startTime;
6001
+ return {
6002
+ backend: "wasm",
6003
+ loadTimeMs,
6004
+ inputNames: result.inputNames,
6005
+ outputNames: result.outputNames,
6006
+ sampleRate: config.sampleRate,
6007
+ chunkSize
6008
+ };
6009
+ }
6010
+ async processVAD(audio, state, context) {
6011
+ this.assertReady();
6012
+ return this.sendMessage(
6013
+ { type: "vad:process", audio, state, context },
6014
+ "vad:result",
6015
+ VAD_INFER_TIMEOUT_MS
6016
+ );
6017
+ }
6018
+ async resetVAD() {
6019
+ this.assertReady();
6020
+ const result = await this.sendMessage(
6021
+ { type: "vad:reset" },
6022
+ "vad:reset",
6023
+ VAD_INFER_TIMEOUT_MS
6024
+ );
6025
+ return result.state;
6026
+ }
6027
+ async disposeVAD() {
6028
+ if (!this.worker) return;
6029
+ await this.sendMessage({ type: "vad:dispose" }, "vad:disposed", DISPOSE_TIMEOUT_MS);
6030
+ }
6031
+ // ── Lifecycle ─────────────────────────────────────────────────────────
6032
+ async dispose() {
6033
+ if (this.worker) {
6034
+ try {
6035
+ await this.sendMessage({ type: "dispose-all" }, "dispose-all:done", DISPOSE_TIMEOUT_MS);
6036
+ } catch {
6037
+ }
6038
+ this.worker.terminate();
6039
+ this.worker = null;
6040
+ }
6041
+ this.initialized = false;
6042
+ this.poisoned = false;
6043
+ this.rejectAllPending("Worker disposed");
6044
+ this.pendingRequests.clear();
6045
+ }
6046
+ /** Check if the worker is initialized and not poisoned */
6047
+ get isReady() {
6048
+ return this.initialized && !this.poisoned && this.worker !== null;
6049
+ }
6050
+ /** Check if Web Workers are supported */
6051
+ static isSupported() {
6052
+ return typeof Worker !== "undefined";
6053
+ }
6054
+ // ── Private ───────────────────────────────────────────────────────────
6055
+ assertReady() {
6056
+ if (!this.initialized || !this.worker) {
6057
+ throw new Error("UnifiedInferenceWorker not initialized. Call init() first.");
6058
+ }
6059
+ if (this.poisoned) {
6060
+ throw new Error("UnifiedInferenceWorker timed out \u2014 unavailable until page reload");
6061
+ }
6062
+ }
6063
+ createWorker() {
6064
+ const blob = new Blob([WORKER_SCRIPT2], { type: "application/javascript" });
6065
+ const blobUrl = URL.createObjectURL(blob);
6066
+ const worker = new Worker(blobUrl);
6067
+ URL.revokeObjectURL(blobUrl);
6068
+ worker.onmessage = (event) => {
6069
+ this.handleWorkerMessage(event.data);
6070
+ };
6071
+ worker.onerror = (error) => {
6072
+ logger6.error("Unified worker error", { error: error.message });
6073
+ this.rejectAllPending(`Worker error: ${error.message}`);
6074
+ };
6075
+ return worker;
6076
+ }
6077
+ handleWorkerMessage(data) {
6078
+ const requestId = data.requestId;
6079
+ if (data.type === "error") {
6080
+ if (requestId && this.pendingRequests.has(requestId)) {
6081
+ const pending = this.pendingRequests.get(requestId);
6082
+ clearTimeout(pending.timeout);
6083
+ this.pendingRequests.delete(requestId);
6084
+ pending.reject(new Error(data.error));
6085
+ } else {
6086
+ logger6.error("Worker broadcast error", { error: data.error });
6087
+ this.rejectAllPending(data.error);
6088
+ }
6089
+ return;
6090
+ }
6091
+ if (requestId && this.pendingRequests.has(requestId)) {
6092
+ const pending = this.pendingRequests.get(requestId);
6093
+ clearTimeout(pending.timeout);
6094
+ this.pendingRequests.delete(requestId);
6095
+ pending.resolve(data);
6096
+ }
6097
+ }
6098
+ sendMessage(message, expectedType, timeoutMs) {
6099
+ return new Promise((resolve, reject) => {
6100
+ if (!this.worker) {
6101
+ reject(new Error("Worker not initialized"));
6102
+ return;
6103
+ }
6104
+ const requestId = nextRequestId();
6105
+ const timeout = setTimeout(() => {
6106
+ this.pendingRequests.delete(requestId);
6107
+ this.poisoned = true;
6108
+ logger6.error("CRITICAL: Worker operation timed out \u2014 worker is dead", {
6109
+ type: message.type,
6110
+ timeoutMs
6111
+ });
6112
+ reject(new Error(`Worker operation '${message.type}' timed out after ${timeoutMs}ms`));
6113
+ }, timeoutMs);
6114
+ this.pendingRequests.set(requestId, {
6115
+ resolve,
6116
+ reject,
6117
+ timeout
6118
+ });
6119
+ this.worker.postMessage({ ...message, requestId });
6120
+ });
6121
+ }
6122
+ rejectAllPending(reason) {
6123
+ for (const [, pending] of this.pendingRequests) {
6124
+ clearTimeout(pending.timeout);
6125
+ pending.reject(new Error(reason));
6126
+ }
6127
+ this.pendingRequests.clear();
6128
+ }
6129
+ cleanup() {
6130
+ if (this.worker) {
6131
+ this.worker.terminate();
6132
+ this.worker = null;
6133
+ }
6134
+ this.initialized = false;
6135
+ this.rejectAllPending("Worker cleanup");
6136
+ this.pendingRequests.clear();
6137
+ }
6138
+ };
6139
+ var SenseVoiceUnifiedAdapter = class {
6140
+ constructor(worker, config) {
6141
+ this._isLoaded = false;
6142
+ this.inferenceQueue = Promise.resolve();
6143
+ this.worker = worker;
6144
+ const modelDir = config.modelUrl.substring(0, config.modelUrl.lastIndexOf("/"));
6145
+ this.config = {
6146
+ modelUrl: config.modelUrl,
6147
+ tokensUrl: config.tokensUrl ?? `${modelDir}/tokens.txt`,
6148
+ language: config.language ?? "auto",
6149
+ textNorm: config.textNorm ?? "with_itn"
6150
+ };
6151
+ this.languageId = resolveLanguageId(this.config.language);
6152
+ this.textNormId = resolveTextNormId(this.config.textNorm);
6153
+ }
6154
+ get isLoaded() {
6155
+ return this._isLoaded;
6156
+ }
6157
+ get backend() {
6158
+ return this._isLoaded ? "wasm" : null;
6159
+ }
6160
+ async load(onProgress) {
6161
+ const telemetry = getTelemetry();
6162
+ const span = telemetry?.startSpan("SenseVoiceUnifiedAdapter.load", {
6163
+ "model.url": this.config.modelUrl
6164
+ });
6165
+ try {
6166
+ const result = await this.worker.loadSenseVoice({
6167
+ modelUrl: this.config.modelUrl,
6168
+ tokensUrl: this.config.tokensUrl,
6169
+ language: this.languageId,
6170
+ textNorm: this.textNormId
6171
+ });
6172
+ this._isLoaded = true;
6173
+ onProgress?.(1, 1);
6174
+ logger6.info("SenseVoice loaded via unified worker", {
6175
+ backend: "wasm",
6176
+ loadTimeMs: Math.round(result.loadTimeMs),
6177
+ vocabSize: result.vocabSize
6178
+ });
6179
+ span?.setAttributes({ "model.backend": "wasm", "model.load_time_ms": result.loadTimeMs });
6180
+ span?.end();
6181
+ telemetry?.recordHistogram("omote.model.load_time", result.loadTimeMs, {
6182
+ model: "sensevoice-unified",
6183
+ backend: "wasm"
6184
+ });
6185
+ return result;
6186
+ } catch (error) {
6187
+ span?.endWithError(error instanceof Error ? error : new Error(String(error)));
6188
+ throw error;
6189
+ }
6190
+ }
6191
+ async transcribe(audioSamples) {
6192
+ if (!this._isLoaded) throw new Error("Model not loaded. Call load() first.");
6193
+ const audio = new Float32Array(audioSamples);
6194
+ return new Promise((resolve, reject) => {
6195
+ this.inferenceQueue = this.inferenceQueue.then(async () => {
6196
+ try {
6197
+ const result = await this.worker.transcribe(audio);
6198
+ resolve(result);
6199
+ } catch (err) {
6200
+ reject(err);
6201
+ }
6202
+ });
6203
+ });
6204
+ }
6205
+ async dispose() {
6206
+ if (this._isLoaded) {
6207
+ await this.worker.disposeSenseVoice();
6208
+ this._isLoaded = false;
6209
+ }
6210
+ }
6211
+ };
6212
+ var Wav2ArkitCpuUnifiedAdapter = class {
6213
+ constructor(worker, config) {
6214
+ this.modelId = "wav2arkit_cpu";
6215
+ this._isLoaded = false;
6216
+ this.inferenceQueue = Promise.resolve();
6217
+ this.worker = worker;
6218
+ this.config = config;
6219
+ }
6220
+ get isLoaded() {
6221
+ return this._isLoaded;
6222
+ }
6223
+ get backend() {
6224
+ return this._isLoaded ? "wasm" : null;
6225
+ }
6226
+ async load() {
6227
+ const telemetry = getTelemetry();
6228
+ const span = telemetry?.startSpan("Wav2ArkitCpuUnifiedAdapter.load", {
6229
+ "model.url": this.config.modelUrl
6230
+ });
6231
+ try {
6232
+ const externalDataUrl = this.config.externalDataUrl !== false ? this.config.externalDataUrl || `${this.config.modelUrl}.data` : null;
6233
+ const result = await this.worker.loadLipSync({
6234
+ modelUrl: this.config.modelUrl,
6235
+ externalDataUrl: externalDataUrl || null
6236
+ });
6237
+ this._isLoaded = true;
6238
+ logger6.info("Wav2ArkitCpu loaded via unified worker", {
6239
+ backend: "wasm",
6240
+ loadTimeMs: Math.round(result.loadTimeMs)
6241
+ });
6242
+ span?.setAttributes({ "model.backend": "wasm", "model.load_time_ms": result.loadTimeMs });
6243
+ span?.end();
6244
+ telemetry?.recordHistogram("omote.model.load_time", result.loadTimeMs, {
6245
+ model: "wav2arkit_cpu-unified",
6246
+ backend: "wasm"
6247
+ });
6248
+ return result;
6249
+ } catch (error) {
6250
+ span?.endWithError(error instanceof Error ? error : new Error(String(error)));
6251
+ throw error;
6252
+ }
6253
+ }
6254
+ async infer(audioSamples, _identityIndex) {
6255
+ if (!this._isLoaded) throw new Error("Model not loaded. Call load() first.");
6256
+ const audioCopy = new Float32Array(audioSamples);
6257
+ return new Promise((resolve, reject) => {
6258
+ this.inferenceQueue = this.inferenceQueue.then(async () => {
6259
+ const telemetry = getTelemetry();
6260
+ const span = telemetry?.startSpan("Wav2ArkitCpuUnifiedAdapter.infer", {
6261
+ "inference.input_samples": audioCopy.length
6262
+ });
6263
+ try {
6264
+ const startTime = performance.now();
6265
+ const result = await this.worker.inferLipSync(audioCopy);
6266
+ const inferenceTimeMs = performance.now() - startTime;
6267
+ const flatBuffer = result.blendshapes;
6268
+ const { numFrames, numBlendshapes } = result;
6269
+ const blendshapes = [];
6270
+ for (let f = 0; f < numFrames; f++) {
6271
+ blendshapes.push(flatBuffer.slice(f * numBlendshapes, (f + 1) * numBlendshapes));
6272
+ }
6273
+ span?.setAttributes({
6274
+ "inference.duration_ms": inferenceTimeMs,
6275
+ "inference.frames": numFrames
6276
+ });
6277
+ span?.end();
6278
+ resolve({ blendshapes, numFrames, inferenceTimeMs });
6279
+ } catch (err) {
6280
+ span?.endWithError(err instanceof Error ? err : new Error(String(err)));
6281
+ reject(err);
6282
+ }
6283
+ });
6284
+ });
6285
+ }
6286
+ async dispose() {
6287
+ if (this._isLoaded) {
6288
+ await this.worker.disposeLipSync();
6289
+ this._isLoaded = false;
6290
+ }
6291
+ }
6292
+ };
6293
+ var SileroVADUnifiedAdapter = class {
6294
+ constructor(worker, config) {
6295
+ this._isLoaded = false;
6296
+ // Inference queue
6297
+ this.inferenceQueue = Promise.resolve();
6298
+ // Pre-speech buffer
6299
+ this.preSpeechBuffer = [];
6300
+ this.wasSpeaking = false;
6301
+ this.worker = worker;
6302
+ const sr = config.sampleRate ?? 16e3;
6303
+ this.config = {
6304
+ modelUrl: config.modelUrl,
6305
+ backend: config.backend ?? "wasm",
6306
+ sampleRate: sr,
6307
+ threshold: config.threshold ?? 0.5,
6308
+ preSpeechBufferChunks: config.preSpeechBufferChunks ?? 10
6309
+ };
6310
+ this.chunkSize = sr === 16e3 ? 512 : 256;
6311
+ this.contextSize = sr === 16e3 ? 64 : 32;
6312
+ this.state = new Float32Array(2 * 1 * 128);
6313
+ this.context = new Float32Array(this.contextSize);
6314
+ }
6315
+ get isLoaded() {
6316
+ return this._isLoaded;
6317
+ }
6318
+ get backend() {
6319
+ return this._isLoaded ? "wasm" : null;
6320
+ }
6321
+ get sampleRate() {
6322
+ return this.config.sampleRate;
6323
+ }
6324
+ get threshold() {
6325
+ return this.config.threshold;
6326
+ }
6327
+ getChunkSize() {
6328
+ return this.chunkSize;
6329
+ }
6330
+ getChunkDurationMs() {
6331
+ return this.chunkSize / this.config.sampleRate * 1e3;
6332
+ }
6333
+ async load() {
6334
+ const telemetry = getTelemetry();
6335
+ const span = telemetry?.startSpan("SileroVADUnifiedAdapter.load", {
6336
+ "model.url": this.config.modelUrl
6337
+ });
6338
+ try {
6339
+ const result = await this.worker.loadVAD({
6340
+ modelUrl: this.config.modelUrl,
6341
+ sampleRate: this.config.sampleRate
6342
+ });
6343
+ this._isLoaded = true;
6344
+ logger6.info("SileroVAD loaded via unified worker", {
6345
+ backend: "wasm",
6346
+ loadTimeMs: Math.round(result.loadTimeMs),
6347
+ sampleRate: this.config.sampleRate,
6348
+ chunkSize: this.chunkSize
6349
+ });
6350
+ span?.setAttributes({ "model.backend": "wasm", "model.load_time_ms": result.loadTimeMs });
6351
+ span?.end();
6352
+ telemetry?.recordHistogram("omote.model.load_time", result.loadTimeMs, {
6353
+ model: "silero-vad-unified",
6354
+ backend: "wasm"
6355
+ });
6356
+ return result;
6357
+ } catch (error) {
6358
+ span?.endWithError(error instanceof Error ? error : new Error(String(error)));
6359
+ throw error;
6360
+ }
6361
+ }
6362
+ async process(audioChunk) {
6363
+ if (!this._isLoaded) throw new Error("Model not loaded. Call load() first.");
6364
+ if (audioChunk.length !== this.chunkSize) {
6365
+ throw new Error(
6366
+ `Audio chunk must be exactly ${this.chunkSize} samples (got ${audioChunk.length}). Use getChunkSize() to get required size.`
6367
+ );
6368
+ }
6369
+ const audioChunkCopy = new Float32Array(audioChunk);
6370
+ return new Promise((resolve, reject) => {
6371
+ this.inferenceQueue = this.inferenceQueue.then(async () => {
6372
+ try {
6373
+ const startTime = performance.now();
6374
+ const result = await this.worker.processVAD(audioChunkCopy, this.state, this.context);
6375
+ this.state = result.state;
6376
+ this.context = audioChunkCopy.slice(-this.contextSize);
6377
+ const inferenceTimeMs = performance.now() - startTime;
6378
+ const isSpeech = result.probability > this.config.threshold;
6379
+ let preSpeechChunks;
6380
+ if (isSpeech && !this.wasSpeaking) {
6381
+ preSpeechChunks = [...this.preSpeechBuffer];
6382
+ this.preSpeechBuffer = [];
6383
+ } else if (!isSpeech && !this.wasSpeaking) {
6384
+ this.preSpeechBuffer.push(new Float32Array(audioChunkCopy));
6385
+ if (this.preSpeechBuffer.length > this.config.preSpeechBufferChunks) {
6386
+ this.preSpeechBuffer.shift();
6387
+ }
6388
+ } else if (!isSpeech && this.wasSpeaking) {
6389
+ this.preSpeechBuffer = [];
6390
+ }
6391
+ this.wasSpeaking = isSpeech;
6392
+ resolve({
6393
+ probability: result.probability,
6394
+ isSpeech,
6395
+ inferenceTimeMs,
6396
+ preSpeechChunks
6397
+ });
6398
+ } catch (err) {
6399
+ reject(err);
6400
+ }
6401
+ });
6402
+ });
6403
+ }
6404
+ async reset() {
6405
+ if (!this._isLoaded) throw new Error("Model not loaded. Call load() first.");
6406
+ const newState = await this.worker.resetVAD();
6407
+ this.state = newState;
6408
+ this.context = new Float32Array(this.contextSize);
6409
+ this.preSpeechBuffer = [];
6410
+ this.wasSpeaking = false;
6411
+ }
6412
+ async dispose() {
6413
+ if (this._isLoaded) {
6414
+ await this.worker.disposeVAD();
6415
+ this._isLoaded = false;
6416
+ }
6417
+ this.state = new Float32Array(2 * 1 * 128);
6418
+ this.context = new Float32Array(this.contextSize);
6419
+ this.preSpeechBuffer = [];
6420
+ this.wasSpeaking = false;
6421
+ }
6422
+ };
6423
+
6424
+ // src/inference/createSenseVoice.ts
6425
+ var logger7 = createLogger("createSenseVoice");
6426
+ function createSenseVoice(config) {
6427
+ if (config.unifiedWorker) {
6428
+ logger7.info("Creating SenseVoiceUnifiedAdapter (shared unified worker)");
6429
+ return new SenseVoiceUnifiedAdapter(config.unifiedWorker, {
6430
+ modelUrl: config.modelUrl,
6431
+ tokensUrl: config.tokensUrl,
6432
+ language: config.language,
6433
+ textNorm: config.textNorm
6434
+ });
6435
+ }
6436
+ const useWorker = config.useWorker ?? "auto";
6437
+ if (useWorker === true) {
6438
+ if (!SenseVoiceWorker.isSupported()) {
6439
+ throw new Error("Web Workers are not supported in this environment");
6440
+ }
6441
+ logger7.info("Creating SenseVoiceWorker (off-main-thread)");
6442
+ return new SenseVoiceWorker({
6443
+ modelUrl: config.modelUrl,
6444
+ tokensUrl: config.tokensUrl,
6445
+ language: config.language,
6446
+ textNorm: config.textNorm
6447
+ });
6448
+ }
6449
+ if (useWorker === false) {
6450
+ logger7.info("Creating SenseVoiceInference (main thread)");
6451
+ return new SenseVoiceInference({
6452
+ modelUrl: config.modelUrl,
6453
+ tokensUrl: config.tokensUrl,
6454
+ language: config.language,
6455
+ textNorm: config.textNorm
6456
+ });
6457
+ }
6458
+ if (SenseVoiceWorker.isSupported() && !isIOS()) {
6459
+ logger7.info("Auto-detected: creating SenseVoiceWorker (off-main-thread)");
6460
+ return new SenseVoiceWorker({
6461
+ modelUrl: config.modelUrl,
6462
+ tokensUrl: config.tokensUrl,
6463
+ language: config.language,
6464
+ textNorm: config.textNorm
6465
+ });
6466
+ }
6467
+ logger7.info("Auto-detected: creating SenseVoiceInference (main thread)", {
6468
+ reason: isIOS() ? "iOS (shared ORT instance)" : "Worker unsupported"
6469
+ });
6470
+ return new SenseVoiceInference({
6471
+ modelUrl: config.modelUrl,
6472
+ tokensUrl: config.tokensUrl,
6473
+ language: config.language,
6474
+ textNorm: config.textNorm
6475
+ });
6476
+ }
6477
+
6478
+ // src/inference/Wav2ArkitCpuInference.ts
6479
+ var logger8 = createLogger("Wav2ArkitCpu");
6480
+ var _Wav2ArkitCpuInference = class _Wav2ArkitCpuInference {
6481
+ constructor(config) {
6482
+ this.modelId = "wav2arkit_cpu";
6483
+ this.session = null;
6484
+ this.ort = null;
6485
+ this._backend = "wasm";
6486
+ this.isLoading = false;
6487
+ // Inference queue for handling concurrent calls
6488
+ this.inferenceQueue = Promise.resolve();
6489
+ // Session health: set to true if session.run() times out.
6490
+ // A timed-out session may have a zombie WASM dispatch still running,
6491
+ // so all future infer() calls reject immediately to prevent concurrent access.
6492
+ this.poisoned = false;
6493
+ this.config = config;
6494
+ }
6495
+ get backend() {
6496
+ return this.session ? this._backend : null;
6497
+ }
6498
+ get isLoaded() {
6499
+ return this.session !== null;
6500
+ }
6501
+ /**
6502
+ * Load the ONNX model
6503
+ */
6504
+ async load() {
6505
+ if (this.isLoading) {
6506
+ throw new Error("Model is already loading");
6507
+ }
6508
+ if (this.session) {
6509
+ throw new Error("Model already loaded. Call dispose() first.");
6510
+ }
6511
+ this.isLoading = true;
6512
+ const startTime = performance.now();
6513
+ const telemetry = getTelemetry();
6514
+ const span = telemetry?.startSpan("Wav2ArkitCpu.load", {
6515
+ "model.url": this.config.modelUrl,
6516
+ "model.backend_requested": this.config.backend || "wasm"
6517
+ });
6518
+ try {
6519
+ const preference = this.config.backend || "wasm";
6520
+ logger8.info("Loading ONNX Runtime...", { preference });
6521
+ const { ort, backend } = await getOnnxRuntimeForPreference(preference);
6522
+ this.ort = ort;
6523
+ this._backend = backend;
6524
+ logger8.info("ONNX Runtime loaded", { backend: this._backend });
6525
+ const modelUrl = this.config.modelUrl;
6526
+ const dataUrl = this.config.externalDataUrl !== false ? this.config.externalDataUrl || `${modelUrl}.data` : null;
6527
+ const sessionOptions = getSessionOptions(this._backend);
6528
+ if (isIOS()) {
6529
+ logger8.info("iOS: passing model URLs directly to ORT (low-memory path)", {
6530
+ modelUrl,
6531
+ dataUrl
6532
+ });
6533
+ if (dataUrl) {
6534
+ const dataFilename = dataUrl.split("/").pop();
6535
+ sessionOptions.externalData = [{
6536
+ path: dataFilename,
6537
+ data: dataUrl
6538
+ // URL string — ORT fetches directly into WASM
6539
+ }];
6540
+ }
6541
+ this.session = await this.ort.InferenceSession.create(modelUrl, sessionOptions);
6542
+ } else {
6543
+ const cache = getModelCache();
6544
+ const isCached = await cache.has(modelUrl);
6545
+ let modelBuffer;
6546
+ if (isCached) {
6547
+ logger8.debug("Loading model from cache", { modelUrl });
6548
+ modelBuffer = await cache.get(modelUrl);
6549
+ if (!modelBuffer) {
6550
+ logger8.warn("Cache corruption detected, clearing and retrying", { modelUrl });
6551
+ await cache.delete(modelUrl);
6552
+ modelBuffer = await fetchWithCache(modelUrl);
6553
+ }
6554
+ } else {
6555
+ logger8.debug("Fetching and caching model graph", { modelUrl });
6556
+ modelBuffer = await fetchWithCache(modelUrl);
6557
+ }
6558
+ if (!modelBuffer) {
6559
+ throw new Error(`Failed to load model: ${modelUrl}`);
6560
+ }
6561
+ let externalDataBuffer = null;
6562
+ if (dataUrl) {
6563
+ try {
6564
+ const isDataCached = await cache.has(dataUrl);
6565
+ if (isDataCached) {
6566
+ logger8.debug("Loading external data from cache", { dataUrl });
6567
+ externalDataBuffer = await cache.get(dataUrl);
6568
+ if (!externalDataBuffer) {
6569
+ logger8.warn("Cache corruption for external data, retrying", { dataUrl });
6570
+ await cache.delete(dataUrl);
6571
+ externalDataBuffer = await fetchWithCache(dataUrl);
6572
+ }
6573
+ } else {
6574
+ logger8.info("Fetching external model data", {
6575
+ dataUrl,
6576
+ note: "This may be a large download (400MB+)"
6577
+ });
6578
+ externalDataBuffer = await fetchWithCache(dataUrl);
6579
+ }
6580
+ logger8.info("External data loaded", {
6581
+ size: formatBytes(externalDataBuffer.byteLength)
6582
+ });
6583
+ } catch (err) {
6584
+ logger8.debug("No external data file found (single-file model)", {
6585
+ dataUrl,
6586
+ error: err.message
6587
+ });
6588
+ }
6589
+ }
6590
+ logger8.debug("Creating ONNX session", {
6591
+ graphSize: formatBytes(modelBuffer.byteLength),
6592
+ externalDataSize: externalDataBuffer ? formatBytes(externalDataBuffer.byteLength) : "none",
6593
+ backend: this._backend
6594
+ });
6595
+ if (externalDataBuffer) {
6596
+ const dataFilename = dataUrl.split("/").pop();
6597
+ sessionOptions.externalData = [{
6598
+ path: dataFilename,
6599
+ data: new Uint8Array(externalDataBuffer)
6600
+ }];
6601
+ }
6602
+ const modelData = new Uint8Array(modelBuffer);
6603
+ this.session = await this.ort.InferenceSession.create(modelData, sessionOptions);
6604
+ }
6605
+ const loadTimeMs = performance.now() - startTime;
6606
+ logger8.info("Model loaded successfully", {
6607
+ backend: this._backend,
6608
+ loadTimeMs: Math.round(loadTimeMs),
6609
+ inputs: this.session.inputNames,
6610
+ outputs: this.session.outputNames
6611
+ });
6612
+ span?.setAttributes({
6613
+ "model.backend": this._backend,
6614
+ "model.load_time_ms": loadTimeMs,
6615
+ "model.cached": !isIOS()
6616
+ });
6617
+ span?.end();
6618
+ telemetry?.recordHistogram("omote.model.load_time", loadTimeMs, {
6619
+ model: "wav2arkit_cpu",
6620
+ backend: this._backend
6621
+ });
6622
+ logger8.debug("Running warmup inference");
6623
+ const warmupStart = performance.now();
6624
+ const silentAudio = new Float32Array(16e3);
6625
+ await this.infer(silentAudio);
6626
+ const warmupTimeMs = performance.now() - warmupStart;
6627
+ logger8.info("Warmup inference complete", {
6628
+ warmupTimeMs: Math.round(warmupTimeMs),
6629
+ backend: this._backend
6630
+ });
6631
+ telemetry?.recordHistogram("omote.model.warmup_time", warmupTimeMs, {
6632
+ model: "wav2arkit_cpu",
6633
+ backend: this._backend
6634
+ });
6635
+ return {
6636
+ backend: this._backend,
6637
+ loadTimeMs,
6638
+ inputNames: [...this.session.inputNames],
6639
+ outputNames: [...this.session.outputNames]
6640
+ };
6641
+ } catch (error) {
6642
+ span?.endWithError(error instanceof Error ? error : new Error(String(error)));
6643
+ telemetry?.incrementCounter("omote.errors.total", 1, {
6644
+ model: "wav2arkit_cpu",
6645
+ error_type: "load_failed"
6646
+ });
6647
+ throw error;
6648
+ } finally {
6649
+ this.isLoading = false;
6650
+ }
6651
+ }
6652
+ /**
6653
+ * Run inference on raw audio
6654
+ *
6655
+ * Accepts variable-length audio (not fixed to 16000 samples).
6656
+ * Output frames = ceil(30 * numSamples / 16000).
6657
+ *
6658
+ * @param audioSamples - Float32Array of raw audio at 16kHz
6659
+ * @param _identityIndex - Ignored (identity 11 is baked into the model)
6660
+ */
6661
+ async infer(audioSamples, _identityIndex) {
6662
+ if (!this.session) {
6663
+ throw new Error("Model not loaded. Call load() first.");
6664
+ }
6665
+ if (this.poisoned) {
6666
+ throw new Error("Wav2ArkitCpu session timed out \u2014 inference unavailable until page reload");
6667
+ }
6668
+ const audioCopy = new Float32Array(audioSamples);
6669
+ const feeds = {
6670
+ "audio_waveform": new this.ort.Tensor("float32", audioCopy, [1, audioCopy.length])
6671
+ };
6672
+ return this.queueInference(feeds, audioCopy.length);
6673
+ }
6674
+ /**
6675
+ * Queue inference to serialize ONNX session calls
6676
+ */
6677
+ queueInference(feeds, inputSamples) {
6678
+ return new Promise((resolve, reject) => {
6679
+ this.inferenceQueue = this.inferenceQueue.then(async () => {
6680
+ const telemetry = getTelemetry();
6681
+ const span = telemetry?.startSpan("Wav2ArkitCpu.infer", {
6682
+ "inference.backend": this._backend,
6683
+ "inference.input_samples": inputSamples
6684
+ });
6685
+ try {
6686
+ const startTime = performance.now();
6687
+ let timeoutId;
6688
+ const results = await Promise.race([
6689
+ this.session.run(feeds).then((r) => {
6690
+ clearTimeout(timeoutId);
6691
+ return r;
6692
+ }),
6693
+ new Promise((_, rej) => {
6694
+ timeoutId = setTimeout(
6695
+ () => rej(new Error(`Wav2ArkitCpu inference timed out after ${_Wav2ArkitCpuInference.INFERENCE_TIMEOUT_MS}ms`)),
6696
+ _Wav2ArkitCpuInference.INFERENCE_TIMEOUT_MS
6697
+ );
6698
+ })
6699
+ ]);
6700
+ const inferenceTimeMs = performance.now() - startTime;
6701
+ const blendshapeOutput = results["blendshapes"];
6702
+ if (!blendshapeOutput) {
6703
+ throw new Error("Missing blendshapes output from model");
6704
+ }
6705
+ const blendshapeData = blendshapeOutput.data;
6706
+ const numFrames = blendshapeOutput.dims[1];
6707
+ const numBlendshapes = blendshapeOutput.dims[2];
6708
+ const blendshapes = [];
6709
+ for (let f = 0; f < numFrames; f++) {
6710
+ const rawFrame = blendshapeData.slice(f * numBlendshapes, (f + 1) * numBlendshapes);
6711
+ const symmetrized = symmetrizeBlendshapes(rawFrame);
6712
+ blendshapes.push(symmetrized);
6713
+ }
6714
+ logger8.trace("Inference completed", {
6715
+ inferenceTimeMs: Math.round(inferenceTimeMs * 100) / 100,
6716
+ numFrames,
6717
+ inputSamples
6718
+ });
6719
+ span?.setAttributes({
6720
+ "inference.duration_ms": inferenceTimeMs,
6721
+ "inference.frames": numFrames
6722
+ });
6723
+ span?.end();
6724
+ telemetry?.recordHistogram("omote.inference.latency", inferenceTimeMs, {
6725
+ model: "wav2arkit_cpu",
6726
+ backend: this._backend
6727
+ });
6728
+ telemetry?.incrementCounter("omote.inference.total", 1, {
6729
+ model: "wav2arkit_cpu",
6730
+ backend: this._backend,
6731
+ status: "success"
6732
+ });
6733
+ resolve({
6734
+ blendshapes,
6735
+ numFrames,
6736
+ inferenceTimeMs
6737
+ });
6738
+ } catch (err) {
6739
+ const errMsg = err instanceof Error ? err.message : String(err);
6740
+ if (errMsg.includes("timed out")) {
6741
+ this.poisoned = true;
6742
+ logger8.error("CRITICAL: Inference session timed out \u2014 Wav2ArkitCpu is dead. Page reload required.", {
6743
+ backend: this._backend,
6744
+ timeoutMs: _Wav2ArkitCpuInference.INFERENCE_TIMEOUT_MS
6745
+ });
6746
+ } else if (typeof err === "number") {
6747
+ const oomError = new Error(
6748
+ `Wav2ArkitCpu inference failed with raw C++ exception pointer (0x${err.toString(16)}). This is likely an OOM crash in WASM. Try reloading the page.`
6749
+ );
6750
+ logger8.error("ORT WASM OOM \u2014 raw C++ exception pointer", {
6751
+ pointer: `0x${err.toString(16)}`,
6752
+ backend: this._backend
6753
+ });
6754
+ span?.endWithError(oomError);
6755
+ telemetry?.incrementCounter("omote.inference.total", 1, {
6756
+ model: "wav2arkit_cpu",
6757
+ backend: this._backend,
6758
+ status: "error"
6759
+ });
6760
+ reject(oomError);
6761
+ return;
6762
+ } else {
6763
+ logger8.error("Inference failed", { error: errMsg, backend: this._backend });
6764
+ }
6765
+ span?.endWithError(err instanceof Error ? err : new Error(String(err)));
6766
+ telemetry?.incrementCounter("omote.inference.total", 1, {
6767
+ model: "wav2arkit_cpu",
6768
+ backend: this._backend,
6769
+ status: "error"
6770
+ });
6771
+ reject(err);
6772
+ }
6773
+ });
6774
+ });
6775
+ }
6776
+ /**
6777
+ * Dispose of the model and free resources
6778
+ */
6779
+ async dispose() {
6780
+ if (this.session) {
6781
+ await this.session.release();
6782
+ this.session = null;
6783
+ }
6784
+ }
6785
+ };
6786
+ _Wav2ArkitCpuInference.INFERENCE_TIMEOUT_MS = 5e3;
6787
+ var Wav2ArkitCpuInference = _Wav2ArkitCpuInference;
6788
+
6789
+ // src/inference/Wav2ArkitCpuWorker.ts
6790
+ var logger9 = createLogger("Wav2ArkitCpuWorker");
6791
+ var WASM_CDN_PATH4 = "https://cdn.jsdelivr.net/npm/onnxruntime-web@1.23.2/dist/";
6792
+ var LOAD_TIMEOUT_MS2 = 6e4;
6793
+ var INFERENCE_TIMEOUT_MS2 = 5e3;
6794
+ function resolveUrl3(url) {
6795
+ if (/^https?:\/\//i.test(url) || /^blob:/i.test(url)) return url;
6796
+ try {
6797
+ return new URL(url, globalThis.location?.origin ?? "https://localhost").href;
6798
+ } catch {
6799
+ return url;
6800
+ }
6801
+ }
6802
+ var WORKER_SCRIPT3 = `
6803
+ // Wav2ArkitCpu Worker Script
6804
+ // Loaded via Blob URL - no separate file needed
6805
+
6806
+ var ort = null;
6807
+ var session = null;
6808
+
6809
+ // Precomputed symmetric index pairs from LAM_BLENDSHAPES alphabetical ordering
6810
+ // Used to average left/right blendshape pairs for symmetrized output
6811
+ const SYMMETRIC_INDEX_PAIRS = [
6812
+ [23, 25], // jawLeft, jawRight
6813
+ [32, 38], // mouthLeft, mouthRight
6814
+ [43, 44], // mouthSmileLeft, mouthSmileRight
6815
+ [29, 30], // mouthFrownLeft, mouthFrownRight
6816
+ [27, 28], // mouthDimpleLeft, mouthDimpleRight
6817
+ [45, 46], // mouthStretchLeft, mouthStretchRight
6818
+ [35, 36], // mouthPressLeft, mouthPressRight
6819
+ [47, 48], // mouthUpperUpLeft, mouthUpperUpRight
6820
+ [33, 34], // mouthLowerDownLeft, mouthLowerDownRight
6821
+ [49, 50], // noseSneerLeft, noseSneerRight
6822
+ [6, 7], // cheekSquintLeft, cheekSquintRight
6823
+ [0, 1], // browDownLeft, browDownRight
6824
+ [3, 4], // browOuterUpLeft, browOuterUpRight
6825
+ [8, 9], // eyeBlinkLeft, eyeBlinkRight
6826
+ [16, 17], // eyeLookUpLeft, eyeLookUpRight
6827
+ [10, 11], // eyeLookDownLeft, eyeLookDownRight
6828
+ [12, 13], // eyeLookInLeft, eyeLookInRight
6829
+ [14, 15], // eyeLookOutLeft, eyeLookOutRight
6830
+ [18, 19], // eyeSquintLeft, eyeSquintRight
6831
+ [20, 21], // eyeWideLeft, eyeWideRight
6832
+ ];
6833
+
6834
+ /**
6835
+ * Symmetrize blendshapes by averaging left/right pairs
6836
+ * Inlined from blendshapeUtils.ts for worker context
6837
+ */
6838
+ function symmetrizeBlendshapes(frame) {
6839
+ const result = new Float32Array(frame);
6840
+ for (const [lIdx, rIdx] of SYMMETRIC_INDEX_PAIRS) {
6841
+ const avg = (frame[lIdx] + frame[rIdx]) / 2;
6842
+ result[lIdx] = avg;
6843
+ result[rIdx] = avg;
6844
+ }
6845
+ return result;
6846
+ }
6847
+
6848
+ /**
6849
+ * Load ONNX Runtime from CDN
6850
+ */
6851
+ async function loadOrt(wasmPaths) {
6852
+ if (ort) return;
6853
+
6854
+ // Import ONNX Runtime from CDN
6855
+ const ortUrl = wasmPaths + 'ort.wasm.min.js';
6856
+
6857
+ // Load the script by fetching and executing it
6858
+ const response = await fetch(ortUrl);
6859
+ const scriptText = await response.text();
6860
+
6861
+ // Create a blob URL for the script
6862
+ const blob = new Blob([scriptText], { type: 'application/javascript' });
6863
+ const blobUrl = URL.createObjectURL(blob);
6864
+
6865
+ // Import the module
6866
+ importScripts(blobUrl);
6867
+ URL.revokeObjectURL(blobUrl);
6868
+
6869
+ // ort is now available as global
6870
+ ort = self.ort;
6871
+
6872
+ // Configure WASM settings
6873
+ ort.env.wasm.wasmPaths = wasmPaths;
6874
+ ort.env.wasm.numThreads = 1; // Single thread in worker
6875
+ ort.env.wasm.simd = true;
6876
+ ort.env.wasm.proxy = false; // No proxy in worker
6877
+ }
6878
+
6879
+ /**
6880
+ * Load the wav2arkit_cpu model
6881
+ */
6882
+ async function loadModel(modelUrl, externalDataUrl, isIOS) {
6883
+ const sessionOptions = {
6884
+ executionProviders: ['wasm'],
6885
+ graphOptimizationLevel: 'all',
6886
+ };
6887
+
6888
+ const dataFilename = externalDataUrl ? externalDataUrl.split('/').pop() : null;
6889
+
6890
+ if (isIOS) {
6891
+ // iOS: Pass URLs directly to ORT to avoid loading 402MB into JS heap.
6892
+ // ORT fetches externally into WASM memory, cutting peak JS memory from
6893
+ // ~800MB to ~2MB (just the graph).
6894
+ if (externalDataUrl && dataFilename) {
6895
+ sessionOptions.externalData = [{ path: dataFilename, data: externalDataUrl }];
6896
+ }
6897
+ session = await ort.InferenceSession.create(modelUrl, sessionOptions);
6898
+ } else {
6899
+ // Desktop: fetch model graph as ArrayBuffer
6900
+ const graphResponse = await fetch(modelUrl);
6901
+ if (!graphResponse.ok) {
6902
+ throw new Error('Failed to fetch model graph: ' + graphResponse.status + ' ' + graphResponse.statusText);
6903
+ }
6904
+ const graphBuffer = await graphResponse.arrayBuffer();
6905
+
6906
+ // Fetch external data file if present
6907
+ if (externalDataUrl && dataFilename) {
6908
+ const dataResponse = await fetch(externalDataUrl);
6909
+ if (!dataResponse.ok) {
6910
+ throw new Error('Failed to fetch external data: ' + dataResponse.status + ' ' + dataResponse.statusText);
6911
+ }
6912
+ const dataBuffer = await dataResponse.arrayBuffer();
6913
+ sessionOptions.externalData = [{ path: dataFilename, data: new Uint8Array(dataBuffer) }];
6914
+ }
6915
+
6916
+ session = await ort.InferenceSession.create(new Uint8Array(graphBuffer), sessionOptions);
6917
+ }
6918
+
6919
+ // Warmup inference with 16000 silent samples
6920
+ const warmupAudio = new Float32Array(16000);
6921
+ const warmupTensor = new ort.Tensor('float32', warmupAudio, [1, warmupAudio.length]);
6922
+ await session.run({ audio_waveform: warmupTensor });
6923
+
6924
+ return {
6925
+ inputNames: session.inputNames.slice(),
6926
+ outputNames: session.outputNames.slice(),
6927
+ };
6928
+ }
6929
+
6930
+ /**
6931
+ * Run lip sync inference
6932
+ */
6933
+ async function runInference(audio) {
6934
+ const tensor = new ort.Tensor('float32', audio, [1, audio.length]);
6935
+ const results = await session.run({ audio_waveform: tensor });
6936
+
6937
+ const blendshapeOutput = results['blendshapes'];
6938
+ if (!blendshapeOutput) {
6939
+ throw new Error('Missing blendshapes output from model');
6940
+ }
6941
+
6942
+ const blendshapeData = blendshapeOutput.data;
6943
+ const numFrames = blendshapeOutput.dims[1];
6944
+ const numBlendshapes = blendshapeOutput.dims[2];
6945
+
6946
+ // Symmetrize each frame and flatten into a single Float32Array for transfer
6947
+ const flatBuffer = new Float32Array(numFrames * numBlendshapes);
6948
+ for (let f = 0; f < numFrames; f++) {
6949
+ const offset = f * numBlendshapes;
6950
+ const rawFrame = blendshapeData.slice(offset, offset + numBlendshapes);
6951
+ const symmetrized = symmetrizeBlendshapes(rawFrame);
6952
+ flatBuffer.set(symmetrized, offset);
6953
+ }
6954
+
6955
+ return { flatBuffer, numFrames, numBlendshapes };
6956
+ }
6957
+
6958
+ // Message handler
6959
+ self.onmessage = async function(e) {
6960
+ const msg = e.data;
6961
+
6962
+ try {
6963
+ switch (msg.type) {
6964
+ case 'load': {
6965
+ const startTime = performance.now();
6966
+ await loadOrt(msg.wasmPaths);
6967
+ const { inputNames, outputNames } = await loadModel(msg.modelUrl, msg.externalDataUrl, msg.isIOS);
6968
+ const loadTimeMs = performance.now() - startTime;
6969
+
6970
+ self.postMessage({
6971
+ type: 'loaded',
6972
+ inputNames,
6973
+ outputNames,
6974
+ loadTimeMs,
6975
+ });
6976
+ break;
6977
+ }
6978
+
6979
+ case 'infer': {
6980
+ const startTime = performance.now();
6981
+ const { flatBuffer, numFrames, numBlendshapes } = await runInference(msg.audio);
6982
+ const inferenceTimeMs = performance.now() - startTime;
6983
+
6984
+ self.postMessage({
6985
+ type: 'result',
6986
+ blendshapes: flatBuffer,
6987
+ numFrames,
6988
+ numBlendshapes,
6989
+ inferenceTimeMs,
6990
+ }, [flatBuffer.buffer]);
6991
+ break;
6992
+ }
6993
+
6994
+ case 'dispose': {
6995
+ if (session) {
6996
+ await session.release();
6997
+ session = null;
6998
+ }
6999
+ ort = null;
7000
+ self.postMessage({ type: 'disposed' });
7001
+ break;
7002
+ }
7003
+
7004
+ default:
7005
+ self.postMessage({
7006
+ type: 'error',
7007
+ error: 'Unknown message type: ' + msg.type,
7008
+ });
7009
+ }
7010
+ } catch (err) {
7011
+ let errorMessage;
7012
+ if (typeof err === 'number') {
7013
+ // ORT WASM throws raw C++ exception pointers as bare numbers
7014
+ errorMessage = 'ORT WASM C++ exception pointer (0x' + err.toString(16) + ') \u2014 likely OOM';
7015
+ } else {
7016
+ errorMessage = err.message || String(err);
7017
+ }
7018
+ self.postMessage({
7019
+ type: 'error',
7020
+ error: errorMessage,
7021
+ });
7022
+ }
7023
+ };
7024
+
7025
+ // Error handler
7026
+ self.onerror = function(err) {
7027
+ self.postMessage({
7028
+ type: 'error',
7029
+ error: 'Worker error: ' + (err.message || String(err)),
7030
+ });
7031
+ };
7032
+ `;
7033
+ var Wav2ArkitCpuWorker = class {
7034
+ constructor(config) {
7035
+ this.modelId = "wav2arkit_cpu";
7036
+ this.worker = null;
7037
+ this.isLoading = false;
7038
+ this._isLoaded = false;
7039
+ // Inference queue for serialization
7040
+ this.inferenceQueue = Promise.resolve();
7041
+ // Session health: set to true if worker inference times out.
7042
+ // A timed-out worker may have a zombie WASM dispatch still running,
7043
+ // so all future infer() calls reject immediately to prevent concurrent access.
7044
+ this.poisoned = false;
7045
+ // Pending message handlers
7046
+ this.pendingResolvers = /* @__PURE__ */ new Map();
7047
+ this.config = config;
7048
+ }
7049
+ get isLoaded() {
7050
+ return this._isLoaded;
7051
+ }
7052
+ /**
7053
+ * Backend type (always 'wasm' for Worker, WebGPU not supported in Workers)
7054
+ */
7055
+ get backend() {
7056
+ return this._isLoaded ? "wasm" : null;
4321
7057
  }
4322
7058
  /**
4323
- * Queue inference to serialize ONNX session calls
7059
+ * Create the worker from inline script
4324
7060
  */
4325
- queueInference(feeds, inputSamples) {
7061
+ createWorker() {
7062
+ const blob = new Blob([WORKER_SCRIPT3], { type: "application/javascript" });
7063
+ const blobUrl = URL.createObjectURL(blob);
7064
+ const worker = new Worker(blobUrl);
7065
+ URL.revokeObjectURL(blobUrl);
7066
+ worker.onmessage = (event) => {
7067
+ this.handleWorkerMessage(event.data);
7068
+ };
7069
+ worker.onerror = (error) => {
7070
+ logger9.error("Worker error", { error: error.message });
7071
+ for (const [, resolver] of this.pendingResolvers) {
7072
+ resolver.reject(new Error(`Worker error: ${error.message}`));
7073
+ }
7074
+ this.pendingResolvers.clear();
7075
+ };
7076
+ return worker;
7077
+ }
7078
+ /**
7079
+ * Handle messages from worker
7080
+ */
7081
+ handleWorkerMessage(result) {
7082
+ const resolver = this.pendingResolvers.get(result.type);
7083
+ if (resolver) {
7084
+ this.pendingResolvers.delete(result.type);
7085
+ if (result.type === "error") {
7086
+ resolver.reject(new Error(result.error));
7087
+ } else {
7088
+ resolver.resolve(result);
7089
+ }
7090
+ }
7091
+ }
7092
+ /**
7093
+ * Send message to worker and wait for response
7094
+ */
7095
+ sendMessage(message, expectedType, timeoutMs) {
7096
+ return new Promise((resolve, reject) => {
7097
+ if (!this.worker) {
7098
+ reject(new Error("Worker not initialized"));
7099
+ return;
7100
+ }
7101
+ const timeoutId = setTimeout(() => {
7102
+ this.pendingResolvers.delete(expectedType);
7103
+ reject(new Error(`Worker operation timed out after ${timeoutMs}ms`));
7104
+ }, timeoutMs);
7105
+ this.pendingResolvers.set(expectedType, {
7106
+ resolve: (value) => {
7107
+ clearTimeout(timeoutId);
7108
+ resolve(value);
7109
+ },
7110
+ reject: (error) => {
7111
+ clearTimeout(timeoutId);
7112
+ reject(error);
7113
+ }
7114
+ });
7115
+ this.pendingResolvers.set("error", {
7116
+ resolve: () => {
7117
+ },
7118
+ // Never called for errors
7119
+ reject: (error) => {
7120
+ clearTimeout(timeoutId);
7121
+ this.pendingResolvers.delete(expectedType);
7122
+ reject(error);
7123
+ }
7124
+ });
7125
+ this.worker.postMessage(message);
7126
+ });
7127
+ }
7128
+ /**
7129
+ * Load the ONNX model in the worker
7130
+ */
7131
+ async load() {
7132
+ if (this.isLoading) {
7133
+ throw new Error("Model is already loading");
7134
+ }
7135
+ if (this._isLoaded) {
7136
+ throw new Error("Model already loaded. Call dispose() first.");
7137
+ }
7138
+ this.isLoading = true;
7139
+ const startTime = performance.now();
7140
+ const telemetry = getTelemetry();
7141
+ const span = telemetry?.startSpan("Wav2ArkitCpuWorker.load", {
7142
+ "model.url": this.config.modelUrl,
7143
+ "model.backend_requested": "wasm"
7144
+ });
7145
+ try {
7146
+ logger9.info("Creating wav2arkit_cpu worker...");
7147
+ this.worker = this.createWorker();
7148
+ const externalDataUrl = this.config.externalDataUrl !== false ? this.config.externalDataUrl || `${this.config.modelUrl}.data` : null;
7149
+ logger9.info("Loading model in worker...", {
7150
+ modelUrl: this.config.modelUrl,
7151
+ externalDataUrl,
7152
+ isIOS: isIOS()
7153
+ });
7154
+ const result = await this.sendMessage(
7155
+ {
7156
+ type: "load",
7157
+ modelUrl: resolveUrl3(this.config.modelUrl),
7158
+ externalDataUrl: externalDataUrl ? resolveUrl3(externalDataUrl) : null,
7159
+ wasmPaths: WASM_CDN_PATH4,
7160
+ isIOS: isIOS()
7161
+ },
7162
+ "loaded",
7163
+ LOAD_TIMEOUT_MS2
7164
+ );
7165
+ this._isLoaded = true;
7166
+ const loadTimeMs = performance.now() - startTime;
7167
+ logger9.info("Wav2ArkitCpu worker loaded successfully", {
7168
+ backend: "wasm",
7169
+ loadTimeMs: Math.round(loadTimeMs),
7170
+ workerLoadTimeMs: Math.round(result.loadTimeMs),
7171
+ inputs: result.inputNames,
7172
+ outputs: result.outputNames
7173
+ });
7174
+ span?.setAttributes({
7175
+ "model.backend": "wasm",
7176
+ "model.load_time_ms": loadTimeMs,
7177
+ "model.worker_load_time_ms": result.loadTimeMs
7178
+ });
7179
+ span?.end();
7180
+ telemetry?.recordHistogram("omote.model.load_time", loadTimeMs, {
7181
+ model: "wav2arkit_cpu-worker",
7182
+ backend: "wasm"
7183
+ });
7184
+ return {
7185
+ backend: "wasm",
7186
+ loadTimeMs,
7187
+ inputNames: result.inputNames,
7188
+ outputNames: result.outputNames
7189
+ };
7190
+ } catch (error) {
7191
+ span?.endWithError(error instanceof Error ? error : new Error(String(error)));
7192
+ telemetry?.incrementCounter("omote.errors.total", 1, {
7193
+ model: "wav2arkit_cpu-worker",
7194
+ error_type: "load_failed"
7195
+ });
7196
+ if (this.worker) {
7197
+ this.worker.terminate();
7198
+ this.worker = null;
7199
+ }
7200
+ throw error;
7201
+ } finally {
7202
+ this.isLoading = false;
7203
+ }
7204
+ }
7205
+ /**
7206
+ * Run inference on raw audio
7207
+ *
7208
+ * Accepts variable-length audio (not fixed to 16000 samples).
7209
+ * Output frames = ceil(30 * numSamples / 16000).
7210
+ *
7211
+ * @param audioSamples - Float32Array of raw audio at 16kHz
7212
+ * @param _identityIndex - Ignored (identity 11 is baked into the model)
7213
+ */
7214
+ async infer(audioSamples, _identityIndex) {
7215
+ if (!this._isLoaded || !this.worker) {
7216
+ throw new Error("Model not loaded. Call load() first.");
7217
+ }
7218
+ if (this.poisoned) {
7219
+ throw new Error("Wav2ArkitCpu worker session timed out \u2014 inference unavailable until page reload");
7220
+ }
7221
+ const audioCopy = new Float32Array(audioSamples);
7222
+ return this.queueInference(audioCopy);
7223
+ }
7224
+ /**
7225
+ * Queue inference to serialize worker calls
7226
+ */
7227
+ queueInference(audioSamples) {
4326
7228
  return new Promise((resolve, reject) => {
4327
7229
  this.inferenceQueue = this.inferenceQueue.then(async () => {
4328
7230
  const telemetry = getTelemetry();
4329
- const span = telemetry?.startSpan("Wav2ArkitCpu.infer", {
4330
- "inference.backend": this._backend,
4331
- "inference.input_samples": inputSamples
7231
+ const span = telemetry?.startSpan("Wav2ArkitCpuWorker.infer", {
7232
+ "inference.backend": "wasm",
7233
+ "inference.input_samples": audioSamples.length
4332
7234
  });
4333
7235
  try {
4334
7236
  const startTime = performance.now();
4335
- const results = await this.session.run(feeds);
7237
+ const result = await this.sendMessage(
7238
+ {
7239
+ type: "infer",
7240
+ audio: audioSamples
7241
+ },
7242
+ "result",
7243
+ INFERENCE_TIMEOUT_MS2
7244
+ );
4336
7245
  const inferenceTimeMs = performance.now() - startTime;
4337
- const blendshapeOutput = results["blendshapes"];
4338
- if (!blendshapeOutput) {
4339
- throw new Error("Missing blendshapes output from model");
4340
- }
4341
- const blendshapeData = blendshapeOutput.data;
4342
- const numFrames = blendshapeOutput.dims[1];
4343
- const numBlendshapes = blendshapeOutput.dims[2];
7246
+ const flatBuffer = result.blendshapes;
7247
+ const { numFrames, numBlendshapes } = result;
4344
7248
  const blendshapes = [];
4345
7249
  for (let f = 0; f < numFrames; f++) {
4346
- const rawFrame = blendshapeData.slice(f * numBlendshapes, (f + 1) * numBlendshapes);
4347
- const symmetrized = symmetrizeBlendshapes(rawFrame);
4348
- blendshapes.push(symmetrized);
7250
+ blendshapes.push(flatBuffer.slice(f * numBlendshapes, (f + 1) * numBlendshapes));
4349
7251
  }
4350
- logger5.trace("Inference completed", {
7252
+ logger9.trace("Worker inference completed", {
4351
7253
  inferenceTimeMs: Math.round(inferenceTimeMs * 100) / 100,
7254
+ workerTimeMs: Math.round(result.inferenceTimeMs * 100) / 100,
4352
7255
  numFrames,
4353
- inputSamples
7256
+ inputSamples: audioSamples.length
4354
7257
  });
4355
7258
  span?.setAttributes({
4356
7259
  "inference.duration_ms": inferenceTimeMs,
7260
+ "inference.worker_duration_ms": result.inferenceTimeMs,
4357
7261
  "inference.frames": numFrames
4358
7262
  });
4359
7263
  span?.end();
4360
7264
  telemetry?.recordHistogram("omote.inference.latency", inferenceTimeMs, {
4361
- model: "wav2arkit_cpu",
4362
- backend: this._backend
7265
+ model: "wav2arkit_cpu-worker",
7266
+ backend: "wasm"
4363
7267
  });
4364
7268
  telemetry?.incrementCounter("omote.inference.total", 1, {
4365
- model: "wav2arkit_cpu",
4366
- backend: this._backend,
7269
+ model: "wav2arkit_cpu-worker",
7270
+ backend: "wasm",
4367
7271
  status: "success"
4368
7272
  });
4369
7273
  resolve({
@@ -4372,10 +7276,20 @@ var Wav2ArkitCpuInference = class {
4372
7276
  inferenceTimeMs
4373
7277
  });
4374
7278
  } catch (err) {
7279
+ const errMsg = err instanceof Error ? err.message : String(err);
7280
+ if (errMsg.includes("timed out")) {
7281
+ this.poisoned = true;
7282
+ logger9.error("CRITICAL: Worker inference timed out \u2014 Wav2ArkitCpu worker is dead. Page reload required.", {
7283
+ backend: "wasm",
7284
+ timeoutMs: INFERENCE_TIMEOUT_MS2
7285
+ });
7286
+ } else {
7287
+ logger9.error("Worker inference failed", { error: errMsg, backend: "wasm" });
7288
+ }
4375
7289
  span?.endWithError(err instanceof Error ? err : new Error(String(err)));
4376
7290
  telemetry?.incrementCounter("omote.inference.total", 1, {
4377
- model: "wav2arkit_cpu",
4378
- backend: this._backend,
7291
+ model: "wav2arkit_cpu-worker",
7292
+ backend: "wasm",
4379
7293
  status: "error"
4380
7294
  });
4381
7295
  reject(err);
@@ -4384,37 +7298,62 @@ var Wav2ArkitCpuInference = class {
4384
7298
  });
4385
7299
  }
4386
7300
  /**
4387
- * Dispose of the model and free resources
7301
+ * Dispose of the worker and free resources
4388
7302
  */
4389
7303
  async dispose() {
4390
- if (this.session) {
4391
- await this.session.release();
4392
- this.session = null;
7304
+ if (this.worker) {
7305
+ try {
7306
+ await this.sendMessage({ type: "dispose" }, "disposed", INFERENCE_TIMEOUT_MS2);
7307
+ } catch {
7308
+ }
7309
+ this.worker.terminate();
7310
+ this.worker = null;
4393
7311
  }
7312
+ this._isLoaded = false;
7313
+ this.poisoned = false;
7314
+ this.pendingResolvers.clear();
7315
+ }
7316
+ /**
7317
+ * Check if Web Workers are supported
7318
+ */
7319
+ static isSupported() {
7320
+ return typeof Worker !== "undefined";
4394
7321
  }
4395
7322
  };
4396
7323
 
4397
7324
  // src/inference/createLipSync.ts
4398
- var logger6 = createLogger("createLipSync");
7325
+ var logger10 = createLogger("createLipSync");
4399
7326
  function createLipSync(config) {
4400
7327
  const mode = config.mode ?? "auto";
4401
7328
  const fallbackOnError = config.fallbackOnError ?? true;
4402
7329
  let useCpu;
4403
7330
  if (mode === "cpu") {
4404
7331
  useCpu = true;
4405
- logger6.info("Forcing CPU lip sync model (wav2arkit_cpu)");
7332
+ logger10.info("Forcing CPU lip sync model (wav2arkit_cpu)");
4406
7333
  } else if (mode === "gpu") {
4407
7334
  useCpu = false;
4408
- logger6.info("Forcing GPU lip sync model (Wav2Vec2)");
7335
+ logger10.info("Forcing GPU lip sync model (Wav2Vec2)");
4409
7336
  } else {
4410
7337
  useCpu = shouldUseCpuLipSync();
4411
- logger6.info("Auto-detected lip sync model", {
7338
+ logger10.info("Auto-detected lip sync model", {
4412
7339
  useCpu,
4413
7340
  isSafari: isSafari()
4414
7341
  });
4415
7342
  }
4416
7343
  if (useCpu) {
4417
- logger6.info("Creating Wav2ArkitCpuInference (404MB, WASM)");
7344
+ if (config.unifiedWorker) {
7345
+ logger10.info("Creating Wav2ArkitCpuUnifiedAdapter (404MB, WASM, shared unified worker)");
7346
+ return new Wav2ArkitCpuUnifiedAdapter(config.unifiedWorker, {
7347
+ modelUrl: config.cpuModelUrl
7348
+ });
7349
+ }
7350
+ if (config.useWorker && Wav2ArkitCpuWorker.isSupported() && !isIOS()) {
7351
+ logger10.info("Creating Wav2ArkitCpuWorker (404MB, WASM, off-main-thread)");
7352
+ return new Wav2ArkitCpuWorker({
7353
+ modelUrl: config.cpuModelUrl
7354
+ });
7355
+ }
7356
+ logger10.info("Creating Wav2ArkitCpuInference (404MB, WASM)");
4418
7357
  return new Wav2ArkitCpuInference({
4419
7358
  modelUrl: config.cpuModelUrl
4420
7359
  });
@@ -4426,10 +7365,10 @@ function createLipSync(config) {
4426
7365
  numIdentityClasses: config.numIdentityClasses
4427
7366
  });
4428
7367
  if (fallbackOnError) {
4429
- logger6.info("Creating Wav2Vec2Inference with CPU fallback");
7368
+ logger10.info("Creating Wav2Vec2Inference with CPU fallback");
4430
7369
  return new LipSyncWithFallback(gpuInstance, config);
4431
7370
  }
4432
- logger6.info("Creating Wav2Vec2Inference (no fallback)");
7371
+ logger10.info("Creating Wav2Vec2Inference (no fallback)");
4433
7372
  return gpuInstance;
4434
7373
  }
4435
7374
  var LipSyncWithFallback = class {
@@ -4455,16 +7394,28 @@ var LipSyncWithFallback = class {
4455
7394
  }
4456
7395
  }
4457
7396
  async fallbackToCpu(reason) {
4458
- logger6.warn("GPU model load failed, falling back to CPU model", { reason });
7397
+ logger10.warn("GPU model load failed, falling back to CPU model", { reason });
4459
7398
  try {
4460
7399
  await this.implementation.dispose();
4461
7400
  } catch {
4462
7401
  }
4463
- this.implementation = new Wav2ArkitCpuInference({
4464
- modelUrl: this.config.cpuModelUrl
4465
- });
7402
+ if (this.config.unifiedWorker) {
7403
+ this.implementation = new Wav2ArkitCpuUnifiedAdapter(this.config.unifiedWorker, {
7404
+ modelUrl: this.config.cpuModelUrl
7405
+ });
7406
+ logger10.info("Fallback to Wav2ArkitCpuUnifiedAdapter successful");
7407
+ } else if (this.config.useWorker && Wav2ArkitCpuWorker.isSupported() && !isIOS()) {
7408
+ this.implementation = new Wav2ArkitCpuWorker({
7409
+ modelUrl: this.config.cpuModelUrl
7410
+ });
7411
+ logger10.info("Fallback to Wav2ArkitCpuWorker successful");
7412
+ } else {
7413
+ this.implementation = new Wav2ArkitCpuInference({
7414
+ modelUrl: this.config.cpuModelUrl
7415
+ });
7416
+ logger10.info("Fallback to Wav2ArkitCpuInference successful");
7417
+ }
4466
7418
  this.hasFallenBack = true;
4467
- logger6.info("Fallback to Wav2ArkitCpuInference successful");
4468
7419
  return await this.implementation.load();
4469
7420
  }
4470
7421
  async infer(audioSamples, identityIndex) {
@@ -4476,7 +7427,7 @@ var LipSyncWithFallback = class {
4476
7427
  };
4477
7428
 
4478
7429
  // src/inference/SileroVADInference.ts
4479
- var logger7 = createLogger("SileroVAD");
7430
+ var logger11 = createLogger("SileroVAD");
4480
7431
  var SileroVADInference = class {
4481
7432
  constructor(config) {
4482
7433
  this.session = null;
@@ -4550,23 +7501,23 @@ var SileroVADInference = class {
4550
7501
  "model.sample_rate": this.config.sampleRate
4551
7502
  });
4552
7503
  try {
4553
- logger7.info("Loading ONNX Runtime...", { preference: this.config.backend });
7504
+ logger11.info("Loading ONNX Runtime...", { preference: this.config.backend });
4554
7505
  const { ort, backend } = await getOnnxRuntimeForPreference(this.config.backend);
4555
7506
  this.ort = ort;
4556
7507
  this._backend = backend;
4557
- logger7.info("ONNX Runtime loaded", { backend: this._backend });
7508
+ logger11.info("ONNX Runtime loaded", { backend: this._backend });
4558
7509
  const cache = getModelCache();
4559
7510
  const modelUrl = this.config.modelUrl;
4560
7511
  const isCached = await cache.has(modelUrl);
4561
7512
  let modelBuffer;
4562
7513
  if (isCached) {
4563
- logger7.debug("Loading model from cache", { modelUrl });
7514
+ logger11.debug("Loading model from cache", { modelUrl });
4564
7515
  modelBuffer = await cache.get(modelUrl);
4565
7516
  } else {
4566
- logger7.debug("Fetching and caching model", { modelUrl });
7517
+ logger11.debug("Fetching and caching model", { modelUrl });
4567
7518
  modelBuffer = await fetchWithCache(modelUrl);
4568
7519
  }
4569
- logger7.debug("Creating ONNX session", {
7520
+ logger11.debug("Creating ONNX session", {
4570
7521
  size: formatBytes(modelBuffer.byteLength),
4571
7522
  backend: this._backend
4572
7523
  });
@@ -4575,7 +7526,7 @@ var SileroVADInference = class {
4575
7526
  this.session = await ort.InferenceSession.create(modelData, sessionOptions);
4576
7527
  this.reset();
4577
7528
  const loadTimeMs = performance.now() - startTime;
4578
- logger7.info("Model loaded successfully", {
7529
+ logger11.info("Model loaded successfully", {
4579
7530
  backend: this._backend,
4580
7531
  loadTimeMs: Math.round(loadTimeMs),
4581
7532
  sampleRate: this.config.sampleRate,
@@ -4630,7 +7581,7 @@ var SileroVADInference = class {
4630
7581
  []
4631
7582
  );
4632
7583
  } catch (e) {
4633
- logger7.warn("BigInt64Array not available, using bigint array fallback", {
7584
+ logger11.warn("BigInt64Array not available, using bigint array fallback", {
4634
7585
  error: e instanceof Error ? e.message : String(e)
4635
7586
  });
4636
7587
  this.srTensor = new this.ort.Tensor(
@@ -4722,23 +7673,13 @@ var SileroVADInference = class {
4722
7673
  }
4723
7674
  return segments;
4724
7675
  }
4725
- /**
4726
- * Calculate RMS energy of audio chunk
4727
- */
4728
- calculateRMS(samples) {
4729
- let sum = 0;
4730
- for (let i = 0; i < samples.length; i++) {
4731
- sum += samples[i] * samples[i];
4732
- }
4733
- return Math.sqrt(sum / samples.length);
4734
- }
4735
7676
  /**
4736
7677
  * Queue inference to serialize ONNX session calls
4737
7678
  */
4738
7679
  queueInference(audioChunk) {
4739
7680
  const audioChunkCopy = new Float32Array(audioChunk);
4740
7681
  const MIN_ENERGY_THRESHOLD = 1e-3;
4741
- const rms = this.calculateRMS(audioChunkCopy);
7682
+ const rms = calculateRMS(audioChunkCopy);
4742
7683
  if (rms < MIN_ENERGY_THRESHOLD) {
4743
7684
  if (!this.wasSpeaking) {
4744
7685
  this.preSpeechBuffer.push(new Float32Array(audioChunkCopy));
@@ -4746,7 +7687,7 @@ var SileroVADInference = class {
4746
7687
  this.preSpeechBuffer.shift();
4747
7688
  }
4748
7689
  }
4749
- logger7.trace("Skipping VAD inference - audio too quiet", {
7690
+ logger11.trace("Skipping VAD inference - audio too quiet", {
4750
7691
  rms: Math.round(rms * 1e4) / 1e4,
4751
7692
  threshold: MIN_ENERGY_THRESHOLD
4752
7693
  });
@@ -4793,19 +7734,19 @@ var SileroVADInference = class {
4793
7734
  [2, 1, 128]
4794
7735
  );
4795
7736
  }
4796
- this.context = audioChunk.slice(-this.contextSize);
7737
+ this.context = audioChunkCopy.slice(-this.contextSize);
4797
7738
  const inferenceTimeMs = performance.now() - startTime;
4798
7739
  const isSpeech = probability > this.config.threshold;
4799
7740
  let preSpeechChunks;
4800
7741
  if (isSpeech && !this.wasSpeaking) {
4801
7742
  preSpeechChunks = [...this.preSpeechBuffer];
4802
7743
  this.preSpeechBuffer = [];
4803
- logger7.debug("Speech started with pre-speech buffer", {
7744
+ logger11.debug("Speech started with pre-speech buffer", {
4804
7745
  preSpeechChunks: preSpeechChunks.length,
4805
7746
  durationMs: Math.round(preSpeechChunks.length * this.getChunkDurationMs())
4806
7747
  });
4807
7748
  } else if (!isSpeech && !this.wasSpeaking) {
4808
- this.preSpeechBuffer.push(new Float32Array(audioChunk));
7749
+ this.preSpeechBuffer.push(new Float32Array(audioChunkCopy));
4809
7750
  if (this.preSpeechBuffer.length > this.config.preSpeechBufferChunks) {
4810
7751
  this.preSpeechBuffer.shift();
4811
7752
  }
@@ -4813,7 +7754,7 @@ var SileroVADInference = class {
4813
7754
  this.preSpeechBuffer = [];
4814
7755
  }
4815
7756
  this.wasSpeaking = isSpeech;
4816
- logger7.trace("VAD inference completed", {
7757
+ logger11.trace("VAD inference completed", {
4817
7758
  probability: Math.round(probability * 1e3) / 1e3,
4818
7759
  isSpeech,
4819
7760
  inferenceTimeMs: Math.round(inferenceTimeMs * 100) / 100
@@ -4840,13 +7781,30 @@ var SileroVADInference = class {
4840
7781
  preSpeechChunks
4841
7782
  });
4842
7783
  } catch (err) {
4843
- span?.endWithError(err instanceof Error ? err : new Error(String(err)));
4844
- telemetry?.incrementCounter("omote.inference.total", 1, {
4845
- model: "silero-vad",
4846
- backend: this._backend,
4847
- status: "error"
4848
- });
4849
- reject(err);
7784
+ if (typeof err === "number") {
7785
+ const oomError = new Error(
7786
+ `SileroVAD inference failed with raw C++ exception pointer (0x${err.toString(16)}). This is likely an OOM crash in WASM. Try reducing concurrent model sessions or reloading the page.`
7787
+ );
7788
+ logger11.error("ORT WASM OOM \u2014 raw C++ exception pointer", {
7789
+ pointer: `0x${err.toString(16)}`,
7790
+ backend: this._backend
7791
+ });
7792
+ span?.endWithError(oomError);
7793
+ telemetry?.incrementCounter("omote.inference.total", 1, {
7794
+ model: "silero-vad",
7795
+ backend: this._backend,
7796
+ status: "error"
7797
+ });
7798
+ reject(oomError);
7799
+ } else {
7800
+ span?.endWithError(err instanceof Error ? err : new Error(String(err)));
7801
+ telemetry?.incrementCounter("omote.inference.total", 1, {
7802
+ model: "silero-vad",
7803
+ backend: this._backend,
7804
+ status: "error"
7805
+ });
7806
+ reject(err);
7807
+ }
4850
7808
  }
4851
7809
  });
4852
7810
  });
@@ -4870,19 +7828,27 @@ var SileroVADInference = class {
4870
7828
  SileroVADInference.isWebGPUAvailable = isWebGPUAvailable;
4871
7829
 
4872
7830
  // src/inference/SileroVADWorker.ts
4873
- var logger8 = createLogger("SileroVADWorker");
4874
- var WASM_CDN_PATH2 = "https://cdn.jsdelivr.net/npm/onnxruntime-web@1.23.2/dist/";
4875
- var LOAD_TIMEOUT_MS = 1e4;
4876
- var INFERENCE_TIMEOUT_MS = 1e3;
4877
- var WORKER_SCRIPT = `
7831
+ var logger12 = createLogger("SileroVADWorker");
7832
+ var WASM_CDN_PATH5 = "https://cdn.jsdelivr.net/npm/onnxruntime-web@1.23.2/dist/";
7833
+ var LOAD_TIMEOUT_MS3 = 1e4;
7834
+ var INFERENCE_TIMEOUT_MS3 = 1e3;
7835
+ function resolveUrl4(url) {
7836
+ if (/^https?:\/\//i.test(url) || /^blob:/i.test(url)) return url;
7837
+ try {
7838
+ return new URL(url, globalThis.location?.origin ?? "https://localhost").href;
7839
+ } catch {
7840
+ return url;
7841
+ }
7842
+ }
7843
+ var WORKER_SCRIPT4 = `
4878
7844
  // Silero VAD Worker Script
4879
7845
  // Loaded via Blob URL - no separate file needed
4880
7846
 
4881
- let ort = null;
4882
- let session = null;
4883
- let sampleRate = 16000;
4884
- let chunkSize = 512;
4885
- let contextSize = 64;
7847
+ var ort = null;
7848
+ var session = null;
7849
+ var sampleRate = 16000;
7850
+ var chunkSize = 512;
7851
+ var contextSize = 64;
4886
7852
 
4887
7853
  /**
4888
7854
  * Load ONNX Runtime from CDN
@@ -5132,7 +8098,7 @@ var SileroVADWorker = class {
5132
8098
  * Create the worker from inline script
5133
8099
  */
5134
8100
  createWorker() {
5135
- const blob = new Blob([WORKER_SCRIPT], { type: "application/javascript" });
8101
+ const blob = new Blob([WORKER_SCRIPT4], { type: "application/javascript" });
5136
8102
  const blobUrl = URL.createObjectURL(blob);
5137
8103
  const worker = new Worker(blobUrl);
5138
8104
  URL.revokeObjectURL(blobUrl);
@@ -5140,7 +8106,7 @@ var SileroVADWorker = class {
5140
8106
  this.handleWorkerMessage(event.data);
5141
8107
  };
5142
8108
  worker.onerror = (error) => {
5143
- logger8.error("Worker error", { error: error.message });
8109
+ logger12.error("Worker error", { error: error.message });
5144
8110
  for (const [, resolver] of this.pendingResolvers) {
5145
8111
  resolver.reject(new Error(`Worker error: ${error.message}`));
5146
8112
  }
@@ -5216,25 +8182,25 @@ var SileroVADWorker = class {
5216
8182
  "model.sample_rate": this.config.sampleRate
5217
8183
  });
5218
8184
  try {
5219
- logger8.info("Creating VAD worker...");
8185
+ logger12.info("Creating VAD worker...");
5220
8186
  this.worker = this.createWorker();
5221
- logger8.info("Loading model in worker...", {
8187
+ logger12.info("Loading model in worker...", {
5222
8188
  modelUrl: this.config.modelUrl,
5223
8189
  sampleRate: this.config.sampleRate
5224
8190
  });
5225
8191
  const result = await this.sendMessage(
5226
8192
  {
5227
8193
  type: "load",
5228
- modelUrl: this.config.modelUrl,
8194
+ modelUrl: resolveUrl4(this.config.modelUrl),
5229
8195
  sampleRate: this.config.sampleRate,
5230
- wasmPaths: WASM_CDN_PATH2
8196
+ wasmPaths: WASM_CDN_PATH5
5231
8197
  },
5232
8198
  "loaded",
5233
- LOAD_TIMEOUT_MS
8199
+ LOAD_TIMEOUT_MS3
5234
8200
  );
5235
8201
  this._isLoaded = true;
5236
8202
  const loadTimeMs = performance.now() - startTime;
5237
- logger8.info("VAD worker loaded successfully", {
8203
+ logger12.info("VAD worker loaded successfully", {
5238
8204
  backend: "wasm",
5239
8205
  loadTimeMs: Math.round(loadTimeMs),
5240
8206
  workerLoadTimeMs: Math.round(result.loadTimeMs),
@@ -5285,7 +8251,7 @@ var SileroVADWorker = class {
5285
8251
  const result = await this.sendMessage(
5286
8252
  { type: "reset" },
5287
8253
  "reset",
5288
- INFERENCE_TIMEOUT_MS
8254
+ INFERENCE_TIMEOUT_MS3
5289
8255
  );
5290
8256
  this.state = result.state;
5291
8257
  this.context = new Float32Array(this.contextSize);
@@ -5331,7 +8297,7 @@ var SileroVADWorker = class {
5331
8297
  context: this.context
5332
8298
  },
5333
8299
  "result",
5334
- INFERENCE_TIMEOUT_MS
8300
+ INFERENCE_TIMEOUT_MS3
5335
8301
  );
5336
8302
  this.state = result.state;
5337
8303
  this.context = audioChunkCopy.slice(-this.contextSize);
@@ -5341,7 +8307,7 @@ var SileroVADWorker = class {
5341
8307
  if (isSpeech && !this.wasSpeaking) {
5342
8308
  preSpeechChunks = [...this.preSpeechBuffer];
5343
8309
  this.preSpeechBuffer = [];
5344
- logger8.debug("Speech started with pre-speech buffer", {
8310
+ logger12.debug("Speech started with pre-speech buffer", {
5345
8311
  preSpeechChunks: preSpeechChunks.length,
5346
8312
  durationMs: Math.round(preSpeechChunks.length * this.getChunkDurationMs())
5347
8313
  });
@@ -5354,7 +8320,7 @@ var SileroVADWorker = class {
5354
8320
  this.preSpeechBuffer = [];
5355
8321
  }
5356
8322
  this.wasSpeaking = isSpeech;
5357
- logger8.trace("VAD worker inference completed", {
8323
+ logger12.trace("VAD worker inference completed", {
5358
8324
  probability: Math.round(result.probability * 1e3) / 1e3,
5359
8325
  isSpeech,
5360
8326
  inferenceTimeMs: Math.round(inferenceTimeMs * 100) / 100,
@@ -5400,7 +8366,7 @@ var SileroVADWorker = class {
5400
8366
  async dispose() {
5401
8367
  if (this.worker) {
5402
8368
  try {
5403
- await this.sendMessage({ type: "dispose" }, "disposed", INFERENCE_TIMEOUT_MS);
8369
+ await this.sendMessage({ type: "dispose" }, "disposed", INFERENCE_TIMEOUT_MS3);
5404
8370
  } catch {
5405
8371
  }
5406
8372
  this.worker.terminate();
@@ -5422,40 +8388,44 @@ var SileroVADWorker = class {
5422
8388
  };
5423
8389
 
5424
8390
  // src/inference/createSileroVAD.ts
5425
- var logger9 = createLogger("createSileroVAD");
8391
+ var logger13 = createLogger("createSileroVAD");
5426
8392
  function supportsVADWorker() {
5427
8393
  if (typeof Worker === "undefined") {
5428
- logger9.debug("Worker not supported: Worker constructor undefined");
8394
+ logger13.debug("Worker not supported: Worker constructor undefined");
5429
8395
  return false;
5430
8396
  }
5431
8397
  if (typeof URL === "undefined" || typeof URL.createObjectURL === "undefined") {
5432
- logger9.debug("Worker not supported: URL.createObjectURL unavailable");
8398
+ logger13.debug("Worker not supported: URL.createObjectURL unavailable");
5433
8399
  return false;
5434
8400
  }
5435
8401
  if (typeof Blob === "undefined") {
5436
- logger9.debug("Worker not supported: Blob constructor unavailable");
8402
+ logger13.debug("Worker not supported: Blob constructor unavailable");
5437
8403
  return false;
5438
8404
  }
5439
8405
  return true;
5440
8406
  }
5441
8407
  function createSileroVAD(config) {
8408
+ if (config.unifiedWorker) {
8409
+ logger13.info("Creating SileroVADUnifiedAdapter (shared unified worker)");
8410
+ return new SileroVADUnifiedAdapter(config.unifiedWorker, config);
8411
+ }
5442
8412
  const fallbackOnError = config.fallbackOnError ?? true;
5443
8413
  let useWorker;
5444
8414
  if (config.useWorker !== void 0) {
5445
8415
  useWorker = config.useWorker;
5446
- logger9.debug("Worker preference explicitly set", { useWorker });
8416
+ logger13.debug("Worker preference explicitly set", { useWorker });
5447
8417
  } else {
5448
8418
  const workerSupported = supportsVADWorker();
5449
8419
  const onMobile = isMobile();
5450
8420
  useWorker = workerSupported && !onMobile;
5451
- logger9.debug("Auto-detected Worker preference", {
8421
+ logger13.debug("Auto-detected Worker preference", {
5452
8422
  useWorker,
5453
8423
  workerSupported,
5454
8424
  onMobile
5455
8425
  });
5456
8426
  }
5457
8427
  if (useWorker) {
5458
- logger9.info("Creating SileroVADWorker (off-main-thread)");
8428
+ logger13.info("Creating SileroVADWorker (off-main-thread)");
5459
8429
  const worker = new SileroVADWorker({
5460
8430
  modelUrl: config.modelUrl,
5461
8431
  sampleRate: config.sampleRate,
@@ -5467,7 +8437,7 @@ function createSileroVAD(config) {
5467
8437
  }
5468
8438
  return worker;
5469
8439
  }
5470
- logger9.info("Creating SileroVADInference (main thread)");
8440
+ logger13.info("Creating SileroVADInference (main thread)");
5471
8441
  return new SileroVADInference(config);
5472
8442
  }
5473
8443
  var VADWorkerWithFallback = class {
@@ -5493,7 +8463,7 @@ var VADWorkerWithFallback = class {
5493
8463
  try {
5494
8464
  return await this.implementation.load();
5495
8465
  } catch (error) {
5496
- logger9.warn("Worker load failed, falling back to main thread", {
8466
+ logger13.warn("Worker load failed, falling back to main thread", {
5497
8467
  error: error instanceof Error ? error.message : String(error)
5498
8468
  });
5499
8469
  try {
@@ -5502,7 +8472,7 @@ var VADWorkerWithFallback = class {
5502
8472
  }
5503
8473
  this.implementation = new SileroVADInference(this.config);
5504
8474
  this.hasFallenBack = true;
5505
- logger9.info("Fallback to SileroVADInference successful");
8475
+ logger13.info("Fallback to SileroVADInference successful");
5506
8476
  return await this.implementation.load();
5507
8477
  }
5508
8478
  }
@@ -5524,7 +8494,7 @@ var VADWorkerWithFallback = class {
5524
8494
  };
5525
8495
 
5526
8496
  // src/inference/SafariSpeechRecognition.ts
5527
- var logger10 = createLogger("SafariSpeech");
8497
+ var logger14 = createLogger("SafariSpeech");
5528
8498
  var SafariSpeechRecognition = class _SafariSpeechRecognition {
5529
8499
  constructor(config = {}) {
5530
8500
  this.recognition = null;
@@ -5543,7 +8513,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
5543
8513
  interimResults: config.interimResults ?? true,
5544
8514
  maxAlternatives: config.maxAlternatives ?? 1
5545
8515
  };
5546
- logger10.debug("SafariSpeechRecognition created", {
8516
+ logger14.debug("SafariSpeechRecognition created", {
5547
8517
  language: this.config.language,
5548
8518
  continuous: this.config.continuous
5549
8519
  });
@@ -5604,7 +8574,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
5604
8574
  */
5605
8575
  async start() {
5606
8576
  if (this.isListening) {
5607
- logger10.warn("Already listening");
8577
+ logger14.warn("Already listening");
5608
8578
  return;
5609
8579
  }
5610
8580
  if (!_SafariSpeechRecognition.isAvailable()) {
@@ -5634,7 +8604,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
5634
8604
  this.isListening = true;
5635
8605
  this.startTime = performance.now();
5636
8606
  this.accumulatedText = "";
5637
- logger10.info("Speech recognition started", {
8607
+ logger14.info("Speech recognition started", {
5638
8608
  language: this.config.language
5639
8609
  });
5640
8610
  span?.end();
@@ -5649,7 +8619,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
5649
8619
  */
5650
8620
  async stop() {
5651
8621
  if (!this.isListening || !this.recognition) {
5652
- logger10.warn("Not currently listening");
8622
+ logger14.warn("Not currently listening");
5653
8623
  return {
5654
8624
  text: this.accumulatedText,
5655
8625
  language: this.config.language,
@@ -5678,7 +8648,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
5678
8648
  if (this.recognition && this.isListening) {
5679
8649
  this.recognition.abort();
5680
8650
  this.isListening = false;
5681
- logger10.info("Speech recognition aborted");
8651
+ logger14.info("Speech recognition aborted");
5682
8652
  }
5683
8653
  }
5684
8654
  /**
@@ -5709,7 +8679,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
5709
8679
  this.isListening = false;
5710
8680
  this.resultCallbacks = [];
5711
8681
  this.errorCallbacks = [];
5712
- logger10.debug("SafariSpeechRecognition disposed");
8682
+ logger14.debug("SafariSpeechRecognition disposed");
5713
8683
  }
5714
8684
  /**
5715
8685
  * Set up event handlers for the recognition instance
@@ -5737,7 +8707,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
5737
8707
  confidence: alternative.confidence
5738
8708
  };
5739
8709
  this.emitResult(speechResult);
5740
- logger10.trace("Speech result", {
8710
+ logger14.trace("Speech result", {
5741
8711
  text: text.substring(0, 50),
5742
8712
  isFinal,
5743
8713
  confidence: alternative.confidence
@@ -5747,12 +8717,12 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
5747
8717
  span?.end();
5748
8718
  } catch (error) {
5749
8719
  span?.endWithError(error instanceof Error ? error : new Error(String(error)));
5750
- logger10.error("Error processing speech result", { error });
8720
+ logger14.error("Error processing speech result", { error });
5751
8721
  }
5752
8722
  };
5753
8723
  this.recognition.onerror = (event) => {
5754
8724
  const error = new Error(`Speech recognition error: ${event.error} - ${event.message}`);
5755
- logger10.error("Speech recognition error", { error: event.error, message: event.message });
8725
+ logger14.error("Speech recognition error", { error: event.error, message: event.message });
5756
8726
  this.emitError(error);
5757
8727
  if (this.stopRejecter) {
5758
8728
  this.stopRejecter(error);
@@ -5762,7 +8732,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
5762
8732
  };
5763
8733
  this.recognition.onend = () => {
5764
8734
  this.isListening = false;
5765
- logger10.info("Speech recognition ended", {
8735
+ logger14.info("Speech recognition ended", {
5766
8736
  totalText: this.accumulatedText.length,
5767
8737
  durationMs: performance.now() - this.startTime
5768
8738
  });
@@ -5779,13 +8749,13 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
5779
8749
  }
5780
8750
  };
5781
8751
  this.recognition.onstart = () => {
5782
- logger10.debug("Speech recognition started by browser");
8752
+ logger14.debug("Speech recognition started by browser");
5783
8753
  };
5784
8754
  this.recognition.onspeechstart = () => {
5785
- logger10.debug("Speech detected");
8755
+ logger14.debug("Speech detected");
5786
8756
  };
5787
8757
  this.recognition.onspeechend = () => {
5788
- logger10.debug("Speech ended");
8758
+ logger14.debug("Speech ended");
5789
8759
  };
5790
8760
  }
5791
8761
  /**
@@ -5796,7 +8766,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
5796
8766
  try {
5797
8767
  callback(result);
5798
8768
  } catch (error) {
5799
- logger10.error("Error in result callback", { error });
8769
+ logger14.error("Error in result callback", { error });
5800
8770
  }
5801
8771
  }
5802
8772
  }
@@ -5808,7 +8778,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
5808
8778
  try {
5809
8779
  callback(error);
5810
8780
  } catch (callbackError) {
5811
- logger10.error("Error in error callback", { error: callbackError });
8781
+ logger14.error("Error in error callback", { error: callbackError });
5812
8782
  }
5813
8783
  }
5814
8784
  }
@@ -6073,7 +9043,7 @@ var AgentCoreAdapter = class extends EventEmitter {
6073
9043
  console.error("[AgentCore] VAD error during interruption detection:", error);
6074
9044
  });
6075
9045
  }
6076
- const float32 = audio instanceof Float32Array ? audio : this.int16ToFloat32(audio);
9046
+ const float32 = audio instanceof Float32Array ? audio : int16ToFloat32(audio);
6077
9047
  this.audioBuffer.push(float32);
6078
9048
  this.scheduleTranscription();
6079
9049
  }
@@ -6405,7 +9375,7 @@ var AgentCoreAdapter = class extends EventEmitter {
6405
9375
  * Falls back to simple RMS if VAD not available
6406
9376
  */
6407
9377
  async detectVoiceActivity(audio) {
6408
- const float32 = audio instanceof Float32Array ? audio : this.int16ToFloat32(audio);
9378
+ const float32 = audio instanceof Float32Array ? audio : int16ToFloat32(audio);
6409
9379
  if (this.vad) {
6410
9380
  const chunkSize = this.vad.getChunkSize();
6411
9381
  for (let i = 0; i + chunkSize <= float32.length; i += chunkSize) {
@@ -6424,13 +9394,6 @@ var AgentCoreAdapter = class extends EventEmitter {
6424
9394
  const rms = Math.sqrt(sum / float32.length);
6425
9395
  return rms > 0.02;
6426
9396
  }
6427
- int16ToFloat32(int16) {
6428
- const float32 = new Float32Array(int16.length);
6429
- for (let i = 0; i < int16.length; i++) {
6430
- float32[i] = int16[i] / 32768;
6431
- }
6432
- return float32;
6433
- }
6434
9397
  base64ToArrayBuffer(base64) {
6435
9398
  const binaryString = atob(base64);
6436
9399
  const bytes = new Uint8Array(binaryString.length);
@@ -8277,13 +11240,19 @@ export {
8277
11240
  RingBuffer,
8278
11241
  SafariSpeechRecognition,
8279
11242
  SenseVoiceInference,
11243
+ SenseVoiceUnifiedAdapter,
11244
+ SenseVoiceWorker,
8280
11245
  SileroVADInference,
11246
+ SileroVADUnifiedAdapter,
8281
11247
  SileroVADWorker,
8282
11248
  SyncedAudioPipeline,
8283
11249
  TenantManager,
8284
11250
  UPPER_FACE_BLENDSHAPES,
11251
+ UnifiedInferenceWorker,
8285
11252
  WAV2ARKIT_BLENDSHAPES,
8286
11253
  Wav2ArkitCpuInference,
11254
+ Wav2ArkitCpuUnifiedAdapter,
11255
+ Wav2ArkitCpuWorker,
8287
11256
  Wav2Vec2Inference,
8288
11257
  applyCMVN,
8289
11258
  applyLFR,
@@ -8297,6 +11266,7 @@ export {
8297
11266
  createEmotionVector,
8298
11267
  createLipSync,
8299
11268
  createLogger,
11269
+ createSenseVoice,
8300
11270
  createSessionWithFallback,
8301
11271
  createSileroVAD,
8302
11272
  ctcGreedyDecode,