@omote/core 0.4.5 → 0.4.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -67,13 +67,19 @@ __export(index_exports, {
67
67
  RingBuffer: () => RingBuffer,
68
68
  SafariSpeechRecognition: () => SafariSpeechRecognition,
69
69
  SenseVoiceInference: () => SenseVoiceInference,
70
+ SenseVoiceUnifiedAdapter: () => SenseVoiceUnifiedAdapter,
71
+ SenseVoiceWorker: () => SenseVoiceWorker,
70
72
  SileroVADInference: () => SileroVADInference,
73
+ SileroVADUnifiedAdapter: () => SileroVADUnifiedAdapter,
71
74
  SileroVADWorker: () => SileroVADWorker,
72
75
  SyncedAudioPipeline: () => SyncedAudioPipeline,
73
76
  TenantManager: () => TenantManager,
74
77
  UPPER_FACE_BLENDSHAPES: () => UPPER_FACE_BLENDSHAPES,
78
+ UnifiedInferenceWorker: () => UnifiedInferenceWorker,
75
79
  WAV2ARKIT_BLENDSHAPES: () => WAV2ARKIT_BLENDSHAPES,
76
80
  Wav2ArkitCpuInference: () => Wav2ArkitCpuInference,
81
+ Wav2ArkitCpuUnifiedAdapter: () => Wav2ArkitCpuUnifiedAdapter,
82
+ Wav2ArkitCpuWorker: () => Wav2ArkitCpuWorker,
77
83
  Wav2Vec2Inference: () => Wav2Vec2Inference,
78
84
  applyCMVN: () => applyCMVN,
79
85
  applyLFR: () => applyLFR,
@@ -87,6 +93,7 @@ __export(index_exports, {
87
93
  createEmotionVector: () => createEmotionVector,
88
94
  createLipSync: () => createLipSync,
89
95
  createLogger: () => createLogger,
96
+ createSenseVoice: () => createSenseVoice,
90
97
  createSessionWithFallback: () => createSessionWithFallback,
91
98
  createSileroVAD: () => createSileroVAD,
92
99
  ctcGreedyDecode: () => ctcGreedyDecode,
@@ -2789,12 +2796,12 @@ var Logger = class _Logger {
2789
2796
  };
2790
2797
  var loggerCache = /* @__PURE__ */ new Map();
2791
2798
  function createLogger(module2) {
2792
- let logger11 = loggerCache.get(module2);
2793
- if (!logger11) {
2794
- logger11 = new Logger(module2);
2795
- loggerCache.set(module2, logger11);
2799
+ let logger15 = loggerCache.get(module2);
2800
+ if (!logger15) {
2801
+ logger15 = new Logger(module2);
2802
+ loggerCache.set(module2, logger15);
2796
2803
  }
2797
- return logger11;
2804
+ return logger15;
2798
2805
  }
2799
2806
  var noopLogger = {
2800
2807
  module: "noop",
@@ -2822,7 +2829,7 @@ function isIOSSafari() {
2822
2829
  function isIOS() {
2823
2830
  if (typeof navigator === "undefined") return false;
2824
2831
  const ua = navigator.userAgent.toLowerCase();
2825
- return /iphone|ipad|ipod/.test(ua);
2832
+ return /iphone|ipad|ipod/.test(ua) || /macintosh/.test(ua) && navigator.maxTouchPoints > 1;
2826
2833
  }
2827
2834
  function isAndroid() {
2828
2835
  if (typeof navigator === "undefined") return false;
@@ -3443,10 +3450,16 @@ var _Wav2Vec2Inference = class _Wav2Vec2Inference {
3443
3450
  });
3444
3451
  logger2.debug("Running warmup inference to initialize GPU context");
3445
3452
  const warmupStart = performance.now();
3446
- const silentAudio = new Float32Array(16e3);
3453
+ const warmupAudio = new Float32Array(16e3);
3454
+ const warmupIdentity = new Float32Array(this.numIdentityClasses);
3455
+ warmupIdentity[0] = 1;
3456
+ const warmupFeeds = {
3457
+ "audio": new this.ort.Tensor("float32", warmupAudio, [1, 16e3]),
3458
+ "identity": new this.ort.Tensor("float32", warmupIdentity, [1, this.numIdentityClasses])
3459
+ };
3447
3460
  const WARMUP_TIMEOUT_MS = 15e3;
3448
3461
  const warmupResult = await Promise.race([
3449
- this.infer(silentAudio, 0).then(() => "ok"),
3462
+ this.session.run(warmupFeeds).then(() => "ok"),
3450
3463
  new Promise((r) => setTimeout(() => r("timeout"), WARMUP_TIMEOUT_MS))
3451
3464
  ]);
3452
3465
  const warmupTimeMs = performance.now() - warmupStart;
@@ -4610,215 +4623,2495 @@ var _SenseVoiceInference = class _SenseVoiceInference {
4610
4623
  _SenseVoiceInference.INFERENCE_TIMEOUT_MS = 1e4;
4611
4624
  var SenseVoiceInference = _SenseVoiceInference;
4612
4625
 
4613
- // src/inference/Wav2ArkitCpuInference.ts
4614
- var logger5 = createLogger("Wav2ArkitCpu");
4615
- var _Wav2ArkitCpuInference = class _Wav2ArkitCpuInference {
4616
- constructor(config) {
4617
- this.modelId = "wav2arkit_cpu";
4618
- this.session = null;
4619
- this.ort = null;
4620
- this._backend = "wasm";
4621
- this.isLoading = false;
4622
- // Inference queue for handling concurrent calls
4623
- this.inferenceQueue = Promise.resolve();
4624
- // Session health: set to true if session.run() times out.
4625
- // A timed-out session may have a zombie WASM dispatch still running,
4626
- // so all future infer() calls reject immediately to prevent concurrent access.
4627
- this.poisoned = false;
4628
- this.config = config;
4629
- }
4630
- get backend() {
4631
- return this.session ? this._backend : null;
4632
- }
4633
- get isLoaded() {
4634
- return this.session !== null;
4626
+ // src/inference/SenseVoiceWorker.ts
4627
+ var logger5 = createLogger("SenseVoiceWorker");
4628
+ var WASM_CDN_PATH2 = "https://cdn.jsdelivr.net/npm/onnxruntime-web@1.23.2/dist/";
4629
+ var LOAD_TIMEOUT_MS = 3e4;
4630
+ var INFERENCE_TIMEOUT_MS = 1e4;
4631
+ function resolveUrl(url) {
4632
+ if (/^https?:\/\//i.test(url) || /^blob:/i.test(url)) return url;
4633
+ try {
4634
+ return new URL(url, globalThis.location?.origin ?? "https://localhost").href;
4635
+ } catch {
4636
+ return url;
4635
4637
  }
4636
- /**
4637
- * Load the ONNX model
4638
- */
4639
- async load() {
4640
- if (this.isLoading) {
4641
- throw new Error("Model is already loading");
4638
+ }
4639
+ var WORKER_SCRIPT = `
4640
+ // SenseVoice ASR Worker Script
4641
+ // Loaded via Blob URL - no separate file needed
4642
+
4643
+ var ort = null;
4644
+ var session = null;
4645
+ var tokenMap = null;
4646
+ var negMean = null;
4647
+ var invStddev = null;
4648
+ var languageId = 0;
4649
+ var textNormId = 14;
4650
+ var vocabSize = 0;
4651
+
4652
+ // \u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550
4653
+ // kaldiFbank.ts \u2014 inlined as plain JavaScript
4654
+ // \u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550
4655
+
4656
+ /**
4657
+ * In-place Radix-2 Cooley-Tukey FFT
4658
+ */
4659
+ function fft(re, im) {
4660
+ var n = re.length;
4661
+
4662
+ // Bit-reversal permutation
4663
+ for (var i = 1, j = 0; i < n; i++) {
4664
+ var bit = n >> 1;
4665
+ while (j & bit) {
4666
+ j ^= bit;
4667
+ bit >>= 1;
4642
4668
  }
4643
- if (this.session) {
4644
- throw new Error("Model already loaded. Call dispose() first.");
4669
+ j ^= bit;
4670
+ if (i < j) {
4671
+ var tmp = re[i]; re[i] = re[j]; re[j] = tmp;
4672
+ tmp = im[i]; im[i] = im[j]; im[j] = tmp;
4645
4673
  }
4646
- this.isLoading = true;
4647
- const startTime = performance.now();
4648
- const telemetry = getTelemetry();
4649
- const span = telemetry?.startSpan("Wav2ArkitCpu.load", {
4650
- "model.url": this.config.modelUrl,
4651
- "model.backend_requested": this.config.backend || "wasm"
4652
- });
4653
- try {
4654
- const preference = this.config.backend || "wasm";
4655
- logger5.info("Loading ONNX Runtime...", { preference });
4656
- const { ort, backend } = await getOnnxRuntimeForPreference(preference);
4657
- this.ort = ort;
4658
- this._backend = backend;
4659
- logger5.info("ONNX Runtime loaded", { backend: this._backend });
4660
- const modelUrl = this.config.modelUrl;
4661
- const dataUrl = this.config.externalDataUrl !== false ? this.config.externalDataUrl || `${modelUrl}.data` : null;
4662
- const sessionOptions = getSessionOptions(this._backend);
4663
- if (isIOS()) {
4664
- logger5.info("iOS: passing model URLs directly to ORT (low-memory path)", {
4665
- modelUrl,
4666
- dataUrl
4667
- });
4668
- if (dataUrl) {
4669
- const dataFilename = dataUrl.split("/").pop();
4670
- sessionOptions.externalData = [{
4671
- path: dataFilename,
4672
- data: dataUrl
4673
- // URL string — ORT fetches directly into WASM
4674
- }];
4675
- }
4676
- this.session = await this.ort.InferenceSession.create(modelUrl, sessionOptions);
4677
- } else {
4678
- const cache = getModelCache();
4679
- const isCached = await cache.has(modelUrl);
4680
- let modelBuffer;
4681
- if (isCached) {
4682
- logger5.debug("Loading model from cache", { modelUrl });
4683
- modelBuffer = await cache.get(modelUrl);
4684
- if (!modelBuffer) {
4685
- logger5.warn("Cache corruption detected, clearing and retrying", { modelUrl });
4686
- await cache.delete(modelUrl);
4687
- modelBuffer = await fetchWithCache(modelUrl);
4688
- }
4689
- } else {
4690
- logger5.debug("Fetching and caching model graph", { modelUrl });
4691
- modelBuffer = await fetchWithCache(modelUrl);
4692
- }
4693
- if (!modelBuffer) {
4694
- throw new Error(`Failed to load model: ${modelUrl}`);
4695
- }
4696
- let externalDataBuffer = null;
4697
- if (dataUrl) {
4698
- try {
4699
- const isDataCached = await cache.has(dataUrl);
4700
- if (isDataCached) {
4701
- logger5.debug("Loading external data from cache", { dataUrl });
4702
- externalDataBuffer = await cache.get(dataUrl);
4703
- if (!externalDataBuffer) {
4704
- logger5.warn("Cache corruption for external data, retrying", { dataUrl });
4705
- await cache.delete(dataUrl);
4706
- externalDataBuffer = await fetchWithCache(dataUrl);
4707
- }
4708
- } else {
4709
- logger5.info("Fetching external model data", {
4710
- dataUrl,
4711
- note: "This may be a large download (400MB+)"
4712
- });
4713
- externalDataBuffer = await fetchWithCache(dataUrl);
4714
- }
4715
- logger5.info("External data loaded", {
4716
- size: formatBytes(externalDataBuffer.byteLength)
4717
- });
4718
- } catch (err) {
4719
- logger5.debug("No external data file found (single-file model)", {
4720
- dataUrl,
4721
- error: err.message
4722
- });
4723
- }
4724
- }
4725
- logger5.debug("Creating ONNX session", {
4726
- graphSize: formatBytes(modelBuffer.byteLength),
4727
- externalDataSize: externalDataBuffer ? formatBytes(externalDataBuffer.byteLength) : "none",
4728
- backend: this._backend
4729
- });
4730
- if (externalDataBuffer) {
4731
- const dataFilename = dataUrl.split("/").pop();
4732
- sessionOptions.externalData = [{
4733
- path: dataFilename,
4734
- data: new Uint8Array(externalDataBuffer)
4735
- }];
4736
- }
4737
- const modelData = new Uint8Array(modelBuffer);
4738
- this.session = await this.ort.InferenceSession.create(modelData, sessionOptions);
4674
+ }
4675
+
4676
+ // Butterfly passes
4677
+ for (var len = 2; len <= n; len *= 2) {
4678
+ var halfLen = len / 2;
4679
+ var angle = -2 * Math.PI / len;
4680
+ var wRe = Math.cos(angle);
4681
+ var wIm = Math.sin(angle);
4682
+
4683
+ for (var i = 0; i < n; i += len) {
4684
+ var curRe = 1;
4685
+ var curIm = 0;
4686
+ for (var j = 0; j < halfLen; j++) {
4687
+ var a = i + j;
4688
+ var b = a + halfLen;
4689
+ var tRe = curRe * re[b] - curIm * im[b];
4690
+ var tIm = curRe * im[b] + curIm * re[b];
4691
+ re[b] = re[a] - tRe;
4692
+ im[b] = im[a] - tIm;
4693
+ re[a] += tRe;
4694
+ im[a] += tIm;
4695
+ var nextRe = curRe * wRe - curIm * wIm;
4696
+ curIm = curRe * wIm + curIm * wRe;
4697
+ curRe = nextRe;
4739
4698
  }
4740
- const loadTimeMs = performance.now() - startTime;
4741
- logger5.info("Model loaded successfully", {
4742
- backend: this._backend,
4743
- loadTimeMs: Math.round(loadTimeMs),
4744
- inputs: this.session.inputNames,
4745
- outputs: this.session.outputNames
4746
- });
4747
- span?.setAttributes({
4748
- "model.backend": this._backend,
4749
- "model.load_time_ms": loadTimeMs,
4750
- "model.cached": !isIOS()
4751
- });
4752
- span?.end();
4753
- telemetry?.recordHistogram("omote.model.load_time", loadTimeMs, {
4754
- model: "wav2arkit_cpu",
4755
- backend: this._backend
4756
- });
4757
- logger5.debug("Running warmup inference");
4758
- const warmupStart = performance.now();
4759
- const silentAudio = new Float32Array(16e3);
4760
- await this.infer(silentAudio);
4761
- const warmupTimeMs = performance.now() - warmupStart;
4762
- logger5.info("Warmup inference complete", {
4763
- warmupTimeMs: Math.round(warmupTimeMs),
4764
- backend: this._backend
4765
- });
4766
- telemetry?.recordHistogram("omote.model.warmup_time", warmupTimeMs, {
4767
- model: "wav2arkit_cpu",
4768
- backend: this._backend
4769
- });
4770
- return {
4771
- backend: this._backend,
4772
- loadTimeMs,
4773
- inputNames: [...this.session.inputNames],
4774
- outputNames: [...this.session.outputNames]
4775
- };
4776
- } catch (error) {
4777
- span?.endWithError(error instanceof Error ? error : new Error(String(error)));
4778
- telemetry?.incrementCounter("omote.errors.total", 1, {
4779
- model: "wav2arkit_cpu",
4780
- error_type: "load_failed"
4781
- });
4782
- throw error;
4783
- } finally {
4784
- this.isLoading = false;
4785
4699
  }
4786
4700
  }
4787
- /**
4788
- * Run inference on raw audio
4789
- *
4790
- * Accepts variable-length audio (not fixed to 16000 samples).
4791
- * Output frames = ceil(30 * numSamples / 16000).
4792
- *
4793
- * @param audioSamples - Float32Array of raw audio at 16kHz
4794
- * @param _identityIndex - Ignored (identity 11 is baked into the model)
4795
- */
4796
- async infer(audioSamples, _identityIndex) {
4797
- if (!this.session) {
4798
- throw new Error("Model not loaded. Call load() first.");
4701
+ }
4702
+
4703
+ /** HTK mel scale */
4704
+ function htkMel(freq) {
4705
+ return 1127.0 * Math.log(1.0 + freq / 700.0);
4706
+ }
4707
+
4708
+ function htkMelInverse(mel) {
4709
+ return 700.0 * (Math.exp(mel / 1127.0) - 1.0);
4710
+ }
4711
+
4712
+ /**
4713
+ * Build triangular mel filterbank matrix
4714
+ */
4715
+ function buildMelFilterbank(numBins, fftSize, sampleRate, lowFreq, highFreq) {
4716
+ var numFftBins = fftSize / 2 + 1;
4717
+ var lowMel = htkMel(lowFreq);
4718
+ var highMel = htkMel(highFreq);
4719
+
4720
+ // numBins + 2 equally spaced points in mel space
4721
+ var melPoints = new Float64Array(numBins + 2);
4722
+ for (var i = 0; i < numBins + 2; i++) {
4723
+ melPoints[i] = lowMel + (highMel - lowMel) * i / (numBins + 1);
4724
+ }
4725
+
4726
+ // Convert mel points to FFT bin indices (float, not rounded)
4727
+ var binFreqs = new Float64Array(numBins + 2);
4728
+ for (var i = 0; i < numBins + 2; i++) {
4729
+ binFreqs[i] = htkMelInverse(melPoints[i]) * fftSize / sampleRate;
4730
+ }
4731
+
4732
+ var filters = [];
4733
+
4734
+ for (var m = 0; m < numBins; m++) {
4735
+ var left = binFreqs[m];
4736
+ var center = binFreqs[m + 1];
4737
+ var right = binFreqs[m + 2];
4738
+
4739
+ var startBin = Math.max(0, Math.ceil(left));
4740
+ var endBin = Math.min(numFftBins - 1, Math.floor(right));
4741
+
4742
+ var weights = new Float32Array(endBin - startBin + 1);
4743
+ for (var k = startBin; k <= endBin; k++) {
4744
+ if (k <= center) {
4745
+ weights[k - startBin] = (center - left) > 0 ? (k - left) / (center - left) : 0;
4746
+ } else {
4747
+ weights[k - startBin] = (right - center) > 0 ? (right - k) / (right - center) : 0;
4748
+ }
4799
4749
  }
4800
- if (this.poisoned) {
4801
- throw new Error("Wav2ArkitCpu session timed out \u2014 inference unavailable until page reload");
4750
+
4751
+ filters.push({ startBin: startBin, weights: weights });
4752
+ }
4753
+
4754
+ return filters;
4755
+ }
4756
+
4757
+ /** Create Hamming window */
4758
+ function createHammingWindow(length) {
4759
+ var w = new Float32Array(length);
4760
+ for (var i = 0; i < length; i++) {
4761
+ w[i] = 0.54 - 0.46 * Math.cos(2 * Math.PI * i / (length - 1));
4762
+ }
4763
+ return w;
4764
+ }
4765
+
4766
+ /**
4767
+ * Compute Kaldi-compatible log mel filterbank features
4768
+ */
4769
+ function computeKaldiFbank(audio, sampleRate, numMelBins, opts) {
4770
+ var frameLengthMs = (opts && opts.frameLengthMs !== undefined) ? opts.frameLengthMs : 25;
4771
+ var frameShiftMs = (opts && opts.frameShiftMs !== undefined) ? opts.frameShiftMs : 10;
4772
+ var lowFreq = (opts && opts.lowFreq !== undefined) ? opts.lowFreq : 20;
4773
+ var highFreq = (opts && opts.highFreq !== undefined) ? opts.highFreq : (sampleRate / 2);
4774
+ var dither = (opts && opts.dither !== undefined) ? opts.dither : 0;
4775
+ var preemphasis = (opts && opts.preemphasis !== undefined) ? opts.preemphasis : 0.97;
4776
+
4777
+ var frameLengthSamples = Math.round(sampleRate * frameLengthMs / 1000);
4778
+ var frameShiftSamples = Math.round(sampleRate * frameShiftMs / 1000);
4779
+
4780
+ // Kaldi signal scaling: float [-1,1] -> int16 range
4781
+ var scaled = new Float32Array(audio.length);
4782
+ for (var i = 0; i < audio.length; i++) {
4783
+ scaled[i] = audio[i] * 32768;
4784
+ }
4785
+
4786
+ // Optional dithering
4787
+ if (dither > 0) {
4788
+ for (var i = 0; i < scaled.length; i++) {
4789
+ var u1 = Math.random();
4790
+ var u2 = Math.random();
4791
+ scaled[i] += dither * Math.sqrt(-2 * Math.log(u1 + 1e-10)) * Math.cos(2 * Math.PI * u2);
4802
4792
  }
4803
- const audioCopy = new Float32Array(audioSamples);
4804
- const feeds = {
4805
- "audio_waveform": new this.ort.Tensor("float32", audioCopy, [1, audioCopy.length])
4806
- };
4807
- return this.queueInference(feeds, audioCopy.length);
4808
4793
  }
4809
- /**
4810
- * Queue inference to serialize ONNX session calls
4811
- */
4812
- queueInference(feeds, inputSamples) {
4813
- return new Promise((resolve, reject) => {
4814
- this.inferenceQueue = this.inferenceQueue.then(async () => {
4815
- const telemetry = getTelemetry();
4816
- const span = telemetry?.startSpan("Wav2ArkitCpu.infer", {
4817
- "inference.backend": this._backend,
4818
- "inference.input_samples": inputSamples
4819
- });
4820
- try {
4821
- const startTime = performance.now();
4794
+
4795
+ // Number of frames (snip_edges=true: only complete frames)
4796
+ var numFrames = Math.max(0, Math.floor((scaled.length - frameLengthSamples) / frameShiftSamples) + 1);
4797
+ if (numFrames === 0) {
4798
+ return new Float32Array(0);
4799
+ }
4800
+
4801
+ // FFT size: next power of 2
4802
+ var fftSize = 1;
4803
+ while (fftSize < frameLengthSamples) fftSize *= 2;
4804
+
4805
+ var numFftBins = fftSize / 2 + 1;
4806
+
4807
+ // Pre-compute window and filterbank
4808
+ var window = createHammingWindow(frameLengthSamples);
4809
+ var filters = buildMelFilterbank(numMelBins, fftSize, sampleRate, lowFreq, highFreq);
4810
+
4811
+ // Allocate output
4812
+ var output = new Float32Array(numFrames * numMelBins);
4813
+
4814
+ // FFT buffers (reused per frame)
4815
+ var fftRe = new Float64Array(fftSize);
4816
+ var fftIm = new Float64Array(fftSize);
4817
+
4818
+ for (var f = 0; f < numFrames; f++) {
4819
+ var offset = f * frameShiftSamples;
4820
+
4821
+ // Clear FFT buffers
4822
+ fftRe.fill(0);
4823
+ fftIm.fill(0);
4824
+
4825
+ // Extract frame with preemphasis and windowing
4826
+ for (var i = 0; i < frameLengthSamples; i++) {
4827
+ var sample = scaled[offset + i];
4828
+ // Preemphasis: y[n] = x[n] - coeff * x[n-1]
4829
+ if (preemphasis > 0 && i > 0) {
4830
+ sample -= preemphasis * scaled[offset + i - 1];
4831
+ } else if (preemphasis > 0 && i === 0 && offset > 0) {
4832
+ sample -= preemphasis * scaled[offset - 1];
4833
+ }
4834
+ // Apply window
4835
+ fftRe[i] = sample * window[i];
4836
+ }
4837
+
4838
+ // FFT
4839
+ fft(fftRe, fftIm);
4840
+
4841
+ // Power spectrum -> mel filterbank -> log
4842
+ var outOffset = f * numMelBins;
4843
+ for (var m = 0; m < numMelBins; m++) {
4844
+ var filter = filters[m];
4845
+ var energy = 0;
4846
+ for (var k = 0; k < filter.weights.length; k++) {
4847
+ var bin = filter.startBin + k;
4848
+ if (bin < numFftBins) {
4849
+ var powerSpec = fftRe[bin] * fftRe[bin] + fftIm[bin] * fftIm[bin];
4850
+ energy += filter.weights[k] * powerSpec;
4851
+ }
4852
+ }
4853
+ output[outOffset + m] = Math.log(Math.max(energy, 1e-10));
4854
+ }
4855
+ }
4856
+
4857
+ return output;
4858
+ }
4859
+
4860
+ /**
4861
+ * Apply Low Frame Rate stacking for SenseVoice
4862
+ */
4863
+ function applyLFR(features, featureDim, lfrM, lfrN) {
4864
+ var numFrames = features.length / featureDim;
4865
+ if (numFrames === 0) return new Float32Array(0);
4866
+
4867
+ var leftPad = Math.floor((lfrM - 1) / 2); // 3 for lfrM=7
4868
+ var paddedLen = numFrames + leftPad;
4869
+ var numOutputFrames = Math.ceil(paddedLen / lfrN);
4870
+ var outputDim = featureDim * lfrM;
4871
+
4872
+ var output = new Float32Array(numOutputFrames * outputDim);
4873
+
4874
+ for (var i = 0; i < numOutputFrames; i++) {
4875
+ var startFrame = i * lfrN - leftPad;
4876
+
4877
+ for (var j = 0; j < lfrM; j++) {
4878
+ var srcFrame = startFrame + j;
4879
+ // Clamp to valid range
4880
+ if (srcFrame < 0) srcFrame = 0;
4881
+ if (srcFrame >= numFrames) srcFrame = numFrames - 1;
4882
+
4883
+ var srcOffset = srcFrame * featureDim;
4884
+ var dstOffset = i * outputDim + j * featureDim;
4885
+ for (var k = 0; k < featureDim; k++) {
4886
+ output[dstOffset + k] = features[srcOffset + k];
4887
+ }
4888
+ }
4889
+ }
4890
+
4891
+ return output;
4892
+ }
4893
+
4894
+ /**
4895
+ * Apply CMVN normalization in-place
4896
+ */
4897
+ function applyCMVN(features, dim, negMeanVec, invStddevVec) {
4898
+ for (var i = 0; i < features.length; i++) {
4899
+ var d = i % dim;
4900
+ features[i] = (features[i] + negMeanVec[d]) * invStddevVec[d];
4901
+ }
4902
+ return features;
4903
+ }
4904
+
4905
+ /**
4906
+ * Parse CMVN vectors from comma-separated strings (stored in ONNX metadata)
4907
+ */
4908
+ function parseCMVNFromMetadata(negMeanStr, invStddevStr) {
4909
+ var negMeanArr = new Float32Array(
4910
+ negMeanStr.split(',').map(function(s) { return parseFloat(s.trim()); })
4911
+ );
4912
+ var invStddevArr = new Float32Array(
4913
+ invStddevStr.split(',').map(function(s) { return parseFloat(s.trim()); })
4914
+ );
4915
+ return { negMean: negMeanArr, invStddev: invStddevArr };
4916
+ }
4917
+
4918
+ // \u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550
4919
+ // ctcDecoder.ts \u2014 inlined as plain JavaScript
4920
+ // \u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550
4921
+
4922
+ /** SenseVoice language ID -> string mapping */
4923
+ var LANGUAGE_IDS = {
4924
+ 0: 'auto',
4925
+ 3: 'zh',
4926
+ 4: 'en',
4927
+ 7: 'yue',
4928
+ 11: 'ja',
4929
+ 12: 'ko',
4930
+ 13: 'nospeech'
4931
+ };
4932
+
4933
+ /** SenseVoice text normalization ID -> string mapping */
4934
+ var TEXT_NORM_IDS = {
4935
+ 14: 'with_itn',
4936
+ 15: 'without_itn'
4937
+ };
4938
+
4939
+ /** Resolve language string to SenseVoice language ID */
4940
+ function resolveLanguageId(language) {
4941
+ var map = {
4942
+ auto: 0,
4943
+ zh: 3,
4944
+ en: 4,
4945
+ yue: 7,
4946
+ ja: 11,
4947
+ ko: 12
4948
+ };
4949
+ return map[language] !== undefined ? map[language] : 0;
4950
+ }
4951
+
4952
+ /** Resolve text norm string to SenseVoice text norm ID */
4953
+ function resolveTextNormId(textNorm) {
4954
+ return textNorm === 'without_itn' ? 15 : 14;
4955
+ }
4956
+
4957
+ /**
4958
+ * Parse tokens.txt into a token ID -> string map
4959
+ */
4960
+ function parseTokensFile(content) {
4961
+ var map = new Map();
4962
+ var lines = content.split('\\n');
4963
+ for (var idx = 0; idx < lines.length; idx++) {
4964
+ var trimmed = lines[idx].trim();
4965
+ if (!trimmed) continue;
4966
+ // Find the last space - token string may contain spaces
4967
+ var lastSpace = trimmed.lastIndexOf(' ');
4968
+ if (lastSpace === -1) continue;
4969
+ var token = trimmed.substring(0, lastSpace);
4970
+ var id = parseInt(trimmed.substring(lastSpace + 1), 10);
4971
+ if (!isNaN(id)) {
4972
+ map.set(id, token);
4973
+ }
4974
+ }
4975
+ return map;
4976
+ }
4977
+
4978
+ /**
4979
+ * SenseVoice structured token pattern matching
4980
+ */
4981
+ function parseStructuredToken(token) {
4982
+ var match = token.match(/^<\\|(.+)\\|>$/);
4983
+ if (!match) return null;
4984
+
4985
+ var value = match[1];
4986
+
4987
+ // Language tokens
4988
+ if (value === 'zh' || value === 'en' || value === 'ja' || value === 'ko' || value === 'yue' || value === 'nospeech') {
4989
+ return { type: 'language', value: value };
4990
+ }
4991
+
4992
+ // Emotion tokens
4993
+ var emotions = ['HAPPY', 'SAD', 'ANGRY', 'NEUTRAL', 'FEARFUL', 'DISGUSTED', 'SURPRISED', 'EMO_UNKNOWN'];
4994
+ if (emotions.indexOf(value) !== -1) {
4995
+ return { type: 'emotion', value: value };
4996
+ }
4997
+
4998
+ // Audio event tokens
4999
+ var events = ['Speech', 'BGM', 'Applause', 'Laughter', 'Crying', 'Coughing', 'Sneezing', 'EVENT_UNKNOWN'];
5000
+ if (events.indexOf(value) !== -1) {
5001
+ return { type: 'event', value: value };
5002
+ }
5003
+
5004
+ // ITN tokens
5005
+ if (value === 'withitn' || value === 'woitn' || value === 'with_itn' || value === 'without_itn') {
5006
+ return { type: 'textnorm', value: value };
5007
+ }
5008
+
5009
+ return null;
5010
+ }
5011
+
5012
+ /**
5013
+ * CTC greedy decode
5014
+ */
5015
+ function ctcGreedyDecode(logits, seqLen, vocabSz, tokenMapLocal) {
5016
+ // Step 1: Argmax per time step
5017
+ var tokenIds = [];
5018
+ for (var t = 0; t < seqLen; t++) {
5019
+ var offset = t * vocabSz;
5020
+ var maxIdx = 0;
5021
+ var maxVal = logits[offset];
5022
+ for (var v = 1; v < vocabSz; v++) {
5023
+ if (logits[offset + v] > maxVal) {
5024
+ maxVal = logits[offset + v];
5025
+ maxIdx = v;
5026
+ }
5027
+ }
5028
+ tokenIds.push(maxIdx);
5029
+ }
5030
+
5031
+ // Step 2: Collapse consecutive duplicates
5032
+ var collapsed = [];
5033
+ var prev = -1;
5034
+ for (var idx = 0; idx < tokenIds.length; idx++) {
5035
+ var id = tokenIds[idx];
5036
+ if (id !== prev) {
5037
+ collapsed.push(id);
5038
+ prev = id;
5039
+ }
5040
+ }
5041
+
5042
+ // Step 3: Remove blank tokens (ID 0) and special tokens (<s>=1, </s>=2)
5043
+ var filtered = collapsed.filter(function(id) { return id !== 0 && id !== 1 && id !== 2; });
5044
+
5045
+ // Step 4: Convert to token strings and parse structured tokens
5046
+ var language = undefined;
5047
+ var emotion = undefined;
5048
+ var event = undefined;
5049
+ var textTokens = [];
5050
+
5051
+ for (var idx = 0; idx < filtered.length; idx++) {
5052
+ var id = filtered[idx];
5053
+ var token = tokenMapLocal.get(id);
5054
+ if (!token) continue;
5055
+
5056
+ var structured = parseStructuredToken(token);
5057
+ if (structured) {
5058
+ if (structured.type === 'language') language = structured.value;
5059
+ else if (structured.type === 'emotion') emotion = structured.value;
5060
+ else if (structured.type === 'event') event = structured.value;
5061
+ // Skip textnorm tokens
5062
+ } else {
5063
+ textTokens.push(token);
5064
+ }
5065
+ }
5066
+
5067
+ // Step 5: Join tokens, handle SentencePiece boundary marker
5068
+ var text = textTokens.join('');
5069
+ // Replace SentencePiece word boundary (U+2581) with space
5070
+ text = text.replace(/\\u2581/g, ' ').trim();
5071
+
5072
+ return { text: text, language: language, emotion: emotion, event: event };
5073
+ }
5074
+
5075
+ // \u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550
5076
+ // Worker globals and message handler
5077
+ // \u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550
5078
+
5079
+ /**
5080
+ * Load ONNX Runtime from CDN
5081
+ */
5082
+ async function loadOrt(wasmPaths) {
5083
+ if (ort) return;
5084
+
5085
+ // Import ONNX Runtime from CDN
5086
+ var ortUrl = wasmPaths + 'ort.wasm.min.js';
5087
+
5088
+ // Load the script by fetching and executing it
5089
+ var response = await fetch(ortUrl);
5090
+ var scriptText = await response.text();
5091
+
5092
+ // Create a blob URL for the script
5093
+ var blob = new Blob([scriptText], { type: 'application/javascript' });
5094
+ var blobUrl = URL.createObjectURL(blob);
5095
+
5096
+ // Import the module
5097
+ importScripts(blobUrl);
5098
+ URL.revokeObjectURL(blobUrl);
5099
+
5100
+ // ort is now available as global
5101
+ ort = self.ort;
5102
+
5103
+ // Configure WASM settings
5104
+ ort.env.wasm.wasmPaths = wasmPaths;
5105
+ ort.env.wasm.numThreads = 1; // Single thread in worker
5106
+ ort.env.wasm.simd = true;
5107
+ ort.env.wasm.proxy = false; // No proxy in worker
5108
+ }
5109
+
5110
+ /**
5111
+ * Load the SenseVoice model and tokens
5112
+ */
5113
+ async function loadModel(modelUrl, tokensUrl, isIOSDevice, lang, textNorm) {
5114
+ // 1. Fetch and parse tokens.txt
5115
+ var tokensResponse = await fetch(tokensUrl);
5116
+ if (!tokensResponse.ok) {
5117
+ throw new Error('Failed to fetch tokens.txt: ' + tokensResponse.status + ' ' + tokensResponse.statusText);
5118
+ }
5119
+ var tokensText = await tokensResponse.text();
5120
+ tokenMap = parseTokensFile(tokensText);
5121
+
5122
+ // 2. Store language/textNorm IDs
5123
+ languageId = lang;
5124
+ textNormId = textNorm;
5125
+
5126
+ // 3. Create inference session
5127
+ var sessionOptions = {
5128
+ executionProviders: ['wasm'],
5129
+ graphOptimizationLevel: 'all',
5130
+ };
5131
+
5132
+ if (isIOSDevice) {
5133
+ // iOS: pass URL string directly to ORT to avoid 239MB JS heap allocation
5134
+ // ORT fetches into WASM memory, keeping JS heap at ~2MB
5135
+ session = await ort.InferenceSession.create(modelUrl, sessionOptions);
5136
+ } else {
5137
+ // Desktop: fetch ArrayBuffer for potential caching
5138
+ var modelResponse = await fetch(modelUrl);
5139
+ if (!modelResponse.ok) {
5140
+ throw new Error('Failed to fetch model: ' + modelResponse.status + ' ' + modelResponse.statusText);
5141
+ }
5142
+ var modelBuffer = await modelResponse.arrayBuffer();
5143
+ var modelData = new Uint8Array(modelBuffer);
5144
+ session = await ort.InferenceSession.create(modelData, sessionOptions);
5145
+ }
5146
+
5147
+ // 4. Try to read CMVN from model metadata
5148
+ try {
5149
+ var metadata = session.handler && session.handler.metadata;
5150
+ if (metadata && metadata.neg_mean && metadata.inv_stddev) {
5151
+ var cmvn = parseCMVNFromMetadata(metadata.neg_mean, metadata.inv_stddev);
5152
+ negMean = cmvn.negMean;
5153
+ invStddev = cmvn.invStddev;
5154
+ }
5155
+ } catch (cmvnErr) {
5156
+ // CMVN not available \u2014 features will not be normalized
5157
+ }
5158
+
5159
+ // 5. Determine vocab size from tokenMap
5160
+ vocabSize = 0;
5161
+ tokenMap.forEach(function(val, key) {
5162
+ if (key >= vocabSize) vocabSize = key + 1;
5163
+ });
5164
+
5165
+ return {
5166
+ vocabSize: vocabSize,
5167
+ inputNames: session.inputNames.slice(),
5168
+ outputNames: session.outputNames.slice(),
5169
+ };
5170
+ }
5171
+
5172
+ /**
5173
+ * Run transcription on audio samples
5174
+ */
5175
+ async function runTranscription(audio) {
5176
+ var preprocessStart = performance.now();
5177
+
5178
+ // 1. Compute Kaldi fbank features [T, 80]
5179
+ var fbank = computeKaldiFbank(audio, 16000, 80);
5180
+ var numFrames = fbank.length / 80;
5181
+
5182
+ if (numFrames === 0) {
5183
+ return {
5184
+ text: '',
5185
+ language: undefined,
5186
+ emotion: undefined,
5187
+ event: undefined,
5188
+ inferenceTimeMs: performance.now() - preprocessStart,
5189
+ preprocessTimeMs: performance.now() - preprocessStart,
5190
+ };
5191
+ }
5192
+
5193
+ // 2. Apply LFR stacking [T_reduced, 560]
5194
+ var lfrFeatures = applyLFR(fbank, 80, 7, 6);
5195
+ var numLfrFrames = lfrFeatures.length / 560;
5196
+
5197
+ // 3. Apply CMVN normalization (in-place)
5198
+ if (negMean && invStddev) {
5199
+ applyCMVN(lfrFeatures, 560, negMean, invStddev);
5200
+ }
5201
+
5202
+ var preprocessTimeMs = performance.now() - preprocessStart;
5203
+
5204
+ // 4. Build ORT tensors
5205
+ var feeds = {
5206
+ x: new ort.Tensor('float32', lfrFeatures, [1, numLfrFrames, 560]),
5207
+ x_length: new ort.Tensor('int32', new Int32Array([numLfrFrames]), [1]),
5208
+ language: new ort.Tensor('int32', new Int32Array([languageId]), [1]),
5209
+ text_norm: new ort.Tensor('int32', new Int32Array([textNormId]), [1]),
5210
+ };
5211
+
5212
+ // 5. Run inference
5213
+ var results = await session.run(feeds);
5214
+
5215
+ var logitsOutput = results['logits'];
5216
+ if (!logitsOutput) {
5217
+ throw new Error('Model output missing "logits" tensor');
5218
+ }
5219
+
5220
+ var logitsData = logitsOutput.data;
5221
+ var logitsDims = logitsOutput.dims;
5222
+ var seqLen = logitsDims[1];
5223
+ var modelVocabSize = logitsDims[2];
5224
+
5225
+ // 6. CTC decode
5226
+ var decoded = ctcGreedyDecode(logitsData, seqLen, modelVocabSize, tokenMap);
5227
+
5228
+ var totalTimeMs = performance.now() - preprocessStart;
5229
+
5230
+ return {
5231
+ text: decoded.text,
5232
+ language: decoded.language,
5233
+ emotion: decoded.emotion,
5234
+ event: decoded.event,
5235
+ inferenceTimeMs: totalTimeMs,
5236
+ preprocessTimeMs: preprocessTimeMs,
5237
+ };
5238
+ }
5239
+
5240
+ // Message handler
5241
+ self.onmessage = async function(e) {
5242
+ var msg = e.data;
5243
+
5244
+ try {
5245
+ switch (msg.type) {
5246
+ case 'load': {
5247
+ var startTime = performance.now();
5248
+ await loadOrt(msg.wasmPaths);
5249
+ var info = await loadModel(msg.modelUrl, msg.tokensUrl, msg.isIOS, msg.language, msg.textNorm);
5250
+ var loadTimeMs = performance.now() - startTime;
5251
+
5252
+ self.postMessage({
5253
+ type: 'loaded',
5254
+ vocabSize: info.vocabSize,
5255
+ inputNames: info.inputNames,
5256
+ outputNames: info.outputNames,
5257
+ loadTimeMs: loadTimeMs,
5258
+ });
5259
+ break;
5260
+ }
5261
+
5262
+ case 'transcribe': {
5263
+ var result = await runTranscription(msg.audio);
5264
+
5265
+ self.postMessage({
5266
+ type: 'result',
5267
+ text: result.text,
5268
+ language: result.language,
5269
+ emotion: result.emotion,
5270
+ event: result.event,
5271
+ inferenceTimeMs: result.inferenceTimeMs,
5272
+ preprocessTimeMs: result.preprocessTimeMs,
5273
+ });
5274
+ break;
5275
+ }
5276
+
5277
+ case 'dispose': {
5278
+ if (session) {
5279
+ await session.release();
5280
+ session = null;
5281
+ }
5282
+ ort = null;
5283
+ tokenMap = null;
5284
+ negMean = null;
5285
+ invStddev = null;
5286
+ self.postMessage({ type: 'disposed' });
5287
+ break;
5288
+ }
5289
+
5290
+ default:
5291
+ self.postMessage({
5292
+ type: 'error',
5293
+ error: 'Unknown message type: ' + msg.type,
5294
+ });
5295
+ }
5296
+ } catch (err) {
5297
+ var errorMsg = err.message || String(err);
5298
+ // Handle raw C++ exception pointers from ORT WASM
5299
+ if (typeof err === 'number') {
5300
+ errorMsg = 'Raw C++ exception pointer (0x' + err.toString(16) + '). Likely OOM in WASM.';
5301
+ }
5302
+ self.postMessage({
5303
+ type: 'error',
5304
+ error: errorMsg,
5305
+ });
5306
+ }
5307
+ };
5308
+
5309
+ // Error handler
5310
+ self.onerror = function(err) {
5311
+ self.postMessage({
5312
+ type: 'error',
5313
+ error: 'Worker error: ' + (err.message || String(err)),
5314
+ });
5315
+ };
5316
+ `;
5317
+ var SenseVoiceWorker = class {
5318
+ constructor(config) {
5319
+ this.worker = null;
5320
+ this.isLoading = false;
5321
+ this._isLoaded = false;
5322
+ // Inference queue for serialization
5323
+ this.inferenceQueue = Promise.resolve();
5324
+ // Session health: set to true if worker operation times out
5325
+ this.poisoned = false;
5326
+ // Pending message handlers
5327
+ this.pendingResolvers = /* @__PURE__ */ new Map();
5328
+ const modelDir = config.modelUrl.substring(0, config.modelUrl.lastIndexOf("/"));
5329
+ const tokensUrl = config.tokensUrl ?? `${modelDir}/tokens.txt`;
5330
+ this.config = {
5331
+ modelUrl: config.modelUrl,
5332
+ tokensUrl,
5333
+ language: config.language ?? "auto",
5334
+ textNorm: config.textNorm ?? "with_itn"
5335
+ };
5336
+ this.languageId = resolveLanguageId(this.config.language);
5337
+ this.textNormId = resolveTextNormId(this.config.textNorm);
5338
+ }
5339
+ get isLoaded() {
5340
+ return this._isLoaded;
5341
+ }
5342
+ /**
5343
+ * Backend type (always 'wasm' for Worker, WebGPU not supported in Workers)
5344
+ */
5345
+ get backend() {
5346
+ return this._isLoaded ? "wasm" : null;
5347
+ }
5348
+ /**
5349
+ * Create the worker from inline script
5350
+ */
5351
+ createWorker() {
5352
+ const blob = new Blob([WORKER_SCRIPT], { type: "application/javascript" });
5353
+ const blobUrl = URL.createObjectURL(blob);
5354
+ const worker = new Worker(blobUrl);
5355
+ URL.revokeObjectURL(blobUrl);
5356
+ worker.onmessage = (event) => {
5357
+ this.handleWorkerMessage(event.data);
5358
+ };
5359
+ worker.onerror = (error) => {
5360
+ logger5.error("Worker error", { error: error.message });
5361
+ for (const [, resolver] of this.pendingResolvers) {
5362
+ resolver.reject(new Error(`Worker error: ${error.message}`));
5363
+ }
5364
+ this.pendingResolvers.clear();
5365
+ };
5366
+ return worker;
5367
+ }
5368
+ /**
5369
+ * Handle messages from worker
5370
+ */
5371
+ handleWorkerMessage(result) {
5372
+ const resolver = this.pendingResolvers.get(result.type);
5373
+ if (resolver) {
5374
+ this.pendingResolvers.delete(result.type);
5375
+ if (result.type === "error") {
5376
+ resolver.reject(new Error(result.error));
5377
+ } else {
5378
+ resolver.resolve(result);
5379
+ }
5380
+ }
5381
+ }
5382
+ /**
5383
+ * Send message to worker and wait for response
5384
+ */
5385
+ sendMessage(message, expectedType, timeoutMs) {
5386
+ return new Promise((resolve, reject) => {
5387
+ if (!this.worker) {
5388
+ reject(new Error("Worker not initialized"));
5389
+ return;
5390
+ }
5391
+ const timeoutId = setTimeout(() => {
5392
+ this.pendingResolvers.delete(expectedType);
5393
+ this.poisoned = true;
5394
+ reject(new Error(`Worker operation timed out after ${timeoutMs}ms`));
5395
+ }, timeoutMs);
5396
+ this.pendingResolvers.set(expectedType, {
5397
+ resolve: (value) => {
5398
+ clearTimeout(timeoutId);
5399
+ resolve(value);
5400
+ },
5401
+ reject: (error) => {
5402
+ clearTimeout(timeoutId);
5403
+ reject(error);
5404
+ }
5405
+ });
5406
+ this.pendingResolvers.set("error", {
5407
+ resolve: () => {
5408
+ },
5409
+ // Never called for errors
5410
+ reject: (error) => {
5411
+ clearTimeout(timeoutId);
5412
+ this.pendingResolvers.delete(expectedType);
5413
+ reject(error);
5414
+ }
5415
+ });
5416
+ this.worker.postMessage(message);
5417
+ });
5418
+ }
5419
+ /**
5420
+ * Load the ONNX model in the worker
5421
+ *
5422
+ * @param onProgress - Optional progress callback. Fires once at 100% when load completes
5423
+ * (the worker downloads and loads the model internally, so granular progress is not available).
5424
+ */
5425
+ async load(onProgress) {
5426
+ if (this.isLoading) {
5427
+ throw new Error("Model is already loading");
5428
+ }
5429
+ if (this._isLoaded) {
5430
+ throw new Error("Model already loaded. Call dispose() first.");
5431
+ }
5432
+ this.isLoading = true;
5433
+ const startTime = performance.now();
5434
+ const telemetry = getTelemetry();
5435
+ const span = telemetry?.startSpan("SenseVoiceWorker.load", {
5436
+ "model.url": this.config.modelUrl,
5437
+ "model.language": this.config.language
5438
+ });
5439
+ try {
5440
+ logger5.info("Creating SenseVoice worker...");
5441
+ this.worker = this.createWorker();
5442
+ logger5.info("Loading model in worker...", {
5443
+ modelUrl: this.config.modelUrl,
5444
+ tokensUrl: this.config.tokensUrl,
5445
+ language: this.config.language,
5446
+ textNorm: this.config.textNorm
5447
+ });
5448
+ const result = await this.sendMessage(
5449
+ {
5450
+ type: "load",
5451
+ modelUrl: resolveUrl(this.config.modelUrl),
5452
+ tokensUrl: resolveUrl(this.config.tokensUrl),
5453
+ wasmPaths: WASM_CDN_PATH2,
5454
+ isIOS: isIOS(),
5455
+ language: this.languageId,
5456
+ textNorm: this.textNormId
5457
+ },
5458
+ "loaded",
5459
+ LOAD_TIMEOUT_MS
5460
+ );
5461
+ this._isLoaded = true;
5462
+ const loadTimeMs = performance.now() - startTime;
5463
+ onProgress?.(1, 1);
5464
+ logger5.info("SenseVoice worker loaded successfully", {
5465
+ backend: "wasm",
5466
+ loadTimeMs: Math.round(loadTimeMs),
5467
+ workerLoadTimeMs: Math.round(result.loadTimeMs),
5468
+ vocabSize: result.vocabSize,
5469
+ language: this.config.language,
5470
+ textNorm: this.config.textNorm
5471
+ });
5472
+ span?.setAttributes({
5473
+ "model.backend": "wasm",
5474
+ "model.load_time_ms": loadTimeMs,
5475
+ "model.worker_load_time_ms": result.loadTimeMs,
5476
+ "model.vocab_size": result.vocabSize
5477
+ });
5478
+ span?.end();
5479
+ telemetry?.recordHistogram("omote.model.load_time", loadTimeMs, {
5480
+ model: "sensevoice-worker",
5481
+ backend: "wasm"
5482
+ });
5483
+ return {
5484
+ backend: "wasm",
5485
+ loadTimeMs,
5486
+ inputNames: result.inputNames,
5487
+ outputNames: result.outputNames,
5488
+ vocabSize: result.vocabSize
5489
+ };
5490
+ } catch (error) {
5491
+ span?.endWithError(error instanceof Error ? error : new Error(String(error)));
5492
+ telemetry?.incrementCounter("omote.errors.total", 1, {
5493
+ model: "sensevoice-worker",
5494
+ error_type: "load_failed"
5495
+ });
5496
+ if (this.worker) {
5497
+ this.worker.terminate();
5498
+ this.worker = null;
5499
+ }
5500
+ throw error;
5501
+ } finally {
5502
+ this.isLoading = false;
5503
+ }
5504
+ }
5505
+ /**
5506
+ * Transcribe audio samples to text
5507
+ *
5508
+ * @param audioSamples Float32Array of audio samples at 16kHz, [-1, 1] range
5509
+ * @returns Transcription result with text, emotion, language, and event
5510
+ */
5511
+ async transcribe(audioSamples) {
5512
+ if (!this._isLoaded || !this.worker) {
5513
+ throw new Error("Worker not loaded. Call load() first.");
5514
+ }
5515
+ if (this.poisoned) {
5516
+ throw new Error("SenseVoice worker timed out \u2014 inference unavailable until page reload");
5517
+ }
5518
+ const audio = new Float32Array(audioSamples);
5519
+ return this.queueInference(audio);
5520
+ }
5521
+ /**
5522
+ * Queue inference to serialize worker calls
5523
+ */
5524
+ queueInference(audio) {
5525
+ return new Promise((resolve, reject) => {
5526
+ this.inferenceQueue = this.inferenceQueue.then(async () => {
5527
+ const telemetry = getTelemetry();
5528
+ const span = telemetry?.startSpan("SenseVoiceWorker.transcribe", {
5529
+ "inference.backend": "wasm",
5530
+ "inference.input_samples": audio.length
5531
+ });
5532
+ try {
5533
+ const startTime = performance.now();
5534
+ const result = await this.sendMessage(
5535
+ {
5536
+ type: "transcribe",
5537
+ audio
5538
+ },
5539
+ "result",
5540
+ INFERENCE_TIMEOUT_MS
5541
+ );
5542
+ const totalTimeMs = performance.now() - startTime;
5543
+ logger5.trace("Worker transcription complete", {
5544
+ text: result.text.substring(0, 50),
5545
+ language: result.language,
5546
+ emotion: result.emotion,
5547
+ event: result.event,
5548
+ preprocessTimeMs: Math.round(result.preprocessTimeMs * 100) / 100,
5549
+ inferenceTimeMs: Math.round(result.inferenceTimeMs * 100) / 100,
5550
+ roundTripMs: Math.round(totalTimeMs * 100) / 100
5551
+ });
5552
+ span?.setAttributes({
5553
+ "inference.duration_ms": totalTimeMs,
5554
+ "inference.worker_duration_ms": result.inferenceTimeMs,
5555
+ "inference.preprocess_ms": result.preprocessTimeMs,
5556
+ "inference.text_length": result.text.length
5557
+ });
5558
+ span?.end();
5559
+ telemetry?.recordHistogram("omote.inference.latency", totalTimeMs, {
5560
+ model: "sensevoice-worker",
5561
+ backend: "wasm"
5562
+ });
5563
+ telemetry?.incrementCounter("omote.inference.total", 1, {
5564
+ model: "sensevoice-worker",
5565
+ backend: "wasm",
5566
+ status: "success"
5567
+ });
5568
+ resolve({
5569
+ text: result.text,
5570
+ language: result.language,
5571
+ emotion: result.emotion,
5572
+ event: result.event,
5573
+ inferenceTimeMs: result.inferenceTimeMs,
5574
+ preprocessTimeMs: result.preprocessTimeMs
5575
+ });
5576
+ } catch (err) {
5577
+ const errMsg = err instanceof Error ? err.message : String(err);
5578
+ if (errMsg.includes("timed out")) {
5579
+ logger5.error("CRITICAL: Worker inference timed out \u2014 SenseVoice worker is dead. Page reload required.", {
5580
+ timeoutMs: INFERENCE_TIMEOUT_MS
5581
+ });
5582
+ } else {
5583
+ logger5.error("Worker inference failed", { error: errMsg });
5584
+ }
5585
+ span?.endWithError(err instanceof Error ? err : new Error(String(err)));
5586
+ telemetry?.incrementCounter("omote.inference.total", 1, {
5587
+ model: "sensevoice-worker",
5588
+ backend: "wasm",
5589
+ status: "error"
5590
+ });
5591
+ reject(err);
5592
+ }
5593
+ });
5594
+ });
5595
+ }
5596
+ /**
5597
+ * Dispose of the worker and free resources
5598
+ */
5599
+ async dispose() {
5600
+ if (this.worker) {
5601
+ try {
5602
+ await this.sendMessage({ type: "dispose" }, "disposed", INFERENCE_TIMEOUT_MS);
5603
+ } catch {
5604
+ }
5605
+ this.worker.terminate();
5606
+ this.worker = null;
5607
+ }
5608
+ this._isLoaded = false;
5609
+ this.poisoned = false;
5610
+ this.pendingResolvers.clear();
5611
+ }
5612
+ /**
5613
+ * Check if Web Workers are supported
5614
+ */
5615
+ static isSupported() {
5616
+ return typeof Worker !== "undefined";
5617
+ }
5618
+ };
5619
+
5620
+ // src/inference/UnifiedInferenceWorker.ts
5621
+ var logger6 = createLogger("UnifiedInferenceWorker");
5622
+ var WASM_CDN_PATH3 = "https://cdn.jsdelivr.net/npm/onnxruntime-web@1.23.2/dist/";
5623
+ var INIT_TIMEOUT_MS = 15e3;
5624
+ var SV_LOAD_TIMEOUT_MS = 3e4;
5625
+ var SV_INFER_TIMEOUT_MS = 1e4;
5626
+ var CPU_LOAD_TIMEOUT_MS = 6e4;
5627
+ var CPU_INFER_TIMEOUT_MS = 5e3;
5628
+ var VAD_LOAD_TIMEOUT_MS = 1e4;
5629
+ var VAD_INFER_TIMEOUT_MS = 1e3;
5630
+ var DISPOSE_TIMEOUT_MS = 5e3;
5631
+ function resolveUrl2(url) {
5632
+ if (/^https?:\/\//i.test(url) || /^blob:/i.test(url)) return url;
5633
+ try {
5634
+ return new URL(url, globalThis.location?.origin ?? "https://localhost").href;
5635
+ } catch {
5636
+ return url;
5637
+ }
5638
+ }
5639
+ var requestCounter = 0;
5640
+ function nextRequestId() {
5641
+ return `req_${++requestCounter}_${Date.now()}`;
5642
+ }
5643
+ var WORKER_SCRIPT2 = `
5644
+ // Unified Inference Worker Script
5645
+ // Hosts SenseVoice + Wav2ArkitCpu + Silero VAD in a single ORT instance
5646
+
5647
+ var ort = null;
5648
+
5649
+ // SenseVoice state
5650
+ var svSession = null;
5651
+ var svTokenMap = null;
5652
+ var svNegMean = null;
5653
+ var svInvStddev = null;
5654
+ var svLanguageId = 0;
5655
+ var svTextNormId = 14;
5656
+ var svVocabSize = 0;
5657
+
5658
+ // Wav2ArkitCpu state
5659
+ var cpuSession = null;
5660
+
5661
+ // Silero VAD state
5662
+ var vadSession = null;
5663
+ var vadSampleRate = 16000;
5664
+ var vadChunkSize = 512;
5665
+ var vadContextSize = 64;
5666
+
5667
+ // \u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550
5668
+ // kaldiFbank.ts \u2014 inlined as plain JavaScript
5669
+ // \u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550
5670
+
5671
+ function fft(re, im) {
5672
+ var n = re.length;
5673
+ for (var i = 1, j = 0; i < n; i++) {
5674
+ var bit = n >> 1;
5675
+ while (j & bit) { j ^= bit; bit >>= 1; }
5676
+ j ^= bit;
5677
+ if (i < j) {
5678
+ var tmp = re[i]; re[i] = re[j]; re[j] = tmp;
5679
+ tmp = im[i]; im[i] = im[j]; im[j] = tmp;
5680
+ }
5681
+ }
5682
+ for (var len = 2; len <= n; len *= 2) {
5683
+ var halfLen = len / 2;
5684
+ var angle = -2 * Math.PI / len;
5685
+ var wRe = Math.cos(angle);
5686
+ var wIm = Math.sin(angle);
5687
+ for (var i = 0; i < n; i += len) {
5688
+ var curRe = 1, curIm = 0;
5689
+ for (var j = 0; j < halfLen; j++) {
5690
+ var a = i + j, b = a + halfLen;
5691
+ var tRe = curRe * re[b] - curIm * im[b];
5692
+ var tIm = curRe * im[b] + curIm * re[b];
5693
+ re[b] = re[a] - tRe; im[b] = im[a] - tIm;
5694
+ re[a] += tRe; im[a] += tIm;
5695
+ var nextRe = curRe * wRe - curIm * wIm;
5696
+ curIm = curRe * wIm + curIm * wRe;
5697
+ curRe = nextRe;
5698
+ }
5699
+ }
5700
+ }
5701
+ }
5702
+
5703
+ function htkMel(freq) { return 1127.0 * Math.log(1.0 + freq / 700.0); }
5704
+ function htkMelInverse(mel) { return 700.0 * (Math.exp(mel / 1127.0) - 1.0); }
5705
+
5706
+ function buildMelFilterbank(numBins, fftSize, sampleRate, lowFreq, highFreq) {
5707
+ var numFftBins = fftSize / 2 + 1;
5708
+ var lowMel = htkMel(lowFreq);
5709
+ var highMel = htkMel(highFreq);
5710
+ var melPoints = new Float64Array(numBins + 2);
5711
+ for (var i = 0; i < numBins + 2; i++) {
5712
+ melPoints[i] = lowMel + (highMel - lowMel) * i / (numBins + 1);
5713
+ }
5714
+ var binFreqs = new Float64Array(numBins + 2);
5715
+ for (var i = 0; i < numBins + 2; i++) {
5716
+ binFreqs[i] = htkMelInverse(melPoints[i]) * fftSize / sampleRate;
5717
+ }
5718
+ var filters = [];
5719
+ for (var m = 0; m < numBins; m++) {
5720
+ var left = binFreqs[m], center = binFreqs[m + 1], right = binFreqs[m + 2];
5721
+ var startBin = Math.max(0, Math.ceil(left));
5722
+ var endBin = Math.min(numFftBins - 1, Math.floor(right));
5723
+ var weights = new Float32Array(endBin - startBin + 1);
5724
+ for (var k = startBin; k <= endBin; k++) {
5725
+ if (k <= center) {
5726
+ weights[k - startBin] = (center - left) > 0 ? (k - left) / (center - left) : 0;
5727
+ } else {
5728
+ weights[k - startBin] = (right - center) > 0 ? (right - k) / (right - center) : 0;
5729
+ }
5730
+ }
5731
+ filters.push({ startBin: startBin, weights: weights });
5732
+ }
5733
+ return filters;
5734
+ }
5735
+
5736
+ function createHammingWindow(length) {
5737
+ var w = new Float32Array(length);
5738
+ for (var i = 0; i < length; i++) {
5739
+ w[i] = 0.54 - 0.46 * Math.cos(2 * Math.PI * i / (length - 1));
5740
+ }
5741
+ return w;
5742
+ }
5743
+
5744
+ function computeKaldiFbank(audio, sampleRate, numMelBins, opts) {
5745
+ var frameLengthMs = (opts && opts.frameLengthMs !== undefined) ? opts.frameLengthMs : 25;
5746
+ var frameShiftMs = (opts && opts.frameShiftMs !== undefined) ? opts.frameShiftMs : 10;
5747
+ var lowFreq = (opts && opts.lowFreq !== undefined) ? opts.lowFreq : 20;
5748
+ var highFreq = (opts && opts.highFreq !== undefined) ? opts.highFreq : (sampleRate / 2);
5749
+ var dither = (opts && opts.dither !== undefined) ? opts.dither : 0;
5750
+ var preemphasis = (opts && opts.preemphasis !== undefined) ? opts.preemphasis : 0.97;
5751
+
5752
+ var frameLengthSamples = Math.round(sampleRate * frameLengthMs / 1000);
5753
+ var frameShiftSamples = Math.round(sampleRate * frameShiftMs / 1000);
5754
+
5755
+ var scaled = new Float32Array(audio.length);
5756
+ for (var i = 0; i < audio.length; i++) { scaled[i] = audio[i] * 32768; }
5757
+
5758
+ if (dither > 0) {
5759
+ for (var i = 0; i < scaled.length; i++) {
5760
+ var u1 = Math.random(), u2 = Math.random();
5761
+ scaled[i] += dither * Math.sqrt(-2 * Math.log(u1 + 1e-10)) * Math.cos(2 * Math.PI * u2);
5762
+ }
5763
+ }
5764
+
5765
+ var numFrames = Math.max(0, Math.floor((scaled.length - frameLengthSamples) / frameShiftSamples) + 1);
5766
+ if (numFrames === 0) return new Float32Array(0);
5767
+
5768
+ var fftSize = 1;
5769
+ while (fftSize < frameLengthSamples) fftSize *= 2;
5770
+ var numFftBins = fftSize / 2 + 1;
5771
+
5772
+ var window = createHammingWindow(frameLengthSamples);
5773
+ var filters = buildMelFilterbank(numMelBins, fftSize, sampleRate, lowFreq, highFreq);
5774
+ var output = new Float32Array(numFrames * numMelBins);
5775
+ var fftRe = new Float64Array(fftSize);
5776
+ var fftIm = new Float64Array(fftSize);
5777
+
5778
+ for (var f = 0; f < numFrames; f++) {
5779
+ var offset = f * frameShiftSamples;
5780
+ fftRe.fill(0); fftIm.fill(0);
5781
+ for (var i = 0; i < frameLengthSamples; i++) {
5782
+ var sample = scaled[offset + i];
5783
+ if (preemphasis > 0 && i > 0) {
5784
+ sample -= preemphasis * scaled[offset + i - 1];
5785
+ } else if (preemphasis > 0 && i === 0 && offset > 0) {
5786
+ sample -= preemphasis * scaled[offset - 1];
5787
+ }
5788
+ fftRe[i] = sample * window[i];
5789
+ }
5790
+ fft(fftRe, fftIm);
5791
+ var outOffset = f * numMelBins;
5792
+ for (var m = 0; m < numMelBins; m++) {
5793
+ var filter = filters[m];
5794
+ var energy = 0;
5795
+ for (var k = 0; k < filter.weights.length; k++) {
5796
+ var bin = filter.startBin + k;
5797
+ if (bin < numFftBins) {
5798
+ var powerSpec = fftRe[bin] * fftRe[bin] + fftIm[bin] * fftIm[bin];
5799
+ energy += filter.weights[k] * powerSpec;
5800
+ }
5801
+ }
5802
+ output[outOffset + m] = Math.log(Math.max(energy, 1e-10));
5803
+ }
5804
+ }
5805
+ return output;
5806
+ }
5807
+
5808
+ function applyLFR(features, featureDim, lfrM, lfrN) {
5809
+ var numFrames = features.length / featureDim;
5810
+ if (numFrames === 0) return new Float32Array(0);
5811
+ var leftPad = Math.floor((lfrM - 1) / 2);
5812
+ var paddedLen = numFrames + leftPad;
5813
+ var numOutputFrames = Math.ceil(paddedLen / lfrN);
5814
+ var outputDim = featureDim * lfrM;
5815
+ var output = new Float32Array(numOutputFrames * outputDim);
5816
+ for (var i = 0; i < numOutputFrames; i++) {
5817
+ var startFrame = i * lfrN - leftPad;
5818
+ for (var j = 0; j < lfrM; j++) {
5819
+ var srcFrame = startFrame + j;
5820
+ if (srcFrame < 0) srcFrame = 0;
5821
+ if (srcFrame >= numFrames) srcFrame = numFrames - 1;
5822
+ var srcOffset = srcFrame * featureDim;
5823
+ var dstOffset = i * outputDim + j * featureDim;
5824
+ for (var k = 0; k < featureDim; k++) {
5825
+ output[dstOffset + k] = features[srcOffset + k];
5826
+ }
5827
+ }
5828
+ }
5829
+ return output;
5830
+ }
5831
+
5832
+ function applyCMVN(features, dim, negMeanVec, invStddevVec) {
5833
+ for (var i = 0; i < features.length; i++) {
5834
+ var d = i % dim;
5835
+ features[i] = (features[i] + negMeanVec[d]) * invStddevVec[d];
5836
+ }
5837
+ return features;
5838
+ }
5839
+
5840
+ function parseCMVNFromMetadata(negMeanStr, invStddevStr) {
5841
+ var negMeanArr = new Float32Array(
5842
+ negMeanStr.split(',').map(function(s) { return parseFloat(s.trim()); })
5843
+ );
5844
+ var invStddevArr = new Float32Array(
5845
+ invStddevStr.split(',').map(function(s) { return parseFloat(s.trim()); })
5846
+ );
5847
+ return { negMean: negMeanArr, invStddev: invStddevArr };
5848
+ }
5849
+
5850
+ // \u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550
5851
+ // ctcDecoder.ts \u2014 inlined as plain JavaScript
5852
+ // \u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550
5853
+
5854
+ var LANGUAGE_IDS = { 0: 'auto', 3: 'zh', 4: 'en', 7: 'yue', 11: 'ja', 12: 'ko', 13: 'nospeech' };
5855
+ var TEXT_NORM_IDS = { 14: 'with_itn', 15: 'without_itn' };
5856
+
5857
+ function resolveLanguageIdW(language) {
5858
+ var map = { auto: 0, zh: 3, en: 4, yue: 7, ja: 11, ko: 12 };
5859
+ return map[language] !== undefined ? map[language] : 0;
5860
+ }
5861
+
5862
+ function resolveTextNormIdW(textNorm) {
5863
+ return textNorm === 'without_itn' ? 15 : 14;
5864
+ }
5865
+
5866
+ function parseTokensFile(content) {
5867
+ var map = new Map();
5868
+ var lines = content.split('\\n');
5869
+ for (var idx = 0; idx < lines.length; idx++) {
5870
+ var trimmed = lines[idx].trim();
5871
+ if (!trimmed) continue;
5872
+ var lastSpace = trimmed.lastIndexOf(' ');
5873
+ if (lastSpace === -1) continue;
5874
+ var token = trimmed.substring(0, lastSpace);
5875
+ var id = parseInt(trimmed.substring(lastSpace + 1), 10);
5876
+ if (!isNaN(id)) map.set(id, token);
5877
+ }
5878
+ return map;
5879
+ }
5880
+
5881
+ function parseStructuredToken(token) {
5882
+ var match = token.match(/^<\\|(.+)\\|>$/);
5883
+ if (!match) return null;
5884
+ var value = match[1];
5885
+ if (value === 'zh' || value === 'en' || value === 'ja' || value === 'ko' || value === 'yue' || value === 'nospeech') {
5886
+ return { type: 'language', value: value };
5887
+ }
5888
+ var emotions = ['HAPPY', 'SAD', 'ANGRY', 'NEUTRAL', 'FEARFUL', 'DISGUSTED', 'SURPRISED', 'EMO_UNKNOWN'];
5889
+ if (emotions.indexOf(value) !== -1) return { type: 'emotion', value: value };
5890
+ var events = ['Speech', 'BGM', 'Applause', 'Laughter', 'Crying', 'Coughing', 'Sneezing', 'EVENT_UNKNOWN'];
5891
+ if (events.indexOf(value) !== -1) return { type: 'event', value: value };
5892
+ if (value === 'withitn' || value === 'woitn' || value === 'with_itn' || value === 'without_itn') {
5893
+ return { type: 'textnorm', value: value };
5894
+ }
5895
+ return null;
5896
+ }
5897
+
5898
+ function ctcGreedyDecode(logits, seqLen, vocabSz, tokenMapLocal) {
5899
+ var tokenIds = [];
5900
+ for (var t = 0; t < seqLen; t++) {
5901
+ var offset = t * vocabSz;
5902
+ var maxIdx = 0, maxVal = logits[offset];
5903
+ for (var v = 1; v < vocabSz; v++) {
5904
+ if (logits[offset + v] > maxVal) { maxVal = logits[offset + v]; maxIdx = v; }
5905
+ }
5906
+ tokenIds.push(maxIdx);
5907
+ }
5908
+ var collapsed = [], prev = -1;
5909
+ for (var idx = 0; idx < tokenIds.length; idx++) {
5910
+ var id = tokenIds[idx];
5911
+ if (id !== prev) { collapsed.push(id); prev = id; }
5912
+ }
5913
+ var filtered = collapsed.filter(function(id) { return id !== 0 && id !== 1 && id !== 2; });
5914
+ var language = undefined, emotion = undefined, event = undefined;
5915
+ var textTokens = [];
5916
+ for (var idx = 0; idx < filtered.length; idx++) {
5917
+ var id = filtered[idx];
5918
+ var token = tokenMapLocal.get(id);
5919
+ if (!token) continue;
5920
+ var structured = parseStructuredToken(token);
5921
+ if (structured) {
5922
+ if (structured.type === 'language') language = structured.value;
5923
+ else if (structured.type === 'emotion') emotion = structured.value;
5924
+ else if (structured.type === 'event') event = structured.value;
5925
+ } else {
5926
+ textTokens.push(token);
5927
+ }
5928
+ }
5929
+ var text = textTokens.join('');
5930
+ text = text.replace(/\\u2581/g, ' ').trim();
5931
+ return { text: text, language: language, emotion: emotion, event: event };
5932
+ }
5933
+
5934
+ // \u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550
5935
+ // blendshapeUtils.ts \u2014 inlined
5936
+ // \u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550
5937
+
5938
+ var SYMMETRIC_INDEX_PAIRS = [
5939
+ [23, 25], [32, 38], [43, 44], [29, 30], [27, 28], [45, 46],
5940
+ [35, 36], [47, 48], [33, 34], [49, 50], [6, 7], [0, 1],
5941
+ [3, 4], [8, 9], [16, 17], [10, 11], [12, 13], [14, 15],
5942
+ [18, 19], [20, 21],
5943
+ ];
5944
+
5945
+ function symmetrizeBlendshapes(frame) {
5946
+ var result = new Float32Array(frame);
5947
+ for (var p = 0; p < SYMMETRIC_INDEX_PAIRS.length; p++) {
5948
+ var lIdx = SYMMETRIC_INDEX_PAIRS[p][0], rIdx = SYMMETRIC_INDEX_PAIRS[p][1];
5949
+ var avg = (frame[lIdx] + frame[rIdx]) / 2;
5950
+ result[lIdx] = avg;
5951
+ result[rIdx] = avg;
5952
+ }
5953
+ return result;
5954
+ }
5955
+
5956
+ // \u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550
5957
+ // Shared ORT loader
5958
+ // \u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550
5959
+
5960
+ async function loadOrt(wasmPaths, isIOSDevice) {
5961
+ if (ort) return;
5962
+ var ortUrl = wasmPaths + 'ort.wasm.min.js';
5963
+ var response = await fetch(ortUrl);
5964
+ var scriptText = await response.text();
5965
+ var blob = new Blob([scriptText], { type: 'application/javascript' });
5966
+ var blobUrl = URL.createObjectURL(blob);
5967
+ importScripts(blobUrl);
5968
+ URL.revokeObjectURL(blobUrl);
5969
+ ort = self.ort;
5970
+ ort.env.wasm.wasmPaths = wasmPaths;
5971
+ ort.env.wasm.numThreads = isIOSDevice ? 1 : 4;
5972
+ ort.env.wasm.simd = true;
5973
+ ort.env.wasm.proxy = false;
5974
+ }
5975
+
5976
+ // \u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550
5977
+ // SenseVoice handlers
5978
+ // \u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550
5979
+
5980
+ async function svLoad(msg) {
5981
+ var tokensResponse = await fetch(msg.tokensUrl);
5982
+ if (!tokensResponse.ok) throw new Error('Failed to fetch tokens.txt: ' + tokensResponse.status);
5983
+ var tokensText = await tokensResponse.text();
5984
+ svTokenMap = parseTokensFile(tokensText);
5985
+ svLanguageId = msg.language;
5986
+ svTextNormId = msg.textNorm;
5987
+
5988
+ var sessionOptions = { executionProviders: ['wasm'], graphOptimizationLevel: 'all' };
5989
+ if (msg.isIOS) {
5990
+ svSession = await ort.InferenceSession.create(msg.modelUrl, sessionOptions);
5991
+ } else {
5992
+ var modelResponse = await fetch(msg.modelUrl);
5993
+ if (!modelResponse.ok) throw new Error('Failed to fetch model: ' + modelResponse.status);
5994
+ var modelBuffer = await modelResponse.arrayBuffer();
5995
+ svSession = await ort.InferenceSession.create(new Uint8Array(modelBuffer), sessionOptions);
5996
+ }
5997
+
5998
+ try {
5999
+ var metadata = svSession.handler && svSession.handler.metadata;
6000
+ if (metadata && metadata.neg_mean && metadata.inv_stddev) {
6001
+ var cmvn = parseCMVNFromMetadata(metadata.neg_mean, metadata.inv_stddev);
6002
+ svNegMean = cmvn.negMean;
6003
+ svInvStddev = cmvn.invStddev;
6004
+ }
6005
+ } catch (e) { /* CMVN not available */ }
6006
+
6007
+ svVocabSize = 0;
6008
+ svTokenMap.forEach(function(val, key) { if (key >= svVocabSize) svVocabSize = key + 1; });
6009
+
6010
+ return {
6011
+ vocabSize: svVocabSize,
6012
+ inputNames: svSession.inputNames.slice(),
6013
+ outputNames: svSession.outputNames.slice(),
6014
+ };
6015
+ }
6016
+
6017
+ async function svTranscribe(audio) {
6018
+ var preprocessStart = performance.now();
6019
+ var fbank = computeKaldiFbank(audio, 16000, 80);
6020
+ var numFrames = fbank.length / 80;
6021
+ if (numFrames === 0) {
6022
+ return { text: '', inferenceTimeMs: performance.now() - preprocessStart, preprocessTimeMs: performance.now() - preprocessStart };
6023
+ }
6024
+ var lfrFeatures = applyLFR(fbank, 80, 7, 6);
6025
+ var numLfrFrames = lfrFeatures.length / 560;
6026
+ if (svNegMean && svInvStddev) applyCMVN(lfrFeatures, 560, svNegMean, svInvStddev);
6027
+ var preprocessTimeMs = performance.now() - preprocessStart;
6028
+
6029
+ var feeds = {
6030
+ x: new ort.Tensor('float32', lfrFeatures, [1, numLfrFrames, 560]),
6031
+ x_length: new ort.Tensor('int32', new Int32Array([numLfrFrames]), [1]),
6032
+ language: new ort.Tensor('int32', new Int32Array([svLanguageId]), [1]),
6033
+ text_norm: new ort.Tensor('int32', new Int32Array([svTextNormId]), [1]),
6034
+ };
6035
+ var results = await svSession.run(feeds);
6036
+ var logitsOutput = results['logits'];
6037
+ if (!logitsOutput) throw new Error('Model output missing "logits" tensor');
6038
+
6039
+ var decoded = ctcGreedyDecode(logitsOutput.data, logitsOutput.dims[1], logitsOutput.dims[2], svTokenMap);
6040
+ var totalTimeMs = performance.now() - preprocessStart;
6041
+
6042
+ return {
6043
+ text: decoded.text, language: decoded.language, emotion: decoded.emotion, event: decoded.event,
6044
+ inferenceTimeMs: totalTimeMs, preprocessTimeMs: preprocessTimeMs,
6045
+ };
6046
+ }
6047
+
6048
+ async function svDispose() {
6049
+ if (svSession) { await svSession.release(); svSession = null; }
6050
+ svTokenMap = null; svNegMean = null; svInvStddev = null;
6051
+ }
6052
+
6053
+ // \u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550
6054
+ // Wav2ArkitCpu handlers
6055
+ // \u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550
6056
+
6057
+ async function cpuLoad(msg) {
6058
+ var sessionOptions = { executionProviders: ['wasm'], graphOptimizationLevel: 'all' };
6059
+ var dataFilename = msg.externalDataUrl ? msg.externalDataUrl.split('/').pop() : null;
6060
+
6061
+ if (msg.isIOS) {
6062
+ if (msg.externalDataUrl && dataFilename) {
6063
+ sessionOptions.externalData = [{ path: dataFilename, data: msg.externalDataUrl }];
6064
+ }
6065
+ cpuSession = await ort.InferenceSession.create(msg.modelUrl, sessionOptions);
6066
+ } else {
6067
+ var graphResponse = await fetch(msg.modelUrl);
6068
+ if (!graphResponse.ok) throw new Error('Failed to fetch model graph: ' + graphResponse.status);
6069
+ var graphBuffer = await graphResponse.arrayBuffer();
6070
+ if (msg.externalDataUrl && dataFilename) {
6071
+ var dataResponse = await fetch(msg.externalDataUrl);
6072
+ if (!dataResponse.ok) throw new Error('Failed to fetch external data: ' + dataResponse.status);
6073
+ var dataBuffer = await dataResponse.arrayBuffer();
6074
+ sessionOptions.externalData = [{ path: dataFilename, data: new Uint8Array(dataBuffer) }];
6075
+ }
6076
+ cpuSession = await ort.InferenceSession.create(new Uint8Array(graphBuffer), sessionOptions);
6077
+ }
6078
+
6079
+ // Warmup
6080
+ var warmupAudio = new Float32Array(16000);
6081
+ var warmupTensor = new ort.Tensor('float32', warmupAudio, [1, warmupAudio.length]);
6082
+ await cpuSession.run({ audio_waveform: warmupTensor });
6083
+
6084
+ return {
6085
+ inputNames: cpuSession.inputNames.slice(),
6086
+ outputNames: cpuSession.outputNames.slice(),
6087
+ };
6088
+ }
6089
+
6090
+ async function cpuInfer(audio) {
6091
+ var tensor = new ort.Tensor('float32', audio, [1, audio.length]);
6092
+ var results = await cpuSession.run({ audio_waveform: tensor });
6093
+ var blendshapeOutput = results['blendshapes'];
6094
+ if (!blendshapeOutput) throw new Error('Missing blendshapes output from model');
6095
+
6096
+ var blendshapeData = blendshapeOutput.data;
6097
+ var numFrames = blendshapeOutput.dims[1];
6098
+ var numBlendshapes = blendshapeOutput.dims[2];
6099
+
6100
+ var flatBuffer = new Float32Array(numFrames * numBlendshapes);
6101
+ for (var f = 0; f < numFrames; f++) {
6102
+ var offset = f * numBlendshapes;
6103
+ var rawFrame = blendshapeData.slice(offset, offset + numBlendshapes);
6104
+ var symmetrized = symmetrizeBlendshapes(rawFrame);
6105
+ flatBuffer.set(symmetrized, offset);
6106
+ }
6107
+ return { flatBuffer: flatBuffer, numFrames: numFrames, numBlendshapes: numBlendshapes };
6108
+ }
6109
+
6110
+ async function cpuDispose() {
6111
+ if (cpuSession) { await cpuSession.release(); cpuSession = null; }
6112
+ }
6113
+
6114
+ // \u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550
6115
+ // Silero VAD handlers
6116
+ // \u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550
6117
+
6118
+ async function vadLoad(msg) {
6119
+ vadSampleRate = msg.sampleRate;
6120
+ vadChunkSize = vadSampleRate === 16000 ? 512 : 256;
6121
+ vadContextSize = vadSampleRate === 16000 ? 64 : 32;
6122
+
6123
+ var response = await fetch(msg.modelUrl);
6124
+ if (!response.ok) throw new Error('Failed to fetch VAD model: ' + response.status);
6125
+ var modelBuffer = await response.arrayBuffer();
6126
+ vadSession = await ort.InferenceSession.create(new Uint8Array(modelBuffer), {
6127
+ executionProviders: ['wasm'],
6128
+ graphOptimizationLevel: 'all',
6129
+ });
6130
+
6131
+ return {
6132
+ inputNames: vadSession.inputNames.slice(),
6133
+ outputNames: vadSession.outputNames.slice(),
6134
+ };
6135
+ }
6136
+
6137
+ async function vadProcess(audio, state, context) {
6138
+ var inputSize = vadContextSize + vadChunkSize;
6139
+ var inputBuffer = new Float32Array(inputSize);
6140
+ inputBuffer.set(context, 0);
6141
+ inputBuffer.set(audio, vadContextSize);
6142
+
6143
+ var inputTensor = new ort.Tensor('float32', new Float32Array(inputBuffer), [1, inputSize]);
6144
+ var stateTensor = new ort.Tensor('float32', new Float32Array(state), [2, 1, 128]);
6145
+ var srTensor;
6146
+ try {
6147
+ srTensor = new ort.Tensor('int64', new BigInt64Array([BigInt(vadSampleRate)]), []);
6148
+ } catch (e) {
6149
+ srTensor = new ort.Tensor('int64', [BigInt(vadSampleRate)], []);
6150
+ }
6151
+
6152
+ var feeds = { 'input': inputTensor, 'state': stateTensor, 'sr': srTensor };
6153
+ var results = await vadSession.run(feeds);
6154
+ var outputTensor = results['output'];
6155
+ var newStateTensor = results['stateN'] || results['state'];
6156
+ if (!outputTensor) throw new Error('Missing output tensor from VAD model');
6157
+
6158
+ return { probability: outputTensor.data[0], newState: new Float32Array(newStateTensor.data) };
6159
+ }
6160
+
6161
+ function vadCreateInitialState() {
6162
+ return new Float32Array(2 * 1 * 128);
6163
+ }
6164
+
6165
+ async function vadDispose() {
6166
+ if (vadSession) { await vadSession.release(); vadSession = null; }
6167
+ }
6168
+
6169
+ // \u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550
6170
+ // Message handler
6171
+ // \u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550
6172
+
6173
+ self.onmessage = async function(e) {
6174
+ var msg = e.data;
6175
+ var requestId = msg.requestId;
6176
+
6177
+ try {
6178
+ switch (msg.type) {
6179
+ case 'init': {
6180
+ var startTime = performance.now();
6181
+ await loadOrt(msg.wasmPaths, msg.isIOS);
6182
+ self.postMessage({ type: 'init:done', requestId: requestId, loadTimeMs: performance.now() - startTime });
6183
+ break;
6184
+ }
6185
+
6186
+ case 'sv:load': {
6187
+ var startTime = performance.now();
6188
+ var info = await svLoad(msg);
6189
+ self.postMessage({
6190
+ type: 'sv:loaded', requestId: requestId, vocabSize: info.vocabSize,
6191
+ inputNames: info.inputNames, outputNames: info.outputNames,
6192
+ loadTimeMs: performance.now() - startTime,
6193
+ });
6194
+ break;
6195
+ }
6196
+
6197
+ case 'sv:transcribe': {
6198
+ var result = await svTranscribe(msg.audio);
6199
+ self.postMessage({
6200
+ type: 'sv:result', requestId: requestId,
6201
+ text: result.text, language: result.language, emotion: result.emotion, event: result.event,
6202
+ inferenceTimeMs: result.inferenceTimeMs, preprocessTimeMs: result.preprocessTimeMs,
6203
+ });
6204
+ break;
6205
+ }
6206
+
6207
+ case 'sv:dispose': {
6208
+ await svDispose();
6209
+ self.postMessage({ type: 'sv:disposed', requestId: requestId });
6210
+ break;
6211
+ }
6212
+
6213
+ case 'cpu:load': {
6214
+ var startTime = performance.now();
6215
+ var info = await cpuLoad(msg);
6216
+ self.postMessage({
6217
+ type: 'cpu:loaded', requestId: requestId,
6218
+ inputNames: info.inputNames, outputNames: info.outputNames,
6219
+ loadTimeMs: performance.now() - startTime,
6220
+ });
6221
+ break;
6222
+ }
6223
+
6224
+ case 'cpu:infer': {
6225
+ var startTime = performance.now();
6226
+ var result = await cpuInfer(msg.audio);
6227
+ var inferenceTimeMs = performance.now() - startTime;
6228
+ self.postMessage({
6229
+ type: 'cpu:result', requestId: requestId,
6230
+ blendshapes: result.flatBuffer, numFrames: result.numFrames,
6231
+ numBlendshapes: result.numBlendshapes, inferenceTimeMs: inferenceTimeMs,
6232
+ }, [result.flatBuffer.buffer]);
6233
+ break;
6234
+ }
6235
+
6236
+ case 'cpu:dispose': {
6237
+ await cpuDispose();
6238
+ self.postMessage({ type: 'cpu:disposed', requestId: requestId });
6239
+ break;
6240
+ }
6241
+
6242
+ case 'vad:load': {
6243
+ var startTime = performance.now();
6244
+ var info = await vadLoad(msg);
6245
+ self.postMessage({
6246
+ type: 'vad:loaded', requestId: requestId,
6247
+ inputNames: info.inputNames, outputNames: info.outputNames,
6248
+ loadTimeMs: performance.now() - startTime,
6249
+ });
6250
+ break;
6251
+ }
6252
+
6253
+ case 'vad:process': {
6254
+ var startTime = performance.now();
6255
+ var result = await vadProcess(msg.audio, msg.state, msg.context);
6256
+ self.postMessage({
6257
+ type: 'vad:result', requestId: requestId,
6258
+ probability: result.probability, state: result.newState,
6259
+ inferenceTimeMs: performance.now() - startTime,
6260
+ });
6261
+ break;
6262
+ }
6263
+
6264
+ case 'vad:reset': {
6265
+ var state = vadCreateInitialState();
6266
+ self.postMessage({ type: 'vad:reset', requestId: requestId, state: state });
6267
+ break;
6268
+ }
6269
+
6270
+ case 'vad:dispose': {
6271
+ await vadDispose();
6272
+ self.postMessage({ type: 'vad:disposed', requestId: requestId });
6273
+ break;
6274
+ }
6275
+
6276
+ case 'dispose-all': {
6277
+ await svDispose();
6278
+ await cpuDispose();
6279
+ await vadDispose();
6280
+ ort = null;
6281
+ self.postMessage({ type: 'dispose-all:done', requestId: requestId });
6282
+ break;
6283
+ }
6284
+
6285
+ default:
6286
+ self.postMessage({ type: 'error', requestId: requestId, error: 'Unknown message type: ' + msg.type });
6287
+ }
6288
+ } catch (err) {
6289
+ var errorMsg = err.message || String(err);
6290
+ if (typeof err === 'number') {
6291
+ errorMsg = 'Raw C++ exception pointer (0x' + err.toString(16) + '). Likely OOM in WASM.';
6292
+ }
6293
+ self.postMessage({ type: 'error', requestId: requestId, error: errorMsg });
6294
+ }
6295
+ };
6296
+
6297
+ self.onerror = function(err) {
6298
+ self.postMessage({ type: 'error', requestId: null, error: 'Worker error: ' + (err.message || String(err)) });
6299
+ };
6300
+ `;
6301
+ var UnifiedInferenceWorker = class {
6302
+ constructor() {
6303
+ this.worker = null;
6304
+ this.pendingRequests = /* @__PURE__ */ new Map();
6305
+ this.initialized = false;
6306
+ this.poisoned = false;
6307
+ }
6308
+ /**
6309
+ * Initialize the worker (load ORT WASM from CDN)
6310
+ */
6311
+ async init() {
6312
+ if (this.initialized) return;
6313
+ const startTime = performance.now();
6314
+ const telemetry = getTelemetry();
6315
+ const span = telemetry?.startSpan("UnifiedInferenceWorker.init");
6316
+ try {
6317
+ logger6.info("Creating unified inference worker...");
6318
+ this.worker = this.createWorker();
6319
+ await this.sendMessage(
6320
+ { type: "init", wasmPaths: WASM_CDN_PATH3, isIOS: isIOS() },
6321
+ "init:done",
6322
+ INIT_TIMEOUT_MS
6323
+ );
6324
+ this.initialized = true;
6325
+ const loadTimeMs = performance.now() - startTime;
6326
+ logger6.info("Unified worker initialized", { loadTimeMs: Math.round(loadTimeMs) });
6327
+ span?.setAttributes({ "worker.init_time_ms": loadTimeMs });
6328
+ span?.end();
6329
+ } catch (error) {
6330
+ span?.endWithError(error instanceof Error ? error : new Error(String(error)));
6331
+ this.cleanup();
6332
+ throw error;
6333
+ }
6334
+ }
6335
+ // ── SenseVoice ────────────────────────────────────────────────────────
6336
+ async loadSenseVoice(config) {
6337
+ this.assertReady();
6338
+ const startTime = performance.now();
6339
+ const result = await this.sendMessage(
6340
+ {
6341
+ type: "sv:load",
6342
+ modelUrl: resolveUrl2(config.modelUrl),
6343
+ tokensUrl: resolveUrl2(config.tokensUrl),
6344
+ isIOS: isIOS(),
6345
+ language: config.language,
6346
+ textNorm: config.textNorm
6347
+ },
6348
+ "sv:loaded",
6349
+ SV_LOAD_TIMEOUT_MS
6350
+ );
6351
+ const loadTimeMs = performance.now() - startTime;
6352
+ return {
6353
+ backend: "wasm",
6354
+ loadTimeMs,
6355
+ inputNames: result.inputNames,
6356
+ outputNames: result.outputNames,
6357
+ vocabSize: result.vocabSize
6358
+ };
6359
+ }
6360
+ async transcribe(audio) {
6361
+ this.assertReady();
6362
+ const result = await this.sendMessage(
6363
+ { type: "sv:transcribe", audio },
6364
+ "sv:result",
6365
+ SV_INFER_TIMEOUT_MS
6366
+ );
6367
+ return {
6368
+ text: result.text,
6369
+ language: result.language,
6370
+ emotion: result.emotion,
6371
+ event: result.event,
6372
+ inferenceTimeMs: result.inferenceTimeMs,
6373
+ preprocessTimeMs: result.preprocessTimeMs
6374
+ };
6375
+ }
6376
+ async disposeSenseVoice() {
6377
+ if (!this.worker) return;
6378
+ await this.sendMessage({ type: "sv:dispose" }, "sv:disposed", DISPOSE_TIMEOUT_MS);
6379
+ }
6380
+ // ── Wav2ArkitCpu (Lip Sync) ──────────────────────────────────────────
6381
+ async loadLipSync(config) {
6382
+ this.assertReady();
6383
+ const startTime = performance.now();
6384
+ const result = await this.sendMessage(
6385
+ {
6386
+ type: "cpu:load",
6387
+ modelUrl: resolveUrl2(config.modelUrl),
6388
+ externalDataUrl: config.externalDataUrl ? resolveUrl2(config.externalDataUrl) : null,
6389
+ isIOS: isIOS()
6390
+ },
6391
+ "cpu:loaded",
6392
+ CPU_LOAD_TIMEOUT_MS
6393
+ );
6394
+ const loadTimeMs = performance.now() - startTime;
6395
+ return {
6396
+ backend: "wasm",
6397
+ loadTimeMs,
6398
+ inputNames: result.inputNames,
6399
+ outputNames: result.outputNames
6400
+ };
6401
+ }
6402
+ async inferLipSync(audio) {
6403
+ this.assertReady();
6404
+ return this.sendMessage(
6405
+ { type: "cpu:infer", audio },
6406
+ "cpu:result",
6407
+ CPU_INFER_TIMEOUT_MS
6408
+ );
6409
+ }
6410
+ async disposeLipSync() {
6411
+ if (!this.worker) return;
6412
+ await this.sendMessage({ type: "cpu:dispose" }, "cpu:disposed", DISPOSE_TIMEOUT_MS);
6413
+ }
6414
+ // ── Silero VAD ────────────────────────────────────────────────────────
6415
+ async loadVAD(config) {
6416
+ this.assertReady();
6417
+ const startTime = performance.now();
6418
+ const chunkSize = config.sampleRate === 16e3 ? 512 : 256;
6419
+ const result = await this.sendMessage(
6420
+ {
6421
+ type: "vad:load",
6422
+ modelUrl: resolveUrl2(config.modelUrl),
6423
+ sampleRate: config.sampleRate
6424
+ },
6425
+ "vad:loaded",
6426
+ VAD_LOAD_TIMEOUT_MS
6427
+ );
6428
+ const loadTimeMs = performance.now() - startTime;
6429
+ return {
6430
+ backend: "wasm",
6431
+ loadTimeMs,
6432
+ inputNames: result.inputNames,
6433
+ outputNames: result.outputNames,
6434
+ sampleRate: config.sampleRate,
6435
+ chunkSize
6436
+ };
6437
+ }
6438
+ async processVAD(audio, state, context) {
6439
+ this.assertReady();
6440
+ return this.sendMessage(
6441
+ { type: "vad:process", audio, state, context },
6442
+ "vad:result",
6443
+ VAD_INFER_TIMEOUT_MS
6444
+ );
6445
+ }
6446
+ async resetVAD() {
6447
+ this.assertReady();
6448
+ const result = await this.sendMessage(
6449
+ { type: "vad:reset" },
6450
+ "vad:reset",
6451
+ VAD_INFER_TIMEOUT_MS
6452
+ );
6453
+ return result.state;
6454
+ }
6455
+ async disposeVAD() {
6456
+ if (!this.worker) return;
6457
+ await this.sendMessage({ type: "vad:dispose" }, "vad:disposed", DISPOSE_TIMEOUT_MS);
6458
+ }
6459
+ // ── Lifecycle ─────────────────────────────────────────────────────────
6460
+ async dispose() {
6461
+ if (this.worker) {
6462
+ try {
6463
+ await this.sendMessage({ type: "dispose-all" }, "dispose-all:done", DISPOSE_TIMEOUT_MS);
6464
+ } catch {
6465
+ }
6466
+ this.worker.terminate();
6467
+ this.worker = null;
6468
+ }
6469
+ this.initialized = false;
6470
+ this.poisoned = false;
6471
+ this.rejectAllPending("Worker disposed");
6472
+ this.pendingRequests.clear();
6473
+ }
6474
+ /** Check if the worker is initialized and not poisoned */
6475
+ get isReady() {
6476
+ return this.initialized && !this.poisoned && this.worker !== null;
6477
+ }
6478
+ /** Check if Web Workers are supported */
6479
+ static isSupported() {
6480
+ return typeof Worker !== "undefined";
6481
+ }
6482
+ // ── Private ───────────────────────────────────────────────────────────
6483
+ assertReady() {
6484
+ if (!this.initialized || !this.worker) {
6485
+ throw new Error("UnifiedInferenceWorker not initialized. Call init() first.");
6486
+ }
6487
+ if (this.poisoned) {
6488
+ throw new Error("UnifiedInferenceWorker timed out \u2014 unavailable until page reload");
6489
+ }
6490
+ }
6491
+ createWorker() {
6492
+ const blob = new Blob([WORKER_SCRIPT2], { type: "application/javascript" });
6493
+ const blobUrl = URL.createObjectURL(blob);
6494
+ const worker = new Worker(blobUrl);
6495
+ URL.revokeObjectURL(blobUrl);
6496
+ worker.onmessage = (event) => {
6497
+ this.handleWorkerMessage(event.data);
6498
+ };
6499
+ worker.onerror = (error) => {
6500
+ logger6.error("Unified worker error", { error: error.message });
6501
+ this.rejectAllPending(`Worker error: ${error.message}`);
6502
+ };
6503
+ return worker;
6504
+ }
6505
+ handleWorkerMessage(data) {
6506
+ const requestId = data.requestId;
6507
+ if (data.type === "error") {
6508
+ if (requestId && this.pendingRequests.has(requestId)) {
6509
+ const pending = this.pendingRequests.get(requestId);
6510
+ clearTimeout(pending.timeout);
6511
+ this.pendingRequests.delete(requestId);
6512
+ pending.reject(new Error(data.error));
6513
+ } else {
6514
+ logger6.error("Worker broadcast error", { error: data.error });
6515
+ this.rejectAllPending(data.error);
6516
+ }
6517
+ return;
6518
+ }
6519
+ if (requestId && this.pendingRequests.has(requestId)) {
6520
+ const pending = this.pendingRequests.get(requestId);
6521
+ clearTimeout(pending.timeout);
6522
+ this.pendingRequests.delete(requestId);
6523
+ pending.resolve(data);
6524
+ }
6525
+ }
6526
+ sendMessage(message, expectedType, timeoutMs) {
6527
+ return new Promise((resolve, reject) => {
6528
+ if (!this.worker) {
6529
+ reject(new Error("Worker not initialized"));
6530
+ return;
6531
+ }
6532
+ const requestId = nextRequestId();
6533
+ const timeout = setTimeout(() => {
6534
+ this.pendingRequests.delete(requestId);
6535
+ this.poisoned = true;
6536
+ logger6.error("CRITICAL: Worker operation timed out \u2014 worker is dead", {
6537
+ type: message.type,
6538
+ timeoutMs
6539
+ });
6540
+ reject(new Error(`Worker operation '${message.type}' timed out after ${timeoutMs}ms`));
6541
+ }, timeoutMs);
6542
+ this.pendingRequests.set(requestId, {
6543
+ resolve,
6544
+ reject,
6545
+ timeout
6546
+ });
6547
+ this.worker.postMessage({ ...message, requestId });
6548
+ });
6549
+ }
6550
+ rejectAllPending(reason) {
6551
+ for (const [, pending] of this.pendingRequests) {
6552
+ clearTimeout(pending.timeout);
6553
+ pending.reject(new Error(reason));
6554
+ }
6555
+ this.pendingRequests.clear();
6556
+ }
6557
+ cleanup() {
6558
+ if (this.worker) {
6559
+ this.worker.terminate();
6560
+ this.worker = null;
6561
+ }
6562
+ this.initialized = false;
6563
+ this.rejectAllPending("Worker cleanup");
6564
+ this.pendingRequests.clear();
6565
+ }
6566
+ };
6567
+ var SenseVoiceUnifiedAdapter = class {
6568
+ constructor(worker, config) {
6569
+ this._isLoaded = false;
6570
+ this.inferenceQueue = Promise.resolve();
6571
+ this.worker = worker;
6572
+ const modelDir = config.modelUrl.substring(0, config.modelUrl.lastIndexOf("/"));
6573
+ this.config = {
6574
+ modelUrl: config.modelUrl,
6575
+ tokensUrl: config.tokensUrl ?? `${modelDir}/tokens.txt`,
6576
+ language: config.language ?? "auto",
6577
+ textNorm: config.textNorm ?? "with_itn"
6578
+ };
6579
+ this.languageId = resolveLanguageId(this.config.language);
6580
+ this.textNormId = resolveTextNormId(this.config.textNorm);
6581
+ }
6582
+ get isLoaded() {
6583
+ return this._isLoaded;
6584
+ }
6585
+ get backend() {
6586
+ return this._isLoaded ? "wasm" : null;
6587
+ }
6588
+ async load(onProgress) {
6589
+ const telemetry = getTelemetry();
6590
+ const span = telemetry?.startSpan("SenseVoiceUnifiedAdapter.load", {
6591
+ "model.url": this.config.modelUrl
6592
+ });
6593
+ try {
6594
+ const result = await this.worker.loadSenseVoice({
6595
+ modelUrl: this.config.modelUrl,
6596
+ tokensUrl: this.config.tokensUrl,
6597
+ language: this.languageId,
6598
+ textNorm: this.textNormId
6599
+ });
6600
+ this._isLoaded = true;
6601
+ onProgress?.(1, 1);
6602
+ logger6.info("SenseVoice loaded via unified worker", {
6603
+ backend: "wasm",
6604
+ loadTimeMs: Math.round(result.loadTimeMs),
6605
+ vocabSize: result.vocabSize
6606
+ });
6607
+ span?.setAttributes({ "model.backend": "wasm", "model.load_time_ms": result.loadTimeMs });
6608
+ span?.end();
6609
+ telemetry?.recordHistogram("omote.model.load_time", result.loadTimeMs, {
6610
+ model: "sensevoice-unified",
6611
+ backend: "wasm"
6612
+ });
6613
+ return result;
6614
+ } catch (error) {
6615
+ span?.endWithError(error instanceof Error ? error : new Error(String(error)));
6616
+ throw error;
6617
+ }
6618
+ }
6619
+ async transcribe(audioSamples) {
6620
+ if (!this._isLoaded) throw new Error("Model not loaded. Call load() first.");
6621
+ const audio = new Float32Array(audioSamples);
6622
+ return new Promise((resolve, reject) => {
6623
+ this.inferenceQueue = this.inferenceQueue.then(async () => {
6624
+ try {
6625
+ const result = await this.worker.transcribe(audio);
6626
+ resolve(result);
6627
+ } catch (err) {
6628
+ reject(err);
6629
+ }
6630
+ });
6631
+ });
6632
+ }
6633
+ async dispose() {
6634
+ if (this._isLoaded) {
6635
+ await this.worker.disposeSenseVoice();
6636
+ this._isLoaded = false;
6637
+ }
6638
+ }
6639
+ };
6640
+ var Wav2ArkitCpuUnifiedAdapter = class {
6641
+ constructor(worker, config) {
6642
+ this.modelId = "wav2arkit_cpu";
6643
+ this._isLoaded = false;
6644
+ this.inferenceQueue = Promise.resolve();
6645
+ this.worker = worker;
6646
+ this.config = config;
6647
+ }
6648
+ get isLoaded() {
6649
+ return this._isLoaded;
6650
+ }
6651
+ get backend() {
6652
+ return this._isLoaded ? "wasm" : null;
6653
+ }
6654
+ async load() {
6655
+ const telemetry = getTelemetry();
6656
+ const span = telemetry?.startSpan("Wav2ArkitCpuUnifiedAdapter.load", {
6657
+ "model.url": this.config.modelUrl
6658
+ });
6659
+ try {
6660
+ const externalDataUrl = this.config.externalDataUrl !== false ? this.config.externalDataUrl || `${this.config.modelUrl}.data` : null;
6661
+ const result = await this.worker.loadLipSync({
6662
+ modelUrl: this.config.modelUrl,
6663
+ externalDataUrl: externalDataUrl || null
6664
+ });
6665
+ this._isLoaded = true;
6666
+ logger6.info("Wav2ArkitCpu loaded via unified worker", {
6667
+ backend: "wasm",
6668
+ loadTimeMs: Math.round(result.loadTimeMs)
6669
+ });
6670
+ span?.setAttributes({ "model.backend": "wasm", "model.load_time_ms": result.loadTimeMs });
6671
+ span?.end();
6672
+ telemetry?.recordHistogram("omote.model.load_time", result.loadTimeMs, {
6673
+ model: "wav2arkit_cpu-unified",
6674
+ backend: "wasm"
6675
+ });
6676
+ return result;
6677
+ } catch (error) {
6678
+ span?.endWithError(error instanceof Error ? error : new Error(String(error)));
6679
+ throw error;
6680
+ }
6681
+ }
6682
+ async infer(audioSamples, _identityIndex) {
6683
+ if (!this._isLoaded) throw new Error("Model not loaded. Call load() first.");
6684
+ const audioCopy = new Float32Array(audioSamples);
6685
+ return new Promise((resolve, reject) => {
6686
+ this.inferenceQueue = this.inferenceQueue.then(async () => {
6687
+ const telemetry = getTelemetry();
6688
+ const span = telemetry?.startSpan("Wav2ArkitCpuUnifiedAdapter.infer", {
6689
+ "inference.input_samples": audioCopy.length
6690
+ });
6691
+ try {
6692
+ const startTime = performance.now();
6693
+ const result = await this.worker.inferLipSync(audioCopy);
6694
+ const inferenceTimeMs = performance.now() - startTime;
6695
+ const flatBuffer = result.blendshapes;
6696
+ const { numFrames, numBlendshapes } = result;
6697
+ const blendshapes = [];
6698
+ for (let f = 0; f < numFrames; f++) {
6699
+ blendshapes.push(flatBuffer.slice(f * numBlendshapes, (f + 1) * numBlendshapes));
6700
+ }
6701
+ span?.setAttributes({
6702
+ "inference.duration_ms": inferenceTimeMs,
6703
+ "inference.frames": numFrames
6704
+ });
6705
+ span?.end();
6706
+ resolve({ blendshapes, numFrames, inferenceTimeMs });
6707
+ } catch (err) {
6708
+ span?.endWithError(err instanceof Error ? err : new Error(String(err)));
6709
+ reject(err);
6710
+ }
6711
+ });
6712
+ });
6713
+ }
6714
+ async dispose() {
6715
+ if (this._isLoaded) {
6716
+ await this.worker.disposeLipSync();
6717
+ this._isLoaded = false;
6718
+ }
6719
+ }
6720
+ };
6721
+ var SileroVADUnifiedAdapter = class {
6722
+ constructor(worker, config) {
6723
+ this._isLoaded = false;
6724
+ // Inference queue
6725
+ this.inferenceQueue = Promise.resolve();
6726
+ // Pre-speech buffer
6727
+ this.preSpeechBuffer = [];
6728
+ this.wasSpeaking = false;
6729
+ this.worker = worker;
6730
+ const sr = config.sampleRate ?? 16e3;
6731
+ this.config = {
6732
+ modelUrl: config.modelUrl,
6733
+ backend: config.backend ?? "wasm",
6734
+ sampleRate: sr,
6735
+ threshold: config.threshold ?? 0.5,
6736
+ preSpeechBufferChunks: config.preSpeechBufferChunks ?? 10
6737
+ };
6738
+ this.chunkSize = sr === 16e3 ? 512 : 256;
6739
+ this.contextSize = sr === 16e3 ? 64 : 32;
6740
+ this.state = new Float32Array(2 * 1 * 128);
6741
+ this.context = new Float32Array(this.contextSize);
6742
+ }
6743
+ get isLoaded() {
6744
+ return this._isLoaded;
6745
+ }
6746
+ get backend() {
6747
+ return this._isLoaded ? "wasm" : null;
6748
+ }
6749
+ get sampleRate() {
6750
+ return this.config.sampleRate;
6751
+ }
6752
+ get threshold() {
6753
+ return this.config.threshold;
6754
+ }
6755
+ getChunkSize() {
6756
+ return this.chunkSize;
6757
+ }
6758
+ getChunkDurationMs() {
6759
+ return this.chunkSize / this.config.sampleRate * 1e3;
6760
+ }
6761
+ async load() {
6762
+ const telemetry = getTelemetry();
6763
+ const span = telemetry?.startSpan("SileroVADUnifiedAdapter.load", {
6764
+ "model.url": this.config.modelUrl
6765
+ });
6766
+ try {
6767
+ const result = await this.worker.loadVAD({
6768
+ modelUrl: this.config.modelUrl,
6769
+ sampleRate: this.config.sampleRate
6770
+ });
6771
+ this._isLoaded = true;
6772
+ logger6.info("SileroVAD loaded via unified worker", {
6773
+ backend: "wasm",
6774
+ loadTimeMs: Math.round(result.loadTimeMs),
6775
+ sampleRate: this.config.sampleRate,
6776
+ chunkSize: this.chunkSize
6777
+ });
6778
+ span?.setAttributes({ "model.backend": "wasm", "model.load_time_ms": result.loadTimeMs });
6779
+ span?.end();
6780
+ telemetry?.recordHistogram("omote.model.load_time", result.loadTimeMs, {
6781
+ model: "silero-vad-unified",
6782
+ backend: "wasm"
6783
+ });
6784
+ return result;
6785
+ } catch (error) {
6786
+ span?.endWithError(error instanceof Error ? error : new Error(String(error)));
6787
+ throw error;
6788
+ }
6789
+ }
6790
+ async process(audioChunk) {
6791
+ if (!this._isLoaded) throw new Error("Model not loaded. Call load() first.");
6792
+ if (audioChunk.length !== this.chunkSize) {
6793
+ throw new Error(
6794
+ `Audio chunk must be exactly ${this.chunkSize} samples (got ${audioChunk.length}). Use getChunkSize() to get required size.`
6795
+ );
6796
+ }
6797
+ const audioChunkCopy = new Float32Array(audioChunk);
6798
+ return new Promise((resolve, reject) => {
6799
+ this.inferenceQueue = this.inferenceQueue.then(async () => {
6800
+ try {
6801
+ const startTime = performance.now();
6802
+ const result = await this.worker.processVAD(audioChunkCopy, this.state, this.context);
6803
+ this.state = result.state;
6804
+ this.context = audioChunkCopy.slice(-this.contextSize);
6805
+ const inferenceTimeMs = performance.now() - startTime;
6806
+ const isSpeech = result.probability > this.config.threshold;
6807
+ let preSpeechChunks;
6808
+ if (isSpeech && !this.wasSpeaking) {
6809
+ preSpeechChunks = [...this.preSpeechBuffer];
6810
+ this.preSpeechBuffer = [];
6811
+ } else if (!isSpeech && !this.wasSpeaking) {
6812
+ this.preSpeechBuffer.push(new Float32Array(audioChunkCopy));
6813
+ if (this.preSpeechBuffer.length > this.config.preSpeechBufferChunks) {
6814
+ this.preSpeechBuffer.shift();
6815
+ }
6816
+ } else if (!isSpeech && this.wasSpeaking) {
6817
+ this.preSpeechBuffer = [];
6818
+ }
6819
+ this.wasSpeaking = isSpeech;
6820
+ resolve({
6821
+ probability: result.probability,
6822
+ isSpeech,
6823
+ inferenceTimeMs,
6824
+ preSpeechChunks
6825
+ });
6826
+ } catch (err) {
6827
+ reject(err);
6828
+ }
6829
+ });
6830
+ });
6831
+ }
6832
+ async reset() {
6833
+ if (!this._isLoaded) throw new Error("Model not loaded. Call load() first.");
6834
+ const newState = await this.worker.resetVAD();
6835
+ this.state = newState;
6836
+ this.context = new Float32Array(this.contextSize);
6837
+ this.preSpeechBuffer = [];
6838
+ this.wasSpeaking = false;
6839
+ }
6840
+ async dispose() {
6841
+ if (this._isLoaded) {
6842
+ await this.worker.disposeVAD();
6843
+ this._isLoaded = false;
6844
+ }
6845
+ this.state = new Float32Array(2 * 1 * 128);
6846
+ this.context = new Float32Array(this.contextSize);
6847
+ this.preSpeechBuffer = [];
6848
+ this.wasSpeaking = false;
6849
+ }
6850
+ };
6851
+
6852
+ // src/inference/createSenseVoice.ts
6853
+ var logger7 = createLogger("createSenseVoice");
6854
+ function createSenseVoice(config) {
6855
+ if (config.unifiedWorker) {
6856
+ logger7.info("Creating SenseVoiceUnifiedAdapter (shared unified worker)");
6857
+ return new SenseVoiceUnifiedAdapter(config.unifiedWorker, {
6858
+ modelUrl: config.modelUrl,
6859
+ tokensUrl: config.tokensUrl,
6860
+ language: config.language,
6861
+ textNorm: config.textNorm
6862
+ });
6863
+ }
6864
+ const useWorker = config.useWorker ?? "auto";
6865
+ if (useWorker === true) {
6866
+ if (!SenseVoiceWorker.isSupported()) {
6867
+ throw new Error("Web Workers are not supported in this environment");
6868
+ }
6869
+ logger7.info("Creating SenseVoiceWorker (off-main-thread)");
6870
+ return new SenseVoiceWorker({
6871
+ modelUrl: config.modelUrl,
6872
+ tokensUrl: config.tokensUrl,
6873
+ language: config.language,
6874
+ textNorm: config.textNorm
6875
+ });
6876
+ }
6877
+ if (useWorker === false) {
6878
+ logger7.info("Creating SenseVoiceInference (main thread)");
6879
+ return new SenseVoiceInference({
6880
+ modelUrl: config.modelUrl,
6881
+ tokensUrl: config.tokensUrl,
6882
+ language: config.language,
6883
+ textNorm: config.textNorm
6884
+ });
6885
+ }
6886
+ if (SenseVoiceWorker.isSupported() && !isIOS()) {
6887
+ logger7.info("Auto-detected: creating SenseVoiceWorker (off-main-thread)");
6888
+ return new SenseVoiceWorker({
6889
+ modelUrl: config.modelUrl,
6890
+ tokensUrl: config.tokensUrl,
6891
+ language: config.language,
6892
+ textNorm: config.textNorm
6893
+ });
6894
+ }
6895
+ logger7.info("Auto-detected: creating SenseVoiceInference (main thread)", {
6896
+ reason: isIOS() ? "iOS (shared ORT instance)" : "Worker unsupported"
6897
+ });
6898
+ return new SenseVoiceInference({
6899
+ modelUrl: config.modelUrl,
6900
+ tokensUrl: config.tokensUrl,
6901
+ language: config.language,
6902
+ textNorm: config.textNorm
6903
+ });
6904
+ }
6905
+
6906
+ // src/inference/Wav2ArkitCpuInference.ts
6907
+ var logger8 = createLogger("Wav2ArkitCpu");
6908
+ var _Wav2ArkitCpuInference = class _Wav2ArkitCpuInference {
6909
+ constructor(config) {
6910
+ this.modelId = "wav2arkit_cpu";
6911
+ this.session = null;
6912
+ this.ort = null;
6913
+ this._backend = "wasm";
6914
+ this.isLoading = false;
6915
+ // Inference queue for handling concurrent calls
6916
+ this.inferenceQueue = Promise.resolve();
6917
+ // Session health: set to true if session.run() times out.
6918
+ // A timed-out session may have a zombie WASM dispatch still running,
6919
+ // so all future infer() calls reject immediately to prevent concurrent access.
6920
+ this.poisoned = false;
6921
+ this.config = config;
6922
+ }
6923
+ get backend() {
6924
+ return this.session ? this._backend : null;
6925
+ }
6926
+ get isLoaded() {
6927
+ return this.session !== null;
6928
+ }
6929
+ /**
6930
+ * Load the ONNX model
6931
+ */
6932
+ async load() {
6933
+ if (this.isLoading) {
6934
+ throw new Error("Model is already loading");
6935
+ }
6936
+ if (this.session) {
6937
+ throw new Error("Model already loaded. Call dispose() first.");
6938
+ }
6939
+ this.isLoading = true;
6940
+ const startTime = performance.now();
6941
+ const telemetry = getTelemetry();
6942
+ const span = telemetry?.startSpan("Wav2ArkitCpu.load", {
6943
+ "model.url": this.config.modelUrl,
6944
+ "model.backend_requested": this.config.backend || "wasm"
6945
+ });
6946
+ try {
6947
+ const preference = this.config.backend || "wasm";
6948
+ logger8.info("Loading ONNX Runtime...", { preference });
6949
+ const { ort, backend } = await getOnnxRuntimeForPreference(preference);
6950
+ this.ort = ort;
6951
+ this._backend = backend;
6952
+ logger8.info("ONNX Runtime loaded", { backend: this._backend });
6953
+ const modelUrl = this.config.modelUrl;
6954
+ const dataUrl = this.config.externalDataUrl !== false ? this.config.externalDataUrl || `${modelUrl}.data` : null;
6955
+ const sessionOptions = getSessionOptions(this._backend);
6956
+ if (isIOS()) {
6957
+ logger8.info("iOS: passing model URLs directly to ORT (low-memory path)", {
6958
+ modelUrl,
6959
+ dataUrl
6960
+ });
6961
+ if (dataUrl) {
6962
+ const dataFilename = dataUrl.split("/").pop();
6963
+ sessionOptions.externalData = [{
6964
+ path: dataFilename,
6965
+ data: dataUrl
6966
+ // URL string — ORT fetches directly into WASM
6967
+ }];
6968
+ }
6969
+ this.session = await this.ort.InferenceSession.create(modelUrl, sessionOptions);
6970
+ } else {
6971
+ const cache = getModelCache();
6972
+ const isCached = await cache.has(modelUrl);
6973
+ let modelBuffer;
6974
+ if (isCached) {
6975
+ logger8.debug("Loading model from cache", { modelUrl });
6976
+ modelBuffer = await cache.get(modelUrl);
6977
+ if (!modelBuffer) {
6978
+ logger8.warn("Cache corruption detected, clearing and retrying", { modelUrl });
6979
+ await cache.delete(modelUrl);
6980
+ modelBuffer = await fetchWithCache(modelUrl);
6981
+ }
6982
+ } else {
6983
+ logger8.debug("Fetching and caching model graph", { modelUrl });
6984
+ modelBuffer = await fetchWithCache(modelUrl);
6985
+ }
6986
+ if (!modelBuffer) {
6987
+ throw new Error(`Failed to load model: ${modelUrl}`);
6988
+ }
6989
+ let externalDataBuffer = null;
6990
+ if (dataUrl) {
6991
+ try {
6992
+ const isDataCached = await cache.has(dataUrl);
6993
+ if (isDataCached) {
6994
+ logger8.debug("Loading external data from cache", { dataUrl });
6995
+ externalDataBuffer = await cache.get(dataUrl);
6996
+ if (!externalDataBuffer) {
6997
+ logger8.warn("Cache corruption for external data, retrying", { dataUrl });
6998
+ await cache.delete(dataUrl);
6999
+ externalDataBuffer = await fetchWithCache(dataUrl);
7000
+ }
7001
+ } else {
7002
+ logger8.info("Fetching external model data", {
7003
+ dataUrl,
7004
+ note: "This may be a large download (400MB+)"
7005
+ });
7006
+ externalDataBuffer = await fetchWithCache(dataUrl);
7007
+ }
7008
+ logger8.info("External data loaded", {
7009
+ size: formatBytes(externalDataBuffer.byteLength)
7010
+ });
7011
+ } catch (err) {
7012
+ logger8.debug("No external data file found (single-file model)", {
7013
+ dataUrl,
7014
+ error: err.message
7015
+ });
7016
+ }
7017
+ }
7018
+ logger8.debug("Creating ONNX session", {
7019
+ graphSize: formatBytes(modelBuffer.byteLength),
7020
+ externalDataSize: externalDataBuffer ? formatBytes(externalDataBuffer.byteLength) : "none",
7021
+ backend: this._backend
7022
+ });
7023
+ if (externalDataBuffer) {
7024
+ const dataFilename = dataUrl.split("/").pop();
7025
+ sessionOptions.externalData = [{
7026
+ path: dataFilename,
7027
+ data: new Uint8Array(externalDataBuffer)
7028
+ }];
7029
+ }
7030
+ const modelData = new Uint8Array(modelBuffer);
7031
+ this.session = await this.ort.InferenceSession.create(modelData, sessionOptions);
7032
+ }
7033
+ const loadTimeMs = performance.now() - startTime;
7034
+ logger8.info("Model loaded successfully", {
7035
+ backend: this._backend,
7036
+ loadTimeMs: Math.round(loadTimeMs),
7037
+ inputs: this.session.inputNames,
7038
+ outputs: this.session.outputNames
7039
+ });
7040
+ span?.setAttributes({
7041
+ "model.backend": this._backend,
7042
+ "model.load_time_ms": loadTimeMs,
7043
+ "model.cached": !isIOS()
7044
+ });
7045
+ span?.end();
7046
+ telemetry?.recordHistogram("omote.model.load_time", loadTimeMs, {
7047
+ model: "wav2arkit_cpu",
7048
+ backend: this._backend
7049
+ });
7050
+ logger8.debug("Running warmup inference");
7051
+ const warmupStart = performance.now();
7052
+ const silentAudio = new Float32Array(16e3);
7053
+ await this.infer(silentAudio);
7054
+ const warmupTimeMs = performance.now() - warmupStart;
7055
+ logger8.info("Warmup inference complete", {
7056
+ warmupTimeMs: Math.round(warmupTimeMs),
7057
+ backend: this._backend
7058
+ });
7059
+ telemetry?.recordHistogram("omote.model.warmup_time", warmupTimeMs, {
7060
+ model: "wav2arkit_cpu",
7061
+ backend: this._backend
7062
+ });
7063
+ return {
7064
+ backend: this._backend,
7065
+ loadTimeMs,
7066
+ inputNames: [...this.session.inputNames],
7067
+ outputNames: [...this.session.outputNames]
7068
+ };
7069
+ } catch (error) {
7070
+ span?.endWithError(error instanceof Error ? error : new Error(String(error)));
7071
+ telemetry?.incrementCounter("omote.errors.total", 1, {
7072
+ model: "wav2arkit_cpu",
7073
+ error_type: "load_failed"
7074
+ });
7075
+ throw error;
7076
+ } finally {
7077
+ this.isLoading = false;
7078
+ }
7079
+ }
7080
+ /**
7081
+ * Run inference on raw audio
7082
+ *
7083
+ * Accepts variable-length audio (not fixed to 16000 samples).
7084
+ * Output frames = ceil(30 * numSamples / 16000).
7085
+ *
7086
+ * @param audioSamples - Float32Array of raw audio at 16kHz
7087
+ * @param _identityIndex - Ignored (identity 11 is baked into the model)
7088
+ */
7089
+ async infer(audioSamples, _identityIndex) {
7090
+ if (!this.session) {
7091
+ throw new Error("Model not loaded. Call load() first.");
7092
+ }
7093
+ if (this.poisoned) {
7094
+ throw new Error("Wav2ArkitCpu session timed out \u2014 inference unavailable until page reload");
7095
+ }
7096
+ const audioCopy = new Float32Array(audioSamples);
7097
+ const feeds = {
7098
+ "audio_waveform": new this.ort.Tensor("float32", audioCopy, [1, audioCopy.length])
7099
+ };
7100
+ return this.queueInference(feeds, audioCopy.length);
7101
+ }
7102
+ /**
7103
+ * Queue inference to serialize ONNX session calls
7104
+ */
7105
+ queueInference(feeds, inputSamples) {
7106
+ return new Promise((resolve, reject) => {
7107
+ this.inferenceQueue = this.inferenceQueue.then(async () => {
7108
+ const telemetry = getTelemetry();
7109
+ const span = telemetry?.startSpan("Wav2ArkitCpu.infer", {
7110
+ "inference.backend": this._backend,
7111
+ "inference.input_samples": inputSamples
7112
+ });
7113
+ try {
7114
+ const startTime = performance.now();
4822
7115
  let timeoutId;
4823
7116
  const results = await Promise.race([
4824
7117
  this.session.run(feeds).then((r) => {
@@ -4833,36 +7126,576 @@ var _Wav2ArkitCpuInference = class _Wav2ArkitCpuInference {
4833
7126
  })
4834
7127
  ]);
4835
7128
  const inferenceTimeMs = performance.now() - startTime;
4836
- const blendshapeOutput = results["blendshapes"];
4837
- if (!blendshapeOutput) {
4838
- throw new Error("Missing blendshapes output from model");
4839
- }
4840
- const blendshapeData = blendshapeOutput.data;
4841
- const numFrames = blendshapeOutput.dims[1];
4842
- const numBlendshapes = blendshapeOutput.dims[2];
7129
+ const blendshapeOutput = results["blendshapes"];
7130
+ if (!blendshapeOutput) {
7131
+ throw new Error("Missing blendshapes output from model");
7132
+ }
7133
+ const blendshapeData = blendshapeOutput.data;
7134
+ const numFrames = blendshapeOutput.dims[1];
7135
+ const numBlendshapes = blendshapeOutput.dims[2];
7136
+ const blendshapes = [];
7137
+ for (let f = 0; f < numFrames; f++) {
7138
+ const rawFrame = blendshapeData.slice(f * numBlendshapes, (f + 1) * numBlendshapes);
7139
+ const symmetrized = symmetrizeBlendshapes(rawFrame);
7140
+ blendshapes.push(symmetrized);
7141
+ }
7142
+ logger8.trace("Inference completed", {
7143
+ inferenceTimeMs: Math.round(inferenceTimeMs * 100) / 100,
7144
+ numFrames,
7145
+ inputSamples
7146
+ });
7147
+ span?.setAttributes({
7148
+ "inference.duration_ms": inferenceTimeMs,
7149
+ "inference.frames": numFrames
7150
+ });
7151
+ span?.end();
7152
+ telemetry?.recordHistogram("omote.inference.latency", inferenceTimeMs, {
7153
+ model: "wav2arkit_cpu",
7154
+ backend: this._backend
7155
+ });
7156
+ telemetry?.incrementCounter("omote.inference.total", 1, {
7157
+ model: "wav2arkit_cpu",
7158
+ backend: this._backend,
7159
+ status: "success"
7160
+ });
7161
+ resolve({
7162
+ blendshapes,
7163
+ numFrames,
7164
+ inferenceTimeMs
7165
+ });
7166
+ } catch (err) {
7167
+ const errMsg = err instanceof Error ? err.message : String(err);
7168
+ if (errMsg.includes("timed out")) {
7169
+ this.poisoned = true;
7170
+ logger8.error("CRITICAL: Inference session timed out \u2014 Wav2ArkitCpu is dead. Page reload required.", {
7171
+ backend: this._backend,
7172
+ timeoutMs: _Wav2ArkitCpuInference.INFERENCE_TIMEOUT_MS
7173
+ });
7174
+ } else if (typeof err === "number") {
7175
+ const oomError = new Error(
7176
+ `Wav2ArkitCpu inference failed with raw C++ exception pointer (0x${err.toString(16)}). This is likely an OOM crash in WASM. Try reloading the page.`
7177
+ );
7178
+ logger8.error("ORT WASM OOM \u2014 raw C++ exception pointer", {
7179
+ pointer: `0x${err.toString(16)}`,
7180
+ backend: this._backend
7181
+ });
7182
+ span?.endWithError(oomError);
7183
+ telemetry?.incrementCounter("omote.inference.total", 1, {
7184
+ model: "wav2arkit_cpu",
7185
+ backend: this._backend,
7186
+ status: "error"
7187
+ });
7188
+ reject(oomError);
7189
+ return;
7190
+ } else {
7191
+ logger8.error("Inference failed", { error: errMsg, backend: this._backend });
7192
+ }
7193
+ span?.endWithError(err instanceof Error ? err : new Error(String(err)));
7194
+ telemetry?.incrementCounter("omote.inference.total", 1, {
7195
+ model: "wav2arkit_cpu",
7196
+ backend: this._backend,
7197
+ status: "error"
7198
+ });
7199
+ reject(err);
7200
+ }
7201
+ });
7202
+ });
7203
+ }
7204
+ /**
7205
+ * Dispose of the model and free resources
7206
+ */
7207
+ async dispose() {
7208
+ if (this.session) {
7209
+ await this.session.release();
7210
+ this.session = null;
7211
+ }
7212
+ }
7213
+ };
7214
+ _Wav2ArkitCpuInference.INFERENCE_TIMEOUT_MS = 5e3;
7215
+ var Wav2ArkitCpuInference = _Wav2ArkitCpuInference;
7216
+
7217
+ // src/inference/Wav2ArkitCpuWorker.ts
7218
+ var logger9 = createLogger("Wav2ArkitCpuWorker");
7219
+ var WASM_CDN_PATH4 = "https://cdn.jsdelivr.net/npm/onnxruntime-web@1.23.2/dist/";
7220
+ var LOAD_TIMEOUT_MS2 = 6e4;
7221
+ var INFERENCE_TIMEOUT_MS2 = 5e3;
7222
+ function resolveUrl3(url) {
7223
+ if (/^https?:\/\//i.test(url) || /^blob:/i.test(url)) return url;
7224
+ try {
7225
+ return new URL(url, globalThis.location?.origin ?? "https://localhost").href;
7226
+ } catch {
7227
+ return url;
7228
+ }
7229
+ }
7230
+ var WORKER_SCRIPT3 = `
7231
+ // Wav2ArkitCpu Worker Script
7232
+ // Loaded via Blob URL - no separate file needed
7233
+
7234
+ var ort = null;
7235
+ var session = null;
7236
+
7237
+ // Precomputed symmetric index pairs from LAM_BLENDSHAPES alphabetical ordering
7238
+ // Used to average left/right blendshape pairs for symmetrized output
7239
+ const SYMMETRIC_INDEX_PAIRS = [
7240
+ [23, 25], // jawLeft, jawRight
7241
+ [32, 38], // mouthLeft, mouthRight
7242
+ [43, 44], // mouthSmileLeft, mouthSmileRight
7243
+ [29, 30], // mouthFrownLeft, mouthFrownRight
7244
+ [27, 28], // mouthDimpleLeft, mouthDimpleRight
7245
+ [45, 46], // mouthStretchLeft, mouthStretchRight
7246
+ [35, 36], // mouthPressLeft, mouthPressRight
7247
+ [47, 48], // mouthUpperUpLeft, mouthUpperUpRight
7248
+ [33, 34], // mouthLowerDownLeft, mouthLowerDownRight
7249
+ [49, 50], // noseSneerLeft, noseSneerRight
7250
+ [6, 7], // cheekSquintLeft, cheekSquintRight
7251
+ [0, 1], // browDownLeft, browDownRight
7252
+ [3, 4], // browOuterUpLeft, browOuterUpRight
7253
+ [8, 9], // eyeBlinkLeft, eyeBlinkRight
7254
+ [16, 17], // eyeLookUpLeft, eyeLookUpRight
7255
+ [10, 11], // eyeLookDownLeft, eyeLookDownRight
7256
+ [12, 13], // eyeLookInLeft, eyeLookInRight
7257
+ [14, 15], // eyeLookOutLeft, eyeLookOutRight
7258
+ [18, 19], // eyeSquintLeft, eyeSquintRight
7259
+ [20, 21], // eyeWideLeft, eyeWideRight
7260
+ ];
7261
+
7262
+ /**
7263
+ * Symmetrize blendshapes by averaging left/right pairs
7264
+ * Inlined from blendshapeUtils.ts for worker context
7265
+ */
7266
+ function symmetrizeBlendshapes(frame) {
7267
+ const result = new Float32Array(frame);
7268
+ for (const [lIdx, rIdx] of SYMMETRIC_INDEX_PAIRS) {
7269
+ const avg = (frame[lIdx] + frame[rIdx]) / 2;
7270
+ result[lIdx] = avg;
7271
+ result[rIdx] = avg;
7272
+ }
7273
+ return result;
7274
+ }
7275
+
7276
+ /**
7277
+ * Load ONNX Runtime from CDN
7278
+ */
7279
+ async function loadOrt(wasmPaths) {
7280
+ if (ort) return;
7281
+
7282
+ // Import ONNX Runtime from CDN
7283
+ const ortUrl = wasmPaths + 'ort.wasm.min.js';
7284
+
7285
+ // Load the script by fetching and executing it
7286
+ const response = await fetch(ortUrl);
7287
+ const scriptText = await response.text();
7288
+
7289
+ // Create a blob URL for the script
7290
+ const blob = new Blob([scriptText], { type: 'application/javascript' });
7291
+ const blobUrl = URL.createObjectURL(blob);
7292
+
7293
+ // Import the module
7294
+ importScripts(blobUrl);
7295
+ URL.revokeObjectURL(blobUrl);
7296
+
7297
+ // ort is now available as global
7298
+ ort = self.ort;
7299
+
7300
+ // Configure WASM settings
7301
+ ort.env.wasm.wasmPaths = wasmPaths;
7302
+ ort.env.wasm.numThreads = 1; // Single thread in worker
7303
+ ort.env.wasm.simd = true;
7304
+ ort.env.wasm.proxy = false; // No proxy in worker
7305
+ }
7306
+
7307
+ /**
7308
+ * Load the wav2arkit_cpu model
7309
+ */
7310
+ async function loadModel(modelUrl, externalDataUrl, isIOS) {
7311
+ const sessionOptions = {
7312
+ executionProviders: ['wasm'],
7313
+ graphOptimizationLevel: 'all',
7314
+ };
7315
+
7316
+ const dataFilename = externalDataUrl ? externalDataUrl.split('/').pop() : null;
7317
+
7318
+ if (isIOS) {
7319
+ // iOS: Pass URLs directly to ORT to avoid loading 402MB into JS heap.
7320
+ // ORT fetches externally into WASM memory, cutting peak JS memory from
7321
+ // ~800MB to ~2MB (just the graph).
7322
+ if (externalDataUrl && dataFilename) {
7323
+ sessionOptions.externalData = [{ path: dataFilename, data: externalDataUrl }];
7324
+ }
7325
+ session = await ort.InferenceSession.create(modelUrl, sessionOptions);
7326
+ } else {
7327
+ // Desktop: fetch model graph as ArrayBuffer
7328
+ const graphResponse = await fetch(modelUrl);
7329
+ if (!graphResponse.ok) {
7330
+ throw new Error('Failed to fetch model graph: ' + graphResponse.status + ' ' + graphResponse.statusText);
7331
+ }
7332
+ const graphBuffer = await graphResponse.arrayBuffer();
7333
+
7334
+ // Fetch external data file if present
7335
+ if (externalDataUrl && dataFilename) {
7336
+ const dataResponse = await fetch(externalDataUrl);
7337
+ if (!dataResponse.ok) {
7338
+ throw new Error('Failed to fetch external data: ' + dataResponse.status + ' ' + dataResponse.statusText);
7339
+ }
7340
+ const dataBuffer = await dataResponse.arrayBuffer();
7341
+ sessionOptions.externalData = [{ path: dataFilename, data: new Uint8Array(dataBuffer) }];
7342
+ }
7343
+
7344
+ session = await ort.InferenceSession.create(new Uint8Array(graphBuffer), sessionOptions);
7345
+ }
7346
+
7347
+ // Warmup inference with 16000 silent samples
7348
+ const warmupAudio = new Float32Array(16000);
7349
+ const warmupTensor = new ort.Tensor('float32', warmupAudio, [1, warmupAudio.length]);
7350
+ await session.run({ audio_waveform: warmupTensor });
7351
+
7352
+ return {
7353
+ inputNames: session.inputNames.slice(),
7354
+ outputNames: session.outputNames.slice(),
7355
+ };
7356
+ }
7357
+
7358
+ /**
7359
+ * Run lip sync inference
7360
+ */
7361
+ async function runInference(audio) {
7362
+ const tensor = new ort.Tensor('float32', audio, [1, audio.length]);
7363
+ const results = await session.run({ audio_waveform: tensor });
7364
+
7365
+ const blendshapeOutput = results['blendshapes'];
7366
+ if (!blendshapeOutput) {
7367
+ throw new Error('Missing blendshapes output from model');
7368
+ }
7369
+
7370
+ const blendshapeData = blendshapeOutput.data;
7371
+ const numFrames = blendshapeOutput.dims[1];
7372
+ const numBlendshapes = blendshapeOutput.dims[2];
7373
+
7374
+ // Symmetrize each frame and flatten into a single Float32Array for transfer
7375
+ const flatBuffer = new Float32Array(numFrames * numBlendshapes);
7376
+ for (let f = 0; f < numFrames; f++) {
7377
+ const offset = f * numBlendshapes;
7378
+ const rawFrame = blendshapeData.slice(offset, offset + numBlendshapes);
7379
+ const symmetrized = symmetrizeBlendshapes(rawFrame);
7380
+ flatBuffer.set(symmetrized, offset);
7381
+ }
7382
+
7383
+ return { flatBuffer, numFrames, numBlendshapes };
7384
+ }
7385
+
7386
+ // Message handler
7387
+ self.onmessage = async function(e) {
7388
+ const msg = e.data;
7389
+
7390
+ try {
7391
+ switch (msg.type) {
7392
+ case 'load': {
7393
+ const startTime = performance.now();
7394
+ await loadOrt(msg.wasmPaths);
7395
+ const { inputNames, outputNames } = await loadModel(msg.modelUrl, msg.externalDataUrl, msg.isIOS);
7396
+ const loadTimeMs = performance.now() - startTime;
7397
+
7398
+ self.postMessage({
7399
+ type: 'loaded',
7400
+ inputNames,
7401
+ outputNames,
7402
+ loadTimeMs,
7403
+ });
7404
+ break;
7405
+ }
7406
+
7407
+ case 'infer': {
7408
+ const startTime = performance.now();
7409
+ const { flatBuffer, numFrames, numBlendshapes } = await runInference(msg.audio);
7410
+ const inferenceTimeMs = performance.now() - startTime;
7411
+
7412
+ self.postMessage({
7413
+ type: 'result',
7414
+ blendshapes: flatBuffer,
7415
+ numFrames,
7416
+ numBlendshapes,
7417
+ inferenceTimeMs,
7418
+ }, [flatBuffer.buffer]);
7419
+ break;
7420
+ }
7421
+
7422
+ case 'dispose': {
7423
+ if (session) {
7424
+ await session.release();
7425
+ session = null;
7426
+ }
7427
+ ort = null;
7428
+ self.postMessage({ type: 'disposed' });
7429
+ break;
7430
+ }
7431
+
7432
+ default:
7433
+ self.postMessage({
7434
+ type: 'error',
7435
+ error: 'Unknown message type: ' + msg.type,
7436
+ });
7437
+ }
7438
+ } catch (err) {
7439
+ let errorMessage;
7440
+ if (typeof err === 'number') {
7441
+ // ORT WASM throws raw C++ exception pointers as bare numbers
7442
+ errorMessage = 'ORT WASM C++ exception pointer (0x' + err.toString(16) + ') \u2014 likely OOM';
7443
+ } else {
7444
+ errorMessage = err.message || String(err);
7445
+ }
7446
+ self.postMessage({
7447
+ type: 'error',
7448
+ error: errorMessage,
7449
+ });
7450
+ }
7451
+ };
7452
+
7453
+ // Error handler
7454
+ self.onerror = function(err) {
7455
+ self.postMessage({
7456
+ type: 'error',
7457
+ error: 'Worker error: ' + (err.message || String(err)),
7458
+ });
7459
+ };
7460
+ `;
7461
+ var Wav2ArkitCpuWorker = class {
7462
+ constructor(config) {
7463
+ this.modelId = "wav2arkit_cpu";
7464
+ this.worker = null;
7465
+ this.isLoading = false;
7466
+ this._isLoaded = false;
7467
+ // Inference queue for serialization
7468
+ this.inferenceQueue = Promise.resolve();
7469
+ // Session health: set to true if worker inference times out.
7470
+ // A timed-out worker may have a zombie WASM dispatch still running,
7471
+ // so all future infer() calls reject immediately to prevent concurrent access.
7472
+ this.poisoned = false;
7473
+ // Pending message handlers
7474
+ this.pendingResolvers = /* @__PURE__ */ new Map();
7475
+ this.config = config;
7476
+ }
7477
+ get isLoaded() {
7478
+ return this._isLoaded;
7479
+ }
7480
+ /**
7481
+ * Backend type (always 'wasm' for Worker, WebGPU not supported in Workers)
7482
+ */
7483
+ get backend() {
7484
+ return this._isLoaded ? "wasm" : null;
7485
+ }
7486
+ /**
7487
+ * Create the worker from inline script
7488
+ */
7489
+ createWorker() {
7490
+ const blob = new Blob([WORKER_SCRIPT3], { type: "application/javascript" });
7491
+ const blobUrl = URL.createObjectURL(blob);
7492
+ const worker = new Worker(blobUrl);
7493
+ URL.revokeObjectURL(blobUrl);
7494
+ worker.onmessage = (event) => {
7495
+ this.handleWorkerMessage(event.data);
7496
+ };
7497
+ worker.onerror = (error) => {
7498
+ logger9.error("Worker error", { error: error.message });
7499
+ for (const [, resolver] of this.pendingResolvers) {
7500
+ resolver.reject(new Error(`Worker error: ${error.message}`));
7501
+ }
7502
+ this.pendingResolvers.clear();
7503
+ };
7504
+ return worker;
7505
+ }
7506
+ /**
7507
+ * Handle messages from worker
7508
+ */
7509
+ handleWorkerMessage(result) {
7510
+ const resolver = this.pendingResolvers.get(result.type);
7511
+ if (resolver) {
7512
+ this.pendingResolvers.delete(result.type);
7513
+ if (result.type === "error") {
7514
+ resolver.reject(new Error(result.error));
7515
+ } else {
7516
+ resolver.resolve(result);
7517
+ }
7518
+ }
7519
+ }
7520
+ /**
7521
+ * Send message to worker and wait for response
7522
+ */
7523
+ sendMessage(message, expectedType, timeoutMs) {
7524
+ return new Promise((resolve, reject) => {
7525
+ if (!this.worker) {
7526
+ reject(new Error("Worker not initialized"));
7527
+ return;
7528
+ }
7529
+ const timeoutId = setTimeout(() => {
7530
+ this.pendingResolvers.delete(expectedType);
7531
+ reject(new Error(`Worker operation timed out after ${timeoutMs}ms`));
7532
+ }, timeoutMs);
7533
+ this.pendingResolvers.set(expectedType, {
7534
+ resolve: (value) => {
7535
+ clearTimeout(timeoutId);
7536
+ resolve(value);
7537
+ },
7538
+ reject: (error) => {
7539
+ clearTimeout(timeoutId);
7540
+ reject(error);
7541
+ }
7542
+ });
7543
+ this.pendingResolvers.set("error", {
7544
+ resolve: () => {
7545
+ },
7546
+ // Never called for errors
7547
+ reject: (error) => {
7548
+ clearTimeout(timeoutId);
7549
+ this.pendingResolvers.delete(expectedType);
7550
+ reject(error);
7551
+ }
7552
+ });
7553
+ this.worker.postMessage(message);
7554
+ });
7555
+ }
7556
+ /**
7557
+ * Load the ONNX model in the worker
7558
+ */
7559
+ async load() {
7560
+ if (this.isLoading) {
7561
+ throw new Error("Model is already loading");
7562
+ }
7563
+ if (this._isLoaded) {
7564
+ throw new Error("Model already loaded. Call dispose() first.");
7565
+ }
7566
+ this.isLoading = true;
7567
+ const startTime = performance.now();
7568
+ const telemetry = getTelemetry();
7569
+ const span = telemetry?.startSpan("Wav2ArkitCpuWorker.load", {
7570
+ "model.url": this.config.modelUrl,
7571
+ "model.backend_requested": "wasm"
7572
+ });
7573
+ try {
7574
+ logger9.info("Creating wav2arkit_cpu worker...");
7575
+ this.worker = this.createWorker();
7576
+ const externalDataUrl = this.config.externalDataUrl !== false ? this.config.externalDataUrl || `${this.config.modelUrl}.data` : null;
7577
+ logger9.info("Loading model in worker...", {
7578
+ modelUrl: this.config.modelUrl,
7579
+ externalDataUrl,
7580
+ isIOS: isIOS()
7581
+ });
7582
+ const result = await this.sendMessage(
7583
+ {
7584
+ type: "load",
7585
+ modelUrl: resolveUrl3(this.config.modelUrl),
7586
+ externalDataUrl: externalDataUrl ? resolveUrl3(externalDataUrl) : null,
7587
+ wasmPaths: WASM_CDN_PATH4,
7588
+ isIOS: isIOS()
7589
+ },
7590
+ "loaded",
7591
+ LOAD_TIMEOUT_MS2
7592
+ );
7593
+ this._isLoaded = true;
7594
+ const loadTimeMs = performance.now() - startTime;
7595
+ logger9.info("Wav2ArkitCpu worker loaded successfully", {
7596
+ backend: "wasm",
7597
+ loadTimeMs: Math.round(loadTimeMs),
7598
+ workerLoadTimeMs: Math.round(result.loadTimeMs),
7599
+ inputs: result.inputNames,
7600
+ outputs: result.outputNames
7601
+ });
7602
+ span?.setAttributes({
7603
+ "model.backend": "wasm",
7604
+ "model.load_time_ms": loadTimeMs,
7605
+ "model.worker_load_time_ms": result.loadTimeMs
7606
+ });
7607
+ span?.end();
7608
+ telemetry?.recordHistogram("omote.model.load_time", loadTimeMs, {
7609
+ model: "wav2arkit_cpu-worker",
7610
+ backend: "wasm"
7611
+ });
7612
+ return {
7613
+ backend: "wasm",
7614
+ loadTimeMs,
7615
+ inputNames: result.inputNames,
7616
+ outputNames: result.outputNames
7617
+ };
7618
+ } catch (error) {
7619
+ span?.endWithError(error instanceof Error ? error : new Error(String(error)));
7620
+ telemetry?.incrementCounter("omote.errors.total", 1, {
7621
+ model: "wav2arkit_cpu-worker",
7622
+ error_type: "load_failed"
7623
+ });
7624
+ if (this.worker) {
7625
+ this.worker.terminate();
7626
+ this.worker = null;
7627
+ }
7628
+ throw error;
7629
+ } finally {
7630
+ this.isLoading = false;
7631
+ }
7632
+ }
7633
+ /**
7634
+ * Run inference on raw audio
7635
+ *
7636
+ * Accepts variable-length audio (not fixed to 16000 samples).
7637
+ * Output frames = ceil(30 * numSamples / 16000).
7638
+ *
7639
+ * @param audioSamples - Float32Array of raw audio at 16kHz
7640
+ * @param _identityIndex - Ignored (identity 11 is baked into the model)
7641
+ */
7642
+ async infer(audioSamples, _identityIndex) {
7643
+ if (!this._isLoaded || !this.worker) {
7644
+ throw new Error("Model not loaded. Call load() first.");
7645
+ }
7646
+ if (this.poisoned) {
7647
+ throw new Error("Wav2ArkitCpu worker session timed out \u2014 inference unavailable until page reload");
7648
+ }
7649
+ const audioCopy = new Float32Array(audioSamples);
7650
+ return this.queueInference(audioCopy);
7651
+ }
7652
+ /**
7653
+ * Queue inference to serialize worker calls
7654
+ */
7655
+ queueInference(audioSamples) {
7656
+ return new Promise((resolve, reject) => {
7657
+ this.inferenceQueue = this.inferenceQueue.then(async () => {
7658
+ const telemetry = getTelemetry();
7659
+ const span = telemetry?.startSpan("Wav2ArkitCpuWorker.infer", {
7660
+ "inference.backend": "wasm",
7661
+ "inference.input_samples": audioSamples.length
7662
+ });
7663
+ try {
7664
+ const startTime = performance.now();
7665
+ const result = await this.sendMessage(
7666
+ {
7667
+ type: "infer",
7668
+ audio: audioSamples
7669
+ },
7670
+ "result",
7671
+ INFERENCE_TIMEOUT_MS2
7672
+ );
7673
+ const inferenceTimeMs = performance.now() - startTime;
7674
+ const flatBuffer = result.blendshapes;
7675
+ const { numFrames, numBlendshapes } = result;
4843
7676
  const blendshapes = [];
4844
7677
  for (let f = 0; f < numFrames; f++) {
4845
- const rawFrame = blendshapeData.slice(f * numBlendshapes, (f + 1) * numBlendshapes);
4846
- const symmetrized = symmetrizeBlendshapes(rawFrame);
4847
- blendshapes.push(symmetrized);
7678
+ blendshapes.push(flatBuffer.slice(f * numBlendshapes, (f + 1) * numBlendshapes));
4848
7679
  }
4849
- logger5.trace("Inference completed", {
7680
+ logger9.trace("Worker inference completed", {
4850
7681
  inferenceTimeMs: Math.round(inferenceTimeMs * 100) / 100,
7682
+ workerTimeMs: Math.round(result.inferenceTimeMs * 100) / 100,
4851
7683
  numFrames,
4852
- inputSamples
7684
+ inputSamples: audioSamples.length
4853
7685
  });
4854
7686
  span?.setAttributes({
4855
7687
  "inference.duration_ms": inferenceTimeMs,
7688
+ "inference.worker_duration_ms": result.inferenceTimeMs,
4856
7689
  "inference.frames": numFrames
4857
7690
  });
4858
7691
  span?.end();
4859
7692
  telemetry?.recordHistogram("omote.inference.latency", inferenceTimeMs, {
4860
- model: "wav2arkit_cpu",
4861
- backend: this._backend
7693
+ model: "wav2arkit_cpu-worker",
7694
+ backend: "wasm"
4862
7695
  });
4863
7696
  telemetry?.incrementCounter("omote.inference.total", 1, {
4864
- model: "wav2arkit_cpu",
4865
- backend: this._backend,
7697
+ model: "wav2arkit_cpu-worker",
7698
+ backend: "wasm",
4866
7699
  status: "success"
4867
7700
  });
4868
7701
  resolve({
@@ -4874,33 +7707,17 @@ var _Wav2ArkitCpuInference = class _Wav2ArkitCpuInference {
4874
7707
  const errMsg = err instanceof Error ? err.message : String(err);
4875
7708
  if (errMsg.includes("timed out")) {
4876
7709
  this.poisoned = true;
4877
- logger5.error("CRITICAL: Inference session timed out \u2014 Wav2ArkitCpu is dead. Page reload required.", {
4878
- backend: this._backend,
4879
- timeoutMs: _Wav2ArkitCpuInference.INFERENCE_TIMEOUT_MS
4880
- });
4881
- } else if (typeof err === "number") {
4882
- const oomError = new Error(
4883
- `Wav2ArkitCpu inference failed with raw C++ exception pointer (0x${err.toString(16)}). This is likely an OOM crash in WASM. Try reloading the page.`
4884
- );
4885
- logger5.error("ORT WASM OOM \u2014 raw C++ exception pointer", {
4886
- pointer: `0x${err.toString(16)}`,
4887
- backend: this._backend
4888
- });
4889
- span?.endWithError(oomError);
4890
- telemetry?.incrementCounter("omote.inference.total", 1, {
4891
- model: "wav2arkit_cpu",
4892
- backend: this._backend,
4893
- status: "error"
7710
+ logger9.error("CRITICAL: Worker inference timed out \u2014 Wav2ArkitCpu worker is dead. Page reload required.", {
7711
+ backend: "wasm",
7712
+ timeoutMs: INFERENCE_TIMEOUT_MS2
4894
7713
  });
4895
- reject(oomError);
4896
- return;
4897
7714
  } else {
4898
- logger5.error("Inference failed", { error: errMsg, backend: this._backend });
7715
+ logger9.error("Worker inference failed", { error: errMsg, backend: "wasm" });
4899
7716
  }
4900
7717
  span?.endWithError(err instanceof Error ? err : new Error(String(err)));
4901
7718
  telemetry?.incrementCounter("omote.inference.total", 1, {
4902
- model: "wav2arkit_cpu",
4903
- backend: this._backend,
7719
+ model: "wav2arkit_cpu-worker",
7720
+ backend: "wasm",
4904
7721
  status: "error"
4905
7722
  });
4906
7723
  reject(err);
@@ -4909,39 +7726,62 @@ var _Wav2ArkitCpuInference = class _Wav2ArkitCpuInference {
4909
7726
  });
4910
7727
  }
4911
7728
  /**
4912
- * Dispose of the model and free resources
7729
+ * Dispose of the worker and free resources
4913
7730
  */
4914
7731
  async dispose() {
4915
- if (this.session) {
4916
- await this.session.release();
4917
- this.session = null;
7732
+ if (this.worker) {
7733
+ try {
7734
+ await this.sendMessage({ type: "dispose" }, "disposed", INFERENCE_TIMEOUT_MS2);
7735
+ } catch {
7736
+ }
7737
+ this.worker.terminate();
7738
+ this.worker = null;
4918
7739
  }
7740
+ this._isLoaded = false;
7741
+ this.poisoned = false;
7742
+ this.pendingResolvers.clear();
7743
+ }
7744
+ /**
7745
+ * Check if Web Workers are supported
7746
+ */
7747
+ static isSupported() {
7748
+ return typeof Worker !== "undefined";
4919
7749
  }
4920
7750
  };
4921
- _Wav2ArkitCpuInference.INFERENCE_TIMEOUT_MS = 5e3;
4922
- var Wav2ArkitCpuInference = _Wav2ArkitCpuInference;
4923
7751
 
4924
7752
  // src/inference/createLipSync.ts
4925
- var logger6 = createLogger("createLipSync");
7753
+ var logger10 = createLogger("createLipSync");
4926
7754
  function createLipSync(config) {
4927
7755
  const mode = config.mode ?? "auto";
4928
7756
  const fallbackOnError = config.fallbackOnError ?? true;
4929
7757
  let useCpu;
4930
7758
  if (mode === "cpu") {
4931
7759
  useCpu = true;
4932
- logger6.info("Forcing CPU lip sync model (wav2arkit_cpu)");
7760
+ logger10.info("Forcing CPU lip sync model (wav2arkit_cpu)");
4933
7761
  } else if (mode === "gpu") {
4934
7762
  useCpu = false;
4935
- logger6.info("Forcing GPU lip sync model (Wav2Vec2)");
7763
+ logger10.info("Forcing GPU lip sync model (Wav2Vec2)");
4936
7764
  } else {
4937
7765
  useCpu = shouldUseCpuLipSync();
4938
- logger6.info("Auto-detected lip sync model", {
7766
+ logger10.info("Auto-detected lip sync model", {
4939
7767
  useCpu,
4940
7768
  isSafari: isSafari()
4941
7769
  });
4942
7770
  }
4943
7771
  if (useCpu) {
4944
- logger6.info("Creating Wav2ArkitCpuInference (404MB, WASM)");
7772
+ if (config.unifiedWorker) {
7773
+ logger10.info("Creating Wav2ArkitCpuUnifiedAdapter (404MB, WASM, shared unified worker)");
7774
+ return new Wav2ArkitCpuUnifiedAdapter(config.unifiedWorker, {
7775
+ modelUrl: config.cpuModelUrl
7776
+ });
7777
+ }
7778
+ if (config.useWorker && Wav2ArkitCpuWorker.isSupported() && !isIOS()) {
7779
+ logger10.info("Creating Wav2ArkitCpuWorker (404MB, WASM, off-main-thread)");
7780
+ return new Wav2ArkitCpuWorker({
7781
+ modelUrl: config.cpuModelUrl
7782
+ });
7783
+ }
7784
+ logger10.info("Creating Wav2ArkitCpuInference (404MB, WASM)");
4945
7785
  return new Wav2ArkitCpuInference({
4946
7786
  modelUrl: config.cpuModelUrl
4947
7787
  });
@@ -4953,10 +7793,10 @@ function createLipSync(config) {
4953
7793
  numIdentityClasses: config.numIdentityClasses
4954
7794
  });
4955
7795
  if (fallbackOnError) {
4956
- logger6.info("Creating Wav2Vec2Inference with CPU fallback");
7796
+ logger10.info("Creating Wav2Vec2Inference with CPU fallback");
4957
7797
  return new LipSyncWithFallback(gpuInstance, config);
4958
7798
  }
4959
- logger6.info("Creating Wav2Vec2Inference (no fallback)");
7799
+ logger10.info("Creating Wav2Vec2Inference (no fallback)");
4960
7800
  return gpuInstance;
4961
7801
  }
4962
7802
  var LipSyncWithFallback = class {
@@ -4982,16 +7822,28 @@ var LipSyncWithFallback = class {
4982
7822
  }
4983
7823
  }
4984
7824
  async fallbackToCpu(reason) {
4985
- logger6.warn("GPU model load failed, falling back to CPU model", { reason });
7825
+ logger10.warn("GPU model load failed, falling back to CPU model", { reason });
4986
7826
  try {
4987
7827
  await this.implementation.dispose();
4988
7828
  } catch {
4989
7829
  }
4990
- this.implementation = new Wav2ArkitCpuInference({
4991
- modelUrl: this.config.cpuModelUrl
4992
- });
7830
+ if (this.config.unifiedWorker) {
7831
+ this.implementation = new Wav2ArkitCpuUnifiedAdapter(this.config.unifiedWorker, {
7832
+ modelUrl: this.config.cpuModelUrl
7833
+ });
7834
+ logger10.info("Fallback to Wav2ArkitCpuUnifiedAdapter successful");
7835
+ } else if (this.config.useWorker && Wav2ArkitCpuWorker.isSupported() && !isIOS()) {
7836
+ this.implementation = new Wav2ArkitCpuWorker({
7837
+ modelUrl: this.config.cpuModelUrl
7838
+ });
7839
+ logger10.info("Fallback to Wav2ArkitCpuWorker successful");
7840
+ } else {
7841
+ this.implementation = new Wav2ArkitCpuInference({
7842
+ modelUrl: this.config.cpuModelUrl
7843
+ });
7844
+ logger10.info("Fallback to Wav2ArkitCpuInference successful");
7845
+ }
4993
7846
  this.hasFallenBack = true;
4994
- logger6.info("Fallback to Wav2ArkitCpuInference successful");
4995
7847
  return await this.implementation.load();
4996
7848
  }
4997
7849
  async infer(audioSamples, identityIndex) {
@@ -5003,7 +7855,7 @@ var LipSyncWithFallback = class {
5003
7855
  };
5004
7856
 
5005
7857
  // src/inference/SileroVADInference.ts
5006
- var logger7 = createLogger("SileroVAD");
7858
+ var logger11 = createLogger("SileroVAD");
5007
7859
  var SileroVADInference = class {
5008
7860
  constructor(config) {
5009
7861
  this.session = null;
@@ -5077,23 +7929,23 @@ var SileroVADInference = class {
5077
7929
  "model.sample_rate": this.config.sampleRate
5078
7930
  });
5079
7931
  try {
5080
- logger7.info("Loading ONNX Runtime...", { preference: this.config.backend });
7932
+ logger11.info("Loading ONNX Runtime...", { preference: this.config.backend });
5081
7933
  const { ort, backend } = await getOnnxRuntimeForPreference(this.config.backend);
5082
7934
  this.ort = ort;
5083
7935
  this._backend = backend;
5084
- logger7.info("ONNX Runtime loaded", { backend: this._backend });
7936
+ logger11.info("ONNX Runtime loaded", { backend: this._backend });
5085
7937
  const cache = getModelCache();
5086
7938
  const modelUrl = this.config.modelUrl;
5087
7939
  const isCached = await cache.has(modelUrl);
5088
7940
  let modelBuffer;
5089
7941
  if (isCached) {
5090
- logger7.debug("Loading model from cache", { modelUrl });
7942
+ logger11.debug("Loading model from cache", { modelUrl });
5091
7943
  modelBuffer = await cache.get(modelUrl);
5092
7944
  } else {
5093
- logger7.debug("Fetching and caching model", { modelUrl });
7945
+ logger11.debug("Fetching and caching model", { modelUrl });
5094
7946
  modelBuffer = await fetchWithCache(modelUrl);
5095
7947
  }
5096
- logger7.debug("Creating ONNX session", {
7948
+ logger11.debug("Creating ONNX session", {
5097
7949
  size: formatBytes(modelBuffer.byteLength),
5098
7950
  backend: this._backend
5099
7951
  });
@@ -5102,7 +7954,7 @@ var SileroVADInference = class {
5102
7954
  this.session = await ort.InferenceSession.create(modelData, sessionOptions);
5103
7955
  this.reset();
5104
7956
  const loadTimeMs = performance.now() - startTime;
5105
- logger7.info("Model loaded successfully", {
7957
+ logger11.info("Model loaded successfully", {
5106
7958
  backend: this._backend,
5107
7959
  loadTimeMs: Math.round(loadTimeMs),
5108
7960
  sampleRate: this.config.sampleRate,
@@ -5157,7 +8009,7 @@ var SileroVADInference = class {
5157
8009
  []
5158
8010
  );
5159
8011
  } catch (e) {
5160
- logger7.warn("BigInt64Array not available, using bigint array fallback", {
8012
+ logger11.warn("BigInt64Array not available, using bigint array fallback", {
5161
8013
  error: e instanceof Error ? e.message : String(e)
5162
8014
  });
5163
8015
  this.srTensor = new this.ort.Tensor(
@@ -5263,7 +8115,7 @@ var SileroVADInference = class {
5263
8115
  this.preSpeechBuffer.shift();
5264
8116
  }
5265
8117
  }
5266
- logger7.trace("Skipping VAD inference - audio too quiet", {
8118
+ logger11.trace("Skipping VAD inference - audio too quiet", {
5267
8119
  rms: Math.round(rms * 1e4) / 1e4,
5268
8120
  threshold: MIN_ENERGY_THRESHOLD
5269
8121
  });
@@ -5317,7 +8169,7 @@ var SileroVADInference = class {
5317
8169
  if (isSpeech && !this.wasSpeaking) {
5318
8170
  preSpeechChunks = [...this.preSpeechBuffer];
5319
8171
  this.preSpeechBuffer = [];
5320
- logger7.debug("Speech started with pre-speech buffer", {
8172
+ logger11.debug("Speech started with pre-speech buffer", {
5321
8173
  preSpeechChunks: preSpeechChunks.length,
5322
8174
  durationMs: Math.round(preSpeechChunks.length * this.getChunkDurationMs())
5323
8175
  });
@@ -5330,7 +8182,7 @@ var SileroVADInference = class {
5330
8182
  this.preSpeechBuffer = [];
5331
8183
  }
5332
8184
  this.wasSpeaking = isSpeech;
5333
- logger7.trace("VAD inference completed", {
8185
+ logger11.trace("VAD inference completed", {
5334
8186
  probability: Math.round(probability * 1e3) / 1e3,
5335
8187
  isSpeech,
5336
8188
  inferenceTimeMs: Math.round(inferenceTimeMs * 100) / 100
@@ -5361,7 +8213,7 @@ var SileroVADInference = class {
5361
8213
  const oomError = new Error(
5362
8214
  `SileroVAD inference failed with raw C++ exception pointer (0x${err.toString(16)}). This is likely an OOM crash in WASM. Try reducing concurrent model sessions or reloading the page.`
5363
8215
  );
5364
- logger7.error("ORT WASM OOM \u2014 raw C++ exception pointer", {
8216
+ logger11.error("ORT WASM OOM \u2014 raw C++ exception pointer", {
5365
8217
  pointer: `0x${err.toString(16)}`,
5366
8218
  backend: this._backend
5367
8219
  });
@@ -5404,19 +8256,27 @@ var SileroVADInference = class {
5404
8256
  SileroVADInference.isWebGPUAvailable = isWebGPUAvailable;
5405
8257
 
5406
8258
  // src/inference/SileroVADWorker.ts
5407
- var logger8 = createLogger("SileroVADWorker");
5408
- var WASM_CDN_PATH2 = "https://cdn.jsdelivr.net/npm/onnxruntime-web@1.23.2/dist/";
5409
- var LOAD_TIMEOUT_MS = 1e4;
5410
- var INFERENCE_TIMEOUT_MS = 1e3;
5411
- var WORKER_SCRIPT = `
8259
+ var logger12 = createLogger("SileroVADWorker");
8260
+ var WASM_CDN_PATH5 = "https://cdn.jsdelivr.net/npm/onnxruntime-web@1.23.2/dist/";
8261
+ var LOAD_TIMEOUT_MS3 = 1e4;
8262
+ var INFERENCE_TIMEOUT_MS3 = 1e3;
8263
+ function resolveUrl4(url) {
8264
+ if (/^https?:\/\//i.test(url) || /^blob:/i.test(url)) return url;
8265
+ try {
8266
+ return new URL(url, globalThis.location?.origin ?? "https://localhost").href;
8267
+ } catch {
8268
+ return url;
8269
+ }
8270
+ }
8271
+ var WORKER_SCRIPT4 = `
5412
8272
  // Silero VAD Worker Script
5413
8273
  // Loaded via Blob URL - no separate file needed
5414
8274
 
5415
- let ort = null;
5416
- let session = null;
5417
- let sampleRate = 16000;
5418
- let chunkSize = 512;
5419
- let contextSize = 64;
8275
+ var ort = null;
8276
+ var session = null;
8277
+ var sampleRate = 16000;
8278
+ var chunkSize = 512;
8279
+ var contextSize = 64;
5420
8280
 
5421
8281
  /**
5422
8282
  * Load ONNX Runtime from CDN
@@ -5666,7 +8526,7 @@ var SileroVADWorker = class {
5666
8526
  * Create the worker from inline script
5667
8527
  */
5668
8528
  createWorker() {
5669
- const blob = new Blob([WORKER_SCRIPT], { type: "application/javascript" });
8529
+ const blob = new Blob([WORKER_SCRIPT4], { type: "application/javascript" });
5670
8530
  const blobUrl = URL.createObjectURL(blob);
5671
8531
  const worker = new Worker(blobUrl);
5672
8532
  URL.revokeObjectURL(blobUrl);
@@ -5674,7 +8534,7 @@ var SileroVADWorker = class {
5674
8534
  this.handleWorkerMessage(event.data);
5675
8535
  };
5676
8536
  worker.onerror = (error) => {
5677
- logger8.error("Worker error", { error: error.message });
8537
+ logger12.error("Worker error", { error: error.message });
5678
8538
  for (const [, resolver] of this.pendingResolvers) {
5679
8539
  resolver.reject(new Error(`Worker error: ${error.message}`));
5680
8540
  }
@@ -5750,25 +8610,25 @@ var SileroVADWorker = class {
5750
8610
  "model.sample_rate": this.config.sampleRate
5751
8611
  });
5752
8612
  try {
5753
- logger8.info("Creating VAD worker...");
8613
+ logger12.info("Creating VAD worker...");
5754
8614
  this.worker = this.createWorker();
5755
- logger8.info("Loading model in worker...", {
8615
+ logger12.info("Loading model in worker...", {
5756
8616
  modelUrl: this.config.modelUrl,
5757
8617
  sampleRate: this.config.sampleRate
5758
8618
  });
5759
8619
  const result = await this.sendMessage(
5760
8620
  {
5761
8621
  type: "load",
5762
- modelUrl: this.config.modelUrl,
8622
+ modelUrl: resolveUrl4(this.config.modelUrl),
5763
8623
  sampleRate: this.config.sampleRate,
5764
- wasmPaths: WASM_CDN_PATH2
8624
+ wasmPaths: WASM_CDN_PATH5
5765
8625
  },
5766
8626
  "loaded",
5767
- LOAD_TIMEOUT_MS
8627
+ LOAD_TIMEOUT_MS3
5768
8628
  );
5769
8629
  this._isLoaded = true;
5770
8630
  const loadTimeMs = performance.now() - startTime;
5771
- logger8.info("VAD worker loaded successfully", {
8631
+ logger12.info("VAD worker loaded successfully", {
5772
8632
  backend: "wasm",
5773
8633
  loadTimeMs: Math.round(loadTimeMs),
5774
8634
  workerLoadTimeMs: Math.round(result.loadTimeMs),
@@ -5819,7 +8679,7 @@ var SileroVADWorker = class {
5819
8679
  const result = await this.sendMessage(
5820
8680
  { type: "reset" },
5821
8681
  "reset",
5822
- INFERENCE_TIMEOUT_MS
8682
+ INFERENCE_TIMEOUT_MS3
5823
8683
  );
5824
8684
  this.state = result.state;
5825
8685
  this.context = new Float32Array(this.contextSize);
@@ -5865,7 +8725,7 @@ var SileroVADWorker = class {
5865
8725
  context: this.context
5866
8726
  },
5867
8727
  "result",
5868
- INFERENCE_TIMEOUT_MS
8728
+ INFERENCE_TIMEOUT_MS3
5869
8729
  );
5870
8730
  this.state = result.state;
5871
8731
  this.context = audioChunkCopy.slice(-this.contextSize);
@@ -5875,7 +8735,7 @@ var SileroVADWorker = class {
5875
8735
  if (isSpeech && !this.wasSpeaking) {
5876
8736
  preSpeechChunks = [...this.preSpeechBuffer];
5877
8737
  this.preSpeechBuffer = [];
5878
- logger8.debug("Speech started with pre-speech buffer", {
8738
+ logger12.debug("Speech started with pre-speech buffer", {
5879
8739
  preSpeechChunks: preSpeechChunks.length,
5880
8740
  durationMs: Math.round(preSpeechChunks.length * this.getChunkDurationMs())
5881
8741
  });
@@ -5888,7 +8748,7 @@ var SileroVADWorker = class {
5888
8748
  this.preSpeechBuffer = [];
5889
8749
  }
5890
8750
  this.wasSpeaking = isSpeech;
5891
- logger8.trace("VAD worker inference completed", {
8751
+ logger12.trace("VAD worker inference completed", {
5892
8752
  probability: Math.round(result.probability * 1e3) / 1e3,
5893
8753
  isSpeech,
5894
8754
  inferenceTimeMs: Math.round(inferenceTimeMs * 100) / 100,
@@ -5934,7 +8794,7 @@ var SileroVADWorker = class {
5934
8794
  async dispose() {
5935
8795
  if (this.worker) {
5936
8796
  try {
5937
- await this.sendMessage({ type: "dispose" }, "disposed", INFERENCE_TIMEOUT_MS);
8797
+ await this.sendMessage({ type: "dispose" }, "disposed", INFERENCE_TIMEOUT_MS3);
5938
8798
  } catch {
5939
8799
  }
5940
8800
  this.worker.terminate();
@@ -5956,40 +8816,44 @@ var SileroVADWorker = class {
5956
8816
  };
5957
8817
 
5958
8818
  // src/inference/createSileroVAD.ts
5959
- var logger9 = createLogger("createSileroVAD");
8819
+ var logger13 = createLogger("createSileroVAD");
5960
8820
  function supportsVADWorker() {
5961
8821
  if (typeof Worker === "undefined") {
5962
- logger9.debug("Worker not supported: Worker constructor undefined");
8822
+ logger13.debug("Worker not supported: Worker constructor undefined");
5963
8823
  return false;
5964
8824
  }
5965
8825
  if (typeof URL === "undefined" || typeof URL.createObjectURL === "undefined") {
5966
- logger9.debug("Worker not supported: URL.createObjectURL unavailable");
8826
+ logger13.debug("Worker not supported: URL.createObjectURL unavailable");
5967
8827
  return false;
5968
8828
  }
5969
8829
  if (typeof Blob === "undefined") {
5970
- logger9.debug("Worker not supported: Blob constructor unavailable");
8830
+ logger13.debug("Worker not supported: Blob constructor unavailable");
5971
8831
  return false;
5972
8832
  }
5973
8833
  return true;
5974
8834
  }
5975
8835
  function createSileroVAD(config) {
8836
+ if (config.unifiedWorker) {
8837
+ logger13.info("Creating SileroVADUnifiedAdapter (shared unified worker)");
8838
+ return new SileroVADUnifiedAdapter(config.unifiedWorker, config);
8839
+ }
5976
8840
  const fallbackOnError = config.fallbackOnError ?? true;
5977
8841
  let useWorker;
5978
8842
  if (config.useWorker !== void 0) {
5979
8843
  useWorker = config.useWorker;
5980
- logger9.debug("Worker preference explicitly set", { useWorker });
8844
+ logger13.debug("Worker preference explicitly set", { useWorker });
5981
8845
  } else {
5982
8846
  const workerSupported = supportsVADWorker();
5983
8847
  const onMobile = isMobile();
5984
8848
  useWorker = workerSupported && !onMobile;
5985
- logger9.debug("Auto-detected Worker preference", {
8849
+ logger13.debug("Auto-detected Worker preference", {
5986
8850
  useWorker,
5987
8851
  workerSupported,
5988
8852
  onMobile
5989
8853
  });
5990
8854
  }
5991
8855
  if (useWorker) {
5992
- logger9.info("Creating SileroVADWorker (off-main-thread)");
8856
+ logger13.info("Creating SileroVADWorker (off-main-thread)");
5993
8857
  const worker = new SileroVADWorker({
5994
8858
  modelUrl: config.modelUrl,
5995
8859
  sampleRate: config.sampleRate,
@@ -6001,7 +8865,7 @@ function createSileroVAD(config) {
6001
8865
  }
6002
8866
  return worker;
6003
8867
  }
6004
- logger9.info("Creating SileroVADInference (main thread)");
8868
+ logger13.info("Creating SileroVADInference (main thread)");
6005
8869
  return new SileroVADInference(config);
6006
8870
  }
6007
8871
  var VADWorkerWithFallback = class {
@@ -6027,7 +8891,7 @@ var VADWorkerWithFallback = class {
6027
8891
  try {
6028
8892
  return await this.implementation.load();
6029
8893
  } catch (error) {
6030
- logger9.warn("Worker load failed, falling back to main thread", {
8894
+ logger13.warn("Worker load failed, falling back to main thread", {
6031
8895
  error: error instanceof Error ? error.message : String(error)
6032
8896
  });
6033
8897
  try {
@@ -6036,7 +8900,7 @@ var VADWorkerWithFallback = class {
6036
8900
  }
6037
8901
  this.implementation = new SileroVADInference(this.config);
6038
8902
  this.hasFallenBack = true;
6039
- logger9.info("Fallback to SileroVADInference successful");
8903
+ logger13.info("Fallback to SileroVADInference successful");
6040
8904
  return await this.implementation.load();
6041
8905
  }
6042
8906
  }
@@ -6058,7 +8922,7 @@ var VADWorkerWithFallback = class {
6058
8922
  };
6059
8923
 
6060
8924
  // src/inference/SafariSpeechRecognition.ts
6061
- var logger10 = createLogger("SafariSpeech");
8925
+ var logger14 = createLogger("SafariSpeech");
6062
8926
  var SafariSpeechRecognition = class _SafariSpeechRecognition {
6063
8927
  constructor(config = {}) {
6064
8928
  this.recognition = null;
@@ -6077,7 +8941,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
6077
8941
  interimResults: config.interimResults ?? true,
6078
8942
  maxAlternatives: config.maxAlternatives ?? 1
6079
8943
  };
6080
- logger10.debug("SafariSpeechRecognition created", {
8944
+ logger14.debug("SafariSpeechRecognition created", {
6081
8945
  language: this.config.language,
6082
8946
  continuous: this.config.continuous
6083
8947
  });
@@ -6138,7 +9002,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
6138
9002
  */
6139
9003
  async start() {
6140
9004
  if (this.isListening) {
6141
- logger10.warn("Already listening");
9005
+ logger14.warn("Already listening");
6142
9006
  return;
6143
9007
  }
6144
9008
  if (!_SafariSpeechRecognition.isAvailable()) {
@@ -6168,7 +9032,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
6168
9032
  this.isListening = true;
6169
9033
  this.startTime = performance.now();
6170
9034
  this.accumulatedText = "";
6171
- logger10.info("Speech recognition started", {
9035
+ logger14.info("Speech recognition started", {
6172
9036
  language: this.config.language
6173
9037
  });
6174
9038
  span?.end();
@@ -6183,7 +9047,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
6183
9047
  */
6184
9048
  async stop() {
6185
9049
  if (!this.isListening || !this.recognition) {
6186
- logger10.warn("Not currently listening");
9050
+ logger14.warn("Not currently listening");
6187
9051
  return {
6188
9052
  text: this.accumulatedText,
6189
9053
  language: this.config.language,
@@ -6212,7 +9076,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
6212
9076
  if (this.recognition && this.isListening) {
6213
9077
  this.recognition.abort();
6214
9078
  this.isListening = false;
6215
- logger10.info("Speech recognition aborted");
9079
+ logger14.info("Speech recognition aborted");
6216
9080
  }
6217
9081
  }
6218
9082
  /**
@@ -6243,7 +9107,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
6243
9107
  this.isListening = false;
6244
9108
  this.resultCallbacks = [];
6245
9109
  this.errorCallbacks = [];
6246
- logger10.debug("SafariSpeechRecognition disposed");
9110
+ logger14.debug("SafariSpeechRecognition disposed");
6247
9111
  }
6248
9112
  /**
6249
9113
  * Set up event handlers for the recognition instance
@@ -6271,7 +9135,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
6271
9135
  confidence: alternative.confidence
6272
9136
  };
6273
9137
  this.emitResult(speechResult);
6274
- logger10.trace("Speech result", {
9138
+ logger14.trace("Speech result", {
6275
9139
  text: text.substring(0, 50),
6276
9140
  isFinal,
6277
9141
  confidence: alternative.confidence
@@ -6281,12 +9145,12 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
6281
9145
  span?.end();
6282
9146
  } catch (error) {
6283
9147
  span?.endWithError(error instanceof Error ? error : new Error(String(error)));
6284
- logger10.error("Error processing speech result", { error });
9148
+ logger14.error("Error processing speech result", { error });
6285
9149
  }
6286
9150
  };
6287
9151
  this.recognition.onerror = (event) => {
6288
9152
  const error = new Error(`Speech recognition error: ${event.error} - ${event.message}`);
6289
- logger10.error("Speech recognition error", { error: event.error, message: event.message });
9153
+ logger14.error("Speech recognition error", { error: event.error, message: event.message });
6290
9154
  this.emitError(error);
6291
9155
  if (this.stopRejecter) {
6292
9156
  this.stopRejecter(error);
@@ -6296,7 +9160,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
6296
9160
  };
6297
9161
  this.recognition.onend = () => {
6298
9162
  this.isListening = false;
6299
- logger10.info("Speech recognition ended", {
9163
+ logger14.info("Speech recognition ended", {
6300
9164
  totalText: this.accumulatedText.length,
6301
9165
  durationMs: performance.now() - this.startTime
6302
9166
  });
@@ -6313,13 +9177,13 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
6313
9177
  }
6314
9178
  };
6315
9179
  this.recognition.onstart = () => {
6316
- logger10.debug("Speech recognition started by browser");
9180
+ logger14.debug("Speech recognition started by browser");
6317
9181
  };
6318
9182
  this.recognition.onspeechstart = () => {
6319
- logger10.debug("Speech detected");
9183
+ logger14.debug("Speech detected");
6320
9184
  };
6321
9185
  this.recognition.onspeechend = () => {
6322
- logger10.debug("Speech ended");
9186
+ logger14.debug("Speech ended");
6323
9187
  };
6324
9188
  }
6325
9189
  /**
@@ -6330,7 +9194,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
6330
9194
  try {
6331
9195
  callback(result);
6332
9196
  } catch (error) {
6333
- logger10.error("Error in result callback", { error });
9197
+ logger14.error("Error in result callback", { error });
6334
9198
  }
6335
9199
  }
6336
9200
  }
@@ -6342,7 +9206,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
6342
9206
  try {
6343
9207
  callback(error);
6344
9208
  } catch (callbackError) {
6345
- logger10.error("Error in error callback", { error: callbackError });
9209
+ logger14.error("Error in error callback", { error: callbackError });
6346
9210
  }
6347
9211
  }
6348
9212
  }