@omote/core 0.3.25 → 0.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.mjs CHANGED
@@ -12,11 +12,6 @@ import {
12
12
  setLogLevel,
13
13
  setLoggingEnabled
14
14
  } from "./chunk-ESU52TDS.mjs";
15
- import {
16
- __webpack_exports__env,
17
- __webpack_exports__pipeline
18
- } from "./chunk-B6TIE56N.mjs";
19
- import "./chunk-NSSMTXJJ.mjs";
20
15
 
21
16
  // src/audio/MicrophoneCapture.ts
22
17
  var MicrophoneCapture = class {
@@ -2274,6 +2269,14 @@ function getSessionOptions(backend) {
2274
2269
  graphOptimizationLevel: "all"
2275
2270
  };
2276
2271
  }
2272
+ if (isIOS()) {
2273
+ return {
2274
+ executionProviders: ["wasm"],
2275
+ graphOptimizationLevel: "basic",
2276
+ enableCpuMemArena: false,
2277
+ enableMemPattern: false
2278
+ };
2279
+ }
2277
2280
  return {
2278
2281
  executionProviders: ["wasm"],
2279
2282
  graphOptimizationLevel: "all"
@@ -2549,77 +2552,108 @@ var Wav2Vec2Inference = class {
2549
2552
  this.ort = ort;
2550
2553
  this._backend = backend;
2551
2554
  logger2.info("ONNX Runtime loaded", { backend: this._backend });
2552
- const cache = getModelCache();
2553
2555
  const modelUrl = this.config.modelUrl;
2554
- const isCached = await cache.has(modelUrl);
2555
- let modelBuffer;
2556
- if (isCached) {
2557
- logger2.debug("Loading model from cache", { modelUrl });
2558
- modelBuffer = await cache.get(modelUrl);
2559
- if (!modelBuffer) {
2560
- logger2.warn("Cache corruption detected, clearing and retrying", { modelUrl });
2561
- await cache.delete(modelUrl);
2562
- logger2.info("Corrupted cache entry deleted, fetching fresh model", { modelUrl });
2563
- modelBuffer = await fetchWithCache(modelUrl);
2556
+ const dataUrl = this.config.externalDataUrl !== false ? typeof this.config.externalDataUrl === "string" ? this.config.externalDataUrl : `${modelUrl}.data` : null;
2557
+ const sessionOptions = getSessionOptions(this._backend);
2558
+ let isCached = false;
2559
+ if (isIOS()) {
2560
+ logger2.info("iOS: passing model URLs directly to ORT (low-memory path)", {
2561
+ modelUrl,
2562
+ dataUrl
2563
+ });
2564
+ if (dataUrl) {
2565
+ const dataFilename = dataUrl.split("/").pop();
2566
+ logger2.info("iOS: setting externalData", { dataFilename, dataUrl });
2567
+ sessionOptions.externalData = [{
2568
+ path: dataFilename,
2569
+ data: dataUrl
2570
+ // URL string — ORT fetches directly into WASM
2571
+ }];
2564
2572
  }
2565
- } else {
2566
- logger2.debug("Fetching and caching model", { modelUrl });
2567
- modelBuffer = await fetchWithCache(modelUrl);
2568
- }
2569
- if (!modelBuffer) {
2570
- const errorMsg = `Failed to load model: ${modelUrl}. Model buffer is null or undefined even after retry.`;
2571
- logger2.error(errorMsg, { modelUrl, isCached });
2572
- throw new Error(errorMsg);
2573
- }
2574
- let externalDataBuffer = null;
2575
- if (this.config.externalDataUrl !== false) {
2576
- const dataUrl = typeof this.config.externalDataUrl === "string" ? this.config.externalDataUrl : `${modelUrl}.data`;
2573
+ logger2.info("iOS: calling InferenceSession.create() with URL string", {
2574
+ modelUrl,
2575
+ sessionOptions: JSON.stringify(
2576
+ sessionOptions,
2577
+ (_, v) => typeof v === "string" && v.length > 100 ? v.slice(0, 100) + "..." : v
2578
+ )
2579
+ });
2577
2580
  try {
2578
- const isDataCached = await cache.has(dataUrl);
2579
- if (isDataCached) {
2580
- logger2.debug("Loading external data from cache", { dataUrl });
2581
- externalDataBuffer = await cache.get(dataUrl);
2582
- if (!externalDataBuffer) {
2583
- logger2.warn("Cache corruption for external data, retrying", { dataUrl });
2584
- await cache.delete(dataUrl);
2581
+ this.session = await this.ort.InferenceSession.create(modelUrl, sessionOptions);
2582
+ } catch (sessionErr) {
2583
+ logger2.error("iOS: InferenceSession.create() failed", {
2584
+ error: sessionErr instanceof Error ? sessionErr.message : String(sessionErr),
2585
+ errorType: sessionErr?.constructor?.name,
2586
+ stack: sessionErr instanceof Error ? sessionErr.stack : void 0
2587
+ });
2588
+ throw sessionErr;
2589
+ }
2590
+ logger2.info("iOS: session created successfully", {
2591
+ inputNames: this.session.inputNames,
2592
+ outputNames: this.session.outputNames
2593
+ });
2594
+ } else {
2595
+ const cache = getModelCache();
2596
+ isCached = await cache.has(modelUrl);
2597
+ let modelBuffer;
2598
+ if (isCached) {
2599
+ logger2.debug("Loading model from cache", { modelUrl });
2600
+ modelBuffer = await cache.get(modelUrl);
2601
+ if (!modelBuffer) {
2602
+ logger2.warn("Cache corruption detected, clearing and retrying", { modelUrl });
2603
+ await cache.delete(modelUrl);
2604
+ modelBuffer = await fetchWithCache(modelUrl);
2605
+ }
2606
+ } else {
2607
+ logger2.debug("Fetching and caching model", { modelUrl });
2608
+ modelBuffer = await fetchWithCache(modelUrl);
2609
+ }
2610
+ if (!modelBuffer) {
2611
+ throw new Error(`Failed to load model: ${modelUrl}`);
2612
+ }
2613
+ let externalDataBuffer = null;
2614
+ if (dataUrl) {
2615
+ try {
2616
+ const isDataCached = await cache.has(dataUrl);
2617
+ if (isDataCached) {
2618
+ logger2.debug("Loading external data from cache", { dataUrl });
2619
+ externalDataBuffer = await cache.get(dataUrl);
2620
+ if (!externalDataBuffer) {
2621
+ logger2.warn("Cache corruption for external data, retrying", { dataUrl });
2622
+ await cache.delete(dataUrl);
2623
+ externalDataBuffer = await fetchWithCache(dataUrl);
2624
+ }
2625
+ } else {
2626
+ logger2.info("Fetching external model data", {
2627
+ dataUrl,
2628
+ note: "This may be a large download (383MB+)"
2629
+ });
2585
2630
  externalDataBuffer = await fetchWithCache(dataUrl);
2586
2631
  }
2587
- } else {
2588
- logger2.info("Fetching external model data", {
2632
+ logger2.info("External data loaded", {
2633
+ size: formatBytes(externalDataBuffer.byteLength)
2634
+ });
2635
+ } catch (err) {
2636
+ logger2.debug("No external data file found (single-file model)", {
2589
2637
  dataUrl,
2590
- note: "This may be a large download (383MB+)"
2638
+ error: err.message
2591
2639
  });
2592
- externalDataBuffer = await fetchWithCache(dataUrl);
2593
2640
  }
2594
- logger2.info("External data loaded", {
2595
- size: formatBytes(externalDataBuffer.byteLength)
2596
- });
2597
- } catch (err) {
2598
- logger2.debug("No external data file found (single-file model)", {
2599
- dataUrl,
2600
- error: err.message
2601
- });
2602
2641
  }
2642
+ logger2.debug("Creating ONNX session", {
2643
+ graphSize: formatBytes(modelBuffer.byteLength),
2644
+ externalDataSize: externalDataBuffer ? formatBytes(externalDataBuffer.byteLength) : "none",
2645
+ backend: this._backend
2646
+ });
2647
+ if (externalDataBuffer) {
2648
+ const dataFilename = dataUrl.split("/").pop();
2649
+ sessionOptions.externalData = [{
2650
+ path: dataFilename,
2651
+ data: new Uint8Array(externalDataBuffer)
2652
+ }];
2653
+ }
2654
+ const modelData = new Uint8Array(modelBuffer);
2655
+ this.session = await this.ort.InferenceSession.create(modelData, sessionOptions);
2603
2656
  }
2604
- logger2.debug("Creating ONNX session", {
2605
- graphSize: formatBytes(modelBuffer.byteLength),
2606
- externalDataSize: externalDataBuffer ? formatBytes(externalDataBuffer.byteLength) : "none",
2607
- backend: this._backend
2608
- });
2609
- const sessionOptions = getSessionOptions(this._backend);
2610
- if (externalDataBuffer) {
2611
- const dataFilename = (typeof this.config.externalDataUrl === "string" ? this.config.externalDataUrl : `${modelUrl}.data`).split("/").pop();
2612
- sessionOptions.externalData = [{
2613
- path: dataFilename,
2614
- data: new Uint8Array(externalDataBuffer)
2615
- }];
2616
- }
2617
- logger2.info("Creating session with execution provider", {
2618
- executionProvider: this._backend,
2619
- hasExternalData: !!externalDataBuffer
2620
- });
2621
- const modelData = new Uint8Array(modelBuffer);
2622
- this.session = await this.ort.InferenceSession.create(modelData, sessionOptions);
2623
2657
  logger2.info("ONNX session created successfully", {
2624
2658
  executionProvider: this._backend,
2625
2659
  backend: this._backend
@@ -2634,7 +2668,7 @@ var Wav2Vec2Inference = class {
2634
2668
  span?.setAttributes({
2635
2669
  "model.backend": this._backend,
2636
2670
  "model.load_time_ms": loadTimeMs,
2637
- "model.cached": isCached
2671
+ "model.cached": !isIOS() && isCached
2638
2672
  });
2639
2673
  span?.end();
2640
2674
  telemetry?.recordHistogram("omote.model.load_time", loadTimeMs, {
@@ -2837,319 +2871,550 @@ LAM_BLENDSHAPES.forEach((name, index) => {
2837
2871
  });
2838
2872
  var UPPER_FACE_SET = new Set(UPPER_FACE_BLENDSHAPES);
2839
2873
 
2840
- // src/inference/WhisperInference.ts
2841
- var logger4 = createLogger("Whisper");
2842
- var WhisperInference = class _WhisperInference {
2843
- constructor(config = {}) {
2844
- this.pipeline = null;
2845
- this.currentModel = null;
2874
+ // src/inference/kaldiFbank.ts
2875
+ function fft(re, im) {
2876
+ const n = re.length;
2877
+ for (let i = 1, j = 0; i < n; i++) {
2878
+ let bit = n >> 1;
2879
+ while (j & bit) {
2880
+ j ^= bit;
2881
+ bit >>= 1;
2882
+ }
2883
+ j ^= bit;
2884
+ if (i < j) {
2885
+ let tmp = re[i];
2886
+ re[i] = re[j];
2887
+ re[j] = tmp;
2888
+ tmp = im[i];
2889
+ im[i] = im[j];
2890
+ im[j] = tmp;
2891
+ }
2892
+ }
2893
+ for (let len = 2; len <= n; len *= 2) {
2894
+ const halfLen = len / 2;
2895
+ const angle = -2 * Math.PI / len;
2896
+ const wRe = Math.cos(angle);
2897
+ const wIm = Math.sin(angle);
2898
+ for (let i = 0; i < n; i += len) {
2899
+ let curRe = 1;
2900
+ let curIm = 0;
2901
+ for (let j = 0; j < halfLen; j++) {
2902
+ const a = i + j;
2903
+ const b = a + halfLen;
2904
+ const tRe = curRe * re[b] - curIm * im[b];
2905
+ const tIm = curRe * im[b] + curIm * re[b];
2906
+ re[b] = re[a] - tRe;
2907
+ im[b] = im[a] - tIm;
2908
+ re[a] += tRe;
2909
+ im[a] += tIm;
2910
+ const nextRe = curRe * wRe - curIm * wIm;
2911
+ curIm = curRe * wIm + curIm * wRe;
2912
+ curRe = nextRe;
2913
+ }
2914
+ }
2915
+ }
2916
+ }
2917
+ function htkMel(freq) {
2918
+ return 1127 * Math.log(1 + freq / 700);
2919
+ }
2920
+ function htkMelInverse(mel) {
2921
+ return 700 * (Math.exp(mel / 1127) - 1);
2922
+ }
2923
+ function buildMelFilterbank(numBins, fftSize, sampleRate, lowFreq, highFreq) {
2924
+ const numFftBins = fftSize / 2 + 1;
2925
+ const lowMel = htkMel(lowFreq);
2926
+ const highMel = htkMel(highFreq);
2927
+ const melPoints = new Float64Array(numBins + 2);
2928
+ for (let i = 0; i < numBins + 2; i++) {
2929
+ melPoints[i] = lowMel + (highMel - lowMel) * i / (numBins + 1);
2930
+ }
2931
+ const binFreqs = new Float64Array(numBins + 2);
2932
+ for (let i = 0; i < numBins + 2; i++) {
2933
+ binFreqs[i] = htkMelInverse(melPoints[i]) * fftSize / sampleRate;
2934
+ }
2935
+ const filters = [];
2936
+ for (let m = 0; m < numBins; m++) {
2937
+ const left = binFreqs[m];
2938
+ const center = binFreqs[m + 1];
2939
+ const right = binFreqs[m + 2];
2940
+ const startBin = Math.max(0, Math.ceil(left));
2941
+ const endBin = Math.min(numFftBins - 1, Math.floor(right));
2942
+ const weights = new Float32Array(endBin - startBin + 1);
2943
+ for (let k = startBin; k <= endBin; k++) {
2944
+ if (k <= center) {
2945
+ weights[k - startBin] = center - left > 0 ? (k - left) / (center - left) : 0;
2946
+ } else {
2947
+ weights[k - startBin] = right - center > 0 ? (right - k) / (right - center) : 0;
2948
+ }
2949
+ }
2950
+ filters.push({ startBin, weights });
2951
+ }
2952
+ return filters;
2953
+ }
2954
+ function createHammingWindow(length) {
2955
+ const window2 = new Float32Array(length);
2956
+ for (let i = 0; i < length; i++) {
2957
+ window2[i] = 0.54 - 0.46 * Math.cos(2 * Math.PI * i / (length - 1));
2958
+ }
2959
+ return window2;
2960
+ }
2961
+ function computeKaldiFbank(audio, sampleRate, numMelBins, opts) {
2962
+ const frameLengthMs = opts?.frameLengthMs ?? 25;
2963
+ const frameShiftMs = opts?.frameShiftMs ?? 10;
2964
+ const lowFreq = opts?.lowFreq ?? 20;
2965
+ const highFreq = opts?.highFreq ?? sampleRate / 2;
2966
+ const dither = opts?.dither ?? 0;
2967
+ const preemphasis = opts?.preemphasis ?? 0.97;
2968
+ const frameLengthSamples = Math.round(sampleRate * frameLengthMs / 1e3);
2969
+ const frameShiftSamples = Math.round(sampleRate * frameShiftMs / 1e3);
2970
+ const scaled = new Float32Array(audio.length);
2971
+ for (let i = 0; i < audio.length; i++) {
2972
+ scaled[i] = audio[i] * 32768;
2973
+ }
2974
+ if (dither > 0) {
2975
+ for (let i = 0; i < scaled.length; i++) {
2976
+ const u1 = Math.random();
2977
+ const u2 = Math.random();
2978
+ scaled[i] += dither * Math.sqrt(-2 * Math.log(u1 + 1e-10)) * Math.cos(2 * Math.PI * u2);
2979
+ }
2980
+ }
2981
+ const numFrames = Math.max(0, Math.floor((scaled.length - frameLengthSamples) / frameShiftSamples) + 1);
2982
+ if (numFrames === 0) {
2983
+ return new Float32Array(0);
2984
+ }
2985
+ let fftSize = 1;
2986
+ while (fftSize < frameLengthSamples) fftSize *= 2;
2987
+ const numFftBins = fftSize / 2 + 1;
2988
+ const window2 = createHammingWindow(frameLengthSamples);
2989
+ const filters = buildMelFilterbank(numMelBins, fftSize, sampleRate, lowFreq, highFreq);
2990
+ const output = new Float32Array(numFrames * numMelBins);
2991
+ const fftRe = new Float64Array(fftSize);
2992
+ const fftIm = new Float64Array(fftSize);
2993
+ for (let f = 0; f < numFrames; f++) {
2994
+ const offset = f * frameShiftSamples;
2995
+ fftRe.fill(0);
2996
+ fftIm.fill(0);
2997
+ for (let i = 0; i < frameLengthSamples; i++) {
2998
+ let sample = scaled[offset + i];
2999
+ if (preemphasis > 0 && i > 0) {
3000
+ sample -= preemphasis * scaled[offset + i - 1];
3001
+ } else if (preemphasis > 0 && i === 0 && offset > 0) {
3002
+ sample -= preemphasis * scaled[offset - 1];
3003
+ }
3004
+ fftRe[i] = sample * window2[i];
3005
+ }
3006
+ fft(fftRe, fftIm);
3007
+ const outOffset = f * numMelBins;
3008
+ for (let m = 0; m < numMelBins; m++) {
3009
+ const filter = filters[m];
3010
+ let energy = 0;
3011
+ for (let k = 0; k < filter.weights.length; k++) {
3012
+ const bin = filter.startBin + k;
3013
+ if (bin < numFftBins) {
3014
+ const powerSpec = fftRe[bin] * fftRe[bin] + fftIm[bin] * fftIm[bin];
3015
+ energy += filter.weights[k] * powerSpec;
3016
+ }
3017
+ }
3018
+ output[outOffset + m] = Math.log(Math.max(energy, 1e-10));
3019
+ }
3020
+ }
3021
+ return output;
3022
+ }
3023
+ function applyLFR(features, featureDim, lfrM = 7, lfrN = 6) {
3024
+ const numFrames = features.length / featureDim;
3025
+ if (numFrames === 0) return new Float32Array(0);
3026
+ const leftPad = Math.floor((lfrM - 1) / 2);
3027
+ const paddedLen = numFrames + leftPad;
3028
+ const numOutputFrames = Math.ceil(paddedLen / lfrN);
3029
+ const outputDim = featureDim * lfrM;
3030
+ const output = new Float32Array(numOutputFrames * outputDim);
3031
+ for (let i = 0; i < numOutputFrames; i++) {
3032
+ const startFrame = i * lfrN - leftPad;
3033
+ for (let j = 0; j < lfrM; j++) {
3034
+ let srcFrame = startFrame + j;
3035
+ if (srcFrame < 0) srcFrame = 0;
3036
+ if (srcFrame >= numFrames) srcFrame = numFrames - 1;
3037
+ const srcOffset = srcFrame * featureDim;
3038
+ const dstOffset = i * outputDim + j * featureDim;
3039
+ for (let k = 0; k < featureDim; k++) {
3040
+ output[dstOffset + k] = features[srcOffset + k];
3041
+ }
3042
+ }
3043
+ }
3044
+ return output;
3045
+ }
3046
+ function applyCMVN(features, dim, negMean, invStddev) {
3047
+ for (let i = 0; i < features.length; i++) {
3048
+ const d = i % dim;
3049
+ features[i] = (features[i] + negMean[d]) * invStddev[d];
3050
+ }
3051
+ return features;
3052
+ }
3053
+ function parseCMVNFromMetadata(negMeanStr, invStddevStr) {
3054
+ const negMean = new Float32Array(
3055
+ negMeanStr.split(",").map((s) => parseFloat(s.trim()))
3056
+ );
3057
+ const invStddev = new Float32Array(
3058
+ invStddevStr.split(",").map((s) => parseFloat(s.trim()))
3059
+ );
3060
+ return { negMean, invStddev };
3061
+ }
3062
+
3063
+ // src/inference/ctcDecoder.ts
3064
+ function resolveLanguageId(language) {
3065
+ const map = {
3066
+ auto: 0,
3067
+ zh: 3,
3068
+ en: 4,
3069
+ yue: 7,
3070
+ ja: 11,
3071
+ ko: 12
3072
+ };
3073
+ return map[language] ?? 0;
3074
+ }
3075
+ function resolveTextNormId(textNorm) {
3076
+ return textNorm === "without_itn" ? 15 : 14;
3077
+ }
3078
+ function parseTokensFile(content) {
3079
+ const map = /* @__PURE__ */ new Map();
3080
+ const lines = content.split("\n");
3081
+ for (const line of lines) {
3082
+ const trimmed = line.trim();
3083
+ if (!trimmed) continue;
3084
+ const lastSpace = trimmed.lastIndexOf(" ");
3085
+ if (lastSpace === -1) continue;
3086
+ const token = trimmed.substring(0, lastSpace);
3087
+ const id = parseInt(trimmed.substring(lastSpace + 1), 10);
3088
+ if (!isNaN(id)) {
3089
+ map.set(id, token);
3090
+ }
3091
+ }
3092
+ return map;
3093
+ }
3094
+ function parseStructuredToken(token) {
3095
+ const match = token.match(/^<\|(.+)\|>$/);
3096
+ if (!match) return null;
3097
+ const value = match[1];
3098
+ if (value === "zh" || value === "en" || value === "ja" || value === "ko" || value === "yue" || value === "nospeech") {
3099
+ return { type: "language", value };
3100
+ }
3101
+ const emotions = ["HAPPY", "SAD", "ANGRY", "NEUTRAL", "FEARFUL", "DISGUSTED", "SURPRISED", "EMO_UNKNOWN"];
3102
+ if (emotions.includes(value)) {
3103
+ return { type: "emotion", value };
3104
+ }
3105
+ const events = ["Speech", "BGM", "Applause", "Laughter", "Crying", "Coughing", "Sneezing", "EVENT_UNKNOWN"];
3106
+ if (events.includes(value)) {
3107
+ return { type: "event", value };
3108
+ }
3109
+ if (value === "withitn" || value === "woitn" || value === "with_itn" || value === "without_itn") {
3110
+ return { type: "textnorm", value };
3111
+ }
3112
+ return null;
3113
+ }
3114
+ function ctcGreedyDecode(logits, seqLen, vocabSize, tokenMap) {
3115
+ const tokenIds = [];
3116
+ for (let t = 0; t < seqLen; t++) {
3117
+ const offset = t * vocabSize;
3118
+ let maxIdx = 0;
3119
+ let maxVal = logits[offset];
3120
+ for (let v = 1; v < vocabSize; v++) {
3121
+ if (logits[offset + v] > maxVal) {
3122
+ maxVal = logits[offset + v];
3123
+ maxIdx = v;
3124
+ }
3125
+ }
3126
+ tokenIds.push(maxIdx);
3127
+ }
3128
+ const collapsed = [];
3129
+ let prev = -1;
3130
+ for (const id of tokenIds) {
3131
+ if (id !== prev) {
3132
+ collapsed.push(id);
3133
+ prev = id;
3134
+ }
3135
+ }
3136
+ const filtered = collapsed.filter((id) => id !== 0 && id !== 1 && id !== 2);
3137
+ let language;
3138
+ let emotion;
3139
+ let event;
3140
+ const textTokens = [];
3141
+ for (const id of filtered) {
3142
+ const token = tokenMap.get(id);
3143
+ if (!token) continue;
3144
+ const structured = parseStructuredToken(token);
3145
+ if (structured) {
3146
+ if (structured.type === "language") language = structured.value;
3147
+ else if (structured.type === "emotion") emotion = structured.value;
3148
+ else if (structured.type === "event") event = structured.value;
3149
+ } else {
3150
+ textTokens.push(token);
3151
+ }
3152
+ }
3153
+ let text = textTokens.join("");
3154
+ text = text.replace(/\u2581/g, " ").trim();
3155
+ return { text, language, emotion, event };
3156
+ }
3157
+
3158
+ // src/inference/SenseVoiceInference.ts
3159
+ var logger4 = createLogger("SenseVoice");
3160
+ var SenseVoiceInference = class {
3161
+ constructor(config) {
3162
+ this.session = null;
3163
+ this.ort = null;
3164
+ this._backend = "wasm";
2846
3165
  this.isLoading = false;
2847
- this.actualBackend = "unknown";
3166
+ this.inferenceQueue = Promise.resolve();
3167
+ // Preprocessing state (loaded once)
3168
+ this.tokenMap = null;
3169
+ this.negMean = null;
3170
+ this.invStddev = null;
3171
+ this.languageId = 0;
3172
+ this.textNormId = 14;
3173
+ const modelDir = config.modelUrl.substring(0, config.modelUrl.lastIndexOf("/"));
3174
+ const tokensUrl = config.tokensUrl ?? `${modelDir}/tokens.txt`;
2848
3175
  this.config = {
2849
- model: config.model || "tiny",
2850
- multilingual: config.multilingual || false,
2851
- language: config.language || "en",
2852
- task: config.task || "transcribe",
2853
- dtype: config.dtype || "q8",
2854
- device: config.device || "auto",
2855
- localModelPath: config.localModelPath,
2856
- token: config.token,
2857
- suppressNonSpeech: config.suppressNonSpeech !== false
2858
- // Default true
3176
+ modelUrl: config.modelUrl,
3177
+ tokensUrl,
3178
+ language: config.language ?? "auto",
3179
+ textNorm: config.textNorm ?? "with_itn",
3180
+ backend: config.backend ?? "auto"
2859
3181
  };
3182
+ this.languageId = resolveLanguageId(this.config.language);
3183
+ this.textNormId = resolveTextNormId(this.config.textNorm);
2860
3184
  }
2861
- /**
2862
- * Check if WebGPU is available in this browser
2863
- */
2864
- static async isWebGPUAvailable() {
2865
- return "gpu" in navigator;
3185
+ get backend() {
3186
+ return this.session ? this._backend : null;
2866
3187
  }
2867
- /**
2868
- * Load the Whisper model pipeline
2869
- */
3188
+ get isLoaded() {
3189
+ return this.session !== null;
3190
+ }
3191
+ // ─── Load ───────────────────────────────────────────────────────────────
2870
3192
  async load(onProgress) {
2871
3193
  if (this.isLoading) {
2872
- logger4.debug("Already loading model, waiting...");
2873
- while (this.isLoading) {
2874
- await new Promise((resolve) => setTimeout(resolve, 100));
2875
- }
2876
- return;
3194
+ throw new Error("Model is already loading");
2877
3195
  }
2878
- const modelName = this.getModelName();
2879
- if (this.pipeline !== null && this.currentModel === modelName) {
2880
- logger4.debug("Model already loaded", { model: modelName });
2881
- return;
3196
+ if (this.session) {
3197
+ throw new Error("Model already loaded. Call dispose() first.");
2882
3198
  }
2883
3199
  this.isLoading = true;
3200
+ const startTime = performance.now();
2884
3201
  const telemetry = getTelemetry();
2885
- const span = telemetry?.startSpan("whisper.load", {
2886
- "whisper.model": modelName,
2887
- "whisper.dtype": this.config.dtype,
2888
- "whisper.device": this.config.device
3202
+ const span = telemetry?.startSpan("SenseVoice.load", {
3203
+ "model.url": this.config.modelUrl,
3204
+ "model.backend_requested": this.config.backend
2889
3205
  });
2890
3206
  try {
2891
- const loadStart = performance.now();
2892
- logger4.info("Loading model", {
2893
- model: modelName,
2894
- dtype: this.config.dtype,
2895
- device: this.config.device,
2896
- multilingual: this.config.multilingual
2897
- });
2898
- if (this.pipeline !== null && this.currentModel !== modelName) {
2899
- logger4.debug("Disposing old model", { oldModel: this.currentModel });
2900
- await this.pipeline.dispose();
2901
- this.pipeline = null;
3207
+ logger4.info("Loading ONNX Runtime...", { preference: this.config.backend });
3208
+ const { ort, backend } = await getOnnxRuntimeForPreference(this.config.backend);
3209
+ this.ort = ort;
3210
+ this._backend = backend;
3211
+ logger4.info("ONNX Runtime loaded", { backend: this._backend });
3212
+ logger4.debug("Fetching tokens vocabulary", { tokensUrl: this.config.tokensUrl });
3213
+ const tokensResponse = await fetch(this.config.tokensUrl);
3214
+ if (!tokensResponse.ok) {
3215
+ throw new Error(`Failed to fetch tokens.txt: ${tokensResponse.status} ${tokensResponse.statusText}`);
2902
3216
  }
2903
- const hasWebGPU = await _WhisperInference.isWebGPUAvailable();
2904
- const device = this.config.device === "auto" ? hasWebGPU ? "webgpu" : "wasm" : this.config.device;
2905
- logger4.info("Creating pipeline", { device, hasWebGPU });
2906
- __webpack_exports__env.allowLocalModels = false;
2907
- __webpack_exports__env.allowRemoteModels = true;
2908
- __webpack_exports__env.useBrowserCache = false;
2909
- __webpack_exports__env.useCustomCache = false;
2910
- __webpack_exports__env.useWasmCache = false;
2911
- if (__webpack_exports__env.backends.onnx.wasm) {
2912
- __webpack_exports__env.backends.onnx.wasm.proxy = false;
2913
- __webpack_exports__env.backends.onnx.wasm.numThreads = 1;
3217
+ const tokensText = await tokensResponse.text();
3218
+ this.tokenMap = parseTokensFile(tokensText);
3219
+ logger4.debug("Tokens loaded", { vocabSize: this.tokenMap.size });
3220
+ const sessionOptions = getSessionOptions(this._backend);
3221
+ if (this._backend === "webgpu") {
3222
+ sessionOptions.graphOptimizationLevel = "basic";
2914
3223
  }
2915
- logger4.info("Configured transformers.js env", {
2916
- allowLocalModels: __webpack_exports__env.allowLocalModels,
2917
- useBrowserCache: __webpack_exports__env.useBrowserCache,
2918
- useWasmCache: __webpack_exports__env.useWasmCache
2919
- });
2920
- const pipelineOptions = {
2921
- dtype: this.config.dtype,
2922
- device,
2923
- progress_callback: onProgress,
2924
- // For medium models, use no_attentions revision to save memory
2925
- revision: modelName.includes("whisper-medium") ? "no_attentions" : "main",
2926
- // Pass HuggingFace token to bypass rate limits
2927
- ...this.config.token && { token: this.config.token }
2928
- };
2929
- if (device === "webgpu") {
2930
- pipelineOptions.session_options = {
2931
- executionProviders: ["webgpu"]
2932
- };
2933
- logger4.info("Forcing WebGPU execution providers");
3224
+ let isCached = false;
3225
+ if (isIOS()) {
3226
+ logger4.info("iOS: passing model URL directly to ORT (low-memory path)", {
3227
+ modelUrl: this.config.modelUrl
3228
+ });
3229
+ this.session = await this.ort.InferenceSession.create(
3230
+ this.config.modelUrl,
3231
+ sessionOptions
3232
+ );
3233
+ } else {
3234
+ const cache = getModelCache();
3235
+ isCached = await cache.has(this.config.modelUrl);
3236
+ let modelBuffer;
3237
+ if (isCached) {
3238
+ logger4.debug("Loading model from cache", { modelUrl: this.config.modelUrl });
3239
+ modelBuffer = await cache.get(this.config.modelUrl);
3240
+ onProgress?.(modelBuffer.byteLength, modelBuffer.byteLength);
3241
+ } else {
3242
+ logger4.debug("Fetching and caching model", { modelUrl: this.config.modelUrl });
3243
+ modelBuffer = await fetchWithCache(this.config.modelUrl, onProgress);
3244
+ }
3245
+ logger4.debug("Creating ONNX session", {
3246
+ size: formatBytes(modelBuffer.byteLength),
3247
+ backend: this._backend
3248
+ });
3249
+ const modelData = new Uint8Array(modelBuffer);
3250
+ this.session = await this.ort.InferenceSession.create(modelData, sessionOptions);
2934
3251
  }
2935
- this.pipeline = await __webpack_exports__pipeline(
2936
- "automatic-speech-recognition",
2937
- modelName,
2938
- pipelineOptions
2939
- );
2940
- this.actualBackend = device;
2941
- this.currentModel = modelName;
2942
- const loadTimeMs = performance.now() - loadStart;
2943
- logger4.info("Model loaded successfully", {
2944
- model: modelName,
2945
- loadTimeMs: Math.round(loadTimeMs)
3252
+ try {
3253
+ const metadata = this.session.handler?.metadata;
3254
+ if (metadata?.neg_mean && metadata?.inv_stddev) {
3255
+ const cmvn = parseCMVNFromMetadata(metadata.neg_mean, metadata.inv_stddev);
3256
+ this.negMean = cmvn.negMean;
3257
+ this.invStddev = cmvn.invStddev;
3258
+ logger4.debug("CMVN loaded from model metadata", { dim: this.negMean.length });
3259
+ } else {
3260
+ logger4.warn("CMVN not found in model metadata \u2014 features will not be normalized");
3261
+ }
3262
+ } catch (cmvnErr) {
3263
+ logger4.warn("Failed to read CMVN from model metadata", { error: cmvnErr });
3264
+ }
3265
+ const loadTimeMs = performance.now() - startTime;
3266
+ logger4.info("SenseVoice model loaded", {
3267
+ backend: this._backend,
3268
+ loadTimeMs: Math.round(loadTimeMs),
3269
+ vocabSize: this.tokenMap.size,
3270
+ inputs: this.session.inputNames,
3271
+ outputs: this.session.outputNames,
3272
+ hasCMVN: this.negMean !== null
2946
3273
  });
2947
3274
  span?.setAttributes({
2948
- "whisper.load_time_ms": loadTimeMs
3275
+ "model.backend": this._backend,
3276
+ "model.load_time_ms": loadTimeMs,
3277
+ "model.cached": !isIOS() && isCached,
3278
+ "model.vocab_size": this.tokenMap.size
2949
3279
  });
2950
3280
  span?.end();
2951
- } catch (error) {
2952
- const errorDetails = {
2953
- message: error instanceof Error ? error.message : String(error),
2954
- stack: error instanceof Error ? error.stack : void 0,
2955
- name: error instanceof Error ? error.name : void 0,
2956
- error
3281
+ telemetry?.recordHistogram("omote.model.load_time", loadTimeMs, {
3282
+ model: "sensevoice",
3283
+ backend: this._backend
3284
+ });
3285
+ return {
3286
+ backend: this._backend,
3287
+ loadTimeMs,
3288
+ inputNames: [...this.session.inputNames],
3289
+ outputNames: [...this.session.outputNames],
3290
+ vocabSize: this.tokenMap.size
2957
3291
  };
2958
- logger4.error("Failed to load model", errorDetails);
2959
- span?.endWithError(error);
3292
+ } catch (error) {
3293
+ span?.endWithError(error instanceof Error ? error : new Error(String(error)));
3294
+ telemetry?.incrementCounter("omote.errors.total", 1, {
3295
+ model: "sensevoice",
3296
+ error_type: "load_failed"
3297
+ });
2960
3298
  throw error;
2961
3299
  } finally {
2962
3300
  this.isLoading = false;
2963
3301
  }
2964
3302
  }
3303
+ // ─── Transcribe ─────────────────────────────────────────────────────────
2965
3304
  /**
2966
- * Transcribe audio to text
3305
+ * Transcribe audio samples to text
2967
3306
  *
2968
- * @param audio Audio samples (Float32Array, 16kHz mono)
2969
- * @param options Transcription options
3307
+ * @param audioSamples Float32Array of audio samples at 16kHz, [-1, 1] range
3308
+ * @returns Transcription result with text, emotion, language, and event
2970
3309
  */
2971
- async transcribe(audio, options) {
2972
- if (!this.pipeline) {
3310
+ async transcribe(audioSamples) {
3311
+ if (!this.session || !this.ort || !this.tokenMap) {
2973
3312
  throw new Error("Model not loaded. Call load() first.");
2974
3313
  }
2975
- const audioCopy = new Float32Array(audio);
2976
- const telemetry = getTelemetry();
2977
- const span = telemetry?.startSpan("whisper.transcribe", {
2978
- "audio.samples": audioCopy.length,
2979
- "audio.duration_s": audioCopy.length / 16e3,
2980
- "whisper.model": this.currentModel
2981
- });
2982
- try {
2983
- const inferStart = performance.now();
2984
- const audioDurationSec = audioCopy.length / 16e3;
2985
- const isShortAudio = audioDurationSec < 10;
2986
- logger4.debug("Starting transcription", {
2987
- audioSamples: audioCopy.length,
2988
- durationSeconds: audioDurationSec.toFixed(2),
2989
- isShortAudio
2990
- });
2991
- const transcribeOptions = {
2992
- // Decoding strategy
2993
- top_k: 0,
2994
- do_sample: false,
2995
- // Adaptive chunking: Disable for short audio, enable for long audio
2996
- chunk_length_s: options?.chunkLengthS || (isShortAudio ? audioDurationSec : 30),
2997
- stride_length_s: options?.strideLengthS || (isShortAudio ? 0 : 5),
2998
- // Timestamps
2999
- return_timestamps: options?.returnTimestamps || false,
3000
- force_full_sequences: false
3001
- };
3002
- if (this.config.multilingual) {
3003
- transcribeOptions.language = options?.language || this.config.language;
3004
- transcribeOptions.task = options?.task || this.config.task;
3005
- }
3006
- const rawResult = await this.pipeline(audioCopy, transcribeOptions);
3007
- const result = Array.isArray(rawResult) ? rawResult[0] : rawResult;
3008
- const inferenceTimeMs = performance.now() - inferStart;
3009
- let cleanedText = result.text;
3010
- if (this.config.suppressNonSpeech) {
3011
- cleanedText = this.removeNonSpeechTokens(cleanedText);
3012
- }
3013
- const transcription = {
3014
- text: cleanedText,
3015
- language: this.config.language,
3016
- inferenceTimeMs,
3017
- chunks: result.chunks
3018
- };
3019
- logger4.debug("Transcription complete", {
3020
- text: transcription.text,
3021
- inferenceTimeMs: Math.round(inferenceTimeMs),
3022
- chunksCount: result.chunks?.length || 0
3023
- });
3024
- span?.setAttributes({
3025
- "whisper.inference_time_ms": inferenceTimeMs,
3026
- "whisper.text_length": transcription.text.length
3027
- });
3028
- span?.end();
3029
- return transcription;
3030
- } catch (error) {
3031
- logger4.error("Transcribe error", { error });
3032
- span?.endWithError(error);
3033
- throw new Error(`Whisper transcription failed: ${error}`);
3034
- }
3314
+ const audio = new Float32Array(audioSamples);
3315
+ return this.queueInference(audio);
3035
3316
  }
3036
- /**
3037
- * Transcribe with streaming chunks (progressive results)
3038
- *
3039
- * @param audio Audio samples
3040
- * @param onChunk Called when each chunk is finalized
3041
- * @param onUpdate Called after each generation step (optional)
3042
- */
3043
- async transcribeStreaming(audio, onChunk, onUpdate, options) {
3044
- if (!this.pipeline) {
3045
- throw new Error("Model not loaded. Call load() first.");
3046
- }
3047
- const telemetry = getTelemetry();
3048
- const span = telemetry?.startSpan("whisper.transcribe_streaming", {
3049
- "audio.samples": audio.length,
3050
- "audio.duration_s": audio.length / 16e3
3051
- });
3052
- try {
3053
- const inferStart = performance.now();
3054
- logger4.debug("Starting streaming transcription", {
3055
- audioSamples: audio.length,
3056
- durationSeconds: (audio.length / 16e3).toFixed(2)
3057
- });
3058
- const transcribeOptions = {
3059
- top_k: 0,
3060
- do_sample: false,
3061
- chunk_length_s: options?.chunkLengthS || 30,
3062
- stride_length_s: options?.strideLengthS || 5,
3063
- return_timestamps: true,
3064
- force_full_sequences: false
3065
- };
3066
- if (this.config.multilingual) {
3067
- transcribeOptions.language = options?.language || this.config.language;
3068
- transcribeOptions.task = options?.task || this.config.task;
3069
- }
3070
- const rawResult = await this.pipeline(audio, transcribeOptions);
3071
- const result = Array.isArray(rawResult) ? rawResult[0] : rawResult;
3072
- const inferenceTimeMs = performance.now() - inferStart;
3073
- if (result.chunks && onChunk) {
3074
- for (const chunk of result.chunks) {
3075
- onChunk({
3076
- text: chunk.text,
3077
- timestamp: chunk.timestamp
3317
+ queueInference(audio) {
3318
+ return new Promise((resolve, reject) => {
3319
+ this.inferenceQueue = this.inferenceQueue.then(async () => {
3320
+ const telemetry = getTelemetry();
3321
+ const span = telemetry?.startSpan("SenseVoice.transcribe", {
3322
+ "inference.backend": this._backend,
3323
+ "inference.input_samples": audio.length
3324
+ });
3325
+ try {
3326
+ const startTime = performance.now();
3327
+ const preprocessStart = performance.now();
3328
+ const fbank = computeKaldiFbank(audio, 16e3, 80);
3329
+ const numFrames = fbank.length / 80;
3330
+ if (numFrames === 0) {
3331
+ resolve({
3332
+ text: "",
3333
+ inferenceTimeMs: performance.now() - startTime,
3334
+ preprocessTimeMs: performance.now() - preprocessStart
3335
+ });
3336
+ return;
3337
+ }
3338
+ const lfrFeatures = applyLFR(fbank, 80, 7, 6);
3339
+ const numLfrFrames = lfrFeatures.length / 560;
3340
+ if (this.negMean && this.invStddev) {
3341
+ applyCMVN(lfrFeatures, 560, this.negMean, this.invStddev);
3342
+ }
3343
+ const preprocessTimeMs = performance.now() - preprocessStart;
3344
+ const ort = this.ort;
3345
+ const feeds = {
3346
+ x: new ort.Tensor("float32", lfrFeatures, [1, numLfrFrames, 560]),
3347
+ x_length: new ort.Tensor("int32", new Int32Array([numLfrFrames]), [1]),
3348
+ language: new ort.Tensor("int32", new Int32Array([this.languageId]), [1]),
3349
+ text_norm: new ort.Tensor("int32", new Int32Array([this.textNormId]), [1])
3350
+ };
3351
+ const results = await this.session.run(feeds);
3352
+ const logitsOutput = results["logits"];
3353
+ if (!logitsOutput) {
3354
+ throw new Error('Model output missing "logits" tensor');
3355
+ }
3356
+ const logitsData = logitsOutput.data;
3357
+ const logitsDims = logitsOutput.dims;
3358
+ const seqLen = logitsDims[1];
3359
+ const vocabSize = logitsDims[2];
3360
+ const decoded = ctcGreedyDecode(logitsData, seqLen, vocabSize, this.tokenMap);
3361
+ const inferenceTimeMs = performance.now() - startTime;
3362
+ logger4.trace("Transcription complete", {
3363
+ text: decoded.text.substring(0, 50),
3364
+ language: decoded.language,
3365
+ emotion: decoded.emotion,
3366
+ event: decoded.event,
3367
+ preprocessTimeMs: Math.round(preprocessTimeMs * 100) / 100,
3368
+ inferenceTimeMs: Math.round(inferenceTimeMs * 100) / 100,
3369
+ numFrames,
3370
+ numLfrFrames
3371
+ });
3372
+ span?.setAttributes({
3373
+ "inference.duration_ms": inferenceTimeMs,
3374
+ "inference.preprocess_ms": preprocessTimeMs,
3375
+ "inference.num_frames": numFrames,
3376
+ "inference.text_length": decoded.text.length
3377
+ });
3378
+ span?.end();
3379
+ telemetry?.recordHistogram("omote.inference.latency", inferenceTimeMs, {
3380
+ model: "sensevoice",
3381
+ backend: this._backend
3382
+ });
3383
+ telemetry?.incrementCounter("omote.inference.total", 1, {
3384
+ model: "sensevoice",
3385
+ backend: this._backend,
3386
+ status: "success"
3387
+ });
3388
+ resolve({
3389
+ text: decoded.text,
3390
+ language: decoded.language,
3391
+ emotion: decoded.emotion,
3392
+ event: decoded.event,
3393
+ inferenceTimeMs,
3394
+ preprocessTimeMs
3395
+ });
3396
+ } catch (err) {
3397
+ span?.endWithError(err instanceof Error ? err : new Error(String(err)));
3398
+ telemetry?.incrementCounter("omote.inference.total", 1, {
3399
+ model: "sensevoice",
3400
+ backend: this._backend,
3401
+ status: "error"
3078
3402
  });
3403
+ reject(err);
3079
3404
  }
3080
- }
3081
- if (onUpdate) {
3082
- onUpdate(result.text);
3083
- }
3084
- logger4.debug("Streaming transcription complete", {
3085
- text: result.text,
3086
- inferenceTimeMs: Math.round(inferenceTimeMs),
3087
- chunksCount: result.chunks?.length || 0
3088
- });
3089
- span?.setAttributes({
3090
- "whisper.inference_time_ms": inferenceTimeMs,
3091
- "whisper.chunks_count": result.chunks?.length || 0
3092
- });
3093
- span?.end();
3094
- return {
3095
- text: result.text,
3096
- language: this.config.language,
3097
- inferenceTimeMs,
3098
- chunks: result.chunks
3099
- };
3100
- } catch (error) {
3101
- logger4.error("Streaming transcribe error", { error });
3102
- span?.endWithError(error);
3103
- throw new Error(`Whisper streaming transcription failed: ${error}`);
3104
- }
3105
- }
3106
- /**
3107
- * Dispose of the model and free resources
3108
- */
3109
- async dispose() {
3110
- if (this.pipeline) {
3111
- logger4.debug("Disposing model", { model: this.currentModel });
3112
- await this.pipeline.dispose();
3113
- this.pipeline = null;
3114
- this.currentModel = null;
3115
- }
3116
- }
3117
- /**
3118
- * Check if model is loaded
3119
- */
3120
- get isLoaded() {
3121
- return this.pipeline !== null;
3122
- }
3123
- /**
3124
- * Get the backend being used (webgpu or wasm)
3125
- */
3126
- get backend() {
3127
- return this.actualBackend;
3128
- }
3129
- /**
3130
- * Get the full model name used by transformers.js
3131
- */
3132
- getModelName() {
3133
- if (this.config.localModelPath) {
3134
- return this.config.localModelPath;
3135
- }
3136
- let modelName = `onnx-community/whisper-${this.config.model}`;
3137
- if (!this.config.multilingual) {
3138
- modelName += ".en";
3139
- }
3140
- return modelName;
3405
+ });
3406
+ });
3141
3407
  }
3142
- /**
3143
- * Remove non-speech event tokens from transcription
3144
- *
3145
- * Whisper outputs special tokens for non-speech events like:
3146
- * [LAUGHTER], [APPLAUSE], [MUSIC], [BLANK_AUDIO], [CLICKING], etc.
3147
- *
3148
- * This method strips these tokens and cleans up extra whitespace.
3149
- */
3150
- removeNonSpeechTokens(text) {
3151
- const cleaned = text.replace(/\[[\w\s_]+\]/g, "");
3152
- return cleaned.replace(/\s+/g, " ").trim();
3408
+ // ─── Dispose ──────────────────────────────────────────────────────────
3409
+ async dispose() {
3410
+ if (this.session) {
3411
+ await this.session.release();
3412
+ this.session = null;
3413
+ }
3414
+ this.ort = null;
3415
+ this.tokenMap = null;
3416
+ this.negMean = null;
3417
+ this.invStddev = null;
3153
3418
  }
3154
3419
  };
3155
3420
 
@@ -4545,268 +4810,8 @@ var VADWorkerWithFallback = class {
4545
4810
  }
4546
4811
  };
4547
4812
 
4548
- // src/inference/Emotion2VecInference.ts
4549
- var logger10 = createLogger("Emotion2Vec");
4550
- var EMOTION2VEC_LABELS = ["neutral", "happy", "angry", "sad"];
4551
- var Emotion2VecInference = class {
4552
- constructor(config) {
4553
- this.session = null;
4554
- this.ort = null;
4555
- this._backend = "wasm";
4556
- this.isLoading = false;
4557
- this.inferenceQueue = Promise.resolve();
4558
- this.config = {
4559
- modelUrl: config.modelUrl,
4560
- backend: config.backend ?? "auto",
4561
- sampleRate: config.sampleRate ?? 16e3
4562
- };
4563
- }
4564
- get backend() {
4565
- return this.session ? this._backend : null;
4566
- }
4567
- get isLoaded() {
4568
- return this.session !== null;
4569
- }
4570
- get sampleRate() {
4571
- return this.config.sampleRate;
4572
- }
4573
- /**
4574
- * Load the ONNX model
4575
- */
4576
- async load() {
4577
- if (this.isLoading) {
4578
- throw new Error("Model is already loading");
4579
- }
4580
- if (this.session) {
4581
- throw new Error("Model already loaded. Call dispose() first.");
4582
- }
4583
- this.isLoading = true;
4584
- const startTime = performance.now();
4585
- const telemetry = getTelemetry();
4586
- const span = telemetry?.startSpan("Emotion2Vec.load", {
4587
- "model.url": this.config.modelUrl,
4588
- "model.backend_requested": this.config.backend
4589
- });
4590
- try {
4591
- logger10.info("Loading ONNX Runtime...", { preference: this.config.backend });
4592
- const { ort, backend } = await getOnnxRuntimeForPreference(this.config.backend);
4593
- this.ort = ort;
4594
- this._backend = backend;
4595
- logger10.info("ONNX Runtime loaded", { backend: this._backend });
4596
- logger10.info("Checking model cache...");
4597
- const cache = getModelCache();
4598
- const modelUrl = this.config.modelUrl;
4599
- const isCached = await cache.has(modelUrl);
4600
- logger10.info("Cache check complete", { modelUrl, isCached });
4601
- let modelBuffer;
4602
- if (isCached) {
4603
- logger10.info("Loading model from cache...", { modelUrl });
4604
- modelBuffer = await cache.get(modelUrl);
4605
- logger10.info("Model loaded from cache", { size: formatBytes(modelBuffer.byteLength) });
4606
- } else {
4607
- logger10.info("Fetching model (not cached)...", { modelUrl });
4608
- modelBuffer = await fetchWithCache(modelUrl);
4609
- logger10.info("Model fetched and cached", { size: formatBytes(modelBuffer.byteLength) });
4610
- }
4611
- logger10.info("Creating ONNX session (this may take a while for large models)...");
4612
- logger10.debug("Creating ONNX session", {
4613
- size: formatBytes(modelBuffer.byteLength),
4614
- backend: this._backend
4615
- });
4616
- const sessionOptions = getSessionOptions(this._backend);
4617
- const modelData = new Uint8Array(modelBuffer);
4618
- this.session = await ort.InferenceSession.create(modelData, sessionOptions);
4619
- const loadTimeMs = performance.now() - startTime;
4620
- logger10.info("Model loaded successfully", {
4621
- backend: this._backend,
4622
- loadTimeMs: Math.round(loadTimeMs),
4623
- sampleRate: this.config.sampleRate,
4624
- inputNames: [...this.session.inputNames],
4625
- outputNames: [...this.session.outputNames]
4626
- });
4627
- span?.setAttributes({
4628
- "model.backend": this._backend,
4629
- "model.load_time_ms": loadTimeMs,
4630
- "model.cached": isCached
4631
- });
4632
- span?.end();
4633
- telemetry?.recordHistogram("omote.model.load_time", loadTimeMs, {
4634
- model: "emotion2vec",
4635
- backend: this._backend
4636
- });
4637
- return {
4638
- backend: this._backend,
4639
- loadTimeMs,
4640
- inputNames: [...this.session.inputNames],
4641
- outputNames: [...this.session.outputNames],
4642
- sampleRate: this.config.sampleRate
4643
- };
4644
- } catch (error) {
4645
- span?.endWithError(error instanceof Error ? error : new Error(String(error)));
4646
- telemetry?.incrementCounter("omote.errors.total", 1, {
4647
- model: "emotion2vec",
4648
- error_type: "load_failed"
4649
- });
4650
- throw error;
4651
- } finally {
4652
- this.isLoading = false;
4653
- }
4654
- }
4655
- /**
4656
- * Run emotion inference on audio samples
4657
- *
4658
- * @param audio - Float32Array of 16kHz audio samples
4659
- * @returns Frame-level emotion results at 50Hz
4660
- */
4661
- async infer(audio) {
4662
- if (!this.session) {
4663
- throw new Error("Model not loaded. Call load() first.");
4664
- }
4665
- return this.queueInference(audio);
4666
- }
4667
- queueInference(audio) {
4668
- const audioCopy = new Float32Array(audio);
4669
- return new Promise((resolve, reject) => {
4670
- this.inferenceQueue = this.inferenceQueue.then(async () => {
4671
- const telemetry = getTelemetry();
4672
- const span = telemetry?.startSpan("Emotion2Vec.infer", {
4673
- "inference.backend": this._backend,
4674
- "inference.audio_samples": audioCopy.length
4675
- });
4676
- try {
4677
- const startTime = performance.now();
4678
- const inputTensor = new this.ort.Tensor("float32", audioCopy, [1, audioCopy.length]);
4679
- const results = await this.session.run({ audio: inputTensor });
4680
- const logitsTensor = results["logits"];
4681
- const embeddingsTensor = results["layer_norm_25"];
4682
- if (!logitsTensor) {
4683
- throw new Error(
4684
- `Missing logits tensor from SUPERB model. Got outputs: ${Object.keys(results).join(", ")}`
4685
- );
4686
- }
4687
- const logitsData = logitsTensor.data;
4688
- const logits = new Float32Array(logitsData);
4689
- const probs = this.softmax(logits);
4690
- const probabilities = {
4691
- neutral: probs[0],
4692
- happy: probs[1],
4693
- angry: probs[2],
4694
- sad: probs[3]
4695
- };
4696
- let maxIdx = 0;
4697
- let maxProb = probs[0];
4698
- for (let i = 1; i < probs.length; i++) {
4699
- if (probs[i] > maxProb) {
4700
- maxProb = probs[i];
4701
- maxIdx = i;
4702
- }
4703
- }
4704
- const dominant = {
4705
- emotion: EMOTION2VEC_LABELS[maxIdx],
4706
- confidence: maxProb,
4707
- probabilities
4708
- };
4709
- let embeddings = [];
4710
- let numFrames = 1;
4711
- if (embeddingsTensor) {
4712
- const embeddingData = embeddingsTensor.data;
4713
- const dims = embeddingsTensor.dims;
4714
- if (dims.length === 3) {
4715
- numFrames = dims[1];
4716
- const embeddingDim = dims[2];
4717
- for (let i = 0; i < numFrames; i++) {
4718
- const start = i * embeddingDim;
4719
- embeddings.push(new Float32Array(embeddingData.slice(start, start + embeddingDim)));
4720
- }
4721
- }
4722
- }
4723
- const frames = [];
4724
- for (let i = 0; i < numFrames; i++) {
4725
- frames.push({
4726
- emotion: dominant.emotion,
4727
- confidence: dominant.confidence,
4728
- probabilities: { ...probabilities }
4729
- });
4730
- }
4731
- const inferenceTimeMs = performance.now() - startTime;
4732
- logger10.debug("Emotion inference completed", {
4733
- numFrames,
4734
- dominant: dominant.emotion,
4735
- confidence: Math.round(dominant.confidence * 100),
4736
- inferenceTimeMs: Math.round(inferenceTimeMs)
4737
- });
4738
- span?.setAttributes({
4739
- "inference.duration_ms": inferenceTimeMs,
4740
- "inference.num_frames": numFrames,
4741
- "inference.dominant_emotion": dominant.emotion
4742
- });
4743
- span?.end();
4744
- telemetry?.recordHistogram("omote.inference.latency", inferenceTimeMs, {
4745
- model: "emotion2vec",
4746
- backend: this._backend
4747
- });
4748
- telemetry?.incrementCounter("omote.inference.total", 1, {
4749
- model: "emotion2vec",
4750
- backend: this._backend,
4751
- status: "success"
4752
- });
4753
- resolve({
4754
- frames,
4755
- dominant,
4756
- embeddings,
4757
- logits,
4758
- inferenceTimeMs
4759
- });
4760
- } catch (err) {
4761
- span?.endWithError(err instanceof Error ? err : new Error(String(err)));
4762
- telemetry?.incrementCounter("omote.inference.total", 1, {
4763
- model: "emotion2vec",
4764
- backend: this._backend,
4765
- status: "error"
4766
- });
4767
- reject(err);
4768
- }
4769
- });
4770
- });
4771
- }
4772
- /**
4773
- * Apply softmax to convert logits to probabilities
4774
- */
4775
- softmax(logits) {
4776
- let max = logits[0];
4777
- for (let i = 1; i < logits.length; i++) {
4778
- if (logits[i] > max) max = logits[i];
4779
- }
4780
- const exp = new Float32Array(logits.length);
4781
- let sum = 0;
4782
- for (let i = 0; i < logits.length; i++) {
4783
- exp[i] = Math.exp(logits[i] - max);
4784
- sum += exp[i];
4785
- }
4786
- const probs = new Float32Array(logits.length);
4787
- for (let i = 0; i < logits.length; i++) {
4788
- probs[i] = exp[i] / sum;
4789
- }
4790
- return probs;
4791
- }
4792
- /**
4793
- * Dispose of the model and free resources
4794
- */
4795
- async dispose() {
4796
- if (this.session) {
4797
- await this.session.release();
4798
- this.session = null;
4799
- }
4800
- }
4801
- };
4802
- /**
4803
- * Check if WebGPU is available and working
4804
- * (iOS returns false even if navigator.gpu exists due to ONNX Runtime bugs)
4805
- */
4806
- Emotion2VecInference.isWebGPUAvailable = isWebGPUAvailable;
4807
-
4808
4813
  // src/inference/SafariSpeechRecognition.ts
4809
- var logger11 = createLogger("SafariSpeech");
4814
+ var logger10 = createLogger("SafariSpeech");
4810
4815
  var SafariSpeechRecognition = class _SafariSpeechRecognition {
4811
4816
  constructor(config = {}) {
4812
4817
  this.recognition = null;
@@ -4825,7 +4830,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
4825
4830
  interimResults: config.interimResults ?? true,
4826
4831
  maxAlternatives: config.maxAlternatives ?? 1
4827
4832
  };
4828
- logger11.debug("SafariSpeechRecognition created", {
4833
+ logger10.debug("SafariSpeechRecognition created", {
4829
4834
  language: this.config.language,
4830
4835
  continuous: this.config.continuous
4831
4836
  });
@@ -4886,7 +4891,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
4886
4891
  */
4887
4892
  async start() {
4888
4893
  if (this.isListening) {
4889
- logger11.warn("Already listening");
4894
+ logger10.warn("Already listening");
4890
4895
  return;
4891
4896
  }
4892
4897
  if (!_SafariSpeechRecognition.isAvailable()) {
@@ -4916,7 +4921,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
4916
4921
  this.isListening = true;
4917
4922
  this.startTime = performance.now();
4918
4923
  this.accumulatedText = "";
4919
- logger11.info("Speech recognition started", {
4924
+ logger10.info("Speech recognition started", {
4920
4925
  language: this.config.language
4921
4926
  });
4922
4927
  span?.end();
@@ -4931,7 +4936,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
4931
4936
  */
4932
4937
  async stop() {
4933
4938
  if (!this.isListening || !this.recognition) {
4934
- logger11.warn("Not currently listening");
4939
+ logger10.warn("Not currently listening");
4935
4940
  return {
4936
4941
  text: this.accumulatedText,
4937
4942
  language: this.config.language,
@@ -4960,7 +4965,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
4960
4965
  if (this.recognition && this.isListening) {
4961
4966
  this.recognition.abort();
4962
4967
  this.isListening = false;
4963
- logger11.info("Speech recognition aborted");
4968
+ logger10.info("Speech recognition aborted");
4964
4969
  }
4965
4970
  }
4966
4971
  /**
@@ -4991,7 +4996,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
4991
4996
  this.isListening = false;
4992
4997
  this.resultCallbacks = [];
4993
4998
  this.errorCallbacks = [];
4994
- logger11.debug("SafariSpeechRecognition disposed");
4999
+ logger10.debug("SafariSpeechRecognition disposed");
4995
5000
  }
4996
5001
  /**
4997
5002
  * Set up event handlers for the recognition instance
@@ -5019,7 +5024,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
5019
5024
  confidence: alternative.confidence
5020
5025
  };
5021
5026
  this.emitResult(speechResult);
5022
- logger11.trace("Speech result", {
5027
+ logger10.trace("Speech result", {
5023
5028
  text: text.substring(0, 50),
5024
5029
  isFinal,
5025
5030
  confidence: alternative.confidence
@@ -5029,12 +5034,12 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
5029
5034
  span?.end();
5030
5035
  } catch (error) {
5031
5036
  span?.endWithError(error instanceof Error ? error : new Error(String(error)));
5032
- logger11.error("Error processing speech result", { error });
5037
+ logger10.error("Error processing speech result", { error });
5033
5038
  }
5034
5039
  };
5035
5040
  this.recognition.onerror = (event) => {
5036
5041
  const error = new Error(`Speech recognition error: ${event.error} - ${event.message}`);
5037
- logger11.error("Speech recognition error", { error: event.error, message: event.message });
5042
+ logger10.error("Speech recognition error", { error: event.error, message: event.message });
5038
5043
  this.emitError(error);
5039
5044
  if (this.stopRejecter) {
5040
5045
  this.stopRejecter(error);
@@ -5044,7 +5049,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
5044
5049
  };
5045
5050
  this.recognition.onend = () => {
5046
5051
  this.isListening = false;
5047
- logger11.info("Speech recognition ended", {
5052
+ logger10.info("Speech recognition ended", {
5048
5053
  totalText: this.accumulatedText.length,
5049
5054
  durationMs: performance.now() - this.startTime
5050
5055
  });
@@ -5061,13 +5066,13 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
5061
5066
  }
5062
5067
  };
5063
5068
  this.recognition.onstart = () => {
5064
- logger11.debug("Speech recognition started by browser");
5069
+ logger10.debug("Speech recognition started by browser");
5065
5070
  };
5066
5071
  this.recognition.onspeechstart = () => {
5067
- logger11.debug("Speech detected");
5072
+ logger10.debug("Speech detected");
5068
5073
  };
5069
5074
  this.recognition.onspeechend = () => {
5070
- logger11.debug("Speech ended");
5075
+ logger10.debug("Speech ended");
5071
5076
  };
5072
5077
  }
5073
5078
  /**
@@ -5078,7 +5083,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
5078
5083
  try {
5079
5084
  callback(result);
5080
5085
  } catch (error) {
5081
- logger11.error("Error in result callback", { error });
5086
+ logger10.error("Error in result callback", { error });
5082
5087
  }
5083
5088
  }
5084
5089
  }
@@ -5090,7 +5095,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
5090
5095
  try {
5091
5096
  callback(error);
5092
5097
  } catch (callbackError) {
5093
- logger11.error("Error in error callback", { error: callbackError });
5098
+ logger10.error("Error in error callback", { error: callbackError });
5094
5099
  }
5095
5100
  }
5096
5101
  }
@@ -5264,7 +5269,7 @@ var AgentCoreAdapter = class extends EventEmitter {
5264
5269
  this._sessionId = null;
5265
5270
  this._isConnected = false;
5266
5271
  // Sub-components
5267
- this.whisper = null;
5272
+ this.asr = null;
5268
5273
  this.vad = null;
5269
5274
  this.lam = null;
5270
5275
  this.pipeline = null;
@@ -5303,7 +5308,7 @@ var AgentCoreAdapter = class extends EventEmitter {
5303
5308
  try {
5304
5309
  const authToken = await this.getAuthToken(config.tenant);
5305
5310
  await Promise.all([
5306
- this.initWhisper(),
5311
+ this.initASR(),
5307
5312
  this.initLAM()
5308
5313
  ]);
5309
5314
  await this.connectWebSocket(authToken, config);
@@ -5333,7 +5338,7 @@ var AgentCoreAdapter = class extends EventEmitter {
5333
5338
  this.ws = null;
5334
5339
  }
5335
5340
  await Promise.all([
5336
- this.whisper?.dispose(),
5341
+ this.asr?.dispose(),
5337
5342
  this.vad?.dispose(),
5338
5343
  this.lam?.dispose()
5339
5344
  ]);
@@ -5465,16 +5470,15 @@ var AgentCoreAdapter = class extends EventEmitter {
5465
5470
  });
5466
5471
  return token;
5467
5472
  }
5468
- async initWhisper() {
5473
+ async initASR() {
5469
5474
  await Promise.all([
5470
- // Whisper ASR
5475
+ // SenseVoice ASR
5471
5476
  (async () => {
5472
- this.whisper = new WhisperInference({
5473
- model: "tiny",
5474
- device: "auto",
5475
- language: "en"
5477
+ this.asr = new SenseVoiceInference({
5478
+ modelUrl: "/models/sensevoice/model.int8.onnx",
5479
+ language: "auto"
5476
5480
  });
5477
- await this.whisper.load();
5481
+ await this.asr.load();
5478
5482
  })(),
5479
5483
  // Silero VAD for accurate voice activity detection
5480
5484
  (async () => {
@@ -5660,17 +5664,17 @@ var AgentCoreAdapter = class extends EventEmitter {
5660
5664
  console.debug("[AgentCore] Skipping silent audio", { rms, samples: audio.length });
5661
5665
  return;
5662
5666
  }
5663
- if (this.whisper) {
5667
+ if (this.asr) {
5664
5668
  this.setState("listening");
5665
5669
  this.emit("user.speech.start", { timestamp: Date.now() });
5666
- this.whisper.transcribe(audio).then((result) => {
5670
+ this.asr.transcribe(audio).then((result) => {
5667
5671
  this.emit("user.transcript.final", {
5668
5672
  text: result.text,
5669
5673
  confidence: 1
5670
5674
  });
5671
5675
  this.emit("user.speech.end", { timestamp: Date.now(), durationMs: result.inferenceTimeMs });
5672
5676
  const cleanText = result.text.trim();
5673
- if (cleanText && !cleanText.includes("[BLANK_AUDIO]")) {
5677
+ if (cleanText) {
5674
5678
  this.sendText(cleanText).catch((error) => {
5675
5679
  console.error("[AgentCore] Send text error:", error);
5676
5680
  });
@@ -6484,228 +6488,6 @@ var InterruptionHandler = class extends EventEmitter {
6484
6488
  }
6485
6489
  };
6486
6490
 
6487
- // src/cache/huggingFaceCDN.ts
6488
- var HF_CDN_TEST_URL = "https://huggingface.co/Xenova/whisper-tiny/resolve/main/config.json";
6489
- function parseHuggingFaceUrl(url) {
6490
- const pattern = /^https:\/\/huggingface\.co\/([^/]+)\/([^/]+)\/resolve\/([^/]+)\/(.+)$/;
6491
- const match = url.match(pattern);
6492
- if (!match) {
6493
- return null;
6494
- }
6495
- return {
6496
- org: match[1],
6497
- model: match[2],
6498
- branch: match[3],
6499
- file: match[4]
6500
- };
6501
- }
6502
- async function isHuggingFaceCDNReachable(testUrl = HF_CDN_TEST_URL) {
6503
- try {
6504
- const response = await fetch(testUrl, {
6505
- method: "HEAD",
6506
- cache: "no-store"
6507
- // Don't use cached response for reachability check
6508
- });
6509
- return response.ok;
6510
- } catch {
6511
- return false;
6512
- }
6513
- }
6514
-
6515
- // src/utils/transformersCacheClear.ts
6516
- var logger12 = createLogger("TransformersCache");
6517
- async function clearTransformersCache(options) {
6518
- const verbose = options?.verbose ?? true;
6519
- const additionalPatterns = options?.additionalPatterns ?? [];
6520
- if (!("caches" in window)) {
6521
- logger12.warn("Cache API not available in this environment");
6522
- return [];
6523
- }
6524
- try {
6525
- const cacheNames = await caches.keys();
6526
- const deletedCaches = [];
6527
- const patterns = [
6528
- "transformers",
6529
- "huggingface",
6530
- "onnx",
6531
- ...additionalPatterns
6532
- ];
6533
- for (const cacheName of cacheNames) {
6534
- const shouldDelete = patterns.some(
6535
- (pattern) => cacheName.toLowerCase().includes(pattern.toLowerCase())
6536
- );
6537
- if (shouldDelete) {
6538
- if (verbose) {
6539
- logger12.info("Deleting cache", { cacheName });
6540
- }
6541
- const deleted = await caches.delete(cacheName);
6542
- if (deleted) {
6543
- deletedCaches.push(cacheName);
6544
- } else if (verbose) {
6545
- logger12.warn("Failed to delete cache", { cacheName });
6546
- }
6547
- }
6548
- }
6549
- if (verbose) {
6550
- logger12.info("Cache clearing complete", {
6551
- totalCaches: cacheNames.length,
6552
- deletedCount: deletedCaches.length,
6553
- deletedCaches
6554
- });
6555
- }
6556
- return deletedCaches;
6557
- } catch (error) {
6558
- logger12.error("Error clearing caches", { error });
6559
- throw error;
6560
- }
6561
- }
6562
- async function clearSpecificCache(cacheName) {
6563
- if (!("caches" in window)) {
6564
- logger12.warn("Cache API not available in this environment");
6565
- return false;
6566
- }
6567
- try {
6568
- const deleted = await caches.delete(cacheName);
6569
- logger12.info("Cache deletion attempt", { cacheName, deleted });
6570
- return deleted;
6571
- } catch (error) {
6572
- logger12.error("Error deleting cache", { cacheName, error });
6573
- return false;
6574
- }
6575
- }
6576
- async function listCaches() {
6577
- if (!("caches" in window)) {
6578
- logger12.warn("Cache API not available in this environment");
6579
- return [];
6580
- }
6581
- try {
6582
- const cacheNames = await caches.keys();
6583
- logger12.debug("Available caches", { cacheNames });
6584
- return cacheNames;
6585
- } catch (error) {
6586
- logger12.error("Error listing caches", { error });
6587
- return [];
6588
- }
6589
- }
6590
- async function validateCachedResponse(cacheName, requestUrl) {
6591
- if (!("caches" in window)) {
6592
- return {
6593
- exists: false,
6594
- valid: false,
6595
- contentType: null,
6596
- isHtml: false,
6597
- reason: "Cache API not available"
6598
- };
6599
- }
6600
- try {
6601
- const cache = await caches.open(cacheName);
6602
- const response = await cache.match(requestUrl);
6603
- if (!response) {
6604
- return {
6605
- exists: false,
6606
- valid: false,
6607
- contentType: null,
6608
- isHtml: false,
6609
- reason: "Not in cache"
6610
- };
6611
- }
6612
- const contentType = response.headers.get("content-type");
6613
- const isHtml = contentType?.includes("text/html") || contentType?.includes("text/plain");
6614
- const clonedResponse = response.clone();
6615
- const text = await clonedResponse.text();
6616
- const looksLikeHtml = text.trim().startsWith("<") || text.includes("<!DOCTYPE");
6617
- const valid = Boolean(
6618
- response.status === 200 && !isHtml && !looksLikeHtml && contentType && (contentType.includes("application/json") || contentType.includes("application/octet-stream") || contentType.includes("binary"))
6619
- );
6620
- return {
6621
- exists: true,
6622
- valid,
6623
- contentType,
6624
- isHtml: isHtml || looksLikeHtml,
6625
- reason: valid ? "Valid response" : `Invalid: status=${response.status}, contentType=${contentType}, isHtml=${isHtml || looksLikeHtml}`
6626
- };
6627
- } catch (error) {
6628
- logger12.error("Error validating cached response", { cacheName, requestUrl, error });
6629
- return {
6630
- exists: false,
6631
- valid: false,
6632
- contentType: null,
6633
- isHtml: false,
6634
- reason: `Error: ${error}`
6635
- };
6636
- }
6637
- }
6638
- async function scanForInvalidCaches() {
6639
- if (!("caches" in window)) {
6640
- return { totalCaches: 0, scannedEntries: 0, invalidEntries: [] };
6641
- }
6642
- const invalidEntries = [];
6643
- let scannedEntries = 0;
6644
- try {
6645
- const cacheNames = await caches.keys();
6646
- for (const cacheName of cacheNames) {
6647
- if (!cacheName.toLowerCase().includes("transformers")) {
6648
- continue;
6649
- }
6650
- const cache = await caches.open(cacheName);
6651
- const requests = await cache.keys();
6652
- for (const request of requests) {
6653
- scannedEntries++;
6654
- const url = request.url;
6655
- const validation = await validateCachedResponse(cacheName, url);
6656
- if (validation.exists && !validation.valid) {
6657
- invalidEntries.push({
6658
- cacheName,
6659
- url,
6660
- reason: validation.reason || "Unknown"
6661
- });
6662
- }
6663
- }
6664
- }
6665
- logger12.info("Cache scan complete", {
6666
- totalCaches: cacheNames.length,
6667
- scannedEntries,
6668
- invalidCount: invalidEntries.length
6669
- });
6670
- return {
6671
- totalCaches: cacheNames.length,
6672
- scannedEntries,
6673
- invalidEntries
6674
- };
6675
- } catch (error) {
6676
- logger12.error("Error scanning caches", { error });
6677
- throw error;
6678
- }
6679
- }
6680
- async function nukeBrowserCaches(preventRecreation = false) {
6681
- if (!("caches" in window)) {
6682
- logger12.warn("Cache API not available in this environment");
6683
- return 0;
6684
- }
6685
- try {
6686
- const cacheNames = await caches.keys();
6687
- let deletedCount = 0;
6688
- for (const cacheName of cacheNames) {
6689
- const deleted = await caches.delete(cacheName);
6690
- if (deleted) {
6691
- deletedCount++;
6692
- }
6693
- }
6694
- logger12.info("All browser caches cleared", {
6695
- totalDeleted: deletedCount
6696
- });
6697
- if (preventRecreation) {
6698
- const { env } = await import("./transformers.web-T5LWC34T.mjs");
6699
- env.useBrowserCache = false;
6700
- logger12.warn("Browser cache creation disabled (env.useBrowserCache = false)");
6701
- }
6702
- return deletedCount;
6703
- } catch (error) {
6704
- logger12.error("Error nuking caches", { error });
6705
- throw error;
6706
- }
6707
- }
6708
-
6709
6491
  // src/animation/types.ts
6710
6492
  var DEFAULT_ANIMATION_CONFIG = {
6711
6493
  initialState: "idle",
@@ -7245,7 +7027,6 @@ export {
7245
7027
  EmotionPresets,
7246
7028
  EmphasisDetector,
7247
7029
  EventEmitter,
7248
- HF_CDN_TEST_URL,
7249
7030
  INFERENCE_LATENCY_BUCKETS,
7250
7031
  InterruptionHandler,
7251
7032
  LAMPipeline,
@@ -7259,6 +7040,7 @@ export {
7259
7040
  OmoteTelemetry,
7260
7041
  RingBuffer,
7261
7042
  SafariSpeechRecognition,
7043
+ SenseVoiceInference,
7262
7044
  SileroVADInference,
7263
7045
  SileroVADWorker,
7264
7046
  SyncedAudioPipeline,
@@ -7266,12 +7048,12 @@ export {
7266
7048
  WAV2ARKIT_BLENDSHAPES,
7267
7049
  Wav2ArkitCpuInference,
7268
7050
  Wav2Vec2Inference,
7269
- WhisperInference,
7051
+ applyCMVN,
7052
+ applyLFR,
7270
7053
  blendEmotions,
7271
7054
  calculatePeak,
7272
7055
  calculateRMS,
7273
- clearSpecificCache,
7274
- clearTransformersCache,
7056
+ computeKaldiFbank,
7275
7057
  configureCacheLimit,
7276
7058
  configureLogging,
7277
7059
  configureTelemetry,
@@ -7280,6 +7062,7 @@ export {
7280
7062
  createLogger,
7281
7063
  createSessionWithFallback,
7282
7064
  createSileroVAD,
7065
+ ctcGreedyDecode,
7283
7066
  fetchWithCache,
7284
7067
  formatBytes,
7285
7068
  getCacheConfig,
@@ -7296,7 +7079,6 @@ export {
7296
7079
  getTelemetry,
7297
7080
  hasWebGPUApi,
7298
7081
  isAndroid,
7299
- isHuggingFaceCDNReachable,
7300
7082
  isIOS,
7301
7083
  isIOSSafari,
7302
7084
  isMobile,
@@ -7305,16 +7087,16 @@ export {
7305
7087
  isSpeechRecognitionAvailable,
7306
7088
  isWebGPUAvailable,
7307
7089
  lerpEmotion,
7308
- listCaches,
7309
7090
  noopLogger,
7310
- nukeBrowserCaches,
7311
- parseHuggingFaceUrl,
7091
+ parseCMVNFromMetadata,
7092
+ parseTokensFile,
7312
7093
  preloadModels,
7313
7094
  preloadOnnxRuntime,
7314
7095
  remapWav2ArkitToLam,
7315
7096
  resetLoggingConfig,
7316
7097
  resolveBackend,
7317
- scanForInvalidCaches,
7098
+ resolveLanguageId,
7099
+ resolveTextNormId,
7318
7100
  setLogLevel,
7319
7101
  setLoggingEnabled,
7320
7102
  shouldEnableWasmProxy,
@@ -7322,7 +7104,6 @@ export {
7322
7104
  shouldUseNativeASR,
7323
7105
  shouldUseServerLipSync,
7324
7106
  supportsVADWorker,
7325
- symmetrizeBlendshapes,
7326
- validateCachedResponse
7107
+ symmetrizeBlendshapes
7327
7108
  };
7328
7109
  //# sourceMappingURL=index.mjs.map