@omote/core 0.3.25 → 0.4.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.mjs CHANGED
@@ -12,11 +12,6 @@ import {
12
12
  setLogLevel,
13
13
  setLoggingEnabled
14
14
  } from "./chunk-ESU52TDS.mjs";
15
- import {
16
- __webpack_exports__env,
17
- __webpack_exports__pipeline
18
- } from "./chunk-B6TIE56N.mjs";
19
- import "./chunk-NSSMTXJJ.mjs";
20
15
 
21
16
  // src/audio/MicrophoneCapture.ts
22
17
  var MicrophoneCapture = class {
@@ -2274,6 +2269,14 @@ function getSessionOptions(backend) {
2274
2269
  graphOptimizationLevel: "all"
2275
2270
  };
2276
2271
  }
2272
+ if (isIOS()) {
2273
+ return {
2274
+ executionProviders: ["wasm"],
2275
+ graphOptimizationLevel: "basic",
2276
+ enableCpuMemArena: false,
2277
+ enableMemPattern: false
2278
+ };
2279
+ }
2277
2280
  return {
2278
2281
  executionProviders: ["wasm"],
2279
2282
  graphOptimizationLevel: "all"
@@ -2549,77 +2552,108 @@ var Wav2Vec2Inference = class {
2549
2552
  this.ort = ort;
2550
2553
  this._backend = backend;
2551
2554
  logger2.info("ONNX Runtime loaded", { backend: this._backend });
2552
- const cache = getModelCache();
2553
2555
  const modelUrl = this.config.modelUrl;
2554
- const isCached = await cache.has(modelUrl);
2555
- let modelBuffer;
2556
- if (isCached) {
2557
- logger2.debug("Loading model from cache", { modelUrl });
2558
- modelBuffer = await cache.get(modelUrl);
2559
- if (!modelBuffer) {
2560
- logger2.warn("Cache corruption detected, clearing and retrying", { modelUrl });
2561
- await cache.delete(modelUrl);
2562
- logger2.info("Corrupted cache entry deleted, fetching fresh model", { modelUrl });
2563
- modelBuffer = await fetchWithCache(modelUrl);
2556
+ const dataUrl = this.config.externalDataUrl !== false ? typeof this.config.externalDataUrl === "string" ? this.config.externalDataUrl : `${modelUrl}.data` : null;
2557
+ const sessionOptions = getSessionOptions(this._backend);
2558
+ let isCached = false;
2559
+ if (isIOS()) {
2560
+ logger2.info("iOS: passing model URLs directly to ORT (low-memory path)", {
2561
+ modelUrl,
2562
+ dataUrl
2563
+ });
2564
+ if (dataUrl) {
2565
+ const dataFilename = dataUrl.split("/").pop();
2566
+ logger2.info("iOS: setting externalData", { dataFilename, dataUrl });
2567
+ sessionOptions.externalData = [{
2568
+ path: dataFilename,
2569
+ data: dataUrl
2570
+ // URL string — ORT fetches directly into WASM
2571
+ }];
2564
2572
  }
2565
- } else {
2566
- logger2.debug("Fetching and caching model", { modelUrl });
2567
- modelBuffer = await fetchWithCache(modelUrl);
2568
- }
2569
- if (!modelBuffer) {
2570
- const errorMsg = `Failed to load model: ${modelUrl}. Model buffer is null or undefined even after retry.`;
2571
- logger2.error(errorMsg, { modelUrl, isCached });
2572
- throw new Error(errorMsg);
2573
- }
2574
- let externalDataBuffer = null;
2575
- if (this.config.externalDataUrl !== false) {
2576
- const dataUrl = typeof this.config.externalDataUrl === "string" ? this.config.externalDataUrl : `${modelUrl}.data`;
2573
+ logger2.info("iOS: calling InferenceSession.create() with URL string", {
2574
+ modelUrl,
2575
+ sessionOptions: JSON.stringify(
2576
+ sessionOptions,
2577
+ (_, v) => typeof v === "string" && v.length > 100 ? v.slice(0, 100) + "..." : v
2578
+ )
2579
+ });
2577
2580
  try {
2578
- const isDataCached = await cache.has(dataUrl);
2579
- if (isDataCached) {
2580
- logger2.debug("Loading external data from cache", { dataUrl });
2581
- externalDataBuffer = await cache.get(dataUrl);
2582
- if (!externalDataBuffer) {
2583
- logger2.warn("Cache corruption for external data, retrying", { dataUrl });
2584
- await cache.delete(dataUrl);
2581
+ this.session = await this.ort.InferenceSession.create(modelUrl, sessionOptions);
2582
+ } catch (sessionErr) {
2583
+ logger2.error("iOS: InferenceSession.create() failed", {
2584
+ error: sessionErr instanceof Error ? sessionErr.message : String(sessionErr),
2585
+ errorType: sessionErr?.constructor?.name,
2586
+ stack: sessionErr instanceof Error ? sessionErr.stack : void 0
2587
+ });
2588
+ throw sessionErr;
2589
+ }
2590
+ logger2.info("iOS: session created successfully", {
2591
+ inputNames: this.session.inputNames,
2592
+ outputNames: this.session.outputNames
2593
+ });
2594
+ } else {
2595
+ const cache = getModelCache();
2596
+ isCached = await cache.has(modelUrl);
2597
+ let modelBuffer;
2598
+ if (isCached) {
2599
+ logger2.debug("Loading model from cache", { modelUrl });
2600
+ modelBuffer = await cache.get(modelUrl);
2601
+ if (!modelBuffer) {
2602
+ logger2.warn("Cache corruption detected, clearing and retrying", { modelUrl });
2603
+ await cache.delete(modelUrl);
2604
+ modelBuffer = await fetchWithCache(modelUrl);
2605
+ }
2606
+ } else {
2607
+ logger2.debug("Fetching and caching model", { modelUrl });
2608
+ modelBuffer = await fetchWithCache(modelUrl);
2609
+ }
2610
+ if (!modelBuffer) {
2611
+ throw new Error(`Failed to load model: ${modelUrl}`);
2612
+ }
2613
+ let externalDataBuffer = null;
2614
+ if (dataUrl) {
2615
+ try {
2616
+ const isDataCached = await cache.has(dataUrl);
2617
+ if (isDataCached) {
2618
+ logger2.debug("Loading external data from cache", { dataUrl });
2619
+ externalDataBuffer = await cache.get(dataUrl);
2620
+ if (!externalDataBuffer) {
2621
+ logger2.warn("Cache corruption for external data, retrying", { dataUrl });
2622
+ await cache.delete(dataUrl);
2623
+ externalDataBuffer = await fetchWithCache(dataUrl);
2624
+ }
2625
+ } else {
2626
+ logger2.info("Fetching external model data", {
2627
+ dataUrl,
2628
+ note: "This may be a large download (383MB+)"
2629
+ });
2585
2630
  externalDataBuffer = await fetchWithCache(dataUrl);
2586
2631
  }
2587
- } else {
2588
- logger2.info("Fetching external model data", {
2632
+ logger2.info("External data loaded", {
2633
+ size: formatBytes(externalDataBuffer.byteLength)
2634
+ });
2635
+ } catch (err) {
2636
+ logger2.debug("No external data file found (single-file model)", {
2589
2637
  dataUrl,
2590
- note: "This may be a large download (383MB+)"
2638
+ error: err.message
2591
2639
  });
2592
- externalDataBuffer = await fetchWithCache(dataUrl);
2593
2640
  }
2594
- logger2.info("External data loaded", {
2595
- size: formatBytes(externalDataBuffer.byteLength)
2596
- });
2597
- } catch (err) {
2598
- logger2.debug("No external data file found (single-file model)", {
2599
- dataUrl,
2600
- error: err.message
2601
- });
2602
2641
  }
2642
+ logger2.debug("Creating ONNX session", {
2643
+ graphSize: formatBytes(modelBuffer.byteLength),
2644
+ externalDataSize: externalDataBuffer ? formatBytes(externalDataBuffer.byteLength) : "none",
2645
+ backend: this._backend
2646
+ });
2647
+ if (externalDataBuffer) {
2648
+ const dataFilename = dataUrl.split("/").pop();
2649
+ sessionOptions.externalData = [{
2650
+ path: dataFilename,
2651
+ data: new Uint8Array(externalDataBuffer)
2652
+ }];
2653
+ }
2654
+ const modelData = new Uint8Array(modelBuffer);
2655
+ this.session = await this.ort.InferenceSession.create(modelData, sessionOptions);
2603
2656
  }
2604
- logger2.debug("Creating ONNX session", {
2605
- graphSize: formatBytes(modelBuffer.byteLength),
2606
- externalDataSize: externalDataBuffer ? formatBytes(externalDataBuffer.byteLength) : "none",
2607
- backend: this._backend
2608
- });
2609
- const sessionOptions = getSessionOptions(this._backend);
2610
- if (externalDataBuffer) {
2611
- const dataFilename = (typeof this.config.externalDataUrl === "string" ? this.config.externalDataUrl : `${modelUrl}.data`).split("/").pop();
2612
- sessionOptions.externalData = [{
2613
- path: dataFilename,
2614
- data: new Uint8Array(externalDataBuffer)
2615
- }];
2616
- }
2617
- logger2.info("Creating session with execution provider", {
2618
- executionProvider: this._backend,
2619
- hasExternalData: !!externalDataBuffer
2620
- });
2621
- const modelData = new Uint8Array(modelBuffer);
2622
- this.session = await this.ort.InferenceSession.create(modelData, sessionOptions);
2623
2657
  logger2.info("ONNX session created successfully", {
2624
2658
  executionProvider: this._backend,
2625
2659
  backend: this._backend
@@ -2634,7 +2668,7 @@ var Wav2Vec2Inference = class {
2634
2668
  span?.setAttributes({
2635
2669
  "model.backend": this._backend,
2636
2670
  "model.load_time_ms": loadTimeMs,
2637
- "model.cached": isCached
2671
+ "model.cached": !isIOS() && isCached
2638
2672
  });
2639
2673
  span?.end();
2640
2674
  telemetry?.recordHistogram("omote.model.load_time", loadTimeMs, {
@@ -2644,12 +2678,23 @@ var Wav2Vec2Inference = class {
2644
2678
  logger2.debug("Running warmup inference to initialize GPU context");
2645
2679
  const warmupStart = performance.now();
2646
2680
  const silentAudio = new Float32Array(16e3);
2647
- await this.infer(silentAudio, 0);
2681
+ const WARMUP_TIMEOUT_MS = 15e3;
2682
+ const warmupResult = await Promise.race([
2683
+ this.infer(silentAudio, 0).then(() => "ok"),
2684
+ new Promise((r) => setTimeout(() => r("timeout"), WARMUP_TIMEOUT_MS))
2685
+ ]);
2648
2686
  const warmupTimeMs = performance.now() - warmupStart;
2649
- logger2.info("Warmup inference complete", {
2650
- warmupTimeMs: Math.round(warmupTimeMs),
2651
- backend: this._backend
2652
- });
2687
+ if (warmupResult === "timeout") {
2688
+ logger2.warn("Warmup inference timed out \u2014 GPU may be unresponsive. Continuing without warmup.", {
2689
+ timeoutMs: WARMUP_TIMEOUT_MS,
2690
+ backend: this._backend
2691
+ });
2692
+ } else {
2693
+ logger2.info("Warmup inference complete", {
2694
+ warmupTimeMs: Math.round(warmupTimeMs),
2695
+ backend: this._backend
2696
+ });
2697
+ }
2653
2698
  telemetry?.recordHistogram("omote.model.warmup_time", warmupTimeMs, {
2654
2699
  model: "wav2vec2",
2655
2700
  backend: this._backend
@@ -2837,334 +2882,316 @@ LAM_BLENDSHAPES.forEach((name, index) => {
2837
2882
  });
2838
2883
  var UPPER_FACE_SET = new Set(UPPER_FACE_BLENDSHAPES);
2839
2884
 
2840
- // src/inference/WhisperInference.ts
2841
- var logger4 = createLogger("Whisper");
2842
- var WhisperInference = class _WhisperInference {
2843
- constructor(config = {}) {
2844
- this.pipeline = null;
2845
- this.currentModel = null;
2846
- this.isLoading = false;
2847
- this.actualBackend = "unknown";
2848
- this.config = {
2849
- model: config.model || "tiny",
2850
- multilingual: config.multilingual || false,
2851
- language: config.language || "en",
2852
- task: config.task || "transcribe",
2853
- dtype: config.dtype || "q8",
2854
- device: config.device || "auto",
2855
- localModelPath: config.localModelPath,
2856
- token: config.token,
2857
- suppressNonSpeech: config.suppressNonSpeech !== false
2858
- // Default true
2859
- };
2860
- }
2861
- /**
2862
- * Check if WebGPU is available in this browser
2863
- */
2864
- static async isWebGPUAvailable() {
2865
- return "gpu" in navigator;
2866
- }
2867
- /**
2868
- * Load the Whisper model pipeline
2869
- */
2870
- async load(onProgress) {
2871
- if (this.isLoading) {
2872
- logger4.debug("Already loading model, waiting...");
2873
- while (this.isLoading) {
2874
- await new Promise((resolve) => setTimeout(resolve, 100));
2875
- }
2876
- return;
2877
- }
2878
- const modelName = this.getModelName();
2879
- if (this.pipeline !== null && this.currentModel === modelName) {
2880
- logger4.debug("Model already loaded", { model: modelName });
2881
- return;
2882
- }
2883
- this.isLoading = true;
2884
- const telemetry = getTelemetry();
2885
- const span = telemetry?.startSpan("whisper.load", {
2886
- "whisper.model": modelName,
2887
- "whisper.dtype": this.config.dtype,
2888
- "whisper.device": this.config.device
2889
- });
2890
- try {
2891
- const loadStart = performance.now();
2892
- logger4.info("Loading model", {
2893
- model: modelName,
2894
- dtype: this.config.dtype,
2895
- device: this.config.device,
2896
- multilingual: this.config.multilingual
2897
- });
2898
- if (this.pipeline !== null && this.currentModel !== modelName) {
2899
- logger4.debug("Disposing old model", { oldModel: this.currentModel });
2900
- await this.pipeline.dispose();
2901
- this.pipeline = null;
2902
- }
2903
- const hasWebGPU = await _WhisperInference.isWebGPUAvailable();
2904
- const device = this.config.device === "auto" ? hasWebGPU ? "webgpu" : "wasm" : this.config.device;
2905
- logger4.info("Creating pipeline", { device, hasWebGPU });
2906
- __webpack_exports__env.allowLocalModels = false;
2907
- __webpack_exports__env.allowRemoteModels = true;
2908
- __webpack_exports__env.useBrowserCache = false;
2909
- __webpack_exports__env.useCustomCache = false;
2910
- __webpack_exports__env.useWasmCache = false;
2911
- if (__webpack_exports__env.backends.onnx.wasm) {
2912
- __webpack_exports__env.backends.onnx.wasm.proxy = false;
2913
- __webpack_exports__env.backends.onnx.wasm.numThreads = 1;
2914
- }
2915
- logger4.info("Configured transformers.js env", {
2916
- allowLocalModels: __webpack_exports__env.allowLocalModels,
2917
- useBrowserCache: __webpack_exports__env.useBrowserCache,
2918
- useWasmCache: __webpack_exports__env.useWasmCache
2919
- });
2920
- const pipelineOptions = {
2921
- dtype: this.config.dtype,
2922
- device,
2923
- progress_callback: onProgress,
2924
- // For medium models, use no_attentions revision to save memory
2925
- revision: modelName.includes("whisper-medium") ? "no_attentions" : "main",
2926
- // Pass HuggingFace token to bypass rate limits
2927
- ...this.config.token && { token: this.config.token }
2928
- };
2929
- if (device === "webgpu") {
2930
- pipelineOptions.session_options = {
2931
- executionProviders: ["webgpu"]
2932
- };
2933
- logger4.info("Forcing WebGPU execution providers");
2885
+ // src/inference/kaldiFbank.ts
2886
+ function fft(re, im) {
2887
+ const n = re.length;
2888
+ for (let i = 1, j = 0; i < n; i++) {
2889
+ let bit = n >> 1;
2890
+ while (j & bit) {
2891
+ j ^= bit;
2892
+ bit >>= 1;
2893
+ }
2894
+ j ^= bit;
2895
+ if (i < j) {
2896
+ let tmp = re[i];
2897
+ re[i] = re[j];
2898
+ re[j] = tmp;
2899
+ tmp = im[i];
2900
+ im[i] = im[j];
2901
+ im[j] = tmp;
2902
+ }
2903
+ }
2904
+ for (let len = 2; len <= n; len *= 2) {
2905
+ const halfLen = len / 2;
2906
+ const angle = -2 * Math.PI / len;
2907
+ const wRe = Math.cos(angle);
2908
+ const wIm = Math.sin(angle);
2909
+ for (let i = 0; i < n; i += len) {
2910
+ let curRe = 1;
2911
+ let curIm = 0;
2912
+ for (let j = 0; j < halfLen; j++) {
2913
+ const a = i + j;
2914
+ const b = a + halfLen;
2915
+ const tRe = curRe * re[b] - curIm * im[b];
2916
+ const tIm = curRe * im[b] + curIm * re[b];
2917
+ re[b] = re[a] - tRe;
2918
+ im[b] = im[a] - tIm;
2919
+ re[a] += tRe;
2920
+ im[a] += tIm;
2921
+ const nextRe = curRe * wRe - curIm * wIm;
2922
+ curIm = curRe * wIm + curIm * wRe;
2923
+ curRe = nextRe;
2934
2924
  }
2935
- this.pipeline = await __webpack_exports__pipeline(
2936
- "automatic-speech-recognition",
2937
- modelName,
2938
- pipelineOptions
2939
- );
2940
- this.actualBackend = device;
2941
- this.currentModel = modelName;
2942
- const loadTimeMs = performance.now() - loadStart;
2943
- logger4.info("Model loaded successfully", {
2944
- model: modelName,
2945
- loadTimeMs: Math.round(loadTimeMs)
2946
- });
2947
- span?.setAttributes({
2948
- "whisper.load_time_ms": loadTimeMs
2949
- });
2950
- span?.end();
2951
- } catch (error) {
2952
- const errorDetails = {
2953
- message: error instanceof Error ? error.message : String(error),
2954
- stack: error instanceof Error ? error.stack : void 0,
2955
- name: error instanceof Error ? error.name : void 0,
2956
- error
2957
- };
2958
- logger4.error("Failed to load model", errorDetails);
2959
- span?.endWithError(error);
2960
- throw error;
2961
- } finally {
2962
- this.isLoading = false;
2963
2925
  }
2964
2926
  }
2965
- /**
2966
- * Transcribe audio to text
2967
- *
2968
- * @param audio Audio samples (Float32Array, 16kHz mono)
2969
- * @param options Transcription options
2970
- */
2971
- async transcribe(audio, options) {
2972
- if (!this.pipeline) {
2973
- throw new Error("Model not loaded. Call load() first.");
2974
- }
2975
- const audioCopy = new Float32Array(audio);
2976
- const telemetry = getTelemetry();
2977
- const span = telemetry?.startSpan("whisper.transcribe", {
2978
- "audio.samples": audioCopy.length,
2979
- "audio.duration_s": audioCopy.length / 16e3,
2980
- "whisper.model": this.currentModel
2981
- });
2982
- try {
2983
- const inferStart = performance.now();
2984
- const audioDurationSec = audioCopy.length / 16e3;
2985
- const isShortAudio = audioDurationSec < 10;
2986
- logger4.debug("Starting transcription", {
2987
- audioSamples: audioCopy.length,
2988
- durationSeconds: audioDurationSec.toFixed(2),
2989
- isShortAudio
2990
- });
2991
- const transcribeOptions = {
2992
- // Decoding strategy
2993
- top_k: 0,
2994
- do_sample: false,
2995
- // Adaptive chunking: Disable for short audio, enable for long audio
2996
- chunk_length_s: options?.chunkLengthS || (isShortAudio ? audioDurationSec : 30),
2997
- stride_length_s: options?.strideLengthS || (isShortAudio ? 0 : 5),
2998
- // Timestamps
2999
- return_timestamps: options?.returnTimestamps || false,
3000
- force_full_sequences: false
3001
- };
3002
- if (this.config.multilingual) {
3003
- transcribeOptions.language = options?.language || this.config.language;
3004
- transcribeOptions.task = options?.task || this.config.task;
3005
- }
3006
- const rawResult = await this.pipeline(audioCopy, transcribeOptions);
3007
- const result = Array.isArray(rawResult) ? rawResult[0] : rawResult;
3008
- const inferenceTimeMs = performance.now() - inferStart;
3009
- let cleanedText = result.text;
3010
- if (this.config.suppressNonSpeech) {
3011
- cleanedText = this.removeNonSpeechTokens(cleanedText);
2927
+ }
2928
+ function htkMel(freq) {
2929
+ return 1127 * Math.log(1 + freq / 700);
2930
+ }
2931
+ function htkMelInverse(mel) {
2932
+ return 700 * (Math.exp(mel / 1127) - 1);
2933
+ }
2934
+ function buildMelFilterbank(numBins, fftSize, sampleRate, lowFreq, highFreq) {
2935
+ const numFftBins = fftSize / 2 + 1;
2936
+ const lowMel = htkMel(lowFreq);
2937
+ const highMel = htkMel(highFreq);
2938
+ const melPoints = new Float64Array(numBins + 2);
2939
+ for (let i = 0; i < numBins + 2; i++) {
2940
+ melPoints[i] = lowMel + (highMel - lowMel) * i / (numBins + 1);
2941
+ }
2942
+ const binFreqs = new Float64Array(numBins + 2);
2943
+ for (let i = 0; i < numBins + 2; i++) {
2944
+ binFreqs[i] = htkMelInverse(melPoints[i]) * fftSize / sampleRate;
2945
+ }
2946
+ const filters = [];
2947
+ for (let m = 0; m < numBins; m++) {
2948
+ const left = binFreqs[m];
2949
+ const center = binFreqs[m + 1];
2950
+ const right = binFreqs[m + 2];
2951
+ const startBin = Math.max(0, Math.ceil(left));
2952
+ const endBin = Math.min(numFftBins - 1, Math.floor(right));
2953
+ const weights = new Float32Array(endBin - startBin + 1);
2954
+ for (let k = startBin; k <= endBin; k++) {
2955
+ if (k <= center) {
2956
+ weights[k - startBin] = center - left > 0 ? (k - left) / (center - left) : 0;
2957
+ } else {
2958
+ weights[k - startBin] = right - center > 0 ? (right - k) / (right - center) : 0;
3012
2959
  }
3013
- const transcription = {
3014
- text: cleanedText,
3015
- language: this.config.language,
3016
- inferenceTimeMs,
3017
- chunks: result.chunks
3018
- };
3019
- logger4.debug("Transcription complete", {
3020
- text: transcription.text,
3021
- inferenceTimeMs: Math.round(inferenceTimeMs),
3022
- chunksCount: result.chunks?.length || 0
3023
- });
3024
- span?.setAttributes({
3025
- "whisper.inference_time_ms": inferenceTimeMs,
3026
- "whisper.text_length": transcription.text.length
3027
- });
3028
- span?.end();
3029
- return transcription;
3030
- } catch (error) {
3031
- logger4.error("Transcribe error", { error });
3032
- span?.endWithError(error);
3033
- throw new Error(`Whisper transcription failed: ${error}`);
3034
2960
  }
2961
+ filters.push({ startBin, weights });
3035
2962
  }
3036
- /**
3037
- * Transcribe with streaming chunks (progressive results)
3038
- *
3039
- * @param audio Audio samples
3040
- * @param onChunk Called when each chunk is finalized
3041
- * @param onUpdate Called after each generation step (optional)
3042
- */
3043
- async transcribeStreaming(audio, onChunk, onUpdate, options) {
3044
- if (!this.pipeline) {
3045
- throw new Error("Model not loaded. Call load() first.");
3046
- }
3047
- const telemetry = getTelemetry();
3048
- const span = telemetry?.startSpan("whisper.transcribe_streaming", {
3049
- "audio.samples": audio.length,
3050
- "audio.duration_s": audio.length / 16e3
3051
- });
3052
- try {
3053
- const inferStart = performance.now();
3054
- logger4.debug("Starting streaming transcription", {
3055
- audioSamples: audio.length,
3056
- durationSeconds: (audio.length / 16e3).toFixed(2)
3057
- });
3058
- const transcribeOptions = {
3059
- top_k: 0,
3060
- do_sample: false,
3061
- chunk_length_s: options?.chunkLengthS || 30,
3062
- stride_length_s: options?.strideLengthS || 5,
3063
- return_timestamps: true,
3064
- force_full_sequences: false
3065
- };
3066
- if (this.config.multilingual) {
3067
- transcribeOptions.language = options?.language || this.config.language;
3068
- transcribeOptions.task = options?.task || this.config.task;
2963
+ return filters;
2964
+ }
2965
+ function createHammingWindow(length) {
2966
+ const window2 = new Float32Array(length);
2967
+ for (let i = 0; i < length; i++) {
2968
+ window2[i] = 0.54 - 0.46 * Math.cos(2 * Math.PI * i / (length - 1));
2969
+ }
2970
+ return window2;
2971
+ }
2972
+ function computeKaldiFbank(audio, sampleRate, numMelBins, opts) {
2973
+ const frameLengthMs = opts?.frameLengthMs ?? 25;
2974
+ const frameShiftMs = opts?.frameShiftMs ?? 10;
2975
+ const lowFreq = opts?.lowFreq ?? 20;
2976
+ const highFreq = opts?.highFreq ?? sampleRate / 2;
2977
+ const dither = opts?.dither ?? 0;
2978
+ const preemphasis = opts?.preemphasis ?? 0.97;
2979
+ const frameLengthSamples = Math.round(sampleRate * frameLengthMs / 1e3);
2980
+ const frameShiftSamples = Math.round(sampleRate * frameShiftMs / 1e3);
2981
+ const scaled = new Float32Array(audio.length);
2982
+ for (let i = 0; i < audio.length; i++) {
2983
+ scaled[i] = audio[i] * 32768;
2984
+ }
2985
+ if (dither > 0) {
2986
+ for (let i = 0; i < scaled.length; i++) {
2987
+ const u1 = Math.random();
2988
+ const u2 = Math.random();
2989
+ scaled[i] += dither * Math.sqrt(-2 * Math.log(u1 + 1e-10)) * Math.cos(2 * Math.PI * u2);
2990
+ }
2991
+ }
2992
+ const numFrames = Math.max(0, Math.floor((scaled.length - frameLengthSamples) / frameShiftSamples) + 1);
2993
+ if (numFrames === 0) {
2994
+ return new Float32Array(0);
2995
+ }
2996
+ let fftSize = 1;
2997
+ while (fftSize < frameLengthSamples) fftSize *= 2;
2998
+ const numFftBins = fftSize / 2 + 1;
2999
+ const window2 = createHammingWindow(frameLengthSamples);
3000
+ const filters = buildMelFilterbank(numMelBins, fftSize, sampleRate, lowFreq, highFreq);
3001
+ const output = new Float32Array(numFrames * numMelBins);
3002
+ const fftRe = new Float64Array(fftSize);
3003
+ const fftIm = new Float64Array(fftSize);
3004
+ for (let f = 0; f < numFrames; f++) {
3005
+ const offset = f * frameShiftSamples;
3006
+ fftRe.fill(0);
3007
+ fftIm.fill(0);
3008
+ for (let i = 0; i < frameLengthSamples; i++) {
3009
+ let sample = scaled[offset + i];
3010
+ if (preemphasis > 0 && i > 0) {
3011
+ sample -= preemphasis * scaled[offset + i - 1];
3012
+ } else if (preemphasis > 0 && i === 0 && offset > 0) {
3013
+ sample -= preemphasis * scaled[offset - 1];
3069
3014
  }
3070
- const rawResult = await this.pipeline(audio, transcribeOptions);
3071
- const result = Array.isArray(rawResult) ? rawResult[0] : rawResult;
3072
- const inferenceTimeMs = performance.now() - inferStart;
3073
- if (result.chunks && onChunk) {
3074
- for (const chunk of result.chunks) {
3075
- onChunk({
3076
- text: chunk.text,
3077
- timestamp: chunk.timestamp
3078
- });
3015
+ fftRe[i] = sample * window2[i];
3016
+ }
3017
+ fft(fftRe, fftIm);
3018
+ const outOffset = f * numMelBins;
3019
+ for (let m = 0; m < numMelBins; m++) {
3020
+ const filter = filters[m];
3021
+ let energy = 0;
3022
+ for (let k = 0; k < filter.weights.length; k++) {
3023
+ const bin = filter.startBin + k;
3024
+ if (bin < numFftBins) {
3025
+ const powerSpec = fftRe[bin] * fftRe[bin] + fftIm[bin] * fftIm[bin];
3026
+ energy += filter.weights[k] * powerSpec;
3079
3027
  }
3080
3028
  }
3081
- if (onUpdate) {
3082
- onUpdate(result.text);
3083
- }
3084
- logger4.debug("Streaming transcription complete", {
3085
- text: result.text,
3086
- inferenceTimeMs: Math.round(inferenceTimeMs),
3087
- chunksCount: result.chunks?.length || 0
3088
- });
3089
- span?.setAttributes({
3090
- "whisper.inference_time_ms": inferenceTimeMs,
3091
- "whisper.chunks_count": result.chunks?.length || 0
3092
- });
3093
- span?.end();
3094
- return {
3095
- text: result.text,
3096
- language: this.config.language,
3097
- inferenceTimeMs,
3098
- chunks: result.chunks
3099
- };
3100
- } catch (error) {
3101
- logger4.error("Streaming transcribe error", { error });
3102
- span?.endWithError(error);
3103
- throw new Error(`Whisper streaming transcription failed: ${error}`);
3029
+ output[outOffset + m] = Math.log(Math.max(energy, 1e-10));
3104
3030
  }
3105
3031
  }
3106
- /**
3107
- * Dispose of the model and free resources
3108
- */
3109
- async dispose() {
3110
- if (this.pipeline) {
3111
- logger4.debug("Disposing model", { model: this.currentModel });
3112
- await this.pipeline.dispose();
3113
- this.pipeline = null;
3114
- this.currentModel = null;
3032
+ return output;
3033
+ }
3034
+ function applyLFR(features, featureDim, lfrM = 7, lfrN = 6) {
3035
+ const numFrames = features.length / featureDim;
3036
+ if (numFrames === 0) return new Float32Array(0);
3037
+ const leftPad = Math.floor((lfrM - 1) / 2);
3038
+ const paddedLen = numFrames + leftPad;
3039
+ const numOutputFrames = Math.ceil(paddedLen / lfrN);
3040
+ const outputDim = featureDim * lfrM;
3041
+ const output = new Float32Array(numOutputFrames * outputDim);
3042
+ for (let i = 0; i < numOutputFrames; i++) {
3043
+ const startFrame = i * lfrN - leftPad;
3044
+ for (let j = 0; j < lfrM; j++) {
3045
+ let srcFrame = startFrame + j;
3046
+ if (srcFrame < 0) srcFrame = 0;
3047
+ if (srcFrame >= numFrames) srcFrame = numFrames - 1;
3048
+ const srcOffset = srcFrame * featureDim;
3049
+ const dstOffset = i * outputDim + j * featureDim;
3050
+ for (let k = 0; k < featureDim; k++) {
3051
+ output[dstOffset + k] = features[srcOffset + k];
3052
+ }
3115
3053
  }
3116
3054
  }
3117
- /**
3118
- * Check if model is loaded
3119
- */
3120
- get isLoaded() {
3121
- return this.pipeline !== null;
3055
+ return output;
3056
+ }
3057
+ function applyCMVN(features, dim, negMean, invStddev) {
3058
+ for (let i = 0; i < features.length; i++) {
3059
+ const d = i % dim;
3060
+ features[i] = (features[i] + negMean[d]) * invStddev[d];
3122
3061
  }
3123
- /**
3124
- * Get the backend being used (webgpu or wasm)
3125
- */
3126
- get backend() {
3127
- return this.actualBackend;
3062
+ return features;
3063
+ }
3064
+ function parseCMVNFromMetadata(negMeanStr, invStddevStr) {
3065
+ const negMean = new Float32Array(
3066
+ negMeanStr.split(",").map((s) => parseFloat(s.trim()))
3067
+ );
3068
+ const invStddev = new Float32Array(
3069
+ invStddevStr.split(",").map((s) => parseFloat(s.trim()))
3070
+ );
3071
+ return { negMean, invStddev };
3072
+ }
3073
+
3074
+ // src/inference/ctcDecoder.ts
3075
+ function resolveLanguageId(language) {
3076
+ const map = {
3077
+ auto: 0,
3078
+ zh: 3,
3079
+ en: 4,
3080
+ yue: 7,
3081
+ ja: 11,
3082
+ ko: 12
3083
+ };
3084
+ return map[language] ?? 0;
3085
+ }
3086
+ function resolveTextNormId(textNorm) {
3087
+ return textNorm === "without_itn" ? 15 : 14;
3088
+ }
3089
+ function parseTokensFile(content) {
3090
+ const map = /* @__PURE__ */ new Map();
3091
+ const lines = content.split("\n");
3092
+ for (const line of lines) {
3093
+ const trimmed = line.trim();
3094
+ if (!trimmed) continue;
3095
+ const lastSpace = trimmed.lastIndexOf(" ");
3096
+ if (lastSpace === -1) continue;
3097
+ const token = trimmed.substring(0, lastSpace);
3098
+ const id = parseInt(trimmed.substring(lastSpace + 1), 10);
3099
+ if (!isNaN(id)) {
3100
+ map.set(id, token);
3101
+ }
3102
+ }
3103
+ return map;
3104
+ }
3105
+ function parseStructuredToken(token) {
3106
+ const match = token.match(/^<\|(.+)\|>$/);
3107
+ if (!match) return null;
3108
+ const value = match[1];
3109
+ if (value === "zh" || value === "en" || value === "ja" || value === "ko" || value === "yue" || value === "nospeech") {
3110
+ return { type: "language", value };
3128
3111
  }
3129
- /**
3130
- * Get the full model name used by transformers.js
3131
- */
3132
- getModelName() {
3133
- if (this.config.localModelPath) {
3134
- return this.config.localModelPath;
3112
+ const emotions = ["HAPPY", "SAD", "ANGRY", "NEUTRAL", "FEARFUL", "DISGUSTED", "SURPRISED", "EMO_UNKNOWN"];
3113
+ if (emotions.includes(value)) {
3114
+ return { type: "emotion", value };
3115
+ }
3116
+ const events = ["Speech", "BGM", "Applause", "Laughter", "Crying", "Coughing", "Sneezing", "EVENT_UNKNOWN"];
3117
+ if (events.includes(value)) {
3118
+ return { type: "event", value };
3119
+ }
3120
+ if (value === "withitn" || value === "woitn" || value === "with_itn" || value === "without_itn") {
3121
+ return { type: "textnorm", value };
3122
+ }
3123
+ return null;
3124
+ }
3125
+ function ctcGreedyDecode(logits, seqLen, vocabSize, tokenMap) {
3126
+ const tokenIds = [];
3127
+ for (let t = 0; t < seqLen; t++) {
3128
+ const offset = t * vocabSize;
3129
+ let maxIdx = 0;
3130
+ let maxVal = logits[offset];
3131
+ for (let v = 1; v < vocabSize; v++) {
3132
+ if (logits[offset + v] > maxVal) {
3133
+ maxVal = logits[offset + v];
3134
+ maxIdx = v;
3135
+ }
3135
3136
  }
3136
- let modelName = `onnx-community/whisper-${this.config.model}`;
3137
- if (!this.config.multilingual) {
3138
- modelName += ".en";
3137
+ tokenIds.push(maxIdx);
3138
+ }
3139
+ const collapsed = [];
3140
+ let prev = -1;
3141
+ for (const id of tokenIds) {
3142
+ if (id !== prev) {
3143
+ collapsed.push(id);
3144
+ prev = id;
3145
+ }
3146
+ }
3147
+ const filtered = collapsed.filter((id) => id !== 0 && id !== 1 && id !== 2);
3148
+ let language;
3149
+ let emotion;
3150
+ let event;
3151
+ const textTokens = [];
3152
+ for (const id of filtered) {
3153
+ const token = tokenMap.get(id);
3154
+ if (!token) continue;
3155
+ const structured = parseStructuredToken(token);
3156
+ if (structured) {
3157
+ if (structured.type === "language") language = structured.value;
3158
+ else if (structured.type === "emotion") emotion = structured.value;
3159
+ else if (structured.type === "event") event = structured.value;
3160
+ } else {
3161
+ textTokens.push(token);
3139
3162
  }
3140
- return modelName;
3141
- }
3142
- /**
3143
- * Remove non-speech event tokens from transcription
3144
- *
3145
- * Whisper outputs special tokens for non-speech events like:
3146
- * [LAUGHTER], [APPLAUSE], [MUSIC], [BLANK_AUDIO], [CLICKING], etc.
3147
- *
3148
- * This method strips these tokens and cleans up extra whitespace.
3149
- */
3150
- removeNonSpeechTokens(text) {
3151
- const cleaned = text.replace(/\[[\w\s_]+\]/g, "");
3152
- return cleaned.replace(/\s+/g, " ").trim();
3153
3163
  }
3154
- };
3164
+ let text = textTokens.join("");
3165
+ text = text.replace(/\u2581/g, " ").trim();
3166
+ return { text, language, emotion, event };
3167
+ }
3155
3168
 
3156
- // src/inference/Wav2ArkitCpuInference.ts
3157
- var logger5 = createLogger("Wav2ArkitCpu");
3158
- var Wav2ArkitCpuInference = class {
3169
+ // src/inference/SenseVoiceInference.ts
3170
+ var logger4 = createLogger("SenseVoice");
3171
+ var SenseVoiceInference = class {
3159
3172
  constructor(config) {
3160
- this.modelId = "wav2arkit_cpu";
3161
3173
  this.session = null;
3162
3174
  this.ort = null;
3163
3175
  this._backend = "wasm";
3164
3176
  this.isLoading = false;
3165
- // Inference queue for handling concurrent calls
3166
3177
  this.inferenceQueue = Promise.resolve();
3167
- this.config = config;
3178
+ // Preprocessing state (loaded once)
3179
+ this.tokenMap = null;
3180
+ this.negMean = null;
3181
+ this.invStddev = null;
3182
+ this.languageId = 0;
3183
+ this.textNormId = 14;
3184
+ const modelDir = config.modelUrl.substring(0, config.modelUrl.lastIndexOf("/"));
3185
+ const tokensUrl = config.tokensUrl ?? `${modelDir}/tokens.txt`;
3186
+ this.config = {
3187
+ modelUrl: config.modelUrl,
3188
+ tokensUrl,
3189
+ language: config.language ?? "auto",
3190
+ textNorm: config.textNorm ?? "with_itn",
3191
+ backend: config.backend ?? "auto"
3192
+ };
3193
+ this.languageId = resolveLanguageId(this.config.language);
3194
+ this.textNormId = resolveTextNormId(this.config.textNorm);
3168
3195
  }
3169
3196
  get backend() {
3170
3197
  return this.session ? this._backend : null;
@@ -3172,10 +3199,8 @@ var Wav2ArkitCpuInference = class {
3172
3199
  get isLoaded() {
3173
3200
  return this.session !== null;
3174
3201
  }
3175
- /**
3176
- * Load the ONNX model
3177
- */
3178
- async load() {
3202
+ // ─── Load ───────────────────────────────────────────────────────────────
3203
+ async load(onProgress) {
3179
3204
  if (this.isLoading) {
3180
3205
  throw new Error("Model is already loading");
3181
3206
  }
@@ -3185,30 +3210,281 @@ var Wav2ArkitCpuInference = class {
3185
3210
  this.isLoading = true;
3186
3211
  const startTime = performance.now();
3187
3212
  const telemetry = getTelemetry();
3188
- const span = telemetry?.startSpan("Wav2ArkitCpu.load", {
3213
+ const span = telemetry?.startSpan("SenseVoice.load", {
3189
3214
  "model.url": this.config.modelUrl,
3190
- "model.backend_requested": this.config.backend || "wasm"
3215
+ "model.backend_requested": this.config.backend
3191
3216
  });
3192
3217
  try {
3193
- const preference = this.config.backend || "wasm";
3194
- logger5.info("Loading ONNX Runtime...", { preference });
3195
- const { ort, backend } = await getOnnxRuntimeForPreference(preference);
3218
+ logger4.info("Loading ONNX Runtime...", { preference: this.config.backend });
3219
+ const { ort, backend } = await getOnnxRuntimeForPreference(this.config.backend);
3196
3220
  this.ort = ort;
3197
3221
  this._backend = backend;
3198
- logger5.info("ONNX Runtime loaded", { backend: this._backend });
3199
- const modelUrl = this.config.modelUrl;
3200
- const dataUrl = this.config.externalDataUrl !== false ? this.config.externalDataUrl || `${modelUrl}.data` : null;
3222
+ logger4.info("ONNX Runtime loaded", { backend: this._backend });
3223
+ logger4.debug("Fetching tokens vocabulary", { tokensUrl: this.config.tokensUrl });
3224
+ const tokensResponse = await fetch(this.config.tokensUrl);
3225
+ if (!tokensResponse.ok) {
3226
+ throw new Error(`Failed to fetch tokens.txt: ${tokensResponse.status} ${tokensResponse.statusText}`);
3227
+ }
3228
+ const tokensText = await tokensResponse.text();
3229
+ this.tokenMap = parseTokensFile(tokensText);
3230
+ logger4.debug("Tokens loaded", { vocabSize: this.tokenMap.size });
3201
3231
  const sessionOptions = getSessionOptions(this._backend);
3232
+ if (this._backend === "webgpu") {
3233
+ sessionOptions.graphOptimizationLevel = "basic";
3234
+ }
3235
+ let isCached = false;
3202
3236
  if (isIOS()) {
3203
- logger5.info("iOS: passing model URLs directly to ORT (low-memory path)", {
3204
- modelUrl,
3205
- dataUrl
3237
+ logger4.info("iOS: passing model URL directly to ORT (low-memory path)", {
3238
+ modelUrl: this.config.modelUrl
3206
3239
  });
3207
- if (dataUrl) {
3208
- const dataFilename = dataUrl.split("/").pop();
3209
- sessionOptions.externalData = [{
3210
- path: dataFilename,
3211
- data: dataUrl
3240
+ this.session = await this.ort.InferenceSession.create(
3241
+ this.config.modelUrl,
3242
+ sessionOptions
3243
+ );
3244
+ } else {
3245
+ const cache = getModelCache();
3246
+ isCached = await cache.has(this.config.modelUrl);
3247
+ let modelBuffer;
3248
+ if (isCached) {
3249
+ logger4.debug("Loading model from cache", { modelUrl: this.config.modelUrl });
3250
+ modelBuffer = await cache.get(this.config.modelUrl);
3251
+ onProgress?.(modelBuffer.byteLength, modelBuffer.byteLength);
3252
+ } else {
3253
+ logger4.debug("Fetching and caching model", { modelUrl: this.config.modelUrl });
3254
+ modelBuffer = await fetchWithCache(this.config.modelUrl, onProgress);
3255
+ }
3256
+ logger4.debug("Creating ONNX session", {
3257
+ size: formatBytes(modelBuffer.byteLength),
3258
+ backend: this._backend
3259
+ });
3260
+ const modelData = new Uint8Array(modelBuffer);
3261
+ this.session = await this.ort.InferenceSession.create(modelData, sessionOptions);
3262
+ }
3263
+ try {
3264
+ const metadata = this.session.handler?.metadata;
3265
+ if (metadata?.neg_mean && metadata?.inv_stddev) {
3266
+ const cmvn = parseCMVNFromMetadata(metadata.neg_mean, metadata.inv_stddev);
3267
+ this.negMean = cmvn.negMean;
3268
+ this.invStddev = cmvn.invStddev;
3269
+ logger4.debug("CMVN loaded from model metadata", { dim: this.negMean.length });
3270
+ } else {
3271
+ logger4.warn("CMVN not found in model metadata \u2014 features will not be normalized");
3272
+ }
3273
+ } catch (cmvnErr) {
3274
+ logger4.warn("Failed to read CMVN from model metadata", { error: cmvnErr });
3275
+ }
3276
+ const loadTimeMs = performance.now() - startTime;
3277
+ logger4.info("SenseVoice model loaded", {
3278
+ backend: this._backend,
3279
+ loadTimeMs: Math.round(loadTimeMs),
3280
+ vocabSize: this.tokenMap.size,
3281
+ inputs: this.session.inputNames,
3282
+ outputs: this.session.outputNames,
3283
+ hasCMVN: this.negMean !== null
3284
+ });
3285
+ span?.setAttributes({
3286
+ "model.backend": this._backend,
3287
+ "model.load_time_ms": loadTimeMs,
3288
+ "model.cached": !isIOS() && isCached,
3289
+ "model.vocab_size": this.tokenMap.size
3290
+ });
3291
+ span?.end();
3292
+ telemetry?.recordHistogram("omote.model.load_time", loadTimeMs, {
3293
+ model: "sensevoice",
3294
+ backend: this._backend
3295
+ });
3296
+ return {
3297
+ backend: this._backend,
3298
+ loadTimeMs,
3299
+ inputNames: [...this.session.inputNames],
3300
+ outputNames: [...this.session.outputNames],
3301
+ vocabSize: this.tokenMap.size
3302
+ };
3303
+ } catch (error) {
3304
+ span?.endWithError(error instanceof Error ? error : new Error(String(error)));
3305
+ telemetry?.incrementCounter("omote.errors.total", 1, {
3306
+ model: "sensevoice",
3307
+ error_type: "load_failed"
3308
+ });
3309
+ throw error;
3310
+ } finally {
3311
+ this.isLoading = false;
3312
+ }
3313
+ }
3314
+ // ─── Transcribe ─────────────────────────────────────────────────────────
3315
+ /**
3316
+ * Transcribe audio samples to text
3317
+ *
3318
+ * @param audioSamples Float32Array of audio samples at 16kHz, [-1, 1] range
3319
+ * @returns Transcription result with text, emotion, language, and event
3320
+ */
3321
+ async transcribe(audioSamples) {
3322
+ if (!this.session || !this.ort || !this.tokenMap) {
3323
+ throw new Error("Model not loaded. Call load() first.");
3324
+ }
3325
+ const audio = new Float32Array(audioSamples);
3326
+ return this.queueInference(audio);
3327
+ }
3328
+ queueInference(audio) {
3329
+ return new Promise((resolve, reject) => {
3330
+ this.inferenceQueue = this.inferenceQueue.then(async () => {
3331
+ const telemetry = getTelemetry();
3332
+ const span = telemetry?.startSpan("SenseVoice.transcribe", {
3333
+ "inference.backend": this._backend,
3334
+ "inference.input_samples": audio.length
3335
+ });
3336
+ try {
3337
+ const startTime = performance.now();
3338
+ const preprocessStart = performance.now();
3339
+ const fbank = computeKaldiFbank(audio, 16e3, 80);
3340
+ const numFrames = fbank.length / 80;
3341
+ if (numFrames === 0) {
3342
+ resolve({
3343
+ text: "",
3344
+ inferenceTimeMs: performance.now() - startTime,
3345
+ preprocessTimeMs: performance.now() - preprocessStart
3346
+ });
3347
+ return;
3348
+ }
3349
+ const lfrFeatures = applyLFR(fbank, 80, 7, 6);
3350
+ const numLfrFrames = lfrFeatures.length / 560;
3351
+ if (this.negMean && this.invStddev) {
3352
+ applyCMVN(lfrFeatures, 560, this.negMean, this.invStddev);
3353
+ }
3354
+ const preprocessTimeMs = performance.now() - preprocessStart;
3355
+ const ort = this.ort;
3356
+ const feeds = {
3357
+ x: new ort.Tensor("float32", lfrFeatures, [1, numLfrFrames, 560]),
3358
+ x_length: new ort.Tensor("int32", new Int32Array([numLfrFrames]), [1]),
3359
+ language: new ort.Tensor("int32", new Int32Array([this.languageId]), [1]),
3360
+ text_norm: new ort.Tensor("int32", new Int32Array([this.textNormId]), [1])
3361
+ };
3362
+ const results = await this.session.run(feeds);
3363
+ const logitsOutput = results["logits"];
3364
+ if (!logitsOutput) {
3365
+ throw new Error('Model output missing "logits" tensor');
3366
+ }
3367
+ const logitsData = logitsOutput.data;
3368
+ const logitsDims = logitsOutput.dims;
3369
+ const seqLen = logitsDims[1];
3370
+ const vocabSize = logitsDims[2];
3371
+ const decoded = ctcGreedyDecode(logitsData, seqLen, vocabSize, this.tokenMap);
3372
+ const inferenceTimeMs = performance.now() - startTime;
3373
+ logger4.trace("Transcription complete", {
3374
+ text: decoded.text.substring(0, 50),
3375
+ language: decoded.language,
3376
+ emotion: decoded.emotion,
3377
+ event: decoded.event,
3378
+ preprocessTimeMs: Math.round(preprocessTimeMs * 100) / 100,
3379
+ inferenceTimeMs: Math.round(inferenceTimeMs * 100) / 100,
3380
+ numFrames,
3381
+ numLfrFrames
3382
+ });
3383
+ span?.setAttributes({
3384
+ "inference.duration_ms": inferenceTimeMs,
3385
+ "inference.preprocess_ms": preprocessTimeMs,
3386
+ "inference.num_frames": numFrames,
3387
+ "inference.text_length": decoded.text.length
3388
+ });
3389
+ span?.end();
3390
+ telemetry?.recordHistogram("omote.inference.latency", inferenceTimeMs, {
3391
+ model: "sensevoice",
3392
+ backend: this._backend
3393
+ });
3394
+ telemetry?.incrementCounter("omote.inference.total", 1, {
3395
+ model: "sensevoice",
3396
+ backend: this._backend,
3397
+ status: "success"
3398
+ });
3399
+ resolve({
3400
+ text: decoded.text,
3401
+ language: decoded.language,
3402
+ emotion: decoded.emotion,
3403
+ event: decoded.event,
3404
+ inferenceTimeMs,
3405
+ preprocessTimeMs
3406
+ });
3407
+ } catch (err) {
3408
+ span?.endWithError(err instanceof Error ? err : new Error(String(err)));
3409
+ telemetry?.incrementCounter("omote.inference.total", 1, {
3410
+ model: "sensevoice",
3411
+ backend: this._backend,
3412
+ status: "error"
3413
+ });
3414
+ reject(err);
3415
+ }
3416
+ });
3417
+ });
3418
+ }
3419
+ // ─── Dispose ──────────────────────────────────────────────────────────
3420
+ async dispose() {
3421
+ if (this.session) {
3422
+ await this.session.release();
3423
+ this.session = null;
3424
+ }
3425
+ this.ort = null;
3426
+ this.tokenMap = null;
3427
+ this.negMean = null;
3428
+ this.invStddev = null;
3429
+ }
3430
+ };
3431
+
3432
+ // src/inference/Wav2ArkitCpuInference.ts
3433
+ var logger5 = createLogger("Wav2ArkitCpu");
3434
+ var Wav2ArkitCpuInference = class {
3435
+ constructor(config) {
3436
+ this.modelId = "wav2arkit_cpu";
3437
+ this.session = null;
3438
+ this.ort = null;
3439
+ this._backend = "wasm";
3440
+ this.isLoading = false;
3441
+ // Inference queue for handling concurrent calls
3442
+ this.inferenceQueue = Promise.resolve();
3443
+ this.config = config;
3444
+ }
3445
+ get backend() {
3446
+ return this.session ? this._backend : null;
3447
+ }
3448
+ get isLoaded() {
3449
+ return this.session !== null;
3450
+ }
3451
+ /**
3452
+ * Load the ONNX model
3453
+ */
3454
+ async load() {
3455
+ if (this.isLoading) {
3456
+ throw new Error("Model is already loading");
3457
+ }
3458
+ if (this.session) {
3459
+ throw new Error("Model already loaded. Call dispose() first.");
3460
+ }
3461
+ this.isLoading = true;
3462
+ const startTime = performance.now();
3463
+ const telemetry = getTelemetry();
3464
+ const span = telemetry?.startSpan("Wav2ArkitCpu.load", {
3465
+ "model.url": this.config.modelUrl,
3466
+ "model.backend_requested": this.config.backend || "wasm"
3467
+ });
3468
+ try {
3469
+ const preference = this.config.backend || "wasm";
3470
+ logger5.info("Loading ONNX Runtime...", { preference });
3471
+ const { ort, backend } = await getOnnxRuntimeForPreference(preference);
3472
+ this.ort = ort;
3473
+ this._backend = backend;
3474
+ logger5.info("ONNX Runtime loaded", { backend: this._backend });
3475
+ const modelUrl = this.config.modelUrl;
3476
+ const dataUrl = this.config.externalDataUrl !== false ? this.config.externalDataUrl || `${modelUrl}.data` : null;
3477
+ const sessionOptions = getSessionOptions(this._backend);
3478
+ if (isIOS()) {
3479
+ logger5.info("iOS: passing model URLs directly to ORT (low-memory path)", {
3480
+ modelUrl,
3481
+ dataUrl
3482
+ });
3483
+ if (dataUrl) {
3484
+ const dataFilename = dataUrl.split("/").pop();
3485
+ sessionOptions.externalData = [{
3486
+ path: dataFilename,
3487
+ data: dataUrl
3212
3488
  // URL string — ORT fetches directly into WASM
3213
3489
  }];
3214
3490
  }
@@ -3474,21 +3750,22 @@ var LipSyncWithFallback = class {
3474
3750
  try {
3475
3751
  return await this.implementation.load();
3476
3752
  } catch (error) {
3477
- logger6.warn("GPU model load failed, falling back to CPU model", {
3478
- error: error instanceof Error ? error.message : String(error)
3479
- });
3480
- try {
3481
- await this.implementation.dispose();
3482
- } catch {
3483
- }
3484
- this.implementation = new Wav2ArkitCpuInference({
3485
- modelUrl: this.config.cpuModelUrl
3486
- });
3487
- this.hasFallenBack = true;
3488
- logger6.info("Fallback to Wav2ArkitCpuInference successful");
3489
- return await this.implementation.load();
3753
+ return this.fallbackToCpu(error instanceof Error ? error.message : String(error));
3490
3754
  }
3491
3755
  }
3756
+ async fallbackToCpu(reason) {
3757
+ logger6.warn("GPU model load failed, falling back to CPU model", { reason });
3758
+ try {
3759
+ await this.implementation.dispose();
3760
+ } catch {
3761
+ }
3762
+ this.implementation = new Wav2ArkitCpuInference({
3763
+ modelUrl: this.config.cpuModelUrl
3764
+ });
3765
+ this.hasFallenBack = true;
3766
+ logger6.info("Fallback to Wav2ArkitCpuInference successful");
3767
+ return await this.implementation.load();
3768
+ }
3492
3769
  async infer(audioSamples, identityIndex) {
3493
3770
  return this.implementation.infer(audioSamples, identityIndex);
3494
3771
  }
@@ -4545,268 +4822,8 @@ var VADWorkerWithFallback = class {
4545
4822
  }
4546
4823
  };
4547
4824
 
4548
- // src/inference/Emotion2VecInference.ts
4549
- var logger10 = createLogger("Emotion2Vec");
4550
- var EMOTION2VEC_LABELS = ["neutral", "happy", "angry", "sad"];
4551
- var Emotion2VecInference = class {
4552
- constructor(config) {
4553
- this.session = null;
4554
- this.ort = null;
4555
- this._backend = "wasm";
4556
- this.isLoading = false;
4557
- this.inferenceQueue = Promise.resolve();
4558
- this.config = {
4559
- modelUrl: config.modelUrl,
4560
- backend: config.backend ?? "auto",
4561
- sampleRate: config.sampleRate ?? 16e3
4562
- };
4563
- }
4564
- get backend() {
4565
- return this.session ? this._backend : null;
4566
- }
4567
- get isLoaded() {
4568
- return this.session !== null;
4569
- }
4570
- get sampleRate() {
4571
- return this.config.sampleRate;
4572
- }
4573
- /**
4574
- * Load the ONNX model
4575
- */
4576
- async load() {
4577
- if (this.isLoading) {
4578
- throw new Error("Model is already loading");
4579
- }
4580
- if (this.session) {
4581
- throw new Error("Model already loaded. Call dispose() first.");
4582
- }
4583
- this.isLoading = true;
4584
- const startTime = performance.now();
4585
- const telemetry = getTelemetry();
4586
- const span = telemetry?.startSpan("Emotion2Vec.load", {
4587
- "model.url": this.config.modelUrl,
4588
- "model.backend_requested": this.config.backend
4589
- });
4590
- try {
4591
- logger10.info("Loading ONNX Runtime...", { preference: this.config.backend });
4592
- const { ort, backend } = await getOnnxRuntimeForPreference(this.config.backend);
4593
- this.ort = ort;
4594
- this._backend = backend;
4595
- logger10.info("ONNX Runtime loaded", { backend: this._backend });
4596
- logger10.info("Checking model cache...");
4597
- const cache = getModelCache();
4598
- const modelUrl = this.config.modelUrl;
4599
- const isCached = await cache.has(modelUrl);
4600
- logger10.info("Cache check complete", { modelUrl, isCached });
4601
- let modelBuffer;
4602
- if (isCached) {
4603
- logger10.info("Loading model from cache...", { modelUrl });
4604
- modelBuffer = await cache.get(modelUrl);
4605
- logger10.info("Model loaded from cache", { size: formatBytes(modelBuffer.byteLength) });
4606
- } else {
4607
- logger10.info("Fetching model (not cached)...", { modelUrl });
4608
- modelBuffer = await fetchWithCache(modelUrl);
4609
- logger10.info("Model fetched and cached", { size: formatBytes(modelBuffer.byteLength) });
4610
- }
4611
- logger10.info("Creating ONNX session (this may take a while for large models)...");
4612
- logger10.debug("Creating ONNX session", {
4613
- size: formatBytes(modelBuffer.byteLength),
4614
- backend: this._backend
4615
- });
4616
- const sessionOptions = getSessionOptions(this._backend);
4617
- const modelData = new Uint8Array(modelBuffer);
4618
- this.session = await ort.InferenceSession.create(modelData, sessionOptions);
4619
- const loadTimeMs = performance.now() - startTime;
4620
- logger10.info("Model loaded successfully", {
4621
- backend: this._backend,
4622
- loadTimeMs: Math.round(loadTimeMs),
4623
- sampleRate: this.config.sampleRate,
4624
- inputNames: [...this.session.inputNames],
4625
- outputNames: [...this.session.outputNames]
4626
- });
4627
- span?.setAttributes({
4628
- "model.backend": this._backend,
4629
- "model.load_time_ms": loadTimeMs,
4630
- "model.cached": isCached
4631
- });
4632
- span?.end();
4633
- telemetry?.recordHistogram("omote.model.load_time", loadTimeMs, {
4634
- model: "emotion2vec",
4635
- backend: this._backend
4636
- });
4637
- return {
4638
- backend: this._backend,
4639
- loadTimeMs,
4640
- inputNames: [...this.session.inputNames],
4641
- outputNames: [...this.session.outputNames],
4642
- sampleRate: this.config.sampleRate
4643
- };
4644
- } catch (error) {
4645
- span?.endWithError(error instanceof Error ? error : new Error(String(error)));
4646
- telemetry?.incrementCounter("omote.errors.total", 1, {
4647
- model: "emotion2vec",
4648
- error_type: "load_failed"
4649
- });
4650
- throw error;
4651
- } finally {
4652
- this.isLoading = false;
4653
- }
4654
- }
4655
- /**
4656
- * Run emotion inference on audio samples
4657
- *
4658
- * @param audio - Float32Array of 16kHz audio samples
4659
- * @returns Frame-level emotion results at 50Hz
4660
- */
4661
- async infer(audio) {
4662
- if (!this.session) {
4663
- throw new Error("Model not loaded. Call load() first.");
4664
- }
4665
- return this.queueInference(audio);
4666
- }
4667
- queueInference(audio) {
4668
- const audioCopy = new Float32Array(audio);
4669
- return new Promise((resolve, reject) => {
4670
- this.inferenceQueue = this.inferenceQueue.then(async () => {
4671
- const telemetry = getTelemetry();
4672
- const span = telemetry?.startSpan("Emotion2Vec.infer", {
4673
- "inference.backend": this._backend,
4674
- "inference.audio_samples": audioCopy.length
4675
- });
4676
- try {
4677
- const startTime = performance.now();
4678
- const inputTensor = new this.ort.Tensor("float32", audioCopy, [1, audioCopy.length]);
4679
- const results = await this.session.run({ audio: inputTensor });
4680
- const logitsTensor = results["logits"];
4681
- const embeddingsTensor = results["layer_norm_25"];
4682
- if (!logitsTensor) {
4683
- throw new Error(
4684
- `Missing logits tensor from SUPERB model. Got outputs: ${Object.keys(results).join(", ")}`
4685
- );
4686
- }
4687
- const logitsData = logitsTensor.data;
4688
- const logits = new Float32Array(logitsData);
4689
- const probs = this.softmax(logits);
4690
- const probabilities = {
4691
- neutral: probs[0],
4692
- happy: probs[1],
4693
- angry: probs[2],
4694
- sad: probs[3]
4695
- };
4696
- let maxIdx = 0;
4697
- let maxProb = probs[0];
4698
- for (let i = 1; i < probs.length; i++) {
4699
- if (probs[i] > maxProb) {
4700
- maxProb = probs[i];
4701
- maxIdx = i;
4702
- }
4703
- }
4704
- const dominant = {
4705
- emotion: EMOTION2VEC_LABELS[maxIdx],
4706
- confidence: maxProb,
4707
- probabilities
4708
- };
4709
- let embeddings = [];
4710
- let numFrames = 1;
4711
- if (embeddingsTensor) {
4712
- const embeddingData = embeddingsTensor.data;
4713
- const dims = embeddingsTensor.dims;
4714
- if (dims.length === 3) {
4715
- numFrames = dims[1];
4716
- const embeddingDim = dims[2];
4717
- for (let i = 0; i < numFrames; i++) {
4718
- const start = i * embeddingDim;
4719
- embeddings.push(new Float32Array(embeddingData.slice(start, start + embeddingDim)));
4720
- }
4721
- }
4722
- }
4723
- const frames = [];
4724
- for (let i = 0; i < numFrames; i++) {
4725
- frames.push({
4726
- emotion: dominant.emotion,
4727
- confidence: dominant.confidence,
4728
- probabilities: { ...probabilities }
4729
- });
4730
- }
4731
- const inferenceTimeMs = performance.now() - startTime;
4732
- logger10.debug("Emotion inference completed", {
4733
- numFrames,
4734
- dominant: dominant.emotion,
4735
- confidence: Math.round(dominant.confidence * 100),
4736
- inferenceTimeMs: Math.round(inferenceTimeMs)
4737
- });
4738
- span?.setAttributes({
4739
- "inference.duration_ms": inferenceTimeMs,
4740
- "inference.num_frames": numFrames,
4741
- "inference.dominant_emotion": dominant.emotion
4742
- });
4743
- span?.end();
4744
- telemetry?.recordHistogram("omote.inference.latency", inferenceTimeMs, {
4745
- model: "emotion2vec",
4746
- backend: this._backend
4747
- });
4748
- telemetry?.incrementCounter("omote.inference.total", 1, {
4749
- model: "emotion2vec",
4750
- backend: this._backend,
4751
- status: "success"
4752
- });
4753
- resolve({
4754
- frames,
4755
- dominant,
4756
- embeddings,
4757
- logits,
4758
- inferenceTimeMs
4759
- });
4760
- } catch (err) {
4761
- span?.endWithError(err instanceof Error ? err : new Error(String(err)));
4762
- telemetry?.incrementCounter("omote.inference.total", 1, {
4763
- model: "emotion2vec",
4764
- backend: this._backend,
4765
- status: "error"
4766
- });
4767
- reject(err);
4768
- }
4769
- });
4770
- });
4771
- }
4772
- /**
4773
- * Apply softmax to convert logits to probabilities
4774
- */
4775
- softmax(logits) {
4776
- let max = logits[0];
4777
- for (let i = 1; i < logits.length; i++) {
4778
- if (logits[i] > max) max = logits[i];
4779
- }
4780
- const exp = new Float32Array(logits.length);
4781
- let sum = 0;
4782
- for (let i = 0; i < logits.length; i++) {
4783
- exp[i] = Math.exp(logits[i] - max);
4784
- sum += exp[i];
4785
- }
4786
- const probs = new Float32Array(logits.length);
4787
- for (let i = 0; i < logits.length; i++) {
4788
- probs[i] = exp[i] / sum;
4789
- }
4790
- return probs;
4791
- }
4792
- /**
4793
- * Dispose of the model and free resources
4794
- */
4795
- async dispose() {
4796
- if (this.session) {
4797
- await this.session.release();
4798
- this.session = null;
4799
- }
4800
- }
4801
- };
4802
- /**
4803
- * Check if WebGPU is available and working
4804
- * (iOS returns false even if navigator.gpu exists due to ONNX Runtime bugs)
4805
- */
4806
- Emotion2VecInference.isWebGPUAvailable = isWebGPUAvailable;
4807
-
4808
4825
  // src/inference/SafariSpeechRecognition.ts
4809
- var logger11 = createLogger("SafariSpeech");
4826
+ var logger10 = createLogger("SafariSpeech");
4810
4827
  var SafariSpeechRecognition = class _SafariSpeechRecognition {
4811
4828
  constructor(config = {}) {
4812
4829
  this.recognition = null;
@@ -4825,7 +4842,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
4825
4842
  interimResults: config.interimResults ?? true,
4826
4843
  maxAlternatives: config.maxAlternatives ?? 1
4827
4844
  };
4828
- logger11.debug("SafariSpeechRecognition created", {
4845
+ logger10.debug("SafariSpeechRecognition created", {
4829
4846
  language: this.config.language,
4830
4847
  continuous: this.config.continuous
4831
4848
  });
@@ -4886,7 +4903,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
4886
4903
  */
4887
4904
  async start() {
4888
4905
  if (this.isListening) {
4889
- logger11.warn("Already listening");
4906
+ logger10.warn("Already listening");
4890
4907
  return;
4891
4908
  }
4892
4909
  if (!_SafariSpeechRecognition.isAvailable()) {
@@ -4916,7 +4933,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
4916
4933
  this.isListening = true;
4917
4934
  this.startTime = performance.now();
4918
4935
  this.accumulatedText = "";
4919
- logger11.info("Speech recognition started", {
4936
+ logger10.info("Speech recognition started", {
4920
4937
  language: this.config.language
4921
4938
  });
4922
4939
  span?.end();
@@ -4931,7 +4948,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
4931
4948
  */
4932
4949
  async stop() {
4933
4950
  if (!this.isListening || !this.recognition) {
4934
- logger11.warn("Not currently listening");
4951
+ logger10.warn("Not currently listening");
4935
4952
  return {
4936
4953
  text: this.accumulatedText,
4937
4954
  language: this.config.language,
@@ -4960,7 +4977,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
4960
4977
  if (this.recognition && this.isListening) {
4961
4978
  this.recognition.abort();
4962
4979
  this.isListening = false;
4963
- logger11.info("Speech recognition aborted");
4980
+ logger10.info("Speech recognition aborted");
4964
4981
  }
4965
4982
  }
4966
4983
  /**
@@ -4991,7 +5008,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
4991
5008
  this.isListening = false;
4992
5009
  this.resultCallbacks = [];
4993
5010
  this.errorCallbacks = [];
4994
- logger11.debug("SafariSpeechRecognition disposed");
5011
+ logger10.debug("SafariSpeechRecognition disposed");
4995
5012
  }
4996
5013
  /**
4997
5014
  * Set up event handlers for the recognition instance
@@ -5019,7 +5036,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
5019
5036
  confidence: alternative.confidence
5020
5037
  };
5021
5038
  this.emitResult(speechResult);
5022
- logger11.trace("Speech result", {
5039
+ logger10.trace("Speech result", {
5023
5040
  text: text.substring(0, 50),
5024
5041
  isFinal,
5025
5042
  confidence: alternative.confidence
@@ -5029,12 +5046,12 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
5029
5046
  span?.end();
5030
5047
  } catch (error) {
5031
5048
  span?.endWithError(error instanceof Error ? error : new Error(String(error)));
5032
- logger11.error("Error processing speech result", { error });
5049
+ logger10.error("Error processing speech result", { error });
5033
5050
  }
5034
5051
  };
5035
5052
  this.recognition.onerror = (event) => {
5036
5053
  const error = new Error(`Speech recognition error: ${event.error} - ${event.message}`);
5037
- logger11.error("Speech recognition error", { error: event.error, message: event.message });
5054
+ logger10.error("Speech recognition error", { error: event.error, message: event.message });
5038
5055
  this.emitError(error);
5039
5056
  if (this.stopRejecter) {
5040
5057
  this.stopRejecter(error);
@@ -5044,7 +5061,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
5044
5061
  };
5045
5062
  this.recognition.onend = () => {
5046
5063
  this.isListening = false;
5047
- logger11.info("Speech recognition ended", {
5064
+ logger10.info("Speech recognition ended", {
5048
5065
  totalText: this.accumulatedText.length,
5049
5066
  durationMs: performance.now() - this.startTime
5050
5067
  });
@@ -5061,13 +5078,13 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
5061
5078
  }
5062
5079
  };
5063
5080
  this.recognition.onstart = () => {
5064
- logger11.debug("Speech recognition started by browser");
5081
+ logger10.debug("Speech recognition started by browser");
5065
5082
  };
5066
5083
  this.recognition.onspeechstart = () => {
5067
- logger11.debug("Speech detected");
5084
+ logger10.debug("Speech detected");
5068
5085
  };
5069
5086
  this.recognition.onspeechend = () => {
5070
- logger11.debug("Speech ended");
5087
+ logger10.debug("Speech ended");
5071
5088
  };
5072
5089
  }
5073
5090
  /**
@@ -5078,7 +5095,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
5078
5095
  try {
5079
5096
  callback(result);
5080
5097
  } catch (error) {
5081
- logger11.error("Error in result callback", { error });
5098
+ logger10.error("Error in result callback", { error });
5082
5099
  }
5083
5100
  }
5084
5101
  }
@@ -5090,7 +5107,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
5090
5107
  try {
5091
5108
  callback(error);
5092
5109
  } catch (callbackError) {
5093
- logger11.error("Error in error callback", { error: callbackError });
5110
+ logger10.error("Error in error callback", { error: callbackError });
5094
5111
  }
5095
5112
  }
5096
5113
  }
@@ -5264,7 +5281,7 @@ var AgentCoreAdapter = class extends EventEmitter {
5264
5281
  this._sessionId = null;
5265
5282
  this._isConnected = false;
5266
5283
  // Sub-components
5267
- this.whisper = null;
5284
+ this.asr = null;
5268
5285
  this.vad = null;
5269
5286
  this.lam = null;
5270
5287
  this.pipeline = null;
@@ -5303,7 +5320,7 @@ var AgentCoreAdapter = class extends EventEmitter {
5303
5320
  try {
5304
5321
  const authToken = await this.getAuthToken(config.tenant);
5305
5322
  await Promise.all([
5306
- this.initWhisper(),
5323
+ this.initASR(),
5307
5324
  this.initLAM()
5308
5325
  ]);
5309
5326
  await this.connectWebSocket(authToken, config);
@@ -5333,7 +5350,7 @@ var AgentCoreAdapter = class extends EventEmitter {
5333
5350
  this.ws = null;
5334
5351
  }
5335
5352
  await Promise.all([
5336
- this.whisper?.dispose(),
5353
+ this.asr?.dispose(),
5337
5354
  this.vad?.dispose(),
5338
5355
  this.lam?.dispose()
5339
5356
  ]);
@@ -5465,16 +5482,15 @@ var AgentCoreAdapter = class extends EventEmitter {
5465
5482
  });
5466
5483
  return token;
5467
5484
  }
5468
- async initWhisper() {
5485
+ async initASR() {
5469
5486
  await Promise.all([
5470
- // Whisper ASR
5487
+ // SenseVoice ASR
5471
5488
  (async () => {
5472
- this.whisper = new WhisperInference({
5473
- model: "tiny",
5474
- device: "auto",
5475
- language: "en"
5489
+ this.asr = new SenseVoiceInference({
5490
+ modelUrl: "/models/sensevoice/model.int8.onnx",
5491
+ language: "auto"
5476
5492
  });
5477
- await this.whisper.load();
5493
+ await this.asr.load();
5478
5494
  })(),
5479
5495
  // Silero VAD for accurate voice activity detection
5480
5496
  (async () => {
@@ -5660,17 +5676,17 @@ var AgentCoreAdapter = class extends EventEmitter {
5660
5676
  console.debug("[AgentCore] Skipping silent audio", { rms, samples: audio.length });
5661
5677
  return;
5662
5678
  }
5663
- if (this.whisper) {
5679
+ if (this.asr) {
5664
5680
  this.setState("listening");
5665
5681
  this.emit("user.speech.start", { timestamp: Date.now() });
5666
- this.whisper.transcribe(audio).then((result) => {
5682
+ this.asr.transcribe(audio).then((result) => {
5667
5683
  this.emit("user.transcript.final", {
5668
5684
  text: result.text,
5669
5685
  confidence: 1
5670
5686
  });
5671
5687
  this.emit("user.speech.end", { timestamp: Date.now(), durationMs: result.inferenceTimeMs });
5672
5688
  const cleanText = result.text.trim();
5673
- if (cleanText && !cleanText.includes("[BLANK_AUDIO]")) {
5689
+ if (cleanText) {
5674
5690
  this.sendText(cleanText).catch((error) => {
5675
5691
  console.error("[AgentCore] Send text error:", error);
5676
5692
  });
@@ -6484,228 +6500,6 @@ var InterruptionHandler = class extends EventEmitter {
6484
6500
  }
6485
6501
  };
6486
6502
 
6487
- // src/cache/huggingFaceCDN.ts
6488
- var HF_CDN_TEST_URL = "https://huggingface.co/Xenova/whisper-tiny/resolve/main/config.json";
6489
- function parseHuggingFaceUrl(url) {
6490
- const pattern = /^https:\/\/huggingface\.co\/([^/]+)\/([^/]+)\/resolve\/([^/]+)\/(.+)$/;
6491
- const match = url.match(pattern);
6492
- if (!match) {
6493
- return null;
6494
- }
6495
- return {
6496
- org: match[1],
6497
- model: match[2],
6498
- branch: match[3],
6499
- file: match[4]
6500
- };
6501
- }
6502
- async function isHuggingFaceCDNReachable(testUrl = HF_CDN_TEST_URL) {
6503
- try {
6504
- const response = await fetch(testUrl, {
6505
- method: "HEAD",
6506
- cache: "no-store"
6507
- // Don't use cached response for reachability check
6508
- });
6509
- return response.ok;
6510
- } catch {
6511
- return false;
6512
- }
6513
- }
6514
-
6515
- // src/utils/transformersCacheClear.ts
6516
- var logger12 = createLogger("TransformersCache");
6517
- async function clearTransformersCache(options) {
6518
- const verbose = options?.verbose ?? true;
6519
- const additionalPatterns = options?.additionalPatterns ?? [];
6520
- if (!("caches" in window)) {
6521
- logger12.warn("Cache API not available in this environment");
6522
- return [];
6523
- }
6524
- try {
6525
- const cacheNames = await caches.keys();
6526
- const deletedCaches = [];
6527
- const patterns = [
6528
- "transformers",
6529
- "huggingface",
6530
- "onnx",
6531
- ...additionalPatterns
6532
- ];
6533
- for (const cacheName of cacheNames) {
6534
- const shouldDelete = patterns.some(
6535
- (pattern) => cacheName.toLowerCase().includes(pattern.toLowerCase())
6536
- );
6537
- if (shouldDelete) {
6538
- if (verbose) {
6539
- logger12.info("Deleting cache", { cacheName });
6540
- }
6541
- const deleted = await caches.delete(cacheName);
6542
- if (deleted) {
6543
- deletedCaches.push(cacheName);
6544
- } else if (verbose) {
6545
- logger12.warn("Failed to delete cache", { cacheName });
6546
- }
6547
- }
6548
- }
6549
- if (verbose) {
6550
- logger12.info("Cache clearing complete", {
6551
- totalCaches: cacheNames.length,
6552
- deletedCount: deletedCaches.length,
6553
- deletedCaches
6554
- });
6555
- }
6556
- return deletedCaches;
6557
- } catch (error) {
6558
- logger12.error("Error clearing caches", { error });
6559
- throw error;
6560
- }
6561
- }
6562
- async function clearSpecificCache(cacheName) {
6563
- if (!("caches" in window)) {
6564
- logger12.warn("Cache API not available in this environment");
6565
- return false;
6566
- }
6567
- try {
6568
- const deleted = await caches.delete(cacheName);
6569
- logger12.info("Cache deletion attempt", { cacheName, deleted });
6570
- return deleted;
6571
- } catch (error) {
6572
- logger12.error("Error deleting cache", { cacheName, error });
6573
- return false;
6574
- }
6575
- }
6576
- async function listCaches() {
6577
- if (!("caches" in window)) {
6578
- logger12.warn("Cache API not available in this environment");
6579
- return [];
6580
- }
6581
- try {
6582
- const cacheNames = await caches.keys();
6583
- logger12.debug("Available caches", { cacheNames });
6584
- return cacheNames;
6585
- } catch (error) {
6586
- logger12.error("Error listing caches", { error });
6587
- return [];
6588
- }
6589
- }
6590
- async function validateCachedResponse(cacheName, requestUrl) {
6591
- if (!("caches" in window)) {
6592
- return {
6593
- exists: false,
6594
- valid: false,
6595
- contentType: null,
6596
- isHtml: false,
6597
- reason: "Cache API not available"
6598
- };
6599
- }
6600
- try {
6601
- const cache = await caches.open(cacheName);
6602
- const response = await cache.match(requestUrl);
6603
- if (!response) {
6604
- return {
6605
- exists: false,
6606
- valid: false,
6607
- contentType: null,
6608
- isHtml: false,
6609
- reason: "Not in cache"
6610
- };
6611
- }
6612
- const contentType = response.headers.get("content-type");
6613
- const isHtml = contentType?.includes("text/html") || contentType?.includes("text/plain");
6614
- const clonedResponse = response.clone();
6615
- const text = await clonedResponse.text();
6616
- const looksLikeHtml = text.trim().startsWith("<") || text.includes("<!DOCTYPE");
6617
- const valid = Boolean(
6618
- response.status === 200 && !isHtml && !looksLikeHtml && contentType && (contentType.includes("application/json") || contentType.includes("application/octet-stream") || contentType.includes("binary"))
6619
- );
6620
- return {
6621
- exists: true,
6622
- valid,
6623
- contentType,
6624
- isHtml: isHtml || looksLikeHtml,
6625
- reason: valid ? "Valid response" : `Invalid: status=${response.status}, contentType=${contentType}, isHtml=${isHtml || looksLikeHtml}`
6626
- };
6627
- } catch (error) {
6628
- logger12.error("Error validating cached response", { cacheName, requestUrl, error });
6629
- return {
6630
- exists: false,
6631
- valid: false,
6632
- contentType: null,
6633
- isHtml: false,
6634
- reason: `Error: ${error}`
6635
- };
6636
- }
6637
- }
6638
- async function scanForInvalidCaches() {
6639
- if (!("caches" in window)) {
6640
- return { totalCaches: 0, scannedEntries: 0, invalidEntries: [] };
6641
- }
6642
- const invalidEntries = [];
6643
- let scannedEntries = 0;
6644
- try {
6645
- const cacheNames = await caches.keys();
6646
- for (const cacheName of cacheNames) {
6647
- if (!cacheName.toLowerCase().includes("transformers")) {
6648
- continue;
6649
- }
6650
- const cache = await caches.open(cacheName);
6651
- const requests = await cache.keys();
6652
- for (const request of requests) {
6653
- scannedEntries++;
6654
- const url = request.url;
6655
- const validation = await validateCachedResponse(cacheName, url);
6656
- if (validation.exists && !validation.valid) {
6657
- invalidEntries.push({
6658
- cacheName,
6659
- url,
6660
- reason: validation.reason || "Unknown"
6661
- });
6662
- }
6663
- }
6664
- }
6665
- logger12.info("Cache scan complete", {
6666
- totalCaches: cacheNames.length,
6667
- scannedEntries,
6668
- invalidCount: invalidEntries.length
6669
- });
6670
- return {
6671
- totalCaches: cacheNames.length,
6672
- scannedEntries,
6673
- invalidEntries
6674
- };
6675
- } catch (error) {
6676
- logger12.error("Error scanning caches", { error });
6677
- throw error;
6678
- }
6679
- }
6680
- async function nukeBrowserCaches(preventRecreation = false) {
6681
- if (!("caches" in window)) {
6682
- logger12.warn("Cache API not available in this environment");
6683
- return 0;
6684
- }
6685
- try {
6686
- const cacheNames = await caches.keys();
6687
- let deletedCount = 0;
6688
- for (const cacheName of cacheNames) {
6689
- const deleted = await caches.delete(cacheName);
6690
- if (deleted) {
6691
- deletedCount++;
6692
- }
6693
- }
6694
- logger12.info("All browser caches cleared", {
6695
- totalDeleted: deletedCount
6696
- });
6697
- if (preventRecreation) {
6698
- const { env } = await import("./transformers.web-T5LWC34T.mjs");
6699
- env.useBrowserCache = false;
6700
- logger12.warn("Browser cache creation disabled (env.useBrowserCache = false)");
6701
- }
6702
- return deletedCount;
6703
- } catch (error) {
6704
- logger12.error("Error nuking caches", { error });
6705
- throw error;
6706
- }
6707
- }
6708
-
6709
6503
  // src/animation/types.ts
6710
6504
  var DEFAULT_ANIMATION_CONFIG = {
6711
6505
  initialState: "idle",
@@ -7245,7 +7039,6 @@ export {
7245
7039
  EmotionPresets,
7246
7040
  EmphasisDetector,
7247
7041
  EventEmitter,
7248
- HF_CDN_TEST_URL,
7249
7042
  INFERENCE_LATENCY_BUCKETS,
7250
7043
  InterruptionHandler,
7251
7044
  LAMPipeline,
@@ -7259,6 +7052,7 @@ export {
7259
7052
  OmoteTelemetry,
7260
7053
  RingBuffer,
7261
7054
  SafariSpeechRecognition,
7055
+ SenseVoiceInference,
7262
7056
  SileroVADInference,
7263
7057
  SileroVADWorker,
7264
7058
  SyncedAudioPipeline,
@@ -7266,12 +7060,12 @@ export {
7266
7060
  WAV2ARKIT_BLENDSHAPES,
7267
7061
  Wav2ArkitCpuInference,
7268
7062
  Wav2Vec2Inference,
7269
- WhisperInference,
7063
+ applyCMVN,
7064
+ applyLFR,
7270
7065
  blendEmotions,
7271
7066
  calculatePeak,
7272
7067
  calculateRMS,
7273
- clearSpecificCache,
7274
- clearTransformersCache,
7068
+ computeKaldiFbank,
7275
7069
  configureCacheLimit,
7276
7070
  configureLogging,
7277
7071
  configureTelemetry,
@@ -7280,6 +7074,7 @@ export {
7280
7074
  createLogger,
7281
7075
  createSessionWithFallback,
7282
7076
  createSileroVAD,
7077
+ ctcGreedyDecode,
7283
7078
  fetchWithCache,
7284
7079
  formatBytes,
7285
7080
  getCacheConfig,
@@ -7296,7 +7091,6 @@ export {
7296
7091
  getTelemetry,
7297
7092
  hasWebGPUApi,
7298
7093
  isAndroid,
7299
- isHuggingFaceCDNReachable,
7300
7094
  isIOS,
7301
7095
  isIOSSafari,
7302
7096
  isMobile,
@@ -7305,16 +7099,16 @@ export {
7305
7099
  isSpeechRecognitionAvailable,
7306
7100
  isWebGPUAvailable,
7307
7101
  lerpEmotion,
7308
- listCaches,
7309
7102
  noopLogger,
7310
- nukeBrowserCaches,
7311
- parseHuggingFaceUrl,
7103
+ parseCMVNFromMetadata,
7104
+ parseTokensFile,
7312
7105
  preloadModels,
7313
7106
  preloadOnnxRuntime,
7314
7107
  remapWav2ArkitToLam,
7315
7108
  resetLoggingConfig,
7316
7109
  resolveBackend,
7317
- scanForInvalidCaches,
7110
+ resolveLanguageId,
7111
+ resolveTextNormId,
7318
7112
  setLogLevel,
7319
7113
  setLoggingEnabled,
7320
7114
  shouldEnableWasmProxy,
@@ -7322,7 +7116,6 @@ export {
7322
7116
  shouldUseNativeASR,
7323
7117
  shouldUseServerLipSync,
7324
7118
  supportsVADWorker,
7325
- symmetrizeBlendshapes,
7326
- validateCachedResponse
7119
+ symmetrizeBlendshapes
7327
7120
  };
7328
7121
  //# sourceMappingURL=index.mjs.map