@omote/core 0.3.1 → 0.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.mjs CHANGED
@@ -12,11 +12,6 @@ import {
12
12
  setLogLevel,
13
13
  setLoggingEnabled
14
14
  } from "./chunk-ESU52TDS.mjs";
15
- import {
16
- __webpack_exports__env,
17
- __webpack_exports__pipeline
18
- } from "./chunk-T465MTDX.mjs";
19
- import "./chunk-6W7G6WE7.mjs";
20
15
 
21
16
  // src/audio/MicrophoneCapture.ts
22
17
  var MicrophoneCapture = class {
@@ -28,6 +23,8 @@ var MicrophoneCapture = class {
28
23
  this.buffer = new Float32Array(0);
29
24
  this._isRecording = false;
30
25
  this._loggedFirstChunk = false;
26
+ /** Actual AudioContext sample rate (may differ from target on Firefox) */
27
+ this._nativeSampleRate = 0;
31
28
  this.config = {
32
29
  sampleRate: config.sampleRate ?? 16e3,
33
30
  chunkSize: config.chunkSize ?? 1600
@@ -62,10 +59,29 @@ var MicrophoneCapture = class {
62
59
  if (this.context.state === "suspended") {
63
60
  await this.context.resume();
64
61
  }
65
- const source = this.context.createMediaStreamSource(this.stream);
62
+ let source;
63
+ try {
64
+ source = this.context.createMediaStreamSource(this.stream);
65
+ this._nativeSampleRate = this.context.sampleRate;
66
+ } catch (sourceErr) {
67
+ console.warn(
68
+ "[MicrophoneCapture] Cannot connect stream at",
69
+ this.config.sampleRate + "Hz, falling back to native rate:",
70
+ sourceErr.message
71
+ );
72
+ await this.context.close();
73
+ this.context = new AudioContext();
74
+ if (this.context.state === "suspended") {
75
+ await this.context.resume();
76
+ }
77
+ source = this.context.createMediaStreamSource(this.stream);
78
+ this._nativeSampleRate = this.context.sampleRate;
79
+ console.log("[MicrophoneCapture] Using native rate:", this._nativeSampleRate, "Hz \u2192 resampling to", this.config.sampleRate, "Hz");
80
+ }
66
81
  this.processor = this.context.createScriptProcessor(4096, 1, 1);
67
82
  this.processor.onaudioprocess = (e) => {
68
- const input = e.inputBuffer.getChannelData(0);
83
+ const raw = e.inputBuffer.getChannelData(0);
84
+ const input = this._nativeSampleRate !== this.config.sampleRate ? this.resample(raw, this._nativeSampleRate, this.config.sampleRate) : raw;
69
85
  let rms = 0;
70
86
  let peak = 0;
71
87
  for (let i = 0; i < input.length; i++) {
@@ -123,6 +139,25 @@ var MicrophoneCapture = class {
123
139
  this.buffer = new Float32Array(0);
124
140
  this._isRecording = false;
125
141
  }
142
+ /**
143
+ * Resample audio using linear interpolation.
144
+ * Used when the AudioContext runs at the device's native rate (e.g. 48kHz)
145
+ * and we need to downsample to the target rate (e.g. 16kHz).
146
+ */
147
+ resample(input, fromRate, toRate) {
148
+ if (fromRate === toRate) return input;
149
+ const ratio = fromRate / toRate;
150
+ const outputLength = Math.floor(input.length / ratio);
151
+ const output = new Float32Array(outputLength);
152
+ for (let i = 0; i < outputLength; i++) {
153
+ const srcIdx = i * ratio;
154
+ const lo = Math.floor(srcIdx);
155
+ const hi = Math.min(lo + 1, input.length - 1);
156
+ const frac = srcIdx - lo;
157
+ output[i] = input[lo] * (1 - frac) + input[hi] * frac;
158
+ }
159
+ return output;
160
+ }
126
161
  floatToPCM16(float32) {
127
162
  const pcm = new Int16Array(float32.length);
128
163
  for (let i = 0; i < float32.length; i++) {
@@ -263,7 +298,8 @@ var AudioScheduler = class {
263
298
  const ctx = await this.ensureContext();
264
299
  const channels = this.options.channels ?? 1;
265
300
  if (!this.isPlaying) {
266
- this.nextPlayTime = ctx.currentTime + (this.options.initialDelayS ?? 0.05);
301
+ const lookahead = this.options.initialLookaheadSec ?? 0.05;
302
+ this.nextPlayTime = ctx.currentTime + lookahead;
267
303
  this.isPlaying = true;
268
304
  }
269
305
  const audioBuffer = ctx.createBuffer(channels, audioData.length, ctx.sampleRate);
@@ -446,8 +482,8 @@ var AudioChunkCoalescer = class {
446
482
  var LAMPipeline = class {
447
483
  constructor(options = {}) {
448
484
  this.options = options;
449
- this.DEFAULT_CHUNK_SAMPLES = 16e3;
450
- // 1.0s at 16kHz (Wav2Vec2 requirement)
485
+ this.REQUIRED_SAMPLES = 16e3;
486
+ // 1.0s at 16kHz (LAM requirement)
451
487
  this.FRAME_RATE = 30;
452
488
  // LAM outputs 30fps
453
489
  this.buffer = new Float32Array(0);
@@ -477,20 +513,22 @@ var LAMPipeline = class {
477
513
  newBuffer.set(this.buffer, 0);
478
514
  newBuffer.set(samples, this.buffer.length);
479
515
  this.buffer = newBuffer;
480
- const chunkSize = lam.chunkSamples ?? this.DEFAULT_CHUNK_SAMPLES;
481
- while (this.buffer.length >= chunkSize) {
482
- await this.processBuffer(lam, chunkSize);
516
+ while (this.buffer.length >= this.REQUIRED_SAMPLES) {
517
+ await this.processBuffer(lam);
518
+ if (this.buffer.length >= this.REQUIRED_SAMPLES) {
519
+ await new Promise((r) => setTimeout(r, 0));
520
+ }
483
521
  }
484
522
  }
485
523
  /**
486
524
  * Process accumulated buffer through LAM inference
487
525
  */
488
- async processBuffer(lam, chunkSize) {
526
+ async processBuffer(lam) {
489
527
  try {
490
- const toProcess = this.buffer.slice(0, chunkSize);
528
+ const toProcess = this.buffer.slice(0, this.REQUIRED_SAMPLES);
491
529
  const processedStartTime = this.bufferStartTime;
492
- this.buffer = this.buffer.slice(chunkSize);
493
- const processedDuration = chunkSize / (this.options.sampleRate ?? 16e3);
530
+ this.buffer = this.buffer.slice(this.REQUIRED_SAMPLES);
531
+ const processedDuration = this.REQUIRED_SAMPLES / (this.options.sampleRate ?? 16e3);
494
532
  this.bufferStartTime = processedStartTime + processedDuration;
495
533
  const result = await lam.infer(toProcess);
496
534
  const frameDuration = 1 / this.FRAME_RATE;
@@ -509,22 +547,35 @@ var LAMPipeline = class {
509
547
  /**
510
548
  * Get the frame that should be displayed at the current time
511
549
  *
512
- * Timestamp-synced playback for all backends. Audio playback is delayed
513
- * for slow backends (WASM gets 1s head start via AudioScheduler) so
514
- * frames are ready by the time their corresponding audio plays.
550
+ * Automatically removes frames that have already been displayed.
551
+ * This prevents memory leaks from accumulating old frames.
515
552
  *
516
- * Discard window is generous for WASM to handle inference jitter.
517
- * Late frames play at RAF rate (~60fps) until caught up, then settle
518
- * to natural 30fps pacing via timestamp gating.
553
+ * Discard Window (prevents premature frame discarding):
554
+ * - WebGPU: 0.5s (LAM inference 20-100ms + RAF jitter + React stalls)
555
+ * - WASM: 1.0s (LAM inference 50-500ms + higher variability)
556
+ *
557
+ * Last-Frame-Hold: Returns last valid frame instead of null to prevent
558
+ * avatar freezing when between frames (RAF at 60fps vs LAM at 30fps).
519
559
  *
520
560
  * @param currentTime - Current AudioContext time
521
561
  * @param lam - LAM inference engine (optional, for backend detection)
522
562
  * @returns Current frame, or last frame as fallback, or null if no frames yet
523
563
  */
524
564
  getFrameForTime(currentTime, lam) {
525
- const discardWindow = lam?.backend === "wasm" ? 10 : 0.5;
565
+ const discardWindow = lam?.backend === "wasm" ? 1 : 0.5;
566
+ let discardedCount = 0;
526
567
  while (this.frameQueue.length > 0 && this.frameQueue[0].timestamp < currentTime - discardWindow) {
527
- this.frameQueue.shift();
568
+ const discarded = this.frameQueue.shift();
569
+ discardedCount++;
570
+ if (discardedCount === 1) {
571
+ const ageMs = ((currentTime - discarded.timestamp) * 1e3).toFixed(0);
572
+ console.warn("[LAM] Frame(s) discarded as too old", {
573
+ ageMs,
574
+ discardWindowMs: discardWindow * 1e3,
575
+ queueLength: this.frameQueue.length,
576
+ backend: lam?.backend ?? "unknown"
577
+ });
578
+ }
528
579
  }
529
580
  if (this.frameQueue.length > 0 && this.frameQueue[0].timestamp <= currentTime) {
530
581
  const { frame } = this.frameQueue.shift();
@@ -543,7 +594,7 @@ var LAMPipeline = class {
543
594
  * Get current buffer fill level (0-1)
544
595
  */
545
596
  get fillLevel() {
546
- return Math.min(1, this.buffer.length / this.DEFAULT_CHUNK_SAMPLES);
597
+ return Math.min(1, this.buffer.length / this.REQUIRED_SAMPLES);
547
598
  }
548
599
  /**
549
600
  * Get number of frames queued
@@ -560,7 +611,7 @@ var LAMPipeline = class {
560
611
  /**
561
612
  * Flush remaining buffered audio
562
613
  *
563
- * Processes any remaining audio in the buffer, even if less than the chunk size.
614
+ * Processes any remaining audio in the buffer, even if less than REQUIRED_SAMPLES.
564
615
  * This ensures the final audio chunk generates blendshape frames.
565
616
  *
566
617
  * Should be called when audio stream ends to prevent losing the last 0-1 seconds.
@@ -571,17 +622,12 @@ var LAMPipeline = class {
571
622
  if (this.buffer.length === 0) {
572
623
  return;
573
624
  }
625
+ const padded = new Float32Array(this.REQUIRED_SAMPLES);
626
+ padded.set(this.buffer, 0);
574
627
  const processedStartTime = this.bufferStartTime;
575
- const sampleRate = this.options.sampleRate ?? 16e3;
576
- const minSize = lam.chunkSamples ?? this.DEFAULT_CHUNK_SAMPLES;
577
- const audioToInfer = this.buffer.length >= minSize ? this.buffer : (() => {
578
- const padded = new Float32Array(minSize);
579
- padded.set(this.buffer, 0);
580
- return padded;
581
- })();
582
628
  try {
583
- const result = await lam.infer(audioToInfer);
584
- const actualDuration = this.buffer.length / sampleRate;
629
+ const result = await lam.infer(padded);
630
+ const actualDuration = this.buffer.length / (this.options.sampleRate ?? 16e3);
585
631
  const frameDuration = 1 / this.FRAME_RATE;
586
632
  const actualFrameCount = Math.ceil(actualDuration * this.FRAME_RATE);
587
633
  for (let i = 0; i < Math.min(actualFrameCount, result.blendshapes.length); i++) {
@@ -640,13 +686,12 @@ var SyncedAudioPipeline = class extends EventEmitter {
640
686
  this.monitorInterval = null;
641
687
  this.frameAnimationId = null;
642
688
  const sampleRate = options.sampleRate ?? 16e3;
643
- if (!options.lam.isLoaded) {
644
- throw new Error(
645
- "LipSyncBackend must be loaded before constructing SyncedAudioPipeline. Call lam.load() first so backend type is known for timing configuration."
646
- );
647
- }
648
- const initialDelayS = options.lam.backend === "wasm" ? 1 : 0.05;
649
- this.scheduler = new AudioScheduler({ sampleRate, initialDelayS });
689
+ const autoDelay = options.lam.modelId === "wav2arkit_cpu" ? 750 : options.lam.backend === "wasm" ? 350 : 50;
690
+ const audioDelayMs = options.audioDelayMs ?? autoDelay;
691
+ this.scheduler = new AudioScheduler({
692
+ sampleRate,
693
+ initialLookaheadSec: audioDelayMs / 1e3
694
+ });
650
695
  this.coalescer = new AudioChunkCoalescer({
651
696
  sampleRate,
652
697
  targetDurationMs: options.chunkTargetMs ?? 200
@@ -2014,9 +2059,7 @@ function formatBytes(bytes) {
2014
2059
  function isIOSSafari() {
2015
2060
  if (typeof navigator === "undefined") return false;
2016
2061
  const ua = navigator.userAgent.toLowerCase();
2017
- return /iphone|ipad|ipod/.test(ua) || // Safari on macOS could also have issues, but less severe
2018
- // Only force WASM on actual iOS devices
2019
- /safari/.test(ua) && /mobile/.test(ua) && !/chrome|crios|fxios/.test(ua);
2062
+ return /iphone|ipad|ipod/.test(ua) && /safari/.test(ua) && !/chrome|crios|fxios|chromium|edg/.test(ua);
2020
2063
  }
2021
2064
  function isIOS() {
2022
2065
  if (typeof navigator === "undefined") return false;
@@ -2074,10 +2117,7 @@ function getOptimalWasmThreads() {
2074
2117
  return 4;
2075
2118
  }
2076
2119
  function shouldEnableWasmProxy() {
2077
- if (isMobile()) {
2078
- return false;
2079
- }
2080
- return true;
2120
+ return false;
2081
2121
  }
2082
2122
  function isSafari() {
2083
2123
  if (typeof navigator === "undefined") return false;
@@ -2092,7 +2132,7 @@ function isSpeechRecognitionAvailable() {
2092
2132
  return "SpeechRecognition" in window || "webkitSpeechRecognition" in window;
2093
2133
  }
2094
2134
  function shouldUseNativeASR() {
2095
- return isIOS() && isSpeechRecognitionAvailable();
2135
+ return (isIOS() || isSafari()) && isSpeechRecognitionAvailable();
2096
2136
  }
2097
2137
  function shouldUseServerLipSync() {
2098
2138
  return isIOS();
@@ -2105,11 +2145,13 @@ var loadedBackend = null;
2105
2145
  var WASM_CDN_PATH = "https://cdn.jsdelivr.net/npm/onnxruntime-web@1.23.2/dist/";
2106
2146
  async function isWebGPUAvailable() {
2107
2147
  if (isIOS()) {
2108
- logger.debug("WebGPU check: iOS detected, returning false");
2148
+ logger.debug("WebGPU check: disabled on iOS (asyncify bundle crashes WebKit)");
2109
2149
  return false;
2110
2150
  }
2111
2151
  if (!hasWebGPUApi()) {
2112
- logger.debug("WebGPU check: navigator.gpu not available");
2152
+ logger.debug("WebGPU check: navigator.gpu not available", {
2153
+ isSecureContext: typeof window !== "undefined" ? window.isSecureContext : "N/A"
2154
+ });
2113
2155
  return false;
2114
2156
  }
2115
2157
  try {
@@ -2133,14 +2175,20 @@ async function isWebGPUAvailable() {
2133
2175
  }
2134
2176
  var iosWasmPatched = false;
2135
2177
  function applyIOSWasmMemoryPatch() {
2136
- if (iosWasmPatched || !isIOS()) return;
2178
+ if (iosWasmPatched || !isIOSSafari()) return;
2137
2179
  iosWasmPatched = true;
2138
2180
  const OrigMemory = WebAssembly.Memory;
2139
- const MAX_IOS_PAGES = 16384;
2140
- logger.info("Applying iOS WASM memory patch (max capped to 1GB, shared preserved)");
2181
+ const MAX_IOS_PAGES = 32768;
2182
+ logger.info("Applying iOS WASM memory patch (max\u21922GB, shared preserved)");
2141
2183
  WebAssembly.Memory = function IOSPatchedMemory(descriptor) {
2142
2184
  const patched = { ...descriptor };
2143
2185
  if (patched.maximum !== void 0 && patched.maximum > MAX_IOS_PAGES) {
2186
+ logger.info("iOS memory patch: capping maximum", {
2187
+ original: patched.maximum,
2188
+ capped: MAX_IOS_PAGES,
2189
+ shared: patched.shared,
2190
+ initial: patched.initial
2191
+ });
2144
2192
  patched.maximum = MAX_IOS_PAGES;
2145
2193
  }
2146
2194
  return new OrigMemory(patched);
@@ -2174,7 +2222,10 @@ async function getOnnxRuntime(backend) {
2174
2222
  logger.info(`Loading ONNX Runtime with ${backend} backend...`);
2175
2223
  applyIOSWasmMemoryPatch();
2176
2224
  try {
2177
- if (backend === "wasm") {
2225
+ if (backend === "wasm" && (isIOS() || isSafari())) {
2226
+ const module = await import("onnxruntime-web/wasm");
2227
+ ortInstance = module.default || module;
2228
+ } else if (backend === "wasm") {
2178
2229
  const module = await import("onnxruntime-web");
2179
2230
  ortInstance = module.default || module;
2180
2231
  } else {
@@ -2218,6 +2269,14 @@ function getSessionOptions(backend) {
2218
2269
  graphOptimizationLevel: "all"
2219
2270
  };
2220
2271
  }
2272
+ if (isIOS()) {
2273
+ return {
2274
+ executionProviders: ["wasm"],
2275
+ graphOptimizationLevel: "basic",
2276
+ enableCpuMemArena: false,
2277
+ enableMemPattern: false
2278
+ };
2279
+ }
2221
2280
  return {
2222
2281
  executionProviders: ["wasm"],
2223
2282
  graphOptimizationLevel: "all"
@@ -2249,6 +2308,16 @@ function getLoadedBackend() {
2249
2308
  function isOnnxRuntimeLoaded() {
2250
2309
  return ortInstance !== null;
2251
2310
  }
2311
+ async function preloadOnnxRuntime(preference = "auto") {
2312
+ if (ortInstance) {
2313
+ logger.info("ONNX Runtime already preloaded", { backend: loadedBackend });
2314
+ return loadedBackend;
2315
+ }
2316
+ logger.info("Preloading ONNX Runtime...", { preference });
2317
+ const { backend } = await getOnnxRuntimeForPreference(preference);
2318
+ logger.info("ONNX Runtime preloaded", { backend });
2319
+ return backend;
2320
+ }
2252
2321
 
2253
2322
  // src/inference/blendshapeUtils.ts
2254
2323
  var LAM_BLENDSHAPES = [
@@ -2444,6 +2513,7 @@ var CTC_VOCAB = [
2444
2513
  ];
2445
2514
  var Wav2Vec2Inference = class {
2446
2515
  constructor(config) {
2516
+ this.modelId = "wav2vec2";
2447
2517
  this.session = null;
2448
2518
  this.ort = null;
2449
2519
  this._backend = "wasm";
@@ -2482,38 +2552,108 @@ var Wav2Vec2Inference = class {
2482
2552
  this.ort = ort;
2483
2553
  this._backend = backend;
2484
2554
  logger2.info("ONNX Runtime loaded", { backend: this._backend });
2485
- const cache = getModelCache();
2486
2555
  const modelUrl = this.config.modelUrl;
2487
- const isCached = await cache.has(modelUrl);
2488
- let modelBuffer;
2489
- if (isCached) {
2490
- logger2.debug("Loading model from cache", { modelUrl });
2491
- modelBuffer = await cache.get(modelUrl);
2492
- if (!modelBuffer) {
2493
- logger2.warn("Cache corruption detected, clearing and retrying", { modelUrl });
2494
- await cache.delete(modelUrl);
2495
- logger2.info("Corrupted cache entry deleted, fetching fresh model", { modelUrl });
2496
- modelBuffer = await fetchWithCache(modelUrl);
2556
+ const dataUrl = this.config.externalDataUrl !== false ? typeof this.config.externalDataUrl === "string" ? this.config.externalDataUrl : `${modelUrl}.data` : null;
2557
+ const sessionOptions = getSessionOptions(this._backend);
2558
+ let isCached = false;
2559
+ if (isIOS()) {
2560
+ logger2.info("iOS: passing model URLs directly to ORT (low-memory path)", {
2561
+ modelUrl,
2562
+ dataUrl
2563
+ });
2564
+ if (dataUrl) {
2565
+ const dataFilename = dataUrl.split("/").pop();
2566
+ logger2.info("iOS: setting externalData", { dataFilename, dataUrl });
2567
+ sessionOptions.externalData = [{
2568
+ path: dataFilename,
2569
+ data: dataUrl
2570
+ // URL string — ORT fetches directly into WASM
2571
+ }];
2497
2572
  }
2573
+ logger2.info("iOS: calling InferenceSession.create() with URL string", {
2574
+ modelUrl,
2575
+ sessionOptions: JSON.stringify(
2576
+ sessionOptions,
2577
+ (_, v) => typeof v === "string" && v.length > 100 ? v.slice(0, 100) + "..." : v
2578
+ )
2579
+ });
2580
+ try {
2581
+ this.session = await this.ort.InferenceSession.create(modelUrl, sessionOptions);
2582
+ } catch (sessionErr) {
2583
+ logger2.error("iOS: InferenceSession.create() failed", {
2584
+ error: sessionErr instanceof Error ? sessionErr.message : String(sessionErr),
2585
+ errorType: sessionErr?.constructor?.name,
2586
+ stack: sessionErr instanceof Error ? sessionErr.stack : void 0
2587
+ });
2588
+ throw sessionErr;
2589
+ }
2590
+ logger2.info("iOS: session created successfully", {
2591
+ inputNames: this.session.inputNames,
2592
+ outputNames: this.session.outputNames
2593
+ });
2498
2594
  } else {
2499
- logger2.debug("Fetching and caching model", { modelUrl });
2500
- modelBuffer = await fetchWithCache(modelUrl);
2501
- }
2502
- if (!modelBuffer) {
2503
- const errorMsg = `Failed to load model: ${modelUrl}. Model buffer is null or undefined even after retry.`;
2504
- logger2.error(errorMsg, { modelUrl, isCached });
2505
- throw new Error(errorMsg);
2595
+ const cache = getModelCache();
2596
+ isCached = await cache.has(modelUrl);
2597
+ let modelBuffer;
2598
+ if (isCached) {
2599
+ logger2.debug("Loading model from cache", { modelUrl });
2600
+ modelBuffer = await cache.get(modelUrl);
2601
+ if (!modelBuffer) {
2602
+ logger2.warn("Cache corruption detected, clearing and retrying", { modelUrl });
2603
+ await cache.delete(modelUrl);
2604
+ modelBuffer = await fetchWithCache(modelUrl);
2605
+ }
2606
+ } else {
2607
+ logger2.debug("Fetching and caching model", { modelUrl });
2608
+ modelBuffer = await fetchWithCache(modelUrl);
2609
+ }
2610
+ if (!modelBuffer) {
2611
+ throw new Error(`Failed to load model: ${modelUrl}`);
2612
+ }
2613
+ let externalDataBuffer = null;
2614
+ if (dataUrl) {
2615
+ try {
2616
+ const isDataCached = await cache.has(dataUrl);
2617
+ if (isDataCached) {
2618
+ logger2.debug("Loading external data from cache", { dataUrl });
2619
+ externalDataBuffer = await cache.get(dataUrl);
2620
+ if (!externalDataBuffer) {
2621
+ logger2.warn("Cache corruption for external data, retrying", { dataUrl });
2622
+ await cache.delete(dataUrl);
2623
+ externalDataBuffer = await fetchWithCache(dataUrl);
2624
+ }
2625
+ } else {
2626
+ logger2.info("Fetching external model data", {
2627
+ dataUrl,
2628
+ note: "This may be a large download (383MB+)"
2629
+ });
2630
+ externalDataBuffer = await fetchWithCache(dataUrl);
2631
+ }
2632
+ logger2.info("External data loaded", {
2633
+ size: formatBytes(externalDataBuffer.byteLength)
2634
+ });
2635
+ } catch (err) {
2636
+ logger2.debug("No external data file found (single-file model)", {
2637
+ dataUrl,
2638
+ error: err.message
2639
+ });
2640
+ }
2641
+ }
2642
+ logger2.debug("Creating ONNX session", {
2643
+ graphSize: formatBytes(modelBuffer.byteLength),
2644
+ externalDataSize: externalDataBuffer ? formatBytes(externalDataBuffer.byteLength) : "none",
2645
+ backend: this._backend
2646
+ });
2647
+ if (externalDataBuffer) {
2648
+ const dataFilename = dataUrl.split("/").pop();
2649
+ sessionOptions.externalData = [{
2650
+ path: dataFilename,
2651
+ data: new Uint8Array(externalDataBuffer)
2652
+ }];
2653
+ }
2654
+ const modelData = new Uint8Array(modelBuffer);
2655
+ this.session = await this.ort.InferenceSession.create(modelData, sessionOptions);
2506
2656
  }
2507
- logger2.debug("Creating ONNX session", {
2508
- size: formatBytes(modelBuffer.byteLength),
2509
- backend: this._backend
2510
- });
2511
- const sessionOptions = getSessionOptions(this._backend);
2512
- logger2.info("Creating session with execution provider", {
2513
- executionProvider: this._backend
2514
- });
2515
- const modelData = new Uint8Array(modelBuffer);
2516
- this.session = await this.ort.InferenceSession.create(modelData, sessionOptions);
2517
2657
  logger2.info("ONNX session created successfully", {
2518
2658
  executionProvider: this._backend,
2519
2659
  backend: this._backend
@@ -2528,7 +2668,7 @@ var Wav2Vec2Inference = class {
2528
2668
  span?.setAttributes({
2529
2669
  "model.backend": this._backend,
2530
2670
  "model.load_time_ms": loadTimeMs,
2531
- "model.cached": isCached
2671
+ "model.cached": !isIOS() && isCached
2532
2672
  });
2533
2673
  span?.end();
2534
2674
  telemetry?.recordHistogram("omote.model.load_time", loadTimeMs, {
@@ -2731,319 +2871,550 @@ LAM_BLENDSHAPES.forEach((name, index) => {
2731
2871
  });
2732
2872
  var UPPER_FACE_SET = new Set(UPPER_FACE_BLENDSHAPES);
2733
2873
 
2734
- // src/inference/WhisperInference.ts
2735
- var logger4 = createLogger("Whisper");
2736
- var WhisperInference = class _WhisperInference {
2737
- constructor(config = {}) {
2738
- this.pipeline = null;
2739
- this.currentModel = null;
2874
+ // src/inference/kaldiFbank.ts
2875
+ function fft(re, im) {
2876
+ const n = re.length;
2877
+ for (let i = 1, j = 0; i < n; i++) {
2878
+ let bit = n >> 1;
2879
+ while (j & bit) {
2880
+ j ^= bit;
2881
+ bit >>= 1;
2882
+ }
2883
+ j ^= bit;
2884
+ if (i < j) {
2885
+ let tmp = re[i];
2886
+ re[i] = re[j];
2887
+ re[j] = tmp;
2888
+ tmp = im[i];
2889
+ im[i] = im[j];
2890
+ im[j] = tmp;
2891
+ }
2892
+ }
2893
+ for (let len = 2; len <= n; len *= 2) {
2894
+ const halfLen = len / 2;
2895
+ const angle = -2 * Math.PI / len;
2896
+ const wRe = Math.cos(angle);
2897
+ const wIm = Math.sin(angle);
2898
+ for (let i = 0; i < n; i += len) {
2899
+ let curRe = 1;
2900
+ let curIm = 0;
2901
+ for (let j = 0; j < halfLen; j++) {
2902
+ const a = i + j;
2903
+ const b = a + halfLen;
2904
+ const tRe = curRe * re[b] - curIm * im[b];
2905
+ const tIm = curRe * im[b] + curIm * re[b];
2906
+ re[b] = re[a] - tRe;
2907
+ im[b] = im[a] - tIm;
2908
+ re[a] += tRe;
2909
+ im[a] += tIm;
2910
+ const nextRe = curRe * wRe - curIm * wIm;
2911
+ curIm = curRe * wIm + curIm * wRe;
2912
+ curRe = nextRe;
2913
+ }
2914
+ }
2915
+ }
2916
+ }
2917
+ function htkMel(freq) {
2918
+ return 1127 * Math.log(1 + freq / 700);
2919
+ }
2920
+ function htkMelInverse(mel) {
2921
+ return 700 * (Math.exp(mel / 1127) - 1);
2922
+ }
2923
+ function buildMelFilterbank(numBins, fftSize, sampleRate, lowFreq, highFreq) {
2924
+ const numFftBins = fftSize / 2 + 1;
2925
+ const lowMel = htkMel(lowFreq);
2926
+ const highMel = htkMel(highFreq);
2927
+ const melPoints = new Float64Array(numBins + 2);
2928
+ for (let i = 0; i < numBins + 2; i++) {
2929
+ melPoints[i] = lowMel + (highMel - lowMel) * i / (numBins + 1);
2930
+ }
2931
+ const binFreqs = new Float64Array(numBins + 2);
2932
+ for (let i = 0; i < numBins + 2; i++) {
2933
+ binFreqs[i] = htkMelInverse(melPoints[i]) * fftSize / sampleRate;
2934
+ }
2935
+ const filters = [];
2936
+ for (let m = 0; m < numBins; m++) {
2937
+ const left = binFreqs[m];
2938
+ const center = binFreqs[m + 1];
2939
+ const right = binFreqs[m + 2];
2940
+ const startBin = Math.max(0, Math.ceil(left));
2941
+ const endBin = Math.min(numFftBins - 1, Math.floor(right));
2942
+ const weights = new Float32Array(endBin - startBin + 1);
2943
+ for (let k = startBin; k <= endBin; k++) {
2944
+ if (k <= center) {
2945
+ weights[k - startBin] = center - left > 0 ? (k - left) / (center - left) : 0;
2946
+ } else {
2947
+ weights[k - startBin] = right - center > 0 ? (right - k) / (right - center) : 0;
2948
+ }
2949
+ }
2950
+ filters.push({ startBin, weights });
2951
+ }
2952
+ return filters;
2953
+ }
2954
+ function createHammingWindow(length) {
2955
+ const window2 = new Float32Array(length);
2956
+ for (let i = 0; i < length; i++) {
2957
+ window2[i] = 0.54 - 0.46 * Math.cos(2 * Math.PI * i / (length - 1));
2958
+ }
2959
+ return window2;
2960
+ }
2961
+ function computeKaldiFbank(audio, sampleRate, numMelBins, opts) {
2962
+ const frameLengthMs = opts?.frameLengthMs ?? 25;
2963
+ const frameShiftMs = opts?.frameShiftMs ?? 10;
2964
+ const lowFreq = opts?.lowFreq ?? 20;
2965
+ const highFreq = opts?.highFreq ?? sampleRate / 2;
2966
+ const dither = opts?.dither ?? 0;
2967
+ const preemphasis = opts?.preemphasis ?? 0.97;
2968
+ const frameLengthSamples = Math.round(sampleRate * frameLengthMs / 1e3);
2969
+ const frameShiftSamples = Math.round(sampleRate * frameShiftMs / 1e3);
2970
+ const scaled = new Float32Array(audio.length);
2971
+ for (let i = 0; i < audio.length; i++) {
2972
+ scaled[i] = audio[i] * 32768;
2973
+ }
2974
+ if (dither > 0) {
2975
+ for (let i = 0; i < scaled.length; i++) {
2976
+ const u1 = Math.random();
2977
+ const u2 = Math.random();
2978
+ scaled[i] += dither * Math.sqrt(-2 * Math.log(u1 + 1e-10)) * Math.cos(2 * Math.PI * u2);
2979
+ }
2980
+ }
2981
+ const numFrames = Math.max(0, Math.floor((scaled.length - frameLengthSamples) / frameShiftSamples) + 1);
2982
+ if (numFrames === 0) {
2983
+ return new Float32Array(0);
2984
+ }
2985
+ let fftSize = 1;
2986
+ while (fftSize < frameLengthSamples) fftSize *= 2;
2987
+ const numFftBins = fftSize / 2 + 1;
2988
+ const window2 = createHammingWindow(frameLengthSamples);
2989
+ const filters = buildMelFilterbank(numMelBins, fftSize, sampleRate, lowFreq, highFreq);
2990
+ const output = new Float32Array(numFrames * numMelBins);
2991
+ const fftRe = new Float64Array(fftSize);
2992
+ const fftIm = new Float64Array(fftSize);
2993
+ for (let f = 0; f < numFrames; f++) {
2994
+ const offset = f * frameShiftSamples;
2995
+ fftRe.fill(0);
2996
+ fftIm.fill(0);
2997
+ for (let i = 0; i < frameLengthSamples; i++) {
2998
+ let sample = scaled[offset + i];
2999
+ if (preemphasis > 0 && i > 0) {
3000
+ sample -= preemphasis * scaled[offset + i - 1];
3001
+ } else if (preemphasis > 0 && i === 0 && offset > 0) {
3002
+ sample -= preemphasis * scaled[offset - 1];
3003
+ }
3004
+ fftRe[i] = sample * window2[i];
3005
+ }
3006
+ fft(fftRe, fftIm);
3007
+ const outOffset = f * numMelBins;
3008
+ for (let m = 0; m < numMelBins; m++) {
3009
+ const filter = filters[m];
3010
+ let energy = 0;
3011
+ for (let k = 0; k < filter.weights.length; k++) {
3012
+ const bin = filter.startBin + k;
3013
+ if (bin < numFftBins) {
3014
+ const powerSpec = fftRe[bin] * fftRe[bin] + fftIm[bin] * fftIm[bin];
3015
+ energy += filter.weights[k] * powerSpec;
3016
+ }
3017
+ }
3018
+ output[outOffset + m] = Math.log(Math.max(energy, 1e-10));
3019
+ }
3020
+ }
3021
+ return output;
3022
+ }
3023
+ function applyLFR(features, featureDim, lfrM = 7, lfrN = 6) {
3024
+ const numFrames = features.length / featureDim;
3025
+ if (numFrames === 0) return new Float32Array(0);
3026
+ const leftPad = Math.floor((lfrM - 1) / 2);
3027
+ const paddedLen = numFrames + leftPad;
3028
+ const numOutputFrames = Math.ceil(paddedLen / lfrN);
3029
+ const outputDim = featureDim * lfrM;
3030
+ const output = new Float32Array(numOutputFrames * outputDim);
3031
+ for (let i = 0; i < numOutputFrames; i++) {
3032
+ const startFrame = i * lfrN - leftPad;
3033
+ for (let j = 0; j < lfrM; j++) {
3034
+ let srcFrame = startFrame + j;
3035
+ if (srcFrame < 0) srcFrame = 0;
3036
+ if (srcFrame >= numFrames) srcFrame = numFrames - 1;
3037
+ const srcOffset = srcFrame * featureDim;
3038
+ const dstOffset = i * outputDim + j * featureDim;
3039
+ for (let k = 0; k < featureDim; k++) {
3040
+ output[dstOffset + k] = features[srcOffset + k];
3041
+ }
3042
+ }
3043
+ }
3044
+ return output;
3045
+ }
3046
+ function applyCMVN(features, dim, negMean, invStddev) {
3047
+ for (let i = 0; i < features.length; i++) {
3048
+ const d = i % dim;
3049
+ features[i] = (features[i] + negMean[d]) * invStddev[d];
3050
+ }
3051
+ return features;
3052
+ }
3053
+ function parseCMVNFromMetadata(negMeanStr, invStddevStr) {
3054
+ const negMean = new Float32Array(
3055
+ negMeanStr.split(",").map((s) => parseFloat(s.trim()))
3056
+ );
3057
+ const invStddev = new Float32Array(
3058
+ invStddevStr.split(",").map((s) => parseFloat(s.trim()))
3059
+ );
3060
+ return { negMean, invStddev };
3061
+ }
3062
+
3063
+ // src/inference/ctcDecoder.ts
3064
+ function resolveLanguageId(language) {
3065
+ const map = {
3066
+ auto: 0,
3067
+ zh: 3,
3068
+ en: 4,
3069
+ yue: 7,
3070
+ ja: 11,
3071
+ ko: 12
3072
+ };
3073
+ return map[language] ?? 0;
3074
+ }
3075
+ function resolveTextNormId(textNorm) {
3076
+ return textNorm === "without_itn" ? 15 : 14;
3077
+ }
3078
+ function parseTokensFile(content) {
3079
+ const map = /* @__PURE__ */ new Map();
3080
+ const lines = content.split("\n");
3081
+ for (const line of lines) {
3082
+ const trimmed = line.trim();
3083
+ if (!trimmed) continue;
3084
+ const lastSpace = trimmed.lastIndexOf(" ");
3085
+ if (lastSpace === -1) continue;
3086
+ const token = trimmed.substring(0, lastSpace);
3087
+ const id = parseInt(trimmed.substring(lastSpace + 1), 10);
3088
+ if (!isNaN(id)) {
3089
+ map.set(id, token);
3090
+ }
3091
+ }
3092
+ return map;
3093
+ }
3094
+ function parseStructuredToken(token) {
3095
+ const match = token.match(/^<\|(.+)\|>$/);
3096
+ if (!match) return null;
3097
+ const value = match[1];
3098
+ if (value === "zh" || value === "en" || value === "ja" || value === "ko" || value === "yue" || value === "nospeech") {
3099
+ return { type: "language", value };
3100
+ }
3101
+ const emotions = ["HAPPY", "SAD", "ANGRY", "NEUTRAL", "FEARFUL", "DISGUSTED", "SURPRISED", "EMO_UNKNOWN"];
3102
+ if (emotions.includes(value)) {
3103
+ return { type: "emotion", value };
3104
+ }
3105
+ const events = ["Speech", "BGM", "Applause", "Laughter", "Crying", "Coughing", "Sneezing", "EVENT_UNKNOWN"];
3106
+ if (events.includes(value)) {
3107
+ return { type: "event", value };
3108
+ }
3109
+ if (value === "withitn" || value === "woitn" || value === "with_itn" || value === "without_itn") {
3110
+ return { type: "textnorm", value };
3111
+ }
3112
+ return null;
3113
+ }
3114
+ function ctcGreedyDecode(logits, seqLen, vocabSize, tokenMap) {
3115
+ const tokenIds = [];
3116
+ for (let t = 0; t < seqLen; t++) {
3117
+ const offset = t * vocabSize;
3118
+ let maxIdx = 0;
3119
+ let maxVal = logits[offset];
3120
+ for (let v = 1; v < vocabSize; v++) {
3121
+ if (logits[offset + v] > maxVal) {
3122
+ maxVal = logits[offset + v];
3123
+ maxIdx = v;
3124
+ }
3125
+ }
3126
+ tokenIds.push(maxIdx);
3127
+ }
3128
+ const collapsed = [];
3129
+ let prev = -1;
3130
+ for (const id of tokenIds) {
3131
+ if (id !== prev) {
3132
+ collapsed.push(id);
3133
+ prev = id;
3134
+ }
3135
+ }
3136
+ const filtered = collapsed.filter((id) => id !== 0 && id !== 1 && id !== 2);
3137
+ let language;
3138
+ let emotion;
3139
+ let event;
3140
+ const textTokens = [];
3141
+ for (const id of filtered) {
3142
+ const token = tokenMap.get(id);
3143
+ if (!token) continue;
3144
+ const structured = parseStructuredToken(token);
3145
+ if (structured) {
3146
+ if (structured.type === "language") language = structured.value;
3147
+ else if (structured.type === "emotion") emotion = structured.value;
3148
+ else if (structured.type === "event") event = structured.value;
3149
+ } else {
3150
+ textTokens.push(token);
3151
+ }
3152
+ }
3153
+ let text = textTokens.join("");
3154
+ text = text.replace(/\u2581/g, " ").trim();
3155
+ return { text, language, emotion, event };
3156
+ }
3157
+
3158
+ // src/inference/SenseVoiceInference.ts
3159
+ var logger4 = createLogger("SenseVoice");
3160
+ var SenseVoiceInference = class {
3161
+ constructor(config) {
3162
+ this.session = null;
3163
+ this.ort = null;
3164
+ this._backend = "wasm";
2740
3165
  this.isLoading = false;
2741
- this.actualBackend = "unknown";
3166
+ this.inferenceQueue = Promise.resolve();
3167
+ // Preprocessing state (loaded once)
3168
+ this.tokenMap = null;
3169
+ this.negMean = null;
3170
+ this.invStddev = null;
3171
+ this.languageId = 0;
3172
+ this.textNormId = 14;
3173
+ const modelDir = config.modelUrl.substring(0, config.modelUrl.lastIndexOf("/"));
3174
+ const tokensUrl = config.tokensUrl ?? `${modelDir}/tokens.txt`;
2742
3175
  this.config = {
2743
- model: config.model || "tiny",
2744
- multilingual: config.multilingual || false,
2745
- language: config.language || "en",
2746
- task: config.task || "transcribe",
2747
- dtype: config.dtype || "q8",
2748
- device: config.device || "auto",
2749
- localModelPath: config.localModelPath,
2750
- token: config.token,
2751
- suppressNonSpeech: config.suppressNonSpeech !== false
2752
- // Default true
3176
+ modelUrl: config.modelUrl,
3177
+ tokensUrl,
3178
+ language: config.language ?? "auto",
3179
+ textNorm: config.textNorm ?? "with_itn",
3180
+ backend: config.backend ?? "auto"
2753
3181
  };
3182
+ this.languageId = resolveLanguageId(this.config.language);
3183
+ this.textNormId = resolveTextNormId(this.config.textNorm);
2754
3184
  }
2755
- /**
2756
- * Check if WebGPU is available in this browser
2757
- */
2758
- static async isWebGPUAvailable() {
2759
- return isWebGPUAvailable();
3185
+ get backend() {
3186
+ return this.session ? this._backend : null;
2760
3187
  }
2761
- /**
2762
- * Load the Whisper model pipeline
2763
- */
3188
+ get isLoaded() {
3189
+ return this.session !== null;
3190
+ }
3191
+ // ─── Load ───────────────────────────────────────────────────────────────
2764
3192
  async load(onProgress) {
2765
3193
  if (this.isLoading) {
2766
- logger4.debug("Already loading model, waiting...");
2767
- while (this.isLoading) {
2768
- await new Promise((resolve) => setTimeout(resolve, 100));
2769
- }
2770
- return;
3194
+ throw new Error("Model is already loading");
2771
3195
  }
2772
- const modelName = this.getModelName();
2773
- if (this.pipeline !== null && this.currentModel === modelName) {
2774
- logger4.debug("Model already loaded", { model: modelName });
2775
- return;
3196
+ if (this.session) {
3197
+ throw new Error("Model already loaded. Call dispose() first.");
2776
3198
  }
2777
3199
  this.isLoading = true;
3200
+ const startTime = performance.now();
2778
3201
  const telemetry = getTelemetry();
2779
- const span = telemetry?.startSpan("whisper.load", {
2780
- "whisper.model": modelName,
2781
- "whisper.dtype": this.config.dtype,
2782
- "whisper.device": this.config.device
3202
+ const span = telemetry?.startSpan("SenseVoice.load", {
3203
+ "model.url": this.config.modelUrl,
3204
+ "model.backend_requested": this.config.backend
2783
3205
  });
2784
3206
  try {
2785
- const loadStart = performance.now();
2786
- logger4.info("Loading model", {
2787
- model: modelName,
2788
- dtype: this.config.dtype,
2789
- device: this.config.device,
2790
- multilingual: this.config.multilingual
2791
- });
2792
- if (this.pipeline !== null && this.currentModel !== modelName) {
2793
- logger4.debug("Disposing old model", { oldModel: this.currentModel });
2794
- await this.pipeline.dispose();
2795
- this.pipeline = null;
3207
+ logger4.info("Loading ONNX Runtime...", { preference: this.config.backend });
3208
+ const { ort, backend } = await getOnnxRuntimeForPreference(this.config.backend);
3209
+ this.ort = ort;
3210
+ this._backend = backend;
3211
+ logger4.info("ONNX Runtime loaded", { backend: this._backend });
3212
+ logger4.debug("Fetching tokens vocabulary", { tokensUrl: this.config.tokensUrl });
3213
+ const tokensResponse = await fetch(this.config.tokensUrl);
3214
+ if (!tokensResponse.ok) {
3215
+ throw new Error(`Failed to fetch tokens.txt: ${tokensResponse.status} ${tokensResponse.statusText}`);
2796
3216
  }
2797
- const hasWebGPU = await _WhisperInference.isWebGPUAvailable();
2798
- const device = this.config.device === "auto" ? hasWebGPU ? "webgpu" : "wasm" : this.config.device;
2799
- logger4.info("Creating pipeline", { device, hasWebGPU });
2800
- __webpack_exports__env.allowLocalModels = false;
2801
- __webpack_exports__env.allowRemoteModels = true;
2802
- __webpack_exports__env.useBrowserCache = false;
2803
- __webpack_exports__env.useCustomCache = false;
2804
- __webpack_exports__env.useWasmCache = false;
2805
- if (__webpack_exports__env.backends.onnx.wasm) {
2806
- __webpack_exports__env.backends.onnx.wasm.proxy = false;
2807
- __webpack_exports__env.backends.onnx.wasm.numThreads = 1;
3217
+ const tokensText = await tokensResponse.text();
3218
+ this.tokenMap = parseTokensFile(tokensText);
3219
+ logger4.debug("Tokens loaded", { vocabSize: this.tokenMap.size });
3220
+ const sessionOptions = getSessionOptions(this._backend);
3221
+ if (this._backend === "webgpu") {
3222
+ sessionOptions.graphOptimizationLevel = "basic";
2808
3223
  }
2809
- logger4.info("Configured transformers.js env", {
2810
- allowLocalModels: __webpack_exports__env.allowLocalModels,
2811
- useBrowserCache: __webpack_exports__env.useBrowserCache,
2812
- useWasmCache: __webpack_exports__env.useWasmCache
2813
- });
2814
- const pipelineOptions = {
2815
- dtype: this.config.dtype,
2816
- device,
2817
- progress_callback: onProgress,
2818
- // For medium models, use no_attentions revision to save memory
2819
- revision: modelName.includes("whisper-medium") ? "no_attentions" : "main",
2820
- // Pass HuggingFace token to bypass rate limits
2821
- ...this.config.token && { token: this.config.token }
2822
- };
2823
- if (device === "webgpu") {
2824
- pipelineOptions.session_options = {
2825
- executionProviders: ["webgpu"]
2826
- };
2827
- logger4.info("Forcing WebGPU execution providers");
3224
+ let isCached = false;
3225
+ if (isIOS()) {
3226
+ logger4.info("iOS: passing model URL directly to ORT (low-memory path)", {
3227
+ modelUrl: this.config.modelUrl
3228
+ });
3229
+ this.session = await this.ort.InferenceSession.create(
3230
+ this.config.modelUrl,
3231
+ sessionOptions
3232
+ );
3233
+ } else {
3234
+ const cache = getModelCache();
3235
+ isCached = await cache.has(this.config.modelUrl);
3236
+ let modelBuffer;
3237
+ if (isCached) {
3238
+ logger4.debug("Loading model from cache", { modelUrl: this.config.modelUrl });
3239
+ modelBuffer = await cache.get(this.config.modelUrl);
3240
+ onProgress?.(modelBuffer.byteLength, modelBuffer.byteLength);
3241
+ } else {
3242
+ logger4.debug("Fetching and caching model", { modelUrl: this.config.modelUrl });
3243
+ modelBuffer = await fetchWithCache(this.config.modelUrl, onProgress);
3244
+ }
3245
+ logger4.debug("Creating ONNX session", {
3246
+ size: formatBytes(modelBuffer.byteLength),
3247
+ backend: this._backend
3248
+ });
3249
+ const modelData = new Uint8Array(modelBuffer);
3250
+ this.session = await this.ort.InferenceSession.create(modelData, sessionOptions);
2828
3251
  }
2829
- this.pipeline = await __webpack_exports__pipeline(
2830
- "automatic-speech-recognition",
2831
- modelName,
2832
- pipelineOptions
2833
- );
2834
- this.actualBackend = device;
2835
- this.currentModel = modelName;
2836
- const loadTimeMs = performance.now() - loadStart;
2837
- logger4.info("Model loaded successfully", {
2838
- model: modelName,
2839
- loadTimeMs: Math.round(loadTimeMs)
3252
+ try {
3253
+ const metadata = this.session.handler?.metadata;
3254
+ if (metadata?.neg_mean && metadata?.inv_stddev) {
3255
+ const cmvn = parseCMVNFromMetadata(metadata.neg_mean, metadata.inv_stddev);
3256
+ this.negMean = cmvn.negMean;
3257
+ this.invStddev = cmvn.invStddev;
3258
+ logger4.debug("CMVN loaded from model metadata", { dim: this.negMean.length });
3259
+ } else {
3260
+ logger4.warn("CMVN not found in model metadata \u2014 features will not be normalized");
3261
+ }
3262
+ } catch (cmvnErr) {
3263
+ logger4.warn("Failed to read CMVN from model metadata", { error: cmvnErr });
3264
+ }
3265
+ const loadTimeMs = performance.now() - startTime;
3266
+ logger4.info("SenseVoice model loaded", {
3267
+ backend: this._backend,
3268
+ loadTimeMs: Math.round(loadTimeMs),
3269
+ vocabSize: this.tokenMap.size,
3270
+ inputs: this.session.inputNames,
3271
+ outputs: this.session.outputNames,
3272
+ hasCMVN: this.negMean !== null
2840
3273
  });
2841
3274
  span?.setAttributes({
2842
- "whisper.load_time_ms": loadTimeMs
3275
+ "model.backend": this._backend,
3276
+ "model.load_time_ms": loadTimeMs,
3277
+ "model.cached": !isIOS() && isCached,
3278
+ "model.vocab_size": this.tokenMap.size
2843
3279
  });
2844
3280
  span?.end();
2845
- } catch (error) {
2846
- const errorDetails = {
2847
- message: error instanceof Error ? error.message : String(error),
2848
- stack: error instanceof Error ? error.stack : void 0,
2849
- name: error instanceof Error ? error.name : void 0,
2850
- error
3281
+ telemetry?.recordHistogram("omote.model.load_time", loadTimeMs, {
3282
+ model: "sensevoice",
3283
+ backend: this._backend
3284
+ });
3285
+ return {
3286
+ backend: this._backend,
3287
+ loadTimeMs,
3288
+ inputNames: [...this.session.inputNames],
3289
+ outputNames: [...this.session.outputNames],
3290
+ vocabSize: this.tokenMap.size
2851
3291
  };
2852
- logger4.error("Failed to load model", errorDetails);
2853
- span?.endWithError(error);
3292
+ } catch (error) {
3293
+ span?.endWithError(error instanceof Error ? error : new Error(String(error)));
3294
+ telemetry?.incrementCounter("omote.errors.total", 1, {
3295
+ model: "sensevoice",
3296
+ error_type: "load_failed"
3297
+ });
2854
3298
  throw error;
2855
3299
  } finally {
2856
3300
  this.isLoading = false;
2857
3301
  }
2858
3302
  }
3303
+ // ─── Transcribe ─────────────────────────────────────────────────────────
2859
3304
  /**
2860
- * Transcribe audio to text
3305
+ * Transcribe audio samples to text
2861
3306
  *
2862
- * @param audio Audio samples (Float32Array, 16kHz mono)
2863
- * @param options Transcription options
3307
+ * @param audioSamples Float32Array of audio samples at 16kHz, [-1, 1] range
3308
+ * @returns Transcription result with text, emotion, language, and event
2864
3309
  */
2865
- async transcribe(audio, options) {
2866
- if (!this.pipeline) {
3310
+ async transcribe(audioSamples) {
3311
+ if (!this.session || !this.ort || !this.tokenMap) {
2867
3312
  throw new Error("Model not loaded. Call load() first.");
2868
3313
  }
2869
- const audioCopy = new Float32Array(audio);
2870
- const telemetry = getTelemetry();
2871
- const span = telemetry?.startSpan("whisper.transcribe", {
2872
- "audio.samples": audioCopy.length,
2873
- "audio.duration_s": audioCopy.length / 16e3,
2874
- "whisper.model": this.currentModel
2875
- });
2876
- try {
2877
- const inferStart = performance.now();
2878
- const audioDurationSec = audioCopy.length / 16e3;
2879
- const isShortAudio = audioDurationSec < 10;
2880
- logger4.debug("Starting transcription", {
2881
- audioSamples: audioCopy.length,
2882
- durationSeconds: audioDurationSec.toFixed(2),
2883
- isShortAudio
2884
- });
2885
- const transcribeOptions = {
2886
- // Decoding strategy
2887
- top_k: 0,
2888
- do_sample: false,
2889
- // Adaptive chunking: Disable for short audio, enable for long audio
2890
- chunk_length_s: options?.chunkLengthS || (isShortAudio ? audioDurationSec : 30),
2891
- stride_length_s: options?.strideLengthS || (isShortAudio ? 0 : 5),
2892
- // Timestamps
2893
- return_timestamps: options?.returnTimestamps || false,
2894
- force_full_sequences: false
2895
- };
2896
- if (this.config.multilingual) {
2897
- transcribeOptions.language = options?.language || this.config.language;
2898
- transcribeOptions.task = options?.task || this.config.task;
2899
- }
2900
- const rawResult = await this.pipeline(audioCopy, transcribeOptions);
2901
- const result = Array.isArray(rawResult) ? rawResult[0] : rawResult;
2902
- const inferenceTimeMs = performance.now() - inferStart;
2903
- let cleanedText = result.text;
2904
- if (this.config.suppressNonSpeech) {
2905
- cleanedText = this.removeNonSpeechTokens(cleanedText);
2906
- }
2907
- const transcription = {
2908
- text: cleanedText,
2909
- language: this.config.language,
2910
- inferenceTimeMs,
2911
- chunks: result.chunks
2912
- };
2913
- logger4.debug("Transcription complete", {
2914
- text: transcription.text,
2915
- inferenceTimeMs: Math.round(inferenceTimeMs),
2916
- chunksCount: result.chunks?.length || 0
2917
- });
2918
- span?.setAttributes({
2919
- "whisper.inference_time_ms": inferenceTimeMs,
2920
- "whisper.text_length": transcription.text.length
2921
- });
2922
- span?.end();
2923
- return transcription;
2924
- } catch (error) {
2925
- logger4.error("Transcribe error", { error });
2926
- span?.endWithError(error);
2927
- throw new Error(`Whisper transcription failed: ${error}`);
2928
- }
3314
+ const audio = new Float32Array(audioSamples);
3315
+ return this.queueInference(audio);
2929
3316
  }
2930
- /**
2931
- * Transcribe with streaming chunks (progressive results)
2932
- *
2933
- * @param audio Audio samples
2934
- * @param onChunk Called when each chunk is finalized
2935
- * @param onUpdate Called after each generation step (optional)
2936
- */
2937
- async transcribeStreaming(audio, onChunk, onUpdate, options) {
2938
- if (!this.pipeline) {
2939
- throw new Error("Model not loaded. Call load() first.");
2940
- }
2941
- const telemetry = getTelemetry();
2942
- const span = telemetry?.startSpan("whisper.transcribe_streaming", {
2943
- "audio.samples": audio.length,
2944
- "audio.duration_s": audio.length / 16e3
2945
- });
2946
- try {
2947
- const inferStart = performance.now();
2948
- logger4.debug("Starting streaming transcription", {
2949
- audioSamples: audio.length,
2950
- durationSeconds: (audio.length / 16e3).toFixed(2)
2951
- });
2952
- const transcribeOptions = {
2953
- top_k: 0,
2954
- do_sample: false,
2955
- chunk_length_s: options?.chunkLengthS || 30,
2956
- stride_length_s: options?.strideLengthS || 5,
2957
- return_timestamps: true,
2958
- force_full_sequences: false
2959
- };
2960
- if (this.config.multilingual) {
2961
- transcribeOptions.language = options?.language || this.config.language;
2962
- transcribeOptions.task = options?.task || this.config.task;
2963
- }
2964
- const rawResult = await this.pipeline(audio, transcribeOptions);
2965
- const result = Array.isArray(rawResult) ? rawResult[0] : rawResult;
2966
- const inferenceTimeMs = performance.now() - inferStart;
2967
- if (result.chunks && onChunk) {
2968
- for (const chunk of result.chunks) {
2969
- onChunk({
2970
- text: chunk.text,
2971
- timestamp: chunk.timestamp
3317
+ queueInference(audio) {
3318
+ return new Promise((resolve, reject) => {
3319
+ this.inferenceQueue = this.inferenceQueue.then(async () => {
3320
+ const telemetry = getTelemetry();
3321
+ const span = telemetry?.startSpan("SenseVoice.transcribe", {
3322
+ "inference.backend": this._backend,
3323
+ "inference.input_samples": audio.length
3324
+ });
3325
+ try {
3326
+ const startTime = performance.now();
3327
+ const preprocessStart = performance.now();
3328
+ const fbank = computeKaldiFbank(audio, 16e3, 80);
3329
+ const numFrames = fbank.length / 80;
3330
+ if (numFrames === 0) {
3331
+ resolve({
3332
+ text: "",
3333
+ inferenceTimeMs: performance.now() - startTime,
3334
+ preprocessTimeMs: performance.now() - preprocessStart
3335
+ });
3336
+ return;
3337
+ }
3338
+ const lfrFeatures = applyLFR(fbank, 80, 7, 6);
3339
+ const numLfrFrames = lfrFeatures.length / 560;
3340
+ if (this.negMean && this.invStddev) {
3341
+ applyCMVN(lfrFeatures, 560, this.negMean, this.invStddev);
3342
+ }
3343
+ const preprocessTimeMs = performance.now() - preprocessStart;
3344
+ const ort = this.ort;
3345
+ const feeds = {
3346
+ x: new ort.Tensor("float32", lfrFeatures, [1, numLfrFrames, 560]),
3347
+ x_length: new ort.Tensor("int32", new Int32Array([numLfrFrames]), [1]),
3348
+ language: new ort.Tensor("int32", new Int32Array([this.languageId]), [1]),
3349
+ text_norm: new ort.Tensor("int32", new Int32Array([this.textNormId]), [1])
3350
+ };
3351
+ const results = await this.session.run(feeds);
3352
+ const logitsOutput = results["logits"];
3353
+ if (!logitsOutput) {
3354
+ throw new Error('Model output missing "logits" tensor');
3355
+ }
3356
+ const logitsData = logitsOutput.data;
3357
+ const logitsDims = logitsOutput.dims;
3358
+ const seqLen = logitsDims[1];
3359
+ const vocabSize = logitsDims[2];
3360
+ const decoded = ctcGreedyDecode(logitsData, seqLen, vocabSize, this.tokenMap);
3361
+ const inferenceTimeMs = performance.now() - startTime;
3362
+ logger4.trace("Transcription complete", {
3363
+ text: decoded.text.substring(0, 50),
3364
+ language: decoded.language,
3365
+ emotion: decoded.emotion,
3366
+ event: decoded.event,
3367
+ preprocessTimeMs: Math.round(preprocessTimeMs * 100) / 100,
3368
+ inferenceTimeMs: Math.round(inferenceTimeMs * 100) / 100,
3369
+ numFrames,
3370
+ numLfrFrames
3371
+ });
3372
+ span?.setAttributes({
3373
+ "inference.duration_ms": inferenceTimeMs,
3374
+ "inference.preprocess_ms": preprocessTimeMs,
3375
+ "inference.num_frames": numFrames,
3376
+ "inference.text_length": decoded.text.length
3377
+ });
3378
+ span?.end();
3379
+ telemetry?.recordHistogram("omote.inference.latency", inferenceTimeMs, {
3380
+ model: "sensevoice",
3381
+ backend: this._backend
3382
+ });
3383
+ telemetry?.incrementCounter("omote.inference.total", 1, {
3384
+ model: "sensevoice",
3385
+ backend: this._backend,
3386
+ status: "success"
3387
+ });
3388
+ resolve({
3389
+ text: decoded.text,
3390
+ language: decoded.language,
3391
+ emotion: decoded.emotion,
3392
+ event: decoded.event,
3393
+ inferenceTimeMs,
3394
+ preprocessTimeMs
2972
3395
  });
3396
+ } catch (err) {
3397
+ span?.endWithError(err instanceof Error ? err : new Error(String(err)));
3398
+ telemetry?.incrementCounter("omote.inference.total", 1, {
3399
+ model: "sensevoice",
3400
+ backend: this._backend,
3401
+ status: "error"
3402
+ });
3403
+ reject(err);
2973
3404
  }
2974
- }
2975
- if (onUpdate) {
2976
- onUpdate(result.text);
2977
- }
2978
- logger4.debug("Streaming transcription complete", {
2979
- text: result.text,
2980
- inferenceTimeMs: Math.round(inferenceTimeMs),
2981
- chunksCount: result.chunks?.length || 0
2982
- });
2983
- span?.setAttributes({
2984
- "whisper.inference_time_ms": inferenceTimeMs,
2985
- "whisper.chunks_count": result.chunks?.length || 0
2986
3405
  });
2987
- span?.end();
2988
- return {
2989
- text: result.text,
2990
- language: this.config.language,
2991
- inferenceTimeMs,
2992
- chunks: result.chunks
2993
- };
2994
- } catch (error) {
2995
- logger4.error("Streaming transcribe error", { error });
2996
- span?.endWithError(error);
2997
- throw new Error(`Whisper streaming transcription failed: ${error}`);
2998
- }
3406
+ });
2999
3407
  }
3000
- /**
3001
- * Dispose of the model and free resources
3002
- */
3408
+ // ─── Dispose ──────────────────────────────────────────────────────────
3003
3409
  async dispose() {
3004
- if (this.pipeline) {
3005
- logger4.debug("Disposing model", { model: this.currentModel });
3006
- await this.pipeline.dispose();
3007
- this.pipeline = null;
3008
- this.currentModel = null;
3009
- }
3010
- }
3011
- /**
3012
- * Check if model is loaded
3013
- */
3014
- get isLoaded() {
3015
- return this.pipeline !== null;
3016
- }
3017
- /**
3018
- * Get the backend being used (webgpu or wasm)
3019
- */
3020
- get backend() {
3021
- return this.actualBackend;
3022
- }
3023
- /**
3024
- * Get the full model name used by transformers.js
3025
- */
3026
- getModelName() {
3027
- if (this.config.localModelPath) {
3028
- return this.config.localModelPath;
3029
- }
3030
- let modelName = `onnx-community/whisper-${this.config.model}`;
3031
- if (!this.config.multilingual) {
3032
- modelName += ".en";
3410
+ if (this.session) {
3411
+ await this.session.release();
3412
+ this.session = null;
3033
3413
  }
3034
- return modelName;
3035
- }
3036
- /**
3037
- * Remove non-speech event tokens from transcription
3038
- *
3039
- * Whisper outputs special tokens for non-speech events like:
3040
- * [LAUGHTER], [APPLAUSE], [MUSIC], [BLANK_AUDIO], [CLICKING], etc.
3041
- *
3042
- * This method strips these tokens and cleans up extra whitespace.
3043
- */
3044
- removeNonSpeechTokens(text) {
3045
- const cleaned = text.replace(/\[[\w\s_]+\]/g, "");
3046
- return cleaned.replace(/\s+/g, " ").trim();
3414
+ this.ort = null;
3415
+ this.tokenMap = null;
3416
+ this.negMean = null;
3417
+ this.invStddev = null;
3047
3418
  }
3048
3419
  };
3049
3420
 
@@ -3051,18 +3422,13 @@ var WhisperInference = class _WhisperInference {
3051
3422
  var logger5 = createLogger("Wav2ArkitCpu");
3052
3423
  var Wav2ArkitCpuInference = class {
3053
3424
  constructor(config) {
3425
+ this.modelId = "wav2arkit_cpu";
3054
3426
  this.session = null;
3055
3427
  this.ort = null;
3056
3428
  this._backend = "wasm";
3057
3429
  this.isLoading = false;
3058
3430
  // Inference queue for handling concurrent calls
3059
3431
  this.inferenceQueue = Promise.resolve();
3060
- /**
3061
- * Preferred chunk size: 4000 samples (250ms at 16kHz).
3062
- * wav2arkit_cpu accepts variable-length input, so we use smaller chunks
3063
- * for lower latency on WASM (vs 16000 for Wav2Vec2's fixed requirement).
3064
- */
3065
- this.chunkSamples = 4e3;
3066
3432
  this.config = config;
3067
3433
  }
3068
3434
  get backend() {
@@ -3096,23 +3462,25 @@ var Wav2ArkitCpuInference = class {
3096
3462
  this._backend = backend;
3097
3463
  logger5.info("ONNX Runtime loaded", { backend: this._backend });
3098
3464
  const modelUrl = this.config.modelUrl;
3099
- const sessionOptions = { ...getSessionOptions(this._backend) };
3100
- let isCached = false;
3101
- if (isIOS() && this.config.modelDataUrl) {
3102
- const dataFilename = this.config.modelDataUrl.split("/").pop();
3103
- sessionOptions.externalData = [{
3104
- path: dataFilename,
3105
- data: this.config.modelDataUrl
3106
- }];
3107
- logger5.info("iOS: URL-based session creation (ORT handles fetch internally)", {
3465
+ const dataUrl = this.config.externalDataUrl !== false ? this.config.externalDataUrl || `${modelUrl}.data` : null;
3466
+ const sessionOptions = getSessionOptions(this._backend);
3467
+ if (isIOS()) {
3468
+ logger5.info("iOS: passing model URLs directly to ORT (low-memory path)", {
3108
3469
  modelUrl,
3109
- dataFile: dataFilename,
3110
- dataUrl: this.config.modelDataUrl
3470
+ dataUrl
3111
3471
  });
3472
+ if (dataUrl) {
3473
+ const dataFilename = dataUrl.split("/").pop();
3474
+ sessionOptions.externalData = [{
3475
+ path: dataFilename,
3476
+ data: dataUrl
3477
+ // URL string — ORT fetches directly into WASM
3478
+ }];
3479
+ }
3112
3480
  this.session = await this.ort.InferenceSession.create(modelUrl, sessionOptions);
3113
3481
  } else {
3114
3482
  const cache = getModelCache();
3115
- isCached = await cache.has(modelUrl);
3483
+ const isCached = await cache.has(modelUrl);
3116
3484
  let modelBuffer;
3117
3485
  if (isCached) {
3118
3486
  logger5.debug("Loading model from cache", { modelUrl });
@@ -3123,42 +3491,48 @@ var Wav2ArkitCpuInference = class {
3123
3491
  modelBuffer = await fetchWithCache(modelUrl);
3124
3492
  }
3125
3493
  } else {
3126
- logger5.debug("Fetching and caching model", { modelUrl });
3494
+ logger5.debug("Fetching and caching model graph", { modelUrl });
3127
3495
  modelBuffer = await fetchWithCache(modelUrl);
3128
3496
  }
3129
3497
  if (!modelBuffer) {
3130
3498
  throw new Error(`Failed to load model: ${modelUrl}`);
3131
3499
  }
3132
- let externalDataBuffer;
3133
- if (this.config.modelDataUrl) {
3134
- const dataUrl = this.config.modelDataUrl;
3135
- const isDataCached = await cache.has(dataUrl);
3136
- if (isDataCached) {
3137
- logger5.debug("Loading external data from cache", { dataUrl });
3138
- externalDataBuffer = await cache.get(dataUrl);
3139
- if (!externalDataBuffer) {
3140
- logger5.warn("External data cache corruption, re-fetching", { dataUrl });
3141
- await cache.delete(dataUrl);
3500
+ let externalDataBuffer = null;
3501
+ if (dataUrl) {
3502
+ try {
3503
+ const isDataCached = await cache.has(dataUrl);
3504
+ if (isDataCached) {
3505
+ logger5.debug("Loading external data from cache", { dataUrl });
3506
+ externalDataBuffer = await cache.get(dataUrl);
3507
+ if (!externalDataBuffer) {
3508
+ logger5.warn("Cache corruption for external data, retrying", { dataUrl });
3509
+ await cache.delete(dataUrl);
3510
+ externalDataBuffer = await fetchWithCache(dataUrl);
3511
+ }
3512
+ } else {
3513
+ logger5.info("Fetching external model data", {
3514
+ dataUrl,
3515
+ note: "This may be a large download (400MB+)"
3516
+ });
3142
3517
  externalDataBuffer = await fetchWithCache(dataUrl);
3143
3518
  }
3144
- } else {
3145
- logger5.info("Fetching external data (this may take a while on first load)", {
3146
- dataUrl
3519
+ logger5.info("External data loaded", {
3520
+ size: formatBytes(externalDataBuffer.byteLength)
3521
+ });
3522
+ } catch (err) {
3523
+ logger5.debug("No external data file found (single-file model)", {
3524
+ dataUrl,
3525
+ error: err.message
3147
3526
  });
3148
- externalDataBuffer = await fetchWithCache(dataUrl);
3149
3527
  }
3150
- logger5.debug("External data loaded", {
3151
- size: formatBytes(externalDataBuffer.byteLength)
3152
- });
3153
3528
  }
3154
3529
  logger5.debug("Creating ONNX session", {
3155
- size: formatBytes(modelBuffer.byteLength),
3156
- hasExternalData: !!externalDataBuffer,
3157
- externalDataSize: externalDataBuffer ? formatBytes(externalDataBuffer.byteLength) : void 0,
3530
+ graphSize: formatBytes(modelBuffer.byteLength),
3531
+ externalDataSize: externalDataBuffer ? formatBytes(externalDataBuffer.byteLength) : "none",
3158
3532
  backend: this._backend
3159
3533
  });
3160
3534
  if (externalDataBuffer) {
3161
- const dataFilename = this.config.modelDataUrl.split("/").pop();
3535
+ const dataFilename = dataUrl.split("/").pop();
3162
3536
  sessionOptions.externalData = [{
3163
3537
  path: dataFilename,
3164
3538
  data: new Uint8Array(externalDataBuffer)
@@ -3177,7 +3551,7 @@ var Wav2ArkitCpuInference = class {
3177
3551
  span?.setAttributes({
3178
3552
  "model.backend": this._backend,
3179
3553
  "model.load_time_ms": loadTimeMs,
3180
- "model.cached": isCached
3554
+ "model.cached": !isIOS()
3181
3555
  });
3182
3556
  span?.end();
3183
3557
  telemetry?.recordHistogram("omote.model.load_time", loadTimeMs, {
@@ -3258,11 +3632,11 @@ var Wav2ArkitCpuInference = class {
3258
3632
  const blendshapes = [];
3259
3633
  for (let f = 0; f < numFrames; f++) {
3260
3634
  const rawFrame = blendshapeData.slice(f * numBlendshapes, (f + 1) * numBlendshapes);
3261
- const remapped = remapWav2ArkitToLam(rawFrame);
3262
- blendshapes.push(symmetrizeBlendshapes(remapped));
3635
+ const symmetrized = symmetrizeBlendshapes(rawFrame);
3636
+ blendshapes.push(symmetrized);
3263
3637
  }
3264
3638
  logger5.trace("Inference completed", {
3265
- inferenceTimeMs: Math.round(inferenceTimeMs),
3639
+ inferenceTimeMs: Math.round(inferenceTimeMs * 100) / 100,
3266
3640
  numFrames,
3267
3641
  inputSamples
3268
3642
  });
@@ -3328,14 +3702,14 @@ function createLipSync(config) {
3328
3702
  });
3329
3703
  }
3330
3704
  if (useCpu) {
3331
- logger6.info("Creating Wav2ArkitCpuInference (WASM)");
3705
+ logger6.info("Creating Wav2ArkitCpuInference (404MB, WASM)");
3332
3706
  return new Wav2ArkitCpuInference({
3333
- modelUrl: config.cpuModelUrl,
3334
- modelDataUrl: config.cpuModelDataUrl
3707
+ modelUrl: config.cpuModelUrl
3335
3708
  });
3336
3709
  }
3337
3710
  const gpuInstance = new Wav2Vec2Inference({
3338
3711
  modelUrl: config.gpuModelUrl,
3712
+ externalDataUrl: config.gpuExternalDataUrl,
3339
3713
  backend: config.gpuBackend ?? "auto",
3340
3714
  numIdentityClasses: config.numIdentityClasses
3341
3715
  });
@@ -3352,15 +3726,15 @@ var LipSyncWithFallback = class {
3352
3726
  this.implementation = gpuInstance;
3353
3727
  this.config = config;
3354
3728
  }
3729
+ get modelId() {
3730
+ return this.implementation.modelId;
3731
+ }
3355
3732
  get backend() {
3356
3733
  return this.implementation.backend;
3357
3734
  }
3358
3735
  get isLoaded() {
3359
3736
  return this.implementation.isLoaded;
3360
3737
  }
3361
- get chunkSamples() {
3362
- return this.implementation.chunkSamples;
3363
- }
3364
3738
  async load() {
3365
3739
  try {
3366
3740
  return await this.implementation.load();
@@ -3373,8 +3747,7 @@ var LipSyncWithFallback = class {
3373
3747
  } catch {
3374
3748
  }
3375
3749
  this.implementation = new Wav2ArkitCpuInference({
3376
- modelUrl: this.config.cpuModelUrl,
3377
- modelDataUrl: this.config.cpuModelDataUrl
3750
+ modelUrl: this.config.cpuModelUrl
3378
3751
  });
3379
3752
  this.hasFallenBack = true;
3380
3753
  logger6.info("Fallback to Wav2ArkitCpuInference successful");
@@ -3404,6 +3777,8 @@ var SileroVADInference = class {
3404
3777
  // Pre-speech buffer for capturing beginning of speech
3405
3778
  this.preSpeechBuffer = [];
3406
3779
  this.wasSpeaking = false;
3780
+ // Cached sample rate tensor (int64 scalar, never changes per instance)
3781
+ this.srTensor = null;
3407
3782
  const sampleRate = config.sampleRate ?? 16e3;
3408
3783
  if (sampleRate !== 8e3 && sampleRate !== 16e3) {
3409
3784
  throw new Error("Silero VAD only supports 8000 or 16000 Hz sample rates");
@@ -3534,6 +3909,24 @@ var SileroVADInference = class {
3534
3909
  this.context = new Float32Array(this.contextSize);
3535
3910
  this.preSpeechBuffer = [];
3536
3911
  this.wasSpeaking = false;
3912
+ if (!this.srTensor) {
3913
+ try {
3914
+ this.srTensor = new this.ort.Tensor(
3915
+ "int64",
3916
+ new BigInt64Array([BigInt(this.config.sampleRate)]),
3917
+ []
3918
+ );
3919
+ } catch (e) {
3920
+ logger7.warn("BigInt64Array not available, using bigint array fallback", {
3921
+ error: e instanceof Error ? e.message : String(e)
3922
+ });
3923
+ this.srTensor = new this.ort.Tensor(
3924
+ "int64",
3925
+ [BigInt(this.config.sampleRate)],
3926
+ []
3927
+ );
3928
+ }
3929
+ }
3537
3930
  }
3538
3931
  /**
3539
3932
  * Process a single audio chunk
@@ -3665,20 +4058,7 @@ var SileroVADInference = class {
3665
4058
  inputBuffer.set(audioChunkCopy, this.contextSize);
3666
4059
  const inputBufferCopy = new Float32Array(inputBuffer);
3667
4060
  const inputTensor = new this.ort.Tensor("float32", inputBufferCopy, [1, inputSize]);
3668
- let srTensor;
3669
- try {
3670
- srTensor = new this.ort.Tensor(
3671
- "int64",
3672
- new BigInt64Array([BigInt(this.config.sampleRate)]),
3673
- []
3674
- );
3675
- } catch {
3676
- srTensor = new this.ort.Tensor(
3677
- "int64",
3678
- [BigInt(this.config.sampleRate)],
3679
- []
3680
- );
3681
- }
4061
+ const srTensor = this.srTensor;
3682
4062
  const stateCopy = new Float32Array(this.state.data);
3683
4063
  const stateTensor = new this.ort.Tensor("float32", stateCopy, this.state.dims);
3684
4064
  const feeds = {
@@ -3767,6 +4147,7 @@ var SileroVADInference = class {
3767
4147
  this.session = null;
3768
4148
  }
3769
4149
  this.state = null;
4150
+ this.srTensor = null;
3770
4151
  }
3771
4152
  };
3772
4153
  /**
@@ -4429,268 +4810,8 @@ var VADWorkerWithFallback = class {
4429
4810
  }
4430
4811
  };
4431
4812
 
4432
- // src/inference/Emotion2VecInference.ts
4433
- var logger10 = createLogger("Emotion2Vec");
4434
- var EMOTION2VEC_LABELS = ["neutral", "happy", "angry", "sad"];
4435
- var Emotion2VecInference = class {
4436
- constructor(config) {
4437
- this.session = null;
4438
- this.ort = null;
4439
- this._backend = "wasm";
4440
- this.isLoading = false;
4441
- this.inferenceQueue = Promise.resolve();
4442
- this.config = {
4443
- modelUrl: config.modelUrl,
4444
- backend: config.backend ?? "auto",
4445
- sampleRate: config.sampleRate ?? 16e3
4446
- };
4447
- }
4448
- get backend() {
4449
- return this.session ? this._backend : null;
4450
- }
4451
- get isLoaded() {
4452
- return this.session !== null;
4453
- }
4454
- get sampleRate() {
4455
- return this.config.sampleRate;
4456
- }
4457
- /**
4458
- * Load the ONNX model
4459
- */
4460
- async load() {
4461
- if (this.isLoading) {
4462
- throw new Error("Model is already loading");
4463
- }
4464
- if (this.session) {
4465
- throw new Error("Model already loaded. Call dispose() first.");
4466
- }
4467
- this.isLoading = true;
4468
- const startTime = performance.now();
4469
- const telemetry = getTelemetry();
4470
- const span = telemetry?.startSpan("Emotion2Vec.load", {
4471
- "model.url": this.config.modelUrl,
4472
- "model.backend_requested": this.config.backend
4473
- });
4474
- try {
4475
- logger10.info("Loading ONNX Runtime...", { preference: this.config.backend });
4476
- const { ort, backend } = await getOnnxRuntimeForPreference(this.config.backend);
4477
- this.ort = ort;
4478
- this._backend = backend;
4479
- logger10.info("ONNX Runtime loaded", { backend: this._backend });
4480
- logger10.info("Checking model cache...");
4481
- const cache = getModelCache();
4482
- const modelUrl = this.config.modelUrl;
4483
- const isCached = await cache.has(modelUrl);
4484
- logger10.info("Cache check complete", { modelUrl, isCached });
4485
- let modelBuffer;
4486
- if (isCached) {
4487
- logger10.info("Loading model from cache...", { modelUrl });
4488
- modelBuffer = await cache.get(modelUrl);
4489
- logger10.info("Model loaded from cache", { size: formatBytes(modelBuffer.byteLength) });
4490
- } else {
4491
- logger10.info("Fetching model (not cached)...", { modelUrl });
4492
- modelBuffer = await fetchWithCache(modelUrl);
4493
- logger10.info("Model fetched and cached", { size: formatBytes(modelBuffer.byteLength) });
4494
- }
4495
- logger10.info("Creating ONNX session (this may take a while for large models)...");
4496
- logger10.debug("Creating ONNX session", {
4497
- size: formatBytes(modelBuffer.byteLength),
4498
- backend: this._backend
4499
- });
4500
- const sessionOptions = getSessionOptions(this._backend);
4501
- const modelData = new Uint8Array(modelBuffer);
4502
- this.session = await ort.InferenceSession.create(modelData, sessionOptions);
4503
- const loadTimeMs = performance.now() - startTime;
4504
- logger10.info("Model loaded successfully", {
4505
- backend: this._backend,
4506
- loadTimeMs: Math.round(loadTimeMs),
4507
- sampleRate: this.config.sampleRate,
4508
- inputNames: [...this.session.inputNames],
4509
- outputNames: [...this.session.outputNames]
4510
- });
4511
- span?.setAttributes({
4512
- "model.backend": this._backend,
4513
- "model.load_time_ms": loadTimeMs,
4514
- "model.cached": isCached
4515
- });
4516
- span?.end();
4517
- telemetry?.recordHistogram("omote.model.load_time", loadTimeMs, {
4518
- model: "emotion2vec",
4519
- backend: this._backend
4520
- });
4521
- return {
4522
- backend: this._backend,
4523
- loadTimeMs,
4524
- inputNames: [...this.session.inputNames],
4525
- outputNames: [...this.session.outputNames],
4526
- sampleRate: this.config.sampleRate
4527
- };
4528
- } catch (error) {
4529
- span?.endWithError(error instanceof Error ? error : new Error(String(error)));
4530
- telemetry?.incrementCounter("omote.errors.total", 1, {
4531
- model: "emotion2vec",
4532
- error_type: "load_failed"
4533
- });
4534
- throw error;
4535
- } finally {
4536
- this.isLoading = false;
4537
- }
4538
- }
4539
- /**
4540
- * Run emotion inference on audio samples
4541
- *
4542
- * @param audio - Float32Array of 16kHz audio samples
4543
- * @returns Frame-level emotion results at 50Hz
4544
- */
4545
- async infer(audio) {
4546
- if (!this.session) {
4547
- throw new Error("Model not loaded. Call load() first.");
4548
- }
4549
- return this.queueInference(audio);
4550
- }
4551
- queueInference(audio) {
4552
- const audioCopy = new Float32Array(audio);
4553
- return new Promise((resolve, reject) => {
4554
- this.inferenceQueue = this.inferenceQueue.then(async () => {
4555
- const telemetry = getTelemetry();
4556
- const span = telemetry?.startSpan("Emotion2Vec.infer", {
4557
- "inference.backend": this._backend,
4558
- "inference.audio_samples": audioCopy.length
4559
- });
4560
- try {
4561
- const startTime = performance.now();
4562
- const inputTensor = new this.ort.Tensor("float32", audioCopy, [1, audioCopy.length]);
4563
- const results = await this.session.run({ audio: inputTensor });
4564
- const logitsTensor = results["logits"];
4565
- const embeddingsTensor = results["layer_norm_25"];
4566
- if (!logitsTensor) {
4567
- throw new Error(
4568
- `Missing logits tensor from SUPERB model. Got outputs: ${Object.keys(results).join(", ")}`
4569
- );
4570
- }
4571
- const logitsData = logitsTensor.data;
4572
- const logits = new Float32Array(logitsData);
4573
- const probs = this.softmax(logits);
4574
- const probabilities = {
4575
- neutral: probs[0],
4576
- happy: probs[1],
4577
- angry: probs[2],
4578
- sad: probs[3]
4579
- };
4580
- let maxIdx = 0;
4581
- let maxProb = probs[0];
4582
- for (let i = 1; i < probs.length; i++) {
4583
- if (probs[i] > maxProb) {
4584
- maxProb = probs[i];
4585
- maxIdx = i;
4586
- }
4587
- }
4588
- const dominant = {
4589
- emotion: EMOTION2VEC_LABELS[maxIdx],
4590
- confidence: maxProb,
4591
- probabilities
4592
- };
4593
- let embeddings = [];
4594
- let numFrames = 1;
4595
- if (embeddingsTensor) {
4596
- const embeddingData = embeddingsTensor.data;
4597
- const dims = embeddingsTensor.dims;
4598
- if (dims.length === 3) {
4599
- numFrames = dims[1];
4600
- const embeddingDim = dims[2];
4601
- for (let i = 0; i < numFrames; i++) {
4602
- const start = i * embeddingDim;
4603
- embeddings.push(new Float32Array(embeddingData.slice(start, start + embeddingDim)));
4604
- }
4605
- }
4606
- }
4607
- const frames = [];
4608
- for (let i = 0; i < numFrames; i++) {
4609
- frames.push({
4610
- emotion: dominant.emotion,
4611
- confidence: dominant.confidence,
4612
- probabilities: { ...probabilities }
4613
- });
4614
- }
4615
- const inferenceTimeMs = performance.now() - startTime;
4616
- logger10.debug("Emotion inference completed", {
4617
- numFrames,
4618
- dominant: dominant.emotion,
4619
- confidence: Math.round(dominant.confidence * 100),
4620
- inferenceTimeMs: Math.round(inferenceTimeMs)
4621
- });
4622
- span?.setAttributes({
4623
- "inference.duration_ms": inferenceTimeMs,
4624
- "inference.num_frames": numFrames,
4625
- "inference.dominant_emotion": dominant.emotion
4626
- });
4627
- span?.end();
4628
- telemetry?.recordHistogram("omote.inference.latency", inferenceTimeMs, {
4629
- model: "emotion2vec",
4630
- backend: this._backend
4631
- });
4632
- telemetry?.incrementCounter("omote.inference.total", 1, {
4633
- model: "emotion2vec",
4634
- backend: this._backend,
4635
- status: "success"
4636
- });
4637
- resolve({
4638
- frames,
4639
- dominant,
4640
- embeddings,
4641
- logits,
4642
- inferenceTimeMs
4643
- });
4644
- } catch (err) {
4645
- span?.endWithError(err instanceof Error ? err : new Error(String(err)));
4646
- telemetry?.incrementCounter("omote.inference.total", 1, {
4647
- model: "emotion2vec",
4648
- backend: this._backend,
4649
- status: "error"
4650
- });
4651
- reject(err);
4652
- }
4653
- });
4654
- });
4655
- }
4656
- /**
4657
- * Apply softmax to convert logits to probabilities
4658
- */
4659
- softmax(logits) {
4660
- let max = logits[0];
4661
- for (let i = 1; i < logits.length; i++) {
4662
- if (logits[i] > max) max = logits[i];
4663
- }
4664
- const exp = new Float32Array(logits.length);
4665
- let sum = 0;
4666
- for (let i = 0; i < logits.length; i++) {
4667
- exp[i] = Math.exp(logits[i] - max);
4668
- sum += exp[i];
4669
- }
4670
- const probs = new Float32Array(logits.length);
4671
- for (let i = 0; i < logits.length; i++) {
4672
- probs[i] = exp[i] / sum;
4673
- }
4674
- return probs;
4675
- }
4676
- /**
4677
- * Dispose of the model and free resources
4678
- */
4679
- async dispose() {
4680
- if (this.session) {
4681
- await this.session.release();
4682
- this.session = null;
4683
- }
4684
- }
4685
- };
4686
- /**
4687
- * Check if WebGPU is available and working
4688
- * (iOS returns false even if navigator.gpu exists due to ONNX Runtime bugs)
4689
- */
4690
- Emotion2VecInference.isWebGPUAvailable = isWebGPUAvailable;
4691
-
4692
4813
  // src/inference/SafariSpeechRecognition.ts
4693
- var logger11 = createLogger("SafariSpeech");
4814
+ var logger10 = createLogger("SafariSpeech");
4694
4815
  var SafariSpeechRecognition = class _SafariSpeechRecognition {
4695
4816
  constructor(config = {}) {
4696
4817
  this.recognition = null;
@@ -4709,7 +4830,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
4709
4830
  interimResults: config.interimResults ?? true,
4710
4831
  maxAlternatives: config.maxAlternatives ?? 1
4711
4832
  };
4712
- logger11.debug("SafariSpeechRecognition created", {
4833
+ logger10.debug("SafariSpeechRecognition created", {
4713
4834
  language: this.config.language,
4714
4835
  continuous: this.config.continuous
4715
4836
  });
@@ -4770,7 +4891,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
4770
4891
  */
4771
4892
  async start() {
4772
4893
  if (this.isListening) {
4773
- logger11.warn("Already listening");
4894
+ logger10.warn("Already listening");
4774
4895
  return;
4775
4896
  }
4776
4897
  if (!_SafariSpeechRecognition.isAvailable()) {
@@ -4800,7 +4921,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
4800
4921
  this.isListening = true;
4801
4922
  this.startTime = performance.now();
4802
4923
  this.accumulatedText = "";
4803
- logger11.info("Speech recognition started", {
4924
+ logger10.info("Speech recognition started", {
4804
4925
  language: this.config.language
4805
4926
  });
4806
4927
  span?.end();
@@ -4815,7 +4936,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
4815
4936
  */
4816
4937
  async stop() {
4817
4938
  if (!this.isListening || !this.recognition) {
4818
- logger11.warn("Not currently listening");
4939
+ logger10.warn("Not currently listening");
4819
4940
  return {
4820
4941
  text: this.accumulatedText,
4821
4942
  language: this.config.language,
@@ -4844,7 +4965,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
4844
4965
  if (this.recognition && this.isListening) {
4845
4966
  this.recognition.abort();
4846
4967
  this.isListening = false;
4847
- logger11.info("Speech recognition aborted");
4968
+ logger10.info("Speech recognition aborted");
4848
4969
  }
4849
4970
  }
4850
4971
  /**
@@ -4875,7 +4996,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
4875
4996
  this.isListening = false;
4876
4997
  this.resultCallbacks = [];
4877
4998
  this.errorCallbacks = [];
4878
- logger11.debug("SafariSpeechRecognition disposed");
4999
+ logger10.debug("SafariSpeechRecognition disposed");
4879
5000
  }
4880
5001
  /**
4881
5002
  * Set up event handlers for the recognition instance
@@ -4903,7 +5024,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
4903
5024
  confidence: alternative.confidence
4904
5025
  };
4905
5026
  this.emitResult(speechResult);
4906
- logger11.trace("Speech result", {
5027
+ logger10.trace("Speech result", {
4907
5028
  text: text.substring(0, 50),
4908
5029
  isFinal,
4909
5030
  confidence: alternative.confidence
@@ -4913,12 +5034,12 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
4913
5034
  span?.end();
4914
5035
  } catch (error) {
4915
5036
  span?.endWithError(error instanceof Error ? error : new Error(String(error)));
4916
- logger11.error("Error processing speech result", { error });
5037
+ logger10.error("Error processing speech result", { error });
4917
5038
  }
4918
5039
  };
4919
5040
  this.recognition.onerror = (event) => {
4920
5041
  const error = new Error(`Speech recognition error: ${event.error} - ${event.message}`);
4921
- logger11.error("Speech recognition error", { error: event.error, message: event.message });
5042
+ logger10.error("Speech recognition error", { error: event.error, message: event.message });
4922
5043
  this.emitError(error);
4923
5044
  if (this.stopRejecter) {
4924
5045
  this.stopRejecter(error);
@@ -4928,7 +5049,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
4928
5049
  };
4929
5050
  this.recognition.onend = () => {
4930
5051
  this.isListening = false;
4931
- logger11.info("Speech recognition ended", {
5052
+ logger10.info("Speech recognition ended", {
4932
5053
  totalText: this.accumulatedText.length,
4933
5054
  durationMs: performance.now() - this.startTime
4934
5055
  });
@@ -4945,13 +5066,13 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
4945
5066
  }
4946
5067
  };
4947
5068
  this.recognition.onstart = () => {
4948
- logger11.debug("Speech recognition started by browser");
5069
+ logger10.debug("Speech recognition started by browser");
4949
5070
  };
4950
5071
  this.recognition.onspeechstart = () => {
4951
- logger11.debug("Speech detected");
5072
+ logger10.debug("Speech detected");
4952
5073
  };
4953
5074
  this.recognition.onspeechend = () => {
4954
- logger11.debug("Speech ended");
5075
+ logger10.debug("Speech ended");
4955
5076
  };
4956
5077
  }
4957
5078
  /**
@@ -4962,7 +5083,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
4962
5083
  try {
4963
5084
  callback(result);
4964
5085
  } catch (error) {
4965
- logger11.error("Error in result callback", { error });
5086
+ logger10.error("Error in result callback", { error });
4966
5087
  }
4967
5088
  }
4968
5089
  }
@@ -4974,7 +5095,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
4974
5095
  try {
4975
5096
  callback(error);
4976
5097
  } catch (callbackError) {
4977
- logger11.error("Error in error callback", { error: callbackError });
5098
+ logger10.error("Error in error callback", { error: callbackError });
4978
5099
  }
4979
5100
  }
4980
5101
  }
@@ -5148,7 +5269,7 @@ var AgentCoreAdapter = class extends EventEmitter {
5148
5269
  this._sessionId = null;
5149
5270
  this._isConnected = false;
5150
5271
  // Sub-components
5151
- this.whisper = null;
5272
+ this.asr = null;
5152
5273
  this.vad = null;
5153
5274
  this.lam = null;
5154
5275
  this.pipeline = null;
@@ -5187,7 +5308,7 @@ var AgentCoreAdapter = class extends EventEmitter {
5187
5308
  try {
5188
5309
  const authToken = await this.getAuthToken(config.tenant);
5189
5310
  await Promise.all([
5190
- this.initWhisper(),
5311
+ this.initASR(),
5191
5312
  this.initLAM()
5192
5313
  ]);
5193
5314
  await this.connectWebSocket(authToken, config);
@@ -5217,7 +5338,7 @@ var AgentCoreAdapter = class extends EventEmitter {
5217
5338
  this.ws = null;
5218
5339
  }
5219
5340
  await Promise.all([
5220
- this.whisper?.dispose(),
5341
+ this.asr?.dispose(),
5221
5342
  this.vad?.dispose(),
5222
5343
  this.lam?.dispose()
5223
5344
  ]);
@@ -5349,16 +5470,15 @@ var AgentCoreAdapter = class extends EventEmitter {
5349
5470
  });
5350
5471
  return token;
5351
5472
  }
5352
- async initWhisper() {
5473
+ async initASR() {
5353
5474
  await Promise.all([
5354
- // Whisper ASR
5475
+ // SenseVoice ASR
5355
5476
  (async () => {
5356
- this.whisper = new WhisperInference({
5357
- model: "tiny",
5358
- device: "auto",
5359
- language: "en"
5477
+ this.asr = new SenseVoiceInference({
5478
+ modelUrl: "/models/sensevoice/model.int8.onnx",
5479
+ language: "auto"
5360
5480
  });
5361
- await this.whisper.load();
5481
+ await this.asr.load();
5362
5482
  })(),
5363
5483
  // Silero VAD for accurate voice activity detection
5364
5484
  (async () => {
@@ -5544,17 +5664,17 @@ var AgentCoreAdapter = class extends EventEmitter {
5544
5664
  console.debug("[AgentCore] Skipping silent audio", { rms, samples: audio.length });
5545
5665
  return;
5546
5666
  }
5547
- if (this.whisper) {
5667
+ if (this.asr) {
5548
5668
  this.setState("listening");
5549
5669
  this.emit("user.speech.start", { timestamp: Date.now() });
5550
- this.whisper.transcribe(audio).then((result) => {
5670
+ this.asr.transcribe(audio).then((result) => {
5551
5671
  this.emit("user.transcript.final", {
5552
5672
  text: result.text,
5553
5673
  confidence: 1
5554
5674
  });
5555
5675
  this.emit("user.speech.end", { timestamp: Date.now(), durationMs: result.inferenceTimeMs });
5556
5676
  const cleanText = result.text.trim();
5557
- if (cleanText && !cleanText.includes("[BLANK_AUDIO]")) {
5677
+ if (cleanText) {
5558
5678
  this.sendText(cleanText).catch((error) => {
5559
5679
  console.error("[AgentCore] Send text error:", error);
5560
5680
  });
@@ -6368,228 +6488,6 @@ var InterruptionHandler = class extends EventEmitter {
6368
6488
  }
6369
6489
  };
6370
6490
 
6371
- // src/cache/huggingFaceCDN.ts
6372
- var HF_CDN_TEST_URL = "https://huggingface.co/Xenova/whisper-tiny/resolve/main/config.json";
6373
- function parseHuggingFaceUrl(url) {
6374
- const pattern = /^https:\/\/huggingface\.co\/([^/]+)\/([^/]+)\/resolve\/([^/]+)\/(.+)$/;
6375
- const match = url.match(pattern);
6376
- if (!match) {
6377
- return null;
6378
- }
6379
- return {
6380
- org: match[1],
6381
- model: match[2],
6382
- branch: match[3],
6383
- file: match[4]
6384
- };
6385
- }
6386
- async function isHuggingFaceCDNReachable(testUrl = HF_CDN_TEST_URL) {
6387
- try {
6388
- const response = await fetch(testUrl, {
6389
- method: "HEAD",
6390
- cache: "no-store"
6391
- // Don't use cached response for reachability check
6392
- });
6393
- return response.ok;
6394
- } catch {
6395
- return false;
6396
- }
6397
- }
6398
-
6399
- // src/utils/transformersCacheClear.ts
6400
- var logger12 = createLogger("TransformersCache");
6401
- async function clearTransformersCache(options) {
6402
- const verbose = options?.verbose ?? true;
6403
- const additionalPatterns = options?.additionalPatterns ?? [];
6404
- if (!("caches" in window)) {
6405
- logger12.warn("Cache API not available in this environment");
6406
- return [];
6407
- }
6408
- try {
6409
- const cacheNames = await caches.keys();
6410
- const deletedCaches = [];
6411
- const patterns = [
6412
- "transformers",
6413
- "huggingface",
6414
- "onnx",
6415
- ...additionalPatterns
6416
- ];
6417
- for (const cacheName of cacheNames) {
6418
- const shouldDelete = patterns.some(
6419
- (pattern) => cacheName.toLowerCase().includes(pattern.toLowerCase())
6420
- );
6421
- if (shouldDelete) {
6422
- if (verbose) {
6423
- logger12.info("Deleting cache", { cacheName });
6424
- }
6425
- const deleted = await caches.delete(cacheName);
6426
- if (deleted) {
6427
- deletedCaches.push(cacheName);
6428
- } else if (verbose) {
6429
- logger12.warn("Failed to delete cache", { cacheName });
6430
- }
6431
- }
6432
- }
6433
- if (verbose) {
6434
- logger12.info("Cache clearing complete", {
6435
- totalCaches: cacheNames.length,
6436
- deletedCount: deletedCaches.length,
6437
- deletedCaches
6438
- });
6439
- }
6440
- return deletedCaches;
6441
- } catch (error) {
6442
- logger12.error("Error clearing caches", { error });
6443
- throw error;
6444
- }
6445
- }
6446
- async function clearSpecificCache(cacheName) {
6447
- if (!("caches" in window)) {
6448
- logger12.warn("Cache API not available in this environment");
6449
- return false;
6450
- }
6451
- try {
6452
- const deleted = await caches.delete(cacheName);
6453
- logger12.info("Cache deletion attempt", { cacheName, deleted });
6454
- return deleted;
6455
- } catch (error) {
6456
- logger12.error("Error deleting cache", { cacheName, error });
6457
- return false;
6458
- }
6459
- }
6460
- async function listCaches() {
6461
- if (!("caches" in window)) {
6462
- logger12.warn("Cache API not available in this environment");
6463
- return [];
6464
- }
6465
- try {
6466
- const cacheNames = await caches.keys();
6467
- logger12.debug("Available caches", { cacheNames });
6468
- return cacheNames;
6469
- } catch (error) {
6470
- logger12.error("Error listing caches", { error });
6471
- return [];
6472
- }
6473
- }
6474
- async function validateCachedResponse(cacheName, requestUrl) {
6475
- if (!("caches" in window)) {
6476
- return {
6477
- exists: false,
6478
- valid: false,
6479
- contentType: null,
6480
- isHtml: false,
6481
- reason: "Cache API not available"
6482
- };
6483
- }
6484
- try {
6485
- const cache = await caches.open(cacheName);
6486
- const response = await cache.match(requestUrl);
6487
- if (!response) {
6488
- return {
6489
- exists: false,
6490
- valid: false,
6491
- contentType: null,
6492
- isHtml: false,
6493
- reason: "Not in cache"
6494
- };
6495
- }
6496
- const contentType = response.headers.get("content-type");
6497
- const isHtml = contentType?.includes("text/html") || contentType?.includes("text/plain");
6498
- const clonedResponse = response.clone();
6499
- const text = await clonedResponse.text();
6500
- const looksLikeHtml = text.trim().startsWith("<") || text.includes("<!DOCTYPE");
6501
- const valid = Boolean(
6502
- response.status === 200 && !isHtml && !looksLikeHtml && contentType && (contentType.includes("application/json") || contentType.includes("application/octet-stream") || contentType.includes("binary"))
6503
- );
6504
- return {
6505
- exists: true,
6506
- valid,
6507
- contentType,
6508
- isHtml: isHtml || looksLikeHtml,
6509
- reason: valid ? "Valid response" : `Invalid: status=${response.status}, contentType=${contentType}, isHtml=${isHtml || looksLikeHtml}`
6510
- };
6511
- } catch (error) {
6512
- logger12.error("Error validating cached response", { cacheName, requestUrl, error });
6513
- return {
6514
- exists: false,
6515
- valid: false,
6516
- contentType: null,
6517
- isHtml: false,
6518
- reason: `Error: ${error}`
6519
- };
6520
- }
6521
- }
6522
- async function scanForInvalidCaches() {
6523
- if (!("caches" in window)) {
6524
- return { totalCaches: 0, scannedEntries: 0, invalidEntries: [] };
6525
- }
6526
- const invalidEntries = [];
6527
- let scannedEntries = 0;
6528
- try {
6529
- const cacheNames = await caches.keys();
6530
- for (const cacheName of cacheNames) {
6531
- if (!cacheName.toLowerCase().includes("transformers")) {
6532
- continue;
6533
- }
6534
- const cache = await caches.open(cacheName);
6535
- const requests = await cache.keys();
6536
- for (const request of requests) {
6537
- scannedEntries++;
6538
- const url = request.url;
6539
- const validation = await validateCachedResponse(cacheName, url);
6540
- if (validation.exists && !validation.valid) {
6541
- invalidEntries.push({
6542
- cacheName,
6543
- url,
6544
- reason: validation.reason || "Unknown"
6545
- });
6546
- }
6547
- }
6548
- }
6549
- logger12.info("Cache scan complete", {
6550
- totalCaches: cacheNames.length,
6551
- scannedEntries,
6552
- invalidCount: invalidEntries.length
6553
- });
6554
- return {
6555
- totalCaches: cacheNames.length,
6556
- scannedEntries,
6557
- invalidEntries
6558
- };
6559
- } catch (error) {
6560
- logger12.error("Error scanning caches", { error });
6561
- throw error;
6562
- }
6563
- }
6564
- async function nukeBrowserCaches(preventRecreation = false) {
6565
- if (!("caches" in window)) {
6566
- logger12.warn("Cache API not available in this environment");
6567
- return 0;
6568
- }
6569
- try {
6570
- const cacheNames = await caches.keys();
6571
- let deletedCount = 0;
6572
- for (const cacheName of cacheNames) {
6573
- const deleted = await caches.delete(cacheName);
6574
- if (deleted) {
6575
- deletedCount++;
6576
- }
6577
- }
6578
- logger12.info("All browser caches cleared", {
6579
- totalDeleted: deletedCount
6580
- });
6581
- if (preventRecreation) {
6582
- const { env } = await import("./transformers.web-MHLR33H6.mjs");
6583
- env.useBrowserCache = false;
6584
- logger12.warn("Browser cache creation disabled (env.useBrowserCache = false)");
6585
- }
6586
- return deletedCount;
6587
- } catch (error) {
6588
- logger12.error("Error nuking caches", { error });
6589
- throw error;
6590
- }
6591
- }
6592
-
6593
6491
  // src/animation/types.ts
6594
6492
  var DEFAULT_ANIMATION_CONFIG = {
6595
6493
  initialState: "idle",
@@ -7129,7 +7027,6 @@ export {
7129
7027
  EmotionPresets,
7130
7028
  EmphasisDetector,
7131
7029
  EventEmitter,
7132
- HF_CDN_TEST_URL,
7133
7030
  INFERENCE_LATENCY_BUCKETS,
7134
7031
  InterruptionHandler,
7135
7032
  LAMPipeline,
@@ -7143,6 +7040,7 @@ export {
7143
7040
  OmoteTelemetry,
7144
7041
  RingBuffer,
7145
7042
  SafariSpeechRecognition,
7043
+ SenseVoiceInference,
7146
7044
  SileroVADInference,
7147
7045
  SileroVADWorker,
7148
7046
  SyncedAudioPipeline,
@@ -7150,12 +7048,12 @@ export {
7150
7048
  WAV2ARKIT_BLENDSHAPES,
7151
7049
  Wav2ArkitCpuInference,
7152
7050
  Wav2Vec2Inference,
7153
- WhisperInference,
7051
+ applyCMVN,
7052
+ applyLFR,
7154
7053
  blendEmotions,
7155
7054
  calculatePeak,
7156
7055
  calculateRMS,
7157
- clearSpecificCache,
7158
- clearTransformersCache,
7056
+ computeKaldiFbank,
7159
7057
  configureCacheLimit,
7160
7058
  configureLogging,
7161
7059
  configureTelemetry,
@@ -7164,6 +7062,7 @@ export {
7164
7062
  createLogger,
7165
7063
  createSessionWithFallback,
7166
7064
  createSileroVAD,
7065
+ ctcGreedyDecode,
7167
7066
  fetchWithCache,
7168
7067
  formatBytes,
7169
7068
  getCacheConfig,
@@ -7180,7 +7079,6 @@ export {
7180
7079
  getTelemetry,
7181
7080
  hasWebGPUApi,
7182
7081
  isAndroid,
7183
- isHuggingFaceCDNReachable,
7184
7082
  isIOS,
7185
7083
  isIOSSafari,
7186
7084
  isMobile,
@@ -7189,15 +7087,16 @@ export {
7189
7087
  isSpeechRecognitionAvailable,
7190
7088
  isWebGPUAvailable,
7191
7089
  lerpEmotion,
7192
- listCaches,
7193
7090
  noopLogger,
7194
- nukeBrowserCaches,
7195
- parseHuggingFaceUrl,
7091
+ parseCMVNFromMetadata,
7092
+ parseTokensFile,
7196
7093
  preloadModels,
7094
+ preloadOnnxRuntime,
7197
7095
  remapWav2ArkitToLam,
7198
7096
  resetLoggingConfig,
7199
7097
  resolveBackend,
7200
- scanForInvalidCaches,
7098
+ resolveLanguageId,
7099
+ resolveTextNormId,
7201
7100
  setLogLevel,
7202
7101
  setLoggingEnabled,
7203
7102
  shouldEnableWasmProxy,
@@ -7205,7 +7104,6 @@ export {
7205
7104
  shouldUseNativeASR,
7206
7105
  shouldUseServerLipSync,
7207
7106
  supportsVADWorker,
7208
- symmetrizeBlendshapes,
7209
- validateCachedResponse
7107
+ symmetrizeBlendshapes
7210
7108
  };
7211
7109
  //# sourceMappingURL=index.mjs.map