@omote/core 0.3.1 → 0.3.25

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.mjs CHANGED
@@ -15,8 +15,8 @@ import {
15
15
  import {
16
16
  __webpack_exports__env,
17
17
  __webpack_exports__pipeline
18
- } from "./chunk-T465MTDX.mjs";
19
- import "./chunk-6W7G6WE7.mjs";
18
+ } from "./chunk-B6TIE56N.mjs";
19
+ import "./chunk-NSSMTXJJ.mjs";
20
20
 
21
21
  // src/audio/MicrophoneCapture.ts
22
22
  var MicrophoneCapture = class {
@@ -28,6 +28,8 @@ var MicrophoneCapture = class {
28
28
  this.buffer = new Float32Array(0);
29
29
  this._isRecording = false;
30
30
  this._loggedFirstChunk = false;
31
+ /** Actual AudioContext sample rate (may differ from target on Firefox) */
32
+ this._nativeSampleRate = 0;
31
33
  this.config = {
32
34
  sampleRate: config.sampleRate ?? 16e3,
33
35
  chunkSize: config.chunkSize ?? 1600
@@ -62,10 +64,29 @@ var MicrophoneCapture = class {
62
64
  if (this.context.state === "suspended") {
63
65
  await this.context.resume();
64
66
  }
65
- const source = this.context.createMediaStreamSource(this.stream);
67
+ let source;
68
+ try {
69
+ source = this.context.createMediaStreamSource(this.stream);
70
+ this._nativeSampleRate = this.context.sampleRate;
71
+ } catch (sourceErr) {
72
+ console.warn(
73
+ "[MicrophoneCapture] Cannot connect stream at",
74
+ this.config.sampleRate + "Hz, falling back to native rate:",
75
+ sourceErr.message
76
+ );
77
+ await this.context.close();
78
+ this.context = new AudioContext();
79
+ if (this.context.state === "suspended") {
80
+ await this.context.resume();
81
+ }
82
+ source = this.context.createMediaStreamSource(this.stream);
83
+ this._nativeSampleRate = this.context.sampleRate;
84
+ console.log("[MicrophoneCapture] Using native rate:", this._nativeSampleRate, "Hz \u2192 resampling to", this.config.sampleRate, "Hz");
85
+ }
66
86
  this.processor = this.context.createScriptProcessor(4096, 1, 1);
67
87
  this.processor.onaudioprocess = (e) => {
68
- const input = e.inputBuffer.getChannelData(0);
88
+ const raw = e.inputBuffer.getChannelData(0);
89
+ const input = this._nativeSampleRate !== this.config.sampleRate ? this.resample(raw, this._nativeSampleRate, this.config.sampleRate) : raw;
69
90
  let rms = 0;
70
91
  let peak = 0;
71
92
  for (let i = 0; i < input.length; i++) {
@@ -123,6 +144,25 @@ var MicrophoneCapture = class {
123
144
  this.buffer = new Float32Array(0);
124
145
  this._isRecording = false;
125
146
  }
147
+ /**
148
+ * Resample audio using linear interpolation.
149
+ * Used when the AudioContext runs at the device's native rate (e.g. 48kHz)
150
+ * and we need to downsample to the target rate (e.g. 16kHz).
151
+ */
152
+ resample(input, fromRate, toRate) {
153
+ if (fromRate === toRate) return input;
154
+ const ratio = fromRate / toRate;
155
+ const outputLength = Math.floor(input.length / ratio);
156
+ const output = new Float32Array(outputLength);
157
+ for (let i = 0; i < outputLength; i++) {
158
+ const srcIdx = i * ratio;
159
+ const lo = Math.floor(srcIdx);
160
+ const hi = Math.min(lo + 1, input.length - 1);
161
+ const frac = srcIdx - lo;
162
+ output[i] = input[lo] * (1 - frac) + input[hi] * frac;
163
+ }
164
+ return output;
165
+ }
126
166
  floatToPCM16(float32) {
127
167
  const pcm = new Int16Array(float32.length);
128
168
  for (let i = 0; i < float32.length; i++) {
@@ -263,7 +303,8 @@ var AudioScheduler = class {
263
303
  const ctx = await this.ensureContext();
264
304
  const channels = this.options.channels ?? 1;
265
305
  if (!this.isPlaying) {
266
- this.nextPlayTime = ctx.currentTime + (this.options.initialDelayS ?? 0.05);
306
+ const lookahead = this.options.initialLookaheadSec ?? 0.05;
307
+ this.nextPlayTime = ctx.currentTime + lookahead;
267
308
  this.isPlaying = true;
268
309
  }
269
310
  const audioBuffer = ctx.createBuffer(channels, audioData.length, ctx.sampleRate);
@@ -446,8 +487,8 @@ var AudioChunkCoalescer = class {
446
487
  var LAMPipeline = class {
447
488
  constructor(options = {}) {
448
489
  this.options = options;
449
- this.DEFAULT_CHUNK_SAMPLES = 16e3;
450
- // 1.0s at 16kHz (Wav2Vec2 requirement)
490
+ this.REQUIRED_SAMPLES = 16e3;
491
+ // 1.0s at 16kHz (LAM requirement)
451
492
  this.FRAME_RATE = 30;
452
493
  // LAM outputs 30fps
453
494
  this.buffer = new Float32Array(0);
@@ -477,20 +518,22 @@ var LAMPipeline = class {
477
518
  newBuffer.set(this.buffer, 0);
478
519
  newBuffer.set(samples, this.buffer.length);
479
520
  this.buffer = newBuffer;
480
- const chunkSize = lam.chunkSamples ?? this.DEFAULT_CHUNK_SAMPLES;
481
- while (this.buffer.length >= chunkSize) {
482
- await this.processBuffer(lam, chunkSize);
521
+ while (this.buffer.length >= this.REQUIRED_SAMPLES) {
522
+ await this.processBuffer(lam);
523
+ if (this.buffer.length >= this.REQUIRED_SAMPLES) {
524
+ await new Promise((r) => setTimeout(r, 0));
525
+ }
483
526
  }
484
527
  }
485
528
  /**
486
529
  * Process accumulated buffer through LAM inference
487
530
  */
488
- async processBuffer(lam, chunkSize) {
531
+ async processBuffer(lam) {
489
532
  try {
490
- const toProcess = this.buffer.slice(0, chunkSize);
533
+ const toProcess = this.buffer.slice(0, this.REQUIRED_SAMPLES);
491
534
  const processedStartTime = this.bufferStartTime;
492
- this.buffer = this.buffer.slice(chunkSize);
493
- const processedDuration = chunkSize / (this.options.sampleRate ?? 16e3);
535
+ this.buffer = this.buffer.slice(this.REQUIRED_SAMPLES);
536
+ const processedDuration = this.REQUIRED_SAMPLES / (this.options.sampleRate ?? 16e3);
494
537
  this.bufferStartTime = processedStartTime + processedDuration;
495
538
  const result = await lam.infer(toProcess);
496
539
  const frameDuration = 1 / this.FRAME_RATE;
@@ -509,22 +552,35 @@ var LAMPipeline = class {
509
552
  /**
510
553
  * Get the frame that should be displayed at the current time
511
554
  *
512
- * Timestamp-synced playback for all backends. Audio playback is delayed
513
- * for slow backends (WASM gets 1s head start via AudioScheduler) so
514
- * frames are ready by the time their corresponding audio plays.
555
+ * Automatically removes frames that have already been displayed.
556
+ * This prevents memory leaks from accumulating old frames.
557
+ *
558
+ * Discard Window (prevents premature frame discarding):
559
+ * - WebGPU: 0.5s (LAM inference 20-100ms + RAF jitter + React stalls)
560
+ * - WASM: 1.0s (LAM inference 50-500ms + higher variability)
515
561
  *
516
- * Discard window is generous for WASM to handle inference jitter.
517
- * Late frames play at RAF rate (~60fps) until caught up, then settle
518
- * to natural 30fps pacing via timestamp gating.
562
+ * Last-Frame-Hold: Returns last valid frame instead of null to prevent
563
+ * avatar freezing when between frames (RAF at 60fps vs LAM at 30fps).
519
564
  *
520
565
  * @param currentTime - Current AudioContext time
521
566
  * @param lam - LAM inference engine (optional, for backend detection)
522
567
  * @returns Current frame, or last frame as fallback, or null if no frames yet
523
568
  */
524
569
  getFrameForTime(currentTime, lam) {
525
- const discardWindow = lam?.backend === "wasm" ? 10 : 0.5;
570
+ const discardWindow = lam?.backend === "wasm" ? 1 : 0.5;
571
+ let discardedCount = 0;
526
572
  while (this.frameQueue.length > 0 && this.frameQueue[0].timestamp < currentTime - discardWindow) {
527
- this.frameQueue.shift();
573
+ const discarded = this.frameQueue.shift();
574
+ discardedCount++;
575
+ if (discardedCount === 1) {
576
+ const ageMs = ((currentTime - discarded.timestamp) * 1e3).toFixed(0);
577
+ console.warn("[LAM] Frame(s) discarded as too old", {
578
+ ageMs,
579
+ discardWindowMs: discardWindow * 1e3,
580
+ queueLength: this.frameQueue.length,
581
+ backend: lam?.backend ?? "unknown"
582
+ });
583
+ }
528
584
  }
529
585
  if (this.frameQueue.length > 0 && this.frameQueue[0].timestamp <= currentTime) {
530
586
  const { frame } = this.frameQueue.shift();
@@ -543,7 +599,7 @@ var LAMPipeline = class {
543
599
  * Get current buffer fill level (0-1)
544
600
  */
545
601
  get fillLevel() {
546
- return Math.min(1, this.buffer.length / this.DEFAULT_CHUNK_SAMPLES);
602
+ return Math.min(1, this.buffer.length / this.REQUIRED_SAMPLES);
547
603
  }
548
604
  /**
549
605
  * Get number of frames queued
@@ -560,7 +616,7 @@ var LAMPipeline = class {
560
616
  /**
561
617
  * Flush remaining buffered audio
562
618
  *
563
- * Processes any remaining audio in the buffer, even if less than the chunk size.
619
+ * Processes any remaining audio in the buffer, even if less than REQUIRED_SAMPLES.
564
620
  * This ensures the final audio chunk generates blendshape frames.
565
621
  *
566
622
  * Should be called when audio stream ends to prevent losing the last 0-1 seconds.
@@ -571,17 +627,12 @@ var LAMPipeline = class {
571
627
  if (this.buffer.length === 0) {
572
628
  return;
573
629
  }
630
+ const padded = new Float32Array(this.REQUIRED_SAMPLES);
631
+ padded.set(this.buffer, 0);
574
632
  const processedStartTime = this.bufferStartTime;
575
- const sampleRate = this.options.sampleRate ?? 16e3;
576
- const minSize = lam.chunkSamples ?? this.DEFAULT_CHUNK_SAMPLES;
577
- const audioToInfer = this.buffer.length >= minSize ? this.buffer : (() => {
578
- const padded = new Float32Array(minSize);
579
- padded.set(this.buffer, 0);
580
- return padded;
581
- })();
582
633
  try {
583
- const result = await lam.infer(audioToInfer);
584
- const actualDuration = this.buffer.length / sampleRate;
634
+ const result = await lam.infer(padded);
635
+ const actualDuration = this.buffer.length / (this.options.sampleRate ?? 16e3);
585
636
  const frameDuration = 1 / this.FRAME_RATE;
586
637
  const actualFrameCount = Math.ceil(actualDuration * this.FRAME_RATE);
587
638
  for (let i = 0; i < Math.min(actualFrameCount, result.blendshapes.length); i++) {
@@ -640,13 +691,12 @@ var SyncedAudioPipeline = class extends EventEmitter {
640
691
  this.monitorInterval = null;
641
692
  this.frameAnimationId = null;
642
693
  const sampleRate = options.sampleRate ?? 16e3;
643
- if (!options.lam.isLoaded) {
644
- throw new Error(
645
- "LipSyncBackend must be loaded before constructing SyncedAudioPipeline. Call lam.load() first so backend type is known for timing configuration."
646
- );
647
- }
648
- const initialDelayS = options.lam.backend === "wasm" ? 1 : 0.05;
649
- this.scheduler = new AudioScheduler({ sampleRate, initialDelayS });
694
+ const autoDelay = options.lam.modelId === "wav2arkit_cpu" ? 750 : options.lam.backend === "wasm" ? 350 : 50;
695
+ const audioDelayMs = options.audioDelayMs ?? autoDelay;
696
+ this.scheduler = new AudioScheduler({
697
+ sampleRate,
698
+ initialLookaheadSec: audioDelayMs / 1e3
699
+ });
650
700
  this.coalescer = new AudioChunkCoalescer({
651
701
  sampleRate,
652
702
  targetDurationMs: options.chunkTargetMs ?? 200
@@ -2014,9 +2064,7 @@ function formatBytes(bytes) {
2014
2064
  function isIOSSafari() {
2015
2065
  if (typeof navigator === "undefined") return false;
2016
2066
  const ua = navigator.userAgent.toLowerCase();
2017
- return /iphone|ipad|ipod/.test(ua) || // Safari on macOS could also have issues, but less severe
2018
- // Only force WASM on actual iOS devices
2019
- /safari/.test(ua) && /mobile/.test(ua) && !/chrome|crios|fxios/.test(ua);
2067
+ return /iphone|ipad|ipod/.test(ua) && /safari/.test(ua) && !/chrome|crios|fxios|chromium|edg/.test(ua);
2020
2068
  }
2021
2069
  function isIOS() {
2022
2070
  if (typeof navigator === "undefined") return false;
@@ -2074,10 +2122,7 @@ function getOptimalWasmThreads() {
2074
2122
  return 4;
2075
2123
  }
2076
2124
  function shouldEnableWasmProxy() {
2077
- if (isMobile()) {
2078
- return false;
2079
- }
2080
- return true;
2125
+ return false;
2081
2126
  }
2082
2127
  function isSafari() {
2083
2128
  if (typeof navigator === "undefined") return false;
@@ -2092,7 +2137,7 @@ function isSpeechRecognitionAvailable() {
2092
2137
  return "SpeechRecognition" in window || "webkitSpeechRecognition" in window;
2093
2138
  }
2094
2139
  function shouldUseNativeASR() {
2095
- return isIOS() && isSpeechRecognitionAvailable();
2140
+ return (isIOS() || isSafari()) && isSpeechRecognitionAvailable();
2096
2141
  }
2097
2142
  function shouldUseServerLipSync() {
2098
2143
  return isIOS();
@@ -2105,11 +2150,13 @@ var loadedBackend = null;
2105
2150
  var WASM_CDN_PATH = "https://cdn.jsdelivr.net/npm/onnxruntime-web@1.23.2/dist/";
2106
2151
  async function isWebGPUAvailable() {
2107
2152
  if (isIOS()) {
2108
- logger.debug("WebGPU check: iOS detected, returning false");
2153
+ logger.debug("WebGPU check: disabled on iOS (asyncify bundle crashes WebKit)");
2109
2154
  return false;
2110
2155
  }
2111
2156
  if (!hasWebGPUApi()) {
2112
- logger.debug("WebGPU check: navigator.gpu not available");
2157
+ logger.debug("WebGPU check: navigator.gpu not available", {
2158
+ isSecureContext: typeof window !== "undefined" ? window.isSecureContext : "N/A"
2159
+ });
2113
2160
  return false;
2114
2161
  }
2115
2162
  try {
@@ -2133,14 +2180,20 @@ async function isWebGPUAvailable() {
2133
2180
  }
2134
2181
  var iosWasmPatched = false;
2135
2182
  function applyIOSWasmMemoryPatch() {
2136
- if (iosWasmPatched || !isIOS()) return;
2183
+ if (iosWasmPatched || !isIOSSafari()) return;
2137
2184
  iosWasmPatched = true;
2138
2185
  const OrigMemory = WebAssembly.Memory;
2139
- const MAX_IOS_PAGES = 16384;
2140
- logger.info("Applying iOS WASM memory patch (max capped to 1GB, shared preserved)");
2186
+ const MAX_IOS_PAGES = 32768;
2187
+ logger.info("Applying iOS WASM memory patch (max\u21922GB, shared preserved)");
2141
2188
  WebAssembly.Memory = function IOSPatchedMemory(descriptor) {
2142
2189
  const patched = { ...descriptor };
2143
2190
  if (patched.maximum !== void 0 && patched.maximum > MAX_IOS_PAGES) {
2191
+ logger.info("iOS memory patch: capping maximum", {
2192
+ original: patched.maximum,
2193
+ capped: MAX_IOS_PAGES,
2194
+ shared: patched.shared,
2195
+ initial: patched.initial
2196
+ });
2144
2197
  patched.maximum = MAX_IOS_PAGES;
2145
2198
  }
2146
2199
  return new OrigMemory(patched);
@@ -2174,7 +2227,10 @@ async function getOnnxRuntime(backend) {
2174
2227
  logger.info(`Loading ONNX Runtime with ${backend} backend...`);
2175
2228
  applyIOSWasmMemoryPatch();
2176
2229
  try {
2177
- if (backend === "wasm") {
2230
+ if (backend === "wasm" && (isIOS() || isSafari())) {
2231
+ const module = await import("onnxruntime-web/wasm");
2232
+ ortInstance = module.default || module;
2233
+ } else if (backend === "wasm") {
2178
2234
  const module = await import("onnxruntime-web");
2179
2235
  ortInstance = module.default || module;
2180
2236
  } else {
@@ -2249,6 +2305,16 @@ function getLoadedBackend() {
2249
2305
  function isOnnxRuntimeLoaded() {
2250
2306
  return ortInstance !== null;
2251
2307
  }
2308
+ async function preloadOnnxRuntime(preference = "auto") {
2309
+ if (ortInstance) {
2310
+ logger.info("ONNX Runtime already preloaded", { backend: loadedBackend });
2311
+ return loadedBackend;
2312
+ }
2313
+ logger.info("Preloading ONNX Runtime...", { preference });
2314
+ const { backend } = await getOnnxRuntimeForPreference(preference);
2315
+ logger.info("ONNX Runtime preloaded", { backend });
2316
+ return backend;
2317
+ }
2252
2318
 
2253
2319
  // src/inference/blendshapeUtils.ts
2254
2320
  var LAM_BLENDSHAPES = [
@@ -2444,6 +2510,7 @@ var CTC_VOCAB = [
2444
2510
  ];
2445
2511
  var Wav2Vec2Inference = class {
2446
2512
  constructor(config) {
2513
+ this.modelId = "wav2vec2";
2447
2514
  this.session = null;
2448
2515
  this.ort = null;
2449
2516
  this._backend = "wasm";
@@ -2504,13 +2571,52 @@ var Wav2Vec2Inference = class {
2504
2571
  logger2.error(errorMsg, { modelUrl, isCached });
2505
2572
  throw new Error(errorMsg);
2506
2573
  }
2574
+ let externalDataBuffer = null;
2575
+ if (this.config.externalDataUrl !== false) {
2576
+ const dataUrl = typeof this.config.externalDataUrl === "string" ? this.config.externalDataUrl : `${modelUrl}.data`;
2577
+ try {
2578
+ const isDataCached = await cache.has(dataUrl);
2579
+ if (isDataCached) {
2580
+ logger2.debug("Loading external data from cache", { dataUrl });
2581
+ externalDataBuffer = await cache.get(dataUrl);
2582
+ if (!externalDataBuffer) {
2583
+ logger2.warn("Cache corruption for external data, retrying", { dataUrl });
2584
+ await cache.delete(dataUrl);
2585
+ externalDataBuffer = await fetchWithCache(dataUrl);
2586
+ }
2587
+ } else {
2588
+ logger2.info("Fetching external model data", {
2589
+ dataUrl,
2590
+ note: "This may be a large download (383MB+)"
2591
+ });
2592
+ externalDataBuffer = await fetchWithCache(dataUrl);
2593
+ }
2594
+ logger2.info("External data loaded", {
2595
+ size: formatBytes(externalDataBuffer.byteLength)
2596
+ });
2597
+ } catch (err) {
2598
+ logger2.debug("No external data file found (single-file model)", {
2599
+ dataUrl,
2600
+ error: err.message
2601
+ });
2602
+ }
2603
+ }
2507
2604
  logger2.debug("Creating ONNX session", {
2508
- size: formatBytes(modelBuffer.byteLength),
2605
+ graphSize: formatBytes(modelBuffer.byteLength),
2606
+ externalDataSize: externalDataBuffer ? formatBytes(externalDataBuffer.byteLength) : "none",
2509
2607
  backend: this._backend
2510
2608
  });
2511
2609
  const sessionOptions = getSessionOptions(this._backend);
2610
+ if (externalDataBuffer) {
2611
+ const dataFilename = (typeof this.config.externalDataUrl === "string" ? this.config.externalDataUrl : `${modelUrl}.data`).split("/").pop();
2612
+ sessionOptions.externalData = [{
2613
+ path: dataFilename,
2614
+ data: new Uint8Array(externalDataBuffer)
2615
+ }];
2616
+ }
2512
2617
  logger2.info("Creating session with execution provider", {
2513
- executionProvider: this._backend
2618
+ executionProvider: this._backend,
2619
+ hasExternalData: !!externalDataBuffer
2514
2620
  });
2515
2621
  const modelData = new Uint8Array(modelBuffer);
2516
2622
  this.session = await this.ort.InferenceSession.create(modelData, sessionOptions);
@@ -2756,7 +2862,7 @@ var WhisperInference = class _WhisperInference {
2756
2862
  * Check if WebGPU is available in this browser
2757
2863
  */
2758
2864
  static async isWebGPUAvailable() {
2759
- return isWebGPUAvailable();
2865
+ return "gpu" in navigator;
2760
2866
  }
2761
2867
  /**
2762
2868
  * Load the Whisper model pipeline
@@ -3051,18 +3157,13 @@ var WhisperInference = class _WhisperInference {
3051
3157
  var logger5 = createLogger("Wav2ArkitCpu");
3052
3158
  var Wav2ArkitCpuInference = class {
3053
3159
  constructor(config) {
3160
+ this.modelId = "wav2arkit_cpu";
3054
3161
  this.session = null;
3055
3162
  this.ort = null;
3056
3163
  this._backend = "wasm";
3057
3164
  this.isLoading = false;
3058
3165
  // Inference queue for handling concurrent calls
3059
3166
  this.inferenceQueue = Promise.resolve();
3060
- /**
3061
- * Preferred chunk size: 4000 samples (250ms at 16kHz).
3062
- * wav2arkit_cpu accepts variable-length input, so we use smaller chunks
3063
- * for lower latency on WASM (vs 16000 for Wav2Vec2's fixed requirement).
3064
- */
3065
- this.chunkSamples = 4e3;
3066
3167
  this.config = config;
3067
3168
  }
3068
3169
  get backend() {
@@ -3096,23 +3197,25 @@ var Wav2ArkitCpuInference = class {
3096
3197
  this._backend = backend;
3097
3198
  logger5.info("ONNX Runtime loaded", { backend: this._backend });
3098
3199
  const modelUrl = this.config.modelUrl;
3099
- const sessionOptions = { ...getSessionOptions(this._backend) };
3100
- let isCached = false;
3101
- if (isIOS() && this.config.modelDataUrl) {
3102
- const dataFilename = this.config.modelDataUrl.split("/").pop();
3103
- sessionOptions.externalData = [{
3104
- path: dataFilename,
3105
- data: this.config.modelDataUrl
3106
- }];
3107
- logger5.info("iOS: URL-based session creation (ORT handles fetch internally)", {
3200
+ const dataUrl = this.config.externalDataUrl !== false ? this.config.externalDataUrl || `${modelUrl}.data` : null;
3201
+ const sessionOptions = getSessionOptions(this._backend);
3202
+ if (isIOS()) {
3203
+ logger5.info("iOS: passing model URLs directly to ORT (low-memory path)", {
3108
3204
  modelUrl,
3109
- dataFile: dataFilename,
3110
- dataUrl: this.config.modelDataUrl
3205
+ dataUrl
3111
3206
  });
3207
+ if (dataUrl) {
3208
+ const dataFilename = dataUrl.split("/").pop();
3209
+ sessionOptions.externalData = [{
3210
+ path: dataFilename,
3211
+ data: dataUrl
3212
+ // URL string — ORT fetches directly into WASM
3213
+ }];
3214
+ }
3112
3215
  this.session = await this.ort.InferenceSession.create(modelUrl, sessionOptions);
3113
3216
  } else {
3114
3217
  const cache = getModelCache();
3115
- isCached = await cache.has(modelUrl);
3218
+ const isCached = await cache.has(modelUrl);
3116
3219
  let modelBuffer;
3117
3220
  if (isCached) {
3118
3221
  logger5.debug("Loading model from cache", { modelUrl });
@@ -3123,42 +3226,48 @@ var Wav2ArkitCpuInference = class {
3123
3226
  modelBuffer = await fetchWithCache(modelUrl);
3124
3227
  }
3125
3228
  } else {
3126
- logger5.debug("Fetching and caching model", { modelUrl });
3229
+ logger5.debug("Fetching and caching model graph", { modelUrl });
3127
3230
  modelBuffer = await fetchWithCache(modelUrl);
3128
3231
  }
3129
3232
  if (!modelBuffer) {
3130
3233
  throw new Error(`Failed to load model: ${modelUrl}`);
3131
3234
  }
3132
- let externalDataBuffer;
3133
- if (this.config.modelDataUrl) {
3134
- const dataUrl = this.config.modelDataUrl;
3135
- const isDataCached = await cache.has(dataUrl);
3136
- if (isDataCached) {
3137
- logger5.debug("Loading external data from cache", { dataUrl });
3138
- externalDataBuffer = await cache.get(dataUrl);
3139
- if (!externalDataBuffer) {
3140
- logger5.warn("External data cache corruption, re-fetching", { dataUrl });
3141
- await cache.delete(dataUrl);
3235
+ let externalDataBuffer = null;
3236
+ if (dataUrl) {
3237
+ try {
3238
+ const isDataCached = await cache.has(dataUrl);
3239
+ if (isDataCached) {
3240
+ logger5.debug("Loading external data from cache", { dataUrl });
3241
+ externalDataBuffer = await cache.get(dataUrl);
3242
+ if (!externalDataBuffer) {
3243
+ logger5.warn("Cache corruption for external data, retrying", { dataUrl });
3244
+ await cache.delete(dataUrl);
3245
+ externalDataBuffer = await fetchWithCache(dataUrl);
3246
+ }
3247
+ } else {
3248
+ logger5.info("Fetching external model data", {
3249
+ dataUrl,
3250
+ note: "This may be a large download (400MB+)"
3251
+ });
3142
3252
  externalDataBuffer = await fetchWithCache(dataUrl);
3143
3253
  }
3144
- } else {
3145
- logger5.info("Fetching external data (this may take a while on first load)", {
3146
- dataUrl
3254
+ logger5.info("External data loaded", {
3255
+ size: formatBytes(externalDataBuffer.byteLength)
3256
+ });
3257
+ } catch (err) {
3258
+ logger5.debug("No external data file found (single-file model)", {
3259
+ dataUrl,
3260
+ error: err.message
3147
3261
  });
3148
- externalDataBuffer = await fetchWithCache(dataUrl);
3149
3262
  }
3150
- logger5.debug("External data loaded", {
3151
- size: formatBytes(externalDataBuffer.byteLength)
3152
- });
3153
3263
  }
3154
3264
  logger5.debug("Creating ONNX session", {
3155
- size: formatBytes(modelBuffer.byteLength),
3156
- hasExternalData: !!externalDataBuffer,
3157
- externalDataSize: externalDataBuffer ? formatBytes(externalDataBuffer.byteLength) : void 0,
3265
+ graphSize: formatBytes(modelBuffer.byteLength),
3266
+ externalDataSize: externalDataBuffer ? formatBytes(externalDataBuffer.byteLength) : "none",
3158
3267
  backend: this._backend
3159
3268
  });
3160
3269
  if (externalDataBuffer) {
3161
- const dataFilename = this.config.modelDataUrl.split("/").pop();
3270
+ const dataFilename = dataUrl.split("/").pop();
3162
3271
  sessionOptions.externalData = [{
3163
3272
  path: dataFilename,
3164
3273
  data: new Uint8Array(externalDataBuffer)
@@ -3177,7 +3286,7 @@ var Wav2ArkitCpuInference = class {
3177
3286
  span?.setAttributes({
3178
3287
  "model.backend": this._backend,
3179
3288
  "model.load_time_ms": loadTimeMs,
3180
- "model.cached": isCached
3289
+ "model.cached": !isIOS()
3181
3290
  });
3182
3291
  span?.end();
3183
3292
  telemetry?.recordHistogram("omote.model.load_time", loadTimeMs, {
@@ -3258,11 +3367,11 @@ var Wav2ArkitCpuInference = class {
3258
3367
  const blendshapes = [];
3259
3368
  for (let f = 0; f < numFrames; f++) {
3260
3369
  const rawFrame = blendshapeData.slice(f * numBlendshapes, (f + 1) * numBlendshapes);
3261
- const remapped = remapWav2ArkitToLam(rawFrame);
3262
- blendshapes.push(symmetrizeBlendshapes(remapped));
3370
+ const symmetrized = symmetrizeBlendshapes(rawFrame);
3371
+ blendshapes.push(symmetrized);
3263
3372
  }
3264
3373
  logger5.trace("Inference completed", {
3265
- inferenceTimeMs: Math.round(inferenceTimeMs),
3374
+ inferenceTimeMs: Math.round(inferenceTimeMs * 100) / 100,
3266
3375
  numFrames,
3267
3376
  inputSamples
3268
3377
  });
@@ -3328,14 +3437,14 @@ function createLipSync(config) {
3328
3437
  });
3329
3438
  }
3330
3439
  if (useCpu) {
3331
- logger6.info("Creating Wav2ArkitCpuInference (WASM)");
3440
+ logger6.info("Creating Wav2ArkitCpuInference (404MB, WASM)");
3332
3441
  return new Wav2ArkitCpuInference({
3333
- modelUrl: config.cpuModelUrl,
3334
- modelDataUrl: config.cpuModelDataUrl
3442
+ modelUrl: config.cpuModelUrl
3335
3443
  });
3336
3444
  }
3337
3445
  const gpuInstance = new Wav2Vec2Inference({
3338
3446
  modelUrl: config.gpuModelUrl,
3447
+ externalDataUrl: config.gpuExternalDataUrl,
3339
3448
  backend: config.gpuBackend ?? "auto",
3340
3449
  numIdentityClasses: config.numIdentityClasses
3341
3450
  });
@@ -3352,15 +3461,15 @@ var LipSyncWithFallback = class {
3352
3461
  this.implementation = gpuInstance;
3353
3462
  this.config = config;
3354
3463
  }
3464
+ get modelId() {
3465
+ return this.implementation.modelId;
3466
+ }
3355
3467
  get backend() {
3356
3468
  return this.implementation.backend;
3357
3469
  }
3358
3470
  get isLoaded() {
3359
3471
  return this.implementation.isLoaded;
3360
3472
  }
3361
- get chunkSamples() {
3362
- return this.implementation.chunkSamples;
3363
- }
3364
3473
  async load() {
3365
3474
  try {
3366
3475
  return await this.implementation.load();
@@ -3373,8 +3482,7 @@ var LipSyncWithFallback = class {
3373
3482
  } catch {
3374
3483
  }
3375
3484
  this.implementation = new Wav2ArkitCpuInference({
3376
- modelUrl: this.config.cpuModelUrl,
3377
- modelDataUrl: this.config.cpuModelDataUrl
3485
+ modelUrl: this.config.cpuModelUrl
3378
3486
  });
3379
3487
  this.hasFallenBack = true;
3380
3488
  logger6.info("Fallback to Wav2ArkitCpuInference successful");
@@ -3404,6 +3512,8 @@ var SileroVADInference = class {
3404
3512
  // Pre-speech buffer for capturing beginning of speech
3405
3513
  this.preSpeechBuffer = [];
3406
3514
  this.wasSpeaking = false;
3515
+ // Cached sample rate tensor (int64 scalar, never changes per instance)
3516
+ this.srTensor = null;
3407
3517
  const sampleRate = config.sampleRate ?? 16e3;
3408
3518
  if (sampleRate !== 8e3 && sampleRate !== 16e3) {
3409
3519
  throw new Error("Silero VAD only supports 8000 or 16000 Hz sample rates");
@@ -3534,6 +3644,24 @@ var SileroVADInference = class {
3534
3644
  this.context = new Float32Array(this.contextSize);
3535
3645
  this.preSpeechBuffer = [];
3536
3646
  this.wasSpeaking = false;
3647
+ if (!this.srTensor) {
3648
+ try {
3649
+ this.srTensor = new this.ort.Tensor(
3650
+ "int64",
3651
+ new BigInt64Array([BigInt(this.config.sampleRate)]),
3652
+ []
3653
+ );
3654
+ } catch (e) {
3655
+ logger7.warn("BigInt64Array not available, using bigint array fallback", {
3656
+ error: e instanceof Error ? e.message : String(e)
3657
+ });
3658
+ this.srTensor = new this.ort.Tensor(
3659
+ "int64",
3660
+ [BigInt(this.config.sampleRate)],
3661
+ []
3662
+ );
3663
+ }
3664
+ }
3537
3665
  }
3538
3666
  /**
3539
3667
  * Process a single audio chunk
@@ -3665,20 +3793,7 @@ var SileroVADInference = class {
3665
3793
  inputBuffer.set(audioChunkCopy, this.contextSize);
3666
3794
  const inputBufferCopy = new Float32Array(inputBuffer);
3667
3795
  const inputTensor = new this.ort.Tensor("float32", inputBufferCopy, [1, inputSize]);
3668
- let srTensor;
3669
- try {
3670
- srTensor = new this.ort.Tensor(
3671
- "int64",
3672
- new BigInt64Array([BigInt(this.config.sampleRate)]),
3673
- []
3674
- );
3675
- } catch {
3676
- srTensor = new this.ort.Tensor(
3677
- "int64",
3678
- [BigInt(this.config.sampleRate)],
3679
- []
3680
- );
3681
- }
3796
+ const srTensor = this.srTensor;
3682
3797
  const stateCopy = new Float32Array(this.state.data);
3683
3798
  const stateTensor = new this.ort.Tensor("float32", stateCopy, this.state.dims);
3684
3799
  const feeds = {
@@ -3767,6 +3882,7 @@ var SileroVADInference = class {
3767
3882
  this.session = null;
3768
3883
  }
3769
3884
  this.state = null;
3885
+ this.srTensor = null;
3770
3886
  }
3771
3887
  };
3772
3888
  /**
@@ -6579,7 +6695,7 @@ async function nukeBrowserCaches(preventRecreation = false) {
6579
6695
  totalDeleted: deletedCount
6580
6696
  });
6581
6697
  if (preventRecreation) {
6582
- const { env } = await import("./transformers.web-MHLR33H6.mjs");
6698
+ const { env } = await import("./transformers.web-T5LWC34T.mjs");
6583
6699
  env.useBrowserCache = false;
6584
6700
  logger12.warn("Browser cache creation disabled (env.useBrowserCache = false)");
6585
6701
  }
@@ -7194,6 +7310,7 @@ export {
7194
7310
  nukeBrowserCaches,
7195
7311
  parseHuggingFaceUrl,
7196
7312
  preloadModels,
7313
+ preloadOnnxRuntime,
7197
7314
  remapWav2ArkitToLam,
7198
7315
  resetLoggingConfig,
7199
7316
  resolveBackend,