npm - @omote/core - Versions diffs - 0.2.3 → 0.3.1 - Mend

@omote/core 0.2.3 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

package/dist/chunk-6W7G6WE7.mjs +13 -0
package/dist/chunk-6W7G6WE7.mjs.map +1 -0
package/dist/chunk-T465MTDX.mjs +38869 -0
package/dist/chunk-T465MTDX.mjs.map +1 -0
package/dist/events/index.mjs +1 -1
package/dist/index.d.mts +32 -12
package/dist/index.d.ts +32 -12
package/dist/index.js +38189 -25600
package/dist/index.js.map +1 -1
package/dist/index.mjs +153 -108
package/dist/index.mjs.map +1 -1
package/dist/logging/index.mjs +1 -1
package/dist/transformers.web-MHLR33H6.mjs +1718 -0
package/dist/transformers.web-MHLR33H6.mjs.map +1 -0
package/package.json +3 -2

package/dist/index.mjs CHANGED Viewed

@@ -13,10 +13,10 @@ import {
   setLoggingEnabled
 } from "./chunk-ESU52TDS.mjs";
 import {
-  env,
-  pipeline3
-} from "./chunk-RI6UQ7WF.mjs";
-import "./chunk-NSSMTXJJ.mjs";
+  __webpack_exports__env,
+  __webpack_exports__pipeline
+} from "./chunk-T465MTDX.mjs";
+import "./chunk-6W7G6WE7.mjs";
 // src/audio/MicrophoneCapture.ts
 var MicrophoneCapture = class {
@@ -263,7 +263,7 @@ var AudioScheduler = class {
     const ctx = await this.ensureContext();
     const channels = this.options.channels ?? 1;
     if (!this.isPlaying) {
-      this.nextPlayTime = ctx.currentTime + 0.05;
+      this.nextPlayTime = ctx.currentTime + (this.options.initialDelayS ?? 0.05);
       this.isPlaying = true;
     }
     const audioBuffer = ctx.createBuffer(channels, audioData.length, ctx.sampleRate);
@@ -446,8 +446,8 @@ var AudioChunkCoalescer = class {
 var LAMPipeline = class {
   constructor(options = {}) {
     this.options = options;
-    this.REQUIRED_SAMPLES = 16e3;
-    // 1.0s at 16kHz (LAM requirement)
+    this.DEFAULT_CHUNK_SAMPLES = 16e3;
+    // 1.0s at 16kHz (Wav2Vec2 requirement)
     this.FRAME_RATE = 30;
     // LAM outputs 30fps
     this.buffer = new Float32Array(0);
@@ -477,19 +477,20 @@ var LAMPipeline = class {
     newBuffer.set(this.buffer, 0);
     newBuffer.set(samples, this.buffer.length);
     this.buffer = newBuffer;
-    while (this.buffer.length >= this.REQUIRED_SAMPLES) {
-      await this.processBuffer(lam);
+    const chunkSize = lam.chunkSamples ?? this.DEFAULT_CHUNK_SAMPLES;
+    while (this.buffer.length >= chunkSize) {
+      await this.processBuffer(lam, chunkSize);
     }
   }
   /**
    * Process accumulated buffer through LAM inference
    */
-  async processBuffer(lam) {
+  async processBuffer(lam, chunkSize) {
     try {
-      const toProcess = this.buffer.slice(0, this.REQUIRED_SAMPLES);
+      const toProcess = this.buffer.slice(0, chunkSize);
       const processedStartTime = this.bufferStartTime;
-      this.buffer = this.buffer.slice(this.REQUIRED_SAMPLES);
-      const processedDuration = this.REQUIRED_SAMPLES / (this.options.sampleRate ?? 16e3);
+      this.buffer = this.buffer.slice(chunkSize);
+      const processedDuration = chunkSize / (this.options.sampleRate ?? 16e3);
       this.bufferStartTime = processedStartTime + processedDuration;
       const result = await lam.infer(toProcess);
       const frameDuration = 1 / this.FRAME_RATE;
@@ -508,35 +509,22 @@ var LAMPipeline = class {
   /**
    * Get the frame that should be displayed at the current time
    *
-   * Automatically removes frames that have already been displayed.
-   * This prevents memory leaks from accumulating old frames.
+   * Timestamp-synced playback for all backends. Audio playback is delayed
+   * for slow backends (WASM gets 1s head start via AudioScheduler) so
+   * frames are ready by the time their corresponding audio plays.
    *
-   * Discard Window (prevents premature frame discarding):
-   * - WebGPU: 0.5s (LAM inference 20-100ms + RAF jitter + React stalls)
-   * - WASM: 1.0s (LAM inference 50-500ms + higher variability)
-   *
-   * Last-Frame-Hold: Returns last valid frame instead of null to prevent
-   * avatar freezing when between frames (RAF at 60fps vs LAM at 30fps).
+   * Discard window is generous for WASM to handle inference jitter.
+   * Late frames play at RAF rate (~60fps) until caught up, then settle
+   * to natural 30fps pacing via timestamp gating.
    *
    * @param currentTime - Current AudioContext time
    * @param lam - LAM inference engine (optional, for backend detection)
    * @returns Current frame, or last frame as fallback, or null if no frames yet
    */
   getFrameForTime(currentTime, lam) {
-    const discardWindow = lam?.backend === "wasm" ? 1 : 0.5;
-    let discardedCount = 0;
+    const discardWindow = lam?.backend === "wasm" ? 10 : 0.5;
     while (this.frameQueue.length > 0 && this.frameQueue[0].timestamp < currentTime - discardWindow) {
-      const discarded = this.frameQueue.shift();
-      discardedCount++;
-      if (discardedCount === 1) {
-        const ageMs = ((currentTime - discarded.timestamp) * 1e3).toFixed(0);
-        console.warn("[LAM] Frame(s) discarded as too old", {
-          ageMs,
-          discardWindowMs: discardWindow * 1e3,
-          queueLength: this.frameQueue.length,
-          backend: lam?.backend ?? "unknown"
-        });
-      }
+      this.frameQueue.shift();
     }
     if (this.frameQueue.length > 0 && this.frameQueue[0].timestamp <= currentTime) {
       const { frame } = this.frameQueue.shift();
@@ -555,7 +543,7 @@ var LAMPipeline = class {
    * Get current buffer fill level (0-1)
    */
   get fillLevel() {
-    return Math.min(1, this.buffer.length / this.REQUIRED_SAMPLES);
+    return Math.min(1, this.buffer.length / this.DEFAULT_CHUNK_SAMPLES);
   }
   /**
    * Get number of frames queued
@@ -572,7 +560,7 @@ var LAMPipeline = class {
   /**
    * Flush remaining buffered audio
    *
-   * Processes any remaining audio in the buffer, even if less than REQUIRED_SAMPLES.
+   * Processes any remaining audio in the buffer, even if less than the chunk size.
    * This ensures the final audio chunk generates blendshape frames.
    *
    * Should be called when audio stream ends to prevent losing the last 0-1 seconds.
@@ -583,12 +571,17 @@ var LAMPipeline = class {
     if (this.buffer.length === 0) {
       return;
     }
-    const padded = new Float32Array(this.REQUIRED_SAMPLES);
-    padded.set(this.buffer, 0);
     const processedStartTime = this.bufferStartTime;
+    const sampleRate = this.options.sampleRate ?? 16e3;
+    const minSize = lam.chunkSamples ?? this.DEFAULT_CHUNK_SAMPLES;
+    const audioToInfer = this.buffer.length >= minSize ? this.buffer : (() => {
+      const padded = new Float32Array(minSize);
+      padded.set(this.buffer, 0);
+      return padded;
+    })();
     try {
-      const result = await lam.infer(padded);
-      const actualDuration = this.buffer.length / (this.options.sampleRate ?? 16e3);
+      const result = await lam.infer(audioToInfer);
+      const actualDuration = this.buffer.length / sampleRate;
       const frameDuration = 1 / this.FRAME_RATE;
       const actualFrameCount = Math.ceil(actualDuration * this.FRAME_RATE);
       for (let i = 0; i < Math.min(actualFrameCount, result.blendshapes.length); i++) {
@@ -647,7 +640,13 @@ var SyncedAudioPipeline = class extends EventEmitter {
     this.monitorInterval = null;
     this.frameAnimationId = null;
     const sampleRate = options.sampleRate ?? 16e3;
-    this.scheduler = new AudioScheduler({ sampleRate });
+    if (!options.lam.isLoaded) {
+      throw new Error(
+        "LipSyncBackend must be loaded before constructing SyncedAudioPipeline. Call lam.load() first so backend type is known for timing configuration."
+      );
+    }
+    const initialDelayS = options.lam.backend === "wasm" ? 1 : 0.05;
+    this.scheduler = new AudioScheduler({ sampleRate, initialDelayS });
     this.coalescer = new AudioChunkCoalescer({
       sampleRate,
       targetDurationMs: options.chunkTargetMs ?? 200
@@ -2138,12 +2137,9 @@ function applyIOSWasmMemoryPatch() {
   iosWasmPatched = true;
   const OrigMemory = WebAssembly.Memory;
   const MAX_IOS_PAGES = 16384;
-  logger.info("Applying iOS WASM memory patch (shared\u2192false, max\u21921GB)");
+  logger.info("Applying iOS WASM memory patch (max capped to 1GB, shared preserved)");
   WebAssembly.Memory = function IOSPatchedMemory(descriptor) {
     const patched = { ...descriptor };
-    if (patched.shared) {
-      patched.shared = false;
-    }
     if (patched.maximum !== void 0 && patched.maximum > MAX_IOS_PAGES) {
       patched.maximum = MAX_IOS_PAGES;
     }
@@ -2760,7 +2756,7 @@ var WhisperInference = class _WhisperInference {
    * Check if WebGPU is available in this browser
    */
   static async isWebGPUAvailable() {
-    return "gpu" in navigator;
+    return isWebGPUAvailable();
   }
   /**
    * Load the Whisper model pipeline
@@ -2801,19 +2797,19 @@ var WhisperInference = class _WhisperInference {
       const hasWebGPU = await _WhisperInference.isWebGPUAvailable();
       const device = this.config.device === "auto" ? hasWebGPU ? "webgpu" : "wasm" : this.config.device;
       logger4.info("Creating pipeline", { device, hasWebGPU });
-      env.allowLocalModels = false;
-      env.allowRemoteModels = true;
-      env.useBrowserCache = false;
-      env.useCustomCache = false;
-      env.useWasmCache = false;
-      if (env.backends.onnx.wasm) {
-        env.backends.onnx.wasm.proxy = false;
-        env.backends.onnx.wasm.numThreads = 1;
+      __webpack_exports__env.allowLocalModels = false;
+      __webpack_exports__env.allowRemoteModels = true;
+      __webpack_exports__env.useBrowserCache = false;
+      __webpack_exports__env.useCustomCache = false;
+      __webpack_exports__env.useWasmCache = false;
+      if (__webpack_exports__env.backends.onnx.wasm) {
+        __webpack_exports__env.backends.onnx.wasm.proxy = false;
+        __webpack_exports__env.backends.onnx.wasm.numThreads = 1;
       }
       logger4.info("Configured transformers.js env", {
-        allowLocalModels: env.allowLocalModels,
-        useBrowserCache: env.useBrowserCache,
-        useWasmCache: env.useWasmCache
+        allowLocalModels: __webpack_exports__env.allowLocalModels,
+        useBrowserCache: __webpack_exports__env.useBrowserCache,
+        useWasmCache: __webpack_exports__env.useWasmCache
       });
       const pipelineOptions = {
         dtype: this.config.dtype,
@@ -2830,7 +2826,7 @@ var WhisperInference = class _WhisperInference {
         };
         logger4.info("Forcing WebGPU execution providers");
       }
-      this.pipeline = await pipeline3(
+      this.pipeline = await __webpack_exports__pipeline(
         "automatic-speech-recognition",
         modelName,
         pipelineOptions
@@ -3061,6 +3057,12 @@ var Wav2ArkitCpuInference = class {
     this.isLoading = false;
     // Inference queue for handling concurrent calls
     this.inferenceQueue = Promise.resolve();
+    /**
+     * Preferred chunk size: 4000 samples (250ms at 16kHz).
+     * wav2arkit_cpu accepts variable-length input, so we use smaller chunks
+     * for lower latency on WASM (vs 16000 for Wav2Vec2's fixed requirement).
+     */
+    this.chunkSamples = 4e3;
     this.config = config;
   }
   get backend() {
@@ -3093,32 +3095,78 @@ var Wav2ArkitCpuInference = class {
       this.ort = ort;
       this._backend = backend;
       logger5.info("ONNX Runtime loaded", { backend: this._backend });
-      const cache = getModelCache();
       const modelUrl = this.config.modelUrl;
-      const isCached = await cache.has(modelUrl);
-      let modelBuffer;
-      if (isCached) {
-        logger5.debug("Loading model from cache", { modelUrl });
-        modelBuffer = await cache.get(modelUrl);
-        if (!modelBuffer) {
-          logger5.warn("Cache corruption detected, clearing and retrying", { modelUrl });
-          await cache.delete(modelUrl);
+      const sessionOptions = { ...getSessionOptions(this._backend) };
+      let isCached = false;
+      if (isIOS() && this.config.modelDataUrl) {
+        const dataFilename = this.config.modelDataUrl.split("/").pop();
+        sessionOptions.externalData = [{
+          path: dataFilename,
+          data: this.config.modelDataUrl
+        }];
+        logger5.info("iOS: URL-based session creation (ORT handles fetch internally)", {
+          modelUrl,
+          dataFile: dataFilename,
+          dataUrl: this.config.modelDataUrl
+        });
+        this.session = await this.ort.InferenceSession.create(modelUrl, sessionOptions);
+      } else {
+        const cache = getModelCache();
+        isCached = await cache.has(modelUrl);
+        let modelBuffer;
+        if (isCached) {
+          logger5.debug("Loading model from cache", { modelUrl });
+          modelBuffer = await cache.get(modelUrl);
+          if (!modelBuffer) {
+            logger5.warn("Cache corruption detected, clearing and retrying", { modelUrl });
+            await cache.delete(modelUrl);
+            modelBuffer = await fetchWithCache(modelUrl);
+          }
+        } else {
+          logger5.debug("Fetching and caching model", { modelUrl });
           modelBuffer = await fetchWithCache(modelUrl);
         }
-      } else {
-        logger5.debug("Fetching and caching model", { modelUrl });
-        modelBuffer = await fetchWithCache(modelUrl);
-      }
-      if (!modelBuffer) {
-        throw new Error(`Failed to load model: ${modelUrl}`);
+        if (!modelBuffer) {
+          throw new Error(`Failed to load model: ${modelUrl}`);
+        }
+        let externalDataBuffer;
+        if (this.config.modelDataUrl) {
+          const dataUrl = this.config.modelDataUrl;
+          const isDataCached = await cache.has(dataUrl);
+          if (isDataCached) {
+            logger5.debug("Loading external data from cache", { dataUrl });
+            externalDataBuffer = await cache.get(dataUrl);
+            if (!externalDataBuffer) {
+              logger5.warn("External data cache corruption, re-fetching", { dataUrl });
+              await cache.delete(dataUrl);
+              externalDataBuffer = await fetchWithCache(dataUrl);
+            }
+          } else {
+            logger5.info("Fetching external data (this may take a while on first load)", {
+              dataUrl
+            });
+            externalDataBuffer = await fetchWithCache(dataUrl);
+          }
+          logger5.debug("External data loaded", {
+            size: formatBytes(externalDataBuffer.byteLength)
+          });
+        }
+        logger5.debug("Creating ONNX session", {
+          size: formatBytes(modelBuffer.byteLength),
+          hasExternalData: !!externalDataBuffer,
+          externalDataSize: externalDataBuffer ? formatBytes(externalDataBuffer.byteLength) : void 0,
+          backend: this._backend
+        });
+        if (externalDataBuffer) {
+          const dataFilename = this.config.modelDataUrl.split("/").pop();
+          sessionOptions.externalData = [{
+            path: dataFilename,
+            data: new Uint8Array(externalDataBuffer)
+          }];
+        }
+        const modelData = new Uint8Array(modelBuffer);
+        this.session = await this.ort.InferenceSession.create(modelData, sessionOptions);
       }
-      logger5.debug("Creating ONNX session", {
-        size: formatBytes(modelBuffer.byteLength),
-        backend: this._backend
-      });
-      const sessionOptions = getSessionOptions(this._backend);
-      const modelData = new Uint8Array(modelBuffer);
-      this.session = await this.ort.InferenceSession.create(modelData, sessionOptions);
       const loadTimeMs = performance.now() - startTime;
       logger5.info("Model loaded successfully", {
         backend: this._backend,
@@ -3214,7 +3262,7 @@ var Wav2ArkitCpuInference = class {
             blendshapes.push(symmetrizeBlendshapes(remapped));
           }
           logger5.trace("Inference completed", {
-            inferenceTimeMs: Math.round(inferenceTimeMs * 100) / 100,
+            inferenceTimeMs: Math.round(inferenceTimeMs),
             numFrames,
             inputSamples
           });
@@ -3280,9 +3328,10 @@ function createLipSync(config) {
     });
   }
   if (useCpu) {
-    logger6.info("Creating Wav2ArkitCpuInference (1.8MB, WASM)");
+    logger6.info("Creating Wav2ArkitCpuInference (WASM)");
     return new Wav2ArkitCpuInference({
-      modelUrl: config.cpuModelUrl
+      modelUrl: config.cpuModelUrl,
+      modelDataUrl: config.cpuModelDataUrl
     });
   }
   const gpuInstance = new Wav2Vec2Inference({
@@ -3309,6 +3358,9 @@ var LipSyncWithFallback = class {
   get isLoaded() {
     return this.implementation.isLoaded;
   }
+  get chunkSamples() {
+    return this.implementation.chunkSamples;
+  }
   async load() {
     try {
       return await this.implementation.load();
@@ -3321,7 +3373,8 @@ var LipSyncWithFallback = class {
       } catch {
       }
       this.implementation = new Wav2ArkitCpuInference({
-        modelUrl: this.config.cpuModelUrl
+        modelUrl: this.config.cpuModelUrl,
+        modelDataUrl: this.config.cpuModelDataUrl
       });
       this.hasFallenBack = true;
       logger6.info("Fallback to Wav2ArkitCpuInference successful");
@@ -3351,8 +3404,6 @@ var SileroVADInference = class {
     // Pre-speech buffer for capturing beginning of speech
     this.preSpeechBuffer = [];
     this.wasSpeaking = false;
-    // Cached sample rate tensor (int64 scalar, never changes per instance)
-    this.srTensor = null;
     const sampleRate = config.sampleRate ?? 16e3;
     if (sampleRate !== 8e3 && sampleRate !== 16e3) {
       throw new Error("Silero VAD only supports 8000 or 16000 Hz sample rates");
@@ -3483,24 +3534,6 @@ var SileroVADInference = class {
     this.context = new Float32Array(this.contextSize);
     this.preSpeechBuffer = [];
     this.wasSpeaking = false;
-    if (!this.srTensor) {
-      try {
-        this.srTensor = new this.ort.Tensor(
-          "int64",
-          new BigInt64Array([BigInt(this.config.sampleRate)]),
-          []
-        );
-      } catch (e) {
-        logger7.warn("BigInt64Array not available, using bigint array fallback", {
-          error: e instanceof Error ? e.message : String(e)
-        });
-        this.srTensor = new this.ort.Tensor(
-          "int64",
-          [BigInt(this.config.sampleRate)],
-          []
-        );
-      }
-    }
   }
   /**
    * Process a single audio chunk
@@ -3632,7 +3665,20 @@ var SileroVADInference = class {
           inputBuffer.set(audioChunkCopy, this.contextSize);
           const inputBufferCopy = new Float32Array(inputBuffer);
           const inputTensor = new this.ort.Tensor("float32", inputBufferCopy, [1, inputSize]);
-          const srTensor = this.srTensor;
+          let srTensor;
+          try {
+            srTensor = new this.ort.Tensor(
+              "int64",
+              new BigInt64Array([BigInt(this.config.sampleRate)]),
+              []
+            );
+          } catch {
+            srTensor = new this.ort.Tensor(
+              "int64",
+              [BigInt(this.config.sampleRate)],
+              []
+            );
+          }
           const stateCopy = new Float32Array(this.state.data);
           const stateTensor = new this.ort.Tensor("float32", stateCopy, this.state.dims);
           const feeds = {
@@ -3721,7 +3767,6 @@ var SileroVADInference = class {
       this.session = null;
     }
     this.state = null;
-    this.srTensor = null;
   }
 };
 /**
@@ -6534,8 +6579,8 @@ async function nukeBrowserCaches(preventRecreation = false) {
       totalDeleted: deletedCount
     });
     if (preventRecreation) {
-      const { env: env2 } = await import("./transformers.web-ALDLCPHT.mjs");
-      env2.useBrowserCache = false;
+      const { env } = await import("./transformers.web-MHLR33H6.mjs");
+      env.useBrowserCache = false;
       logger12.warn("Browser cache creation disabled (env.useBrowserCache = false)");
     }
     return deletedCount;