npm - @omote/core - Versions diffs - 0.2.3 → 0.3.0 - Mend

@omote/core 0.2.3 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

package/dist/chunk-6W7G6WE7.mjs +13 -0
package/dist/chunk-6W7G6WE7.mjs.map +1 -0
package/dist/chunk-T465MTDX.mjs +38869 -0
package/dist/chunk-T465MTDX.mjs.map +1 -0
package/dist/events/index.mjs +1 -1
package/dist/index.d.mts +32 -12
package/dist/index.d.ts +32 -12
package/dist/index.js +38188 -25599
package/dist/index.js.map +1 -1
package/dist/index.mjs +152 -107
package/dist/index.mjs.map +1 -1
package/dist/logging/index.mjs +1 -1
package/dist/transformers.web-MHLR33H6.mjs +1718 -0
package/dist/transformers.web-MHLR33H6.mjs.map +1 -0
package/package.json +3 -2

package/dist/events/index.mjs CHANGED Viewed

@@ -1,7 +1,7 @@
 import {
   EventEmitter
 } from "../chunk-XK22BRG4.mjs";
-import "../chunk-NSSMTXJJ.mjs";
+import "../chunk-6W7G6WE7.mjs";
 export {
   EventEmitter
 };

package/dist/index.d.mts CHANGED Viewed

@@ -97,6 +97,13 @@ interface AudioSchedulerOptions {
     sampleRate?: number;
     /** Number of audio channels (default: 1 for mono) */
     channels?: number;
+    /**
+     * Delay before first audio chunk plays (seconds).
+     * Gives slow inference backends (WASM) a head start so lip sync
+     * frames are ready by the time audio reaches the listener.
+     * Default: 0.05 (50ms — just enough to enqueue the first node)
+     */
+    initialDelayS?: number;
 }
 declare class AudioScheduler {
     private readonly options;
@@ -454,6 +461,12 @@ interface LipSyncBackend {
     readonly backend: RuntimeBackend | null;
     /** Whether the model is loaded and ready for inference */
     readonly isLoaded: boolean;
+    /**
+     * Preferred number of audio samples per inference chunk.
+     * Models with variable-length input can use smaller values for lower latency.
+     * Default (if undefined): 16000 (1.0s at 16kHz, required by Wav2Vec2).
+     */
+    readonly chunkSamples?: number;
     /**
      * Load the ONNX model
      * @returns Model loading information
@@ -516,7 +529,7 @@ interface LAMPipelineOptions {
 }
 declare class LAMPipeline {
     private readonly options;
-    private readonly REQUIRED_SAMPLES;
+    private readonly DEFAULT_CHUNK_SAMPLES;
     private readonly FRAME_RATE;
     private buffer;
     private bufferStartTime;
@@ -545,15 +558,13 @@ declare class LAMPipeline {
     /**
      * Get the frame that should be displayed at the current time
      *
-     * Automatically removes frames that have already been displayed.
-     * This prevents memory leaks from accumulating old frames.
+     * Timestamp-synced playback for all backends. Audio playback is delayed
+     * for slow backends (WASM gets 1s head start via AudioScheduler) so
+     * frames are ready by the time their corresponding audio plays.
      *
-     * Discard Window (prevents premature frame discarding):
-     * - WebGPU: 0.5s (LAM inference 20-100ms + RAF jitter + React stalls)
-     * - WASM: 1.0s (LAM inference 50-500ms + higher variability)
-     *
-     * Last-Frame-Hold: Returns last valid frame instead of null to prevent
-     * avatar freezing when between frames (RAF at 60fps vs LAM at 30fps).
+     * Discard window is generous for WASM to handle inference jitter.
+     * Late frames play at RAF rate (~60fps) until caught up, then settle
+     * to natural 30fps pacing via timestamp gating.
      *
      * @param currentTime - Current AudioContext time
      * @param lam - LAM inference engine (optional, for backend detection)
@@ -581,7 +592,7 @@ declare class LAMPipeline {
     /**
      * Flush remaining buffered audio
      *
-     * Processes any remaining audio in the buffer, even if less than REQUIRED_SAMPLES.
+     * Processes any remaining audio in the buffer, even if less than the chunk size.
      * This ensures the final audio chunk generates blendshape frames.
      *
      * Should be called when audio stream ends to prevent losing the last 0-1 seconds.
@@ -1133,8 +1144,10 @@ declare class Wav2Vec2Inference {
  */
 interface Wav2ArkitCpuConfig {
-    /** Path or URL to the wav2arkit_cpu ONNX model */
+    /** Path or URL to the wav2arkit_cpu ONNX model (.onnx graph file) */
     modelUrl: string;
+    /** Path or URL to the external data file (.onnx.data weights file) */
+    modelDataUrl?: string;
     /** Preferred backend (default: 'wasm' — this model is CPU-optimized) */
     backend?: BackendPreference;
 }
@@ -1148,6 +1161,12 @@ declare class Wav2ArkitCpuInference implements LipSyncBackend {
     constructor(config: Wav2ArkitCpuConfig);
     get backend(): RuntimeBackend | null;
     get isLoaded(): boolean;
+    /**
+     * Preferred chunk size: 4000 samples (250ms at 16kHz).
+     * wav2arkit_cpu accepts variable-length input, so we use smaller chunks
+     * for lower latency on WASM (vs 16000 for Wav2Vec2's fixed requirement).
+     */
+    readonly chunkSamples = 4000;
     /**
      * Load the ONNX model
      */
@@ -1213,6 +1232,8 @@ interface CreateLipSyncConfig {
     gpuModelUrl: string;
     /** URL for the CPU model (wav2arkit_cpu, used on Safari/iOS) */
     cpuModelUrl: string;
+    /** URL for the CPU model's external data file (.onnx.data weights) */
+    cpuModelDataUrl?: string;
     /**
      * Model selection mode:
      * - 'auto': Safari/iOS → CPU, everything else → GPU (default)
@@ -1367,7 +1388,6 @@ declare class SileroVADInference {
     private inferenceQueue;
     private preSpeechBuffer;
     private wasSpeaking;
-    private srTensor;
     constructor(config: SileroVADConfig);
     get backend(): RuntimeBackend | null;
     get isLoaded(): boolean;

package/dist/index.d.ts CHANGED Viewed

@@ -97,6 +97,13 @@ interface AudioSchedulerOptions {
     sampleRate?: number;
     /** Number of audio channels (default: 1 for mono) */
     channels?: number;
+    /**
+     * Delay before first audio chunk plays (seconds).
+     * Gives slow inference backends (WASM) a head start so lip sync
+     * frames are ready by the time audio reaches the listener.
+     * Default: 0.05 (50ms — just enough to enqueue the first node)
+     */
+    initialDelayS?: number;
 }
 declare class AudioScheduler {
     private readonly options;
@@ -454,6 +461,12 @@ interface LipSyncBackend {
     readonly backend: RuntimeBackend | null;
     /** Whether the model is loaded and ready for inference */
     readonly isLoaded: boolean;
+    /**
+     * Preferred number of audio samples per inference chunk.
+     * Models with variable-length input can use smaller values for lower latency.
+     * Default (if undefined): 16000 (1.0s at 16kHz, required by Wav2Vec2).
+     */
+    readonly chunkSamples?: number;
     /**
      * Load the ONNX model
      * @returns Model loading information
@@ -516,7 +529,7 @@ interface LAMPipelineOptions {
 }
 declare class LAMPipeline {
     private readonly options;
-    private readonly REQUIRED_SAMPLES;
+    private readonly DEFAULT_CHUNK_SAMPLES;
     private readonly FRAME_RATE;
     private buffer;
     private bufferStartTime;
@@ -545,15 +558,13 @@ declare class LAMPipeline {
     /**
      * Get the frame that should be displayed at the current time
      *
-     * Automatically removes frames that have already been displayed.
-     * This prevents memory leaks from accumulating old frames.
+     * Timestamp-synced playback for all backends. Audio playback is delayed
+     * for slow backends (WASM gets 1s head start via AudioScheduler) so
+     * frames are ready by the time their corresponding audio plays.
      *
-     * Discard Window (prevents premature frame discarding):
-     * - WebGPU: 0.5s (LAM inference 20-100ms + RAF jitter + React stalls)
-     * - WASM: 1.0s (LAM inference 50-500ms + higher variability)
-     *
-     * Last-Frame-Hold: Returns last valid frame instead of null to prevent
-     * avatar freezing when between frames (RAF at 60fps vs LAM at 30fps).
+     * Discard window is generous for WASM to handle inference jitter.
+     * Late frames play at RAF rate (~60fps) until caught up, then settle
+     * to natural 30fps pacing via timestamp gating.
      *
      * @param currentTime - Current AudioContext time
      * @param lam - LAM inference engine (optional, for backend detection)
@@ -581,7 +592,7 @@ declare class LAMPipeline {
     /**
      * Flush remaining buffered audio
      *
-     * Processes any remaining audio in the buffer, even if less than REQUIRED_SAMPLES.
+     * Processes any remaining audio in the buffer, even if less than the chunk size.
      * This ensures the final audio chunk generates blendshape frames.
      *
      * Should be called when audio stream ends to prevent losing the last 0-1 seconds.
@@ -1133,8 +1144,10 @@ declare class Wav2Vec2Inference {
  */
 interface Wav2ArkitCpuConfig {
-    /** Path or URL to the wav2arkit_cpu ONNX model */
+    /** Path or URL to the wav2arkit_cpu ONNX model (.onnx graph file) */
     modelUrl: string;
+    /** Path or URL to the external data file (.onnx.data weights file) */
+    modelDataUrl?: string;
     /** Preferred backend (default: 'wasm' — this model is CPU-optimized) */
     backend?: BackendPreference;
 }
@@ -1148,6 +1161,12 @@ declare class Wav2ArkitCpuInference implements LipSyncBackend {
     constructor(config: Wav2ArkitCpuConfig);
     get backend(): RuntimeBackend | null;
     get isLoaded(): boolean;
+    /**
+     * Preferred chunk size: 4000 samples (250ms at 16kHz).
+     * wav2arkit_cpu accepts variable-length input, so we use smaller chunks
+     * for lower latency on WASM (vs 16000 for Wav2Vec2's fixed requirement).
+     */
+    readonly chunkSamples = 4000;
     /**
      * Load the ONNX model
      */
@@ -1213,6 +1232,8 @@ interface CreateLipSyncConfig {
     gpuModelUrl: string;
     /** URL for the CPU model (wav2arkit_cpu, used on Safari/iOS) */
     cpuModelUrl: string;
+    /** URL for the CPU model's external data file (.onnx.data weights) */
+    cpuModelDataUrl?: string;
     /**
      * Model selection mode:
      * - 'auto': Safari/iOS → CPU, everything else → GPU (default)
@@ -1367,7 +1388,6 @@ declare class SileroVADInference {
     private inferenceQueue;
     private preSpeechBuffer;
     private wasSpeaking;
-    private srTensor;
     constructor(config: SileroVADConfig);
     get backend(): RuntimeBackend | null;
     get isLoaded(): boolean;