npm - @omote/core - Versions diffs - 0.1.3 → 0.2.1 - Mend

@omote/core 0.1.3 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

package/dist/index.d.mts CHANGED Viewed

@@ -112,6 +112,17 @@ declare class AudioScheduler {
      * to avoid browser autoplay policy issues (requires user gesture).
      */
     initialize(): Promise<void>;
+    /**
+     * Eagerly create and warm up the AudioContext
+     *
+     * Call this when a playback session starts (e.g., when AI response begins).
+     * The AudioContext needs time to initialize the audio hardware — on Windows
+     * this can take 50-100ms. By warming up early (before audio data arrives),
+     * the context is fully ready when schedule() is first called.
+     *
+     * Must be called after a user gesture (click/tap) for autoplay policy.
+     */
+    warmup(): Promise<void>;
     /**
      * Ensure AudioContext is created and ready
      * Called lazily on first schedule() - requires user gesture
@@ -157,6 +168,7 @@ declare class AudioScheduler {
     cancelAll(fadeOutMs?: number): Promise<void>;
     /**
      * Reset scheduler state for new playback session
+     * Stops any orphaned sources that weren't cleaned up by cancelAll()
      */
     reset(): void;
     /**
@@ -342,6 +354,27 @@ declare function getOptimalWasmThreads(): number;
  * @returns true if proxy mode is safe to enable
  */
 declare function shouldEnableWasmProxy(): boolean;
+/**
+ * Detect Safari browser on any platform (macOS + iOS)
+ *
+ * Safari WebKit has bugs with ONNX Runtime's WebGPU multithreaded JSEP build
+ * that crash session creation. Both iOS and macOS Safari are affected.
+ *
+ * @returns true if running in Safari on any platform
+ */
+declare function isSafari(): boolean;
+/**
+ * Recommend using CPU-optimized lip sync model (wav2arkit_cpu)
+ *
+ * All WebKit browsers (Safari macOS, Safari iOS, Chrome iOS, Firefox iOS)
+ * have ONNX Runtime WebGPU JSEP bugs that crash session creation, and the
+ * 384MB LAM model stack-overflows in WASM mode.
+ * The wav2arkit_cpu model (1.8MB) provides identical 52 ARKit blendshape
+ * output at 22x real-time on CPU/WASM.
+ *
+ * @returns true if on Safari or any iOS browser (should use CPU lip sync model)
+ */
+declare function shouldUseCpuLipSync(): boolean;
 /**
  * Check if Web Speech API is available in the browser
  *
@@ -377,200 +410,62 @@ declare function shouldUseNativeASR(): boolean;
 declare function shouldUseServerLipSync(): boolean;
 /**
- * Lazy ONNX Runtime loader with conditional WebGPU/WASM bundle loading
+ * Common interface for lip sync inference backends
  *
- * This module provides a way to dynamically load the appropriate ONNX Runtime bundle
- * based on the platform's capabilities. This is critical for iOS support because:
+ * Both Wav2Vec2Inference (GPU, 384MB) and Wav2ArkitCpuInference (CPU, 1.8MB)
+ * implement this interface, allowing SyncedAudioPipeline and LAMPipeline to
+ * work with either model transparently.
  *
- * 1. iOS Safari has WebGPU API but ONNX Runtime's WebGPU backend crashes
- * 2. Loading the WebGPU bundle wastes bandwidth and can cause issues
- * 3. WASM-only bundle is smaller and more reliable on iOS
- *
- * Usage:
- * ```typescript
- * const ort = await getOnnxRuntime('wasm'); // Load WASM-only bundle
- * const ort = await getOnnxRuntime('webgpu'); // Load WebGPU bundle (includes WASM)
- * ```
- *
- * @module inference/onnxLoader
+ * @category Inference
  */
-type OrtModule = {
-    InferenceSession: typeof InferenceSession;
-    Tensor: typeof Tensor;
-    env: Env;
-};
-type SessionOptions = InferenceSession.SessionOptions;
-/**
- * Check if WebGPU is available and likely to work
- *
- * This is more thorough than just checking navigator.gpu exists.
- * It actually requests an adapter to verify the GPU is accessible.
- *
- * @returns true if WebGPU is available and working
- */
-declare function isWebGPUAvailable(): Promise<boolean>;
-/**
- * Load ONNX Runtime with the specified backend
- *
- * This lazily loads the appropriate bundle:
- * - 'wasm': Loads onnxruntime-web (WASM-only, smaller)
- * - 'webgpu': Loads onnxruntime-web/webgpu (includes WebGPU + WASM fallback)
- *
- * Once loaded, the same instance is reused for all subsequent calls.
- * If you need to switch backends, you must reload the page.
- *
- * @param backend The backend to load ('webgpu' or 'wasm')
- * @returns The ONNX Runtime module
- */
-declare function getOnnxRuntime(backend: RuntimeBackend): Promise<OrtModule>;
-/**
- * Get the appropriate ONNX Runtime based on user preference
- *
- * This resolves the user's preference against platform capabilities
- * and loads the appropriate bundle.
- *
- * @param preference User's backend preference
- * @returns The ONNX Runtime module and the resolved backend
- */
-declare function getOnnxRuntimeForPreference(preference?: BackendPreference): Promise<{
-    ort: OrtModule;
-    backend: RuntimeBackend;
-}>;
 /**
- * Get session options for creating an inference session
- *
- * This returns optimized session options based on the backend and platform.
- *
- * @param backend The backend being used
- * @returns Session options for InferenceSession.create()
+ * Model loading information returned by load()
  */
-declare function getSessionOptions(backend: RuntimeBackend): SessionOptions;
-/**
- * Create an inference session with automatic fallback
- *
- * If WebGPU session creation fails, automatically falls back to WASM.
- *
- * @param modelBuffer The model data as ArrayBuffer
- * @param preferredBackend The preferred backend
- * @returns The created session and the backend used
- */
-declare function createSessionWithFallback(modelBuffer: ArrayBuffer, preferredBackend: RuntimeBackend): Promise<{
-    session: InferenceSession;
+interface LipSyncModelInfo {
     backend: RuntimeBackend;
-}>;
-/**
- * Get the currently loaded backend (if any)
- */
-declare function getLoadedBackend(): RuntimeBackend | null;
-/**
- * Check if ONNX Runtime has been loaded
- */
-declare function isOnnxRuntimeLoaded(): boolean;
-/**
- * Unified Wav2Vec2 inference engine for Audio-to-Expression + ASR
- *
- * Runs entirely in the browser using WebGPU or WASM.
- * Takes raw 16kHz audio and outputs:
- * - 52 ARKit blendshapes (lip sync)
- * - 32-token CTC logits (speech recognition)
- *
- * @category Inference
- *
- * @example Basic usage
- * ```typescript
- * import { Wav2Vec2Inference } from '@omote/core';
- *
- * const wav2vec = new Wav2Vec2Inference({ modelUrl: '/models/unified_wav2vec2_asr_a2e.onnx' });
- * await wav2vec.load();
- *
- * // Process 1 second of audio (16kHz = 16000 samples)
- * const result = await wav2vec.infer(audioSamples);
- *
- * console.log('Blendshapes:', result.blendshapes); // [30, 52] for 30fps
- * console.log('ASR text:', result.text); // Decoded transcription
- * ```
- */
-type InferenceBackend = BackendPreference;
-interface Wav2Vec2InferenceConfig {
-    /** Path or URL to the ONNX model */
-    modelUrl: string;
-    /** Preferred backend (auto will try WebGPU first, fallback to WASM) */
-    backend?: InferenceBackend;
-    /** Number of identity classes (default: 12 for streaming model) */
-    numIdentityClasses?: number;
-}
-interface ModelInfo {
-    backend: 'webgpu' | 'wasm';
     loadTimeMs: number;
     inputNames: string[];
     outputNames: string[];
 }
 /**
- * LAM model blendshape names in order (52 total)
- * NOTE: This is alphabetical ordering used by LAM, different from standard ARKit order
+ * Result from lip sync inference
+ *
+ * All implementations must return blendshapes in LAM_BLENDSHAPES order (alphabetical).
+ * Models with different native orderings must remap internally before returning.
  */
-declare const LAM_BLENDSHAPES: readonly ["browDownLeft", "browDownRight", "browInnerUp", "browOuterUpLeft", "browOuterUpRight", "cheekPuff", "cheekSquintLeft", "cheekSquintRight", "eyeBlinkLeft", "eyeBlinkRight", "eyeLookDownLeft", "eyeLookDownRight", "eyeLookInLeft", "eyeLookInRight", "eyeLookOutLeft", "eyeLookOutRight", "eyeLookUpLeft", "eyeLookUpRight", "eyeSquintLeft", "eyeSquintRight", "eyeWideLeft", "eyeWideRight", "jawForward", "jawLeft", "jawOpen", "jawRight", "mouthClose", "mouthDimpleLeft", "mouthDimpleRight", "mouthFrownLeft", "mouthFrownRight", "mouthFunnel", "mouthLeft", "mouthLowerDownLeft", "mouthLowerDownRight", "mouthPressLeft", "mouthPressRight", "mouthPucker", "mouthRight", "mouthRollLower", "mouthRollUpper", "mouthShrugLower", "mouthShrugUpper", "mouthSmileLeft", "mouthSmileRight", "mouthStretchLeft", "mouthStretchRight", "mouthUpperUpLeft", "mouthUpperUpRight", "noseSneerLeft", "noseSneerRight", "tongueOut"];
-/** CTC vocabulary (32 tokens from wav2vec2-base-960h) */
-declare const CTC_VOCAB: string[];
-interface Wav2Vec2Result {
-    /** Blendshape weights [frames, 52] - 30fps */
+interface LipSyncResult {
+    /** Blendshape weights [frames, 52] in LAM_BLENDSHAPES order - 30fps */
     blendshapes: Float32Array[];
-    /** Raw CTC logits [frames, 32] - 50fps */
-    asrLogits: Float32Array[];
-    /** Decoded text from CTC */
-    text: string;
-    /** Number of A2E frames (30fps) */
-    numA2EFrames: number;
-    /** Number of ASR frames (50fps) */
-    numASRFrames: number;
+    /** Number of blendshape frames */
+    numFrames: number;
     /** Inference time in ms */
     inferenceTimeMs: number;
 }
-declare class Wav2Vec2Inference {
-    private session;
-    private ort;
-    private config;
-    private _backend;
-    private isLoading;
-    private numIdentityClasses;
-    private inferenceQueue;
-    constructor(config: Wav2Vec2InferenceConfig);
-    /**
-     * Check if WebGPU is available and working
-     * (iOS returns false even if navigator.gpu exists due to ONNX Runtime bugs)
-     */
-    static isWebGPUAvailable: typeof isWebGPUAvailable;
-    get backend(): 'webgpu' | 'wasm' | null;
-    get isLoaded(): boolean;
+/**
+ * Common interface for lip sync inference engines
+ *
+ * Implemented by:
+ * - Wav2Vec2Inference (WebGPU/WASM, 384MB, ASR + lip sync)
+ * - Wav2ArkitCpuInference (WASM-only, 1.8MB, lip sync only)
+ */
+interface LipSyncBackend {
+    /** Current backend type (webgpu, wasm, or null if not loaded) */
+    readonly backend: RuntimeBackend | null;
+    /** Whether the model is loaded and ready for inference */
+    readonly isLoaded: boolean;
     /**
      * Load the ONNX model
+     * @returns Model loading information
      */
-    load(): Promise<ModelInfo>;
+    load(): Promise<LipSyncModelInfo>;
     /**
      * Run inference on raw audio
-     * @param audioSamples - Float32Array of raw audio at 16kHz (16000 samples = 1 second)
-     * @param identityIndex - Optional identity index (0-11, default 0 = neutral)
-     *
-     * Note: Model expects 1-second chunks (16000 samples) for optimal performance.
-     * Audio will be zero-padded or truncated to 16000 samples.
+     * @param audioSamples - Float32Array of raw audio at 16kHz
+     * @param identityIndex - Optional identity index (ignored by CPU model)
+     * @returns Lip sync result with blendshapes in LAM_BLENDSHAPES order
      */
-    infer(audioSamples: Float32Array, identityIndex?: number): Promise<Wav2Vec2Result>;
-    /**
-     * Decode CTC logits to text using greedy decoding
-     */
-    private decodeCTC;
-    /**
-     * Queue inference to serialize ONNX session calls
-     */
-    private queueInference;
-    /**
-     * Get blendshape value by name for a specific frame
-     */
-    getBlendshape(blendshapes: Float32Array, name: typeof LAM_BLENDSHAPES[number]): number;
+    infer(audioSamples: Float32Array, identityIndex?: number): Promise<LipSyncResult>;
     /**
      * Dispose of the model and free resources
      */
@@ -642,7 +537,7 @@ declare class LAMPipeline {
      * @param timestamp - AudioContext time when these samples start playing
      * @param lam - LAM inference engine
      */
-    push(samples: Float32Array, timestamp: number, lam: Wav2Vec2Inference): Promise<void>;
+    push(samples: Float32Array, timestamp: number, lam: LipSyncBackend): Promise<void>;
     /**
      * Process accumulated buffer through LAM inference
      */
@@ -693,7 +588,7 @@ declare class LAMPipeline {
      *
      * @param lam - LAM inference engine
      */
-    flush(lam: Wav2Vec2Inference): Promise<void>;
+    flush(lam: LipSyncBackend): Promise<void>;
     /**
      * Adjust all queued frame timestamps by an offset
      *
@@ -710,25 +605,25 @@ declare class LAMPipeline {
 }
 /**
- * SyncedAudioPipeline - Enterprise-grade audio + LAM synchronization coordinator
+ * SyncedAudioPipeline - Audio playback + LAM lip sync coordinator
  *
  * Orchestrates the complete pipeline for synchronized audio playback and lip sync:
  * 1. Network chunks → Coalescer → Optimized buffers
- * 2. Audio buffers → Scheduler → Gapless playback
- * 3. Audio buffers → LAM Pipeline → Blendshape frames
+ * 2. Audio buffers → Scheduler → Gapless playback (immediate, never blocks)
+ * 3. Audio buffers → LAM Pipeline → Blendshape frames (background, fire-and-forget)
  * 4. Frames synchronized to AudioContext clock → Renderer
  *
- * Key Architecture Pattern: Wait-for-First-LAM
- * - Buffers incoming audio chunks without scheduling playback
- * - Waits for first LAM inference to complete (ensures LAM frames are ready)
- * - Then schedules all buffered audio + LAM frames together
- * - Result: Perfect synchronization from frame 1, no lag compensation needed
+ * Key Architecture Pattern: Audio-First, LAM-Background
+ * - Audio chunks are scheduled for playback immediately (never waits for LAM)
+ * - LAM inference runs in background without blocking the audio path
+ * - Lip sync starts ~1 second after audio (LAM needs 16000 samples to infer)
+ * - Once LAM catches up, frames stay synchronized to AudioContext clock
  *
- * This is a deterministic, enterprise-grade solution suitable for production use.
- * No hacks, no lag detection, no frame skipping - just guaranteed synchronization.
+ * This decoupled design prevents LAM inference (50-300ms) from blocking audio
+ * scheduling, which caused audible stuttering when audio arrived as a continuous
+ * stream (e.g., single-call TTS from ElevenLabs via AgentCore).
  *
  * @see https://web.dev/articles/audio-scheduling (Web Audio clock patterns)
- * @see https://developer.chrome.com/blog/audio-worklet-design-pattern (Ring buffer patterns)
  * @category Audio
  */
@@ -738,14 +633,14 @@ interface SyncedAudioPipelineOptions {
     /** Target chunk duration in ms for coalescing (default: 200) */
     chunkTargetMs?: number;
     /** LAM inference engine */
-    lam: Wav2Vec2Inference;
+    lam: LipSyncBackend;
 }
 interface SyncedAudioPipelineEvents {
     /** New frame ready for display */
     frame_ready: Float32Array;
     /** Playback has completed */
     playback_complete: void;
-    /** First LAM inference completed, playback starting */
+    /** First audio chunk scheduled, playback starting */
     playback_start: number;
     /** Error occurred */
     error: Error;
@@ -757,8 +652,7 @@ declare class SyncedAudioPipeline extends EventEmitter<SyncedAudioPipelineEvents
     private scheduler;
     private coalescer;
     private lamPipeline;
-    private waitingForFirstLAM;
-    private bufferedChunks;
+    private playbackStarted;
     private monitorInterval;
     private frameAnimationId;
     constructor(options: SyncedAudioPipelineOptions);
@@ -770,31 +664,19 @@ declare class SyncedAudioPipeline extends EventEmitter<SyncedAudioPipelineEvents
      * Start a new playback session
      *
      * Resets all state and prepares for incoming audio chunks.
-     * Enables wait-for-first-LAM synchronization.
+     * Audio will be scheduled immediately as chunks arrive (no buffering).
      */
     start(): void;
     /**
      * Receive audio chunk from network
      *
-     * Implements wait-for-first-LAM pattern:
-     * - Chunks are coalesced into optimal buffers
-     * - Buffers are sent to LAM for processing
-     * - Audio scheduling waits until first LAM completes
-     * - Then all buffered audio is scheduled together with LAM frames
+     * Audio-first design: schedules audio immediately, LAM runs in background.
+     * This prevents LAM inference (50-300ms) from blocking audio scheduling,
+     * which caused audible stuttering with continuous audio streams.
      *
      * @param chunk - Uint8Array containing Int16 PCM audio
      */
     onAudioChunk(chunk: Uint8Array): Promise<void>;
-    /**
-     * Handle first LAM inference completion
-     *
-     * This is the critical synchronization point:
-     * - LAM frames are now ready in the queue
-     * - Schedule all buffered audio chunks
-     * - Adjust LAM frame timestamps to match actual schedule time
-     * - Audio and LAM start playing together, perfectly synchronized
-     */
-    private onFirstLAMComplete;
     /**
      * End of audio stream
      *
@@ -840,8 +722,7 @@ declare class SyncedAudioPipeline extends EventEmitter<SyncedAudioPipelineEvents
      * Get current pipeline state (for debugging/monitoring)
      */
     getState(): {
-        waitingForFirstLAM: boolean;
-        bufferedChunks: number;
+        playbackStarted: boolean;
         coalescerFill: number;
         lamFill: number;
         queuedFrames: number;
@@ -854,6 +735,99 @@ declare class SyncedAudioPipeline extends EventEmitter<SyncedAudioPipelineEvents
     dispose(): void;
 }
+/**
+ * Lazy ONNX Runtime loader with conditional WebGPU/WASM bundle loading
+ *
+ * This module provides a way to dynamically load the appropriate ONNX Runtime bundle
+ * based on the platform's capabilities. This is critical for iOS support because:
+ *
+ * 1. iOS Safari has WebGPU API but ONNX Runtime's WebGPU backend crashes
+ * 2. Loading the WebGPU bundle wastes bandwidth and can cause issues
+ * 3. WASM-only bundle is smaller and more reliable on iOS
+ *
+ * Usage:
+ * ```typescript
+ * const ort = await getOnnxRuntime('wasm'); // Load WASM-only bundle
+ * const ort = await getOnnxRuntime('webgpu'); // Load WebGPU bundle (includes WASM)
+ * ```
+ *
+ * @module inference/onnxLoader
+ */
+type OrtModule = {
+    InferenceSession: typeof InferenceSession;
+    Tensor: typeof Tensor;
+    env: Env;
+};
+type SessionOptions = InferenceSession.SessionOptions;
+/**
+ * Check if WebGPU is available and likely to work
+ *
+ * This is more thorough than just checking navigator.gpu exists.
+ * It actually requests an adapter to verify the GPU is accessible.
+ *
+ * @returns true if WebGPU is available and working
+ */
+declare function isWebGPUAvailable(): Promise<boolean>;
+/**
+ * Load ONNX Runtime with the specified backend
+ *
+ * This lazily loads the appropriate bundle:
+ * - 'wasm': Loads onnxruntime-web (WASM-only, smaller)
+ * - 'webgpu': Loads onnxruntime-web/webgpu (includes WebGPU + WASM fallback)
+ *
+ * Once loaded, the same instance is reused for all subsequent calls.
+ * If you need to switch backends, you must reload the page.
+ *
+ * @param backend The backend to load ('webgpu' or 'wasm')
+ * @returns The ONNX Runtime module
+ */
+declare function getOnnxRuntime(backend: RuntimeBackend): Promise<OrtModule>;
+/**
+ * Get the appropriate ONNX Runtime based on user preference
+ *
+ * This resolves the user's preference against platform capabilities
+ * and loads the appropriate bundle.
+ *
+ * @param preference User's backend preference
+ * @returns The ONNX Runtime module and the resolved backend
+ */
+declare function getOnnxRuntimeForPreference(preference?: BackendPreference): Promise<{
+    ort: OrtModule;
+    backend: RuntimeBackend;
+}>;
+/**
+ * Get session options for creating an inference session
+ *
+ * This returns optimized session options based on the backend and platform.
+ *
+ * @param backend The backend being used
+ * @returns Session options for InferenceSession.create()
+ */
+declare function getSessionOptions(backend: RuntimeBackend): SessionOptions;
+/**
+ * Create an inference session with automatic fallback
+ *
+ * If WebGPU session creation fails, automatically falls back to WASM.
+ *
+ * @param modelBuffer The model data as ArrayBuffer
+ * @param preferredBackend The preferred backend
+ * @returns The created session and the backend used
+ */
+declare function createSessionWithFallback(modelBuffer: ArrayBuffer, preferredBackend: RuntimeBackend): Promise<{
+    session: InferenceSession;
+    backend: RuntimeBackend;
+}>;
+/**
+ * Get the currently loaded backend (if any)
+ */
+declare function getLoadedBackend(): RuntimeBackend | null;
+/**
+ * Check if ONNX Runtime has been loaded
+ */
+declare function isOnnxRuntimeLoaded(): boolean;
 /**
  * Whisper Automatic Speech Recognition using transformers.js
  * Uses Xenova's proven pipeline API for reliable transcription
@@ -982,6 +956,288 @@ declare class WhisperInference {
     private removeNonSpeechTokens;
 }
+/**
+ * Shared blendshape constants and utilities for lip sync inference
+ *
+ * Contains LAM_BLENDSHAPES (canonical ordering), symmetrization, and
+ * index remapping used by both Wav2Vec2Inference and Wav2ArkitCpuInference.
+ *
+ * This module is the single source of truth for blendshape ordering to
+ * avoid circular dependencies between inference classes.
+ *
+ * @category Inference
+ */
+/**
+ * LAM model blendshape names in order (52 total)
+ * NOTE: This is alphabetical ordering used by LAM, different from standard ARKit order
+ */
+declare const LAM_BLENDSHAPES: readonly ["browDownLeft", "browDownRight", "browInnerUp", "browOuterUpLeft", "browOuterUpRight", "cheekPuff", "cheekSquintLeft", "cheekSquintRight", "eyeBlinkLeft", "eyeBlinkRight", "eyeLookDownLeft", "eyeLookDownRight", "eyeLookInLeft", "eyeLookInRight", "eyeLookOutLeft", "eyeLookOutRight", "eyeLookUpLeft", "eyeLookUpRight", "eyeSquintLeft", "eyeSquintRight", "eyeWideLeft", "eyeWideRight", "jawForward", "jawLeft", "jawOpen", "jawRight", "mouthClose", "mouthDimpleLeft", "mouthDimpleRight", "mouthFrownLeft", "mouthFrownRight", "mouthFunnel", "mouthLeft", "mouthLowerDownLeft", "mouthLowerDownRight", "mouthPressLeft", "mouthPressRight", "mouthPucker", "mouthRight", "mouthRollLower", "mouthRollUpper", "mouthShrugLower", "mouthShrugUpper", "mouthSmileLeft", "mouthSmileRight", "mouthStretchLeft", "mouthStretchRight", "mouthUpperUpLeft", "mouthUpperUpRight", "noseSneerLeft", "noseSneerRight", "tongueOut"];
+/** Alias for backwards compatibility */
+declare const ARKIT_BLENDSHAPES: readonly ["browDownLeft", "browDownRight", "browInnerUp", "browOuterUpLeft", "browOuterUpRight", "cheekPuff", "cheekSquintLeft", "cheekSquintRight", "eyeBlinkLeft", "eyeBlinkRight", "eyeLookDownLeft", "eyeLookDownRight", "eyeLookInLeft", "eyeLookInRight", "eyeLookOutLeft", "eyeLookOutRight", "eyeLookUpLeft", "eyeLookUpRight", "eyeSquintLeft", "eyeSquintRight", "eyeWideLeft", "eyeWideRight", "jawForward", "jawLeft", "jawOpen", "jawRight", "mouthClose", "mouthDimpleLeft", "mouthDimpleRight", "mouthFrownLeft", "mouthFrownRight", "mouthFunnel", "mouthLeft", "mouthLowerDownLeft", "mouthLowerDownRight", "mouthPressLeft", "mouthPressRight", "mouthPucker", "mouthRight", "mouthRollLower", "mouthRollUpper", "mouthShrugLower", "mouthShrugUpper", "mouthSmileLeft", "mouthSmileRight", "mouthStretchLeft", "mouthStretchRight", "mouthUpperUpLeft", "mouthUpperUpRight", "noseSneerLeft", "noseSneerRight", "tongueOut"];
+/**
+ * Symmetrize blendshapes by averaging left/right pairs
+ * From LAM official postprocessing (models/utils.py)
+ * This fixes asymmetric output from the raw model
+ */
+declare function symmetrizeBlendshapes(frame: Float32Array): Float32Array;
+/**
+ * wav2arkit_cpu model blendshape ordering
+ *
+ * Indices 0-24 match LAM_BLENDSHAPES, but 25+ diverge:
+ * - LAM puts jawRight, mouthClose, mouthDimpleLeft, mouthDimpleRight at 25-28
+ * - wav2arkit_cpu puts mouthFrownLeft at 25 and moves those four to 48-51
+ */
+declare const WAV2ARKIT_BLENDSHAPES: readonly ["browDownLeft", "browDownRight", "browInnerUp", "browOuterUpLeft", "browOuterUpRight", "cheekPuff", "cheekSquintLeft", "cheekSquintRight", "eyeBlinkLeft", "eyeBlinkRight", "eyeLookDownLeft", "eyeLookDownRight", "eyeLookInLeft", "eyeLookInRight", "eyeLookOutLeft", "eyeLookOutRight", "eyeLookUpLeft", "eyeLookUpRight", "eyeSquintLeft", "eyeSquintRight", "eyeWideLeft", "eyeWideRight", "jawForward", "jawLeft", "jawOpen", "mouthFrownLeft", "mouthFrownRight", "mouthFunnel", "mouthLeft", "mouthLowerDownLeft", "mouthLowerDownRight", "mouthPressLeft", "mouthPressRight", "mouthPucker", "mouthRight", "mouthRollLower", "mouthRollUpper", "mouthShrugLower", "mouthShrugUpper", "mouthSmileLeft", "mouthSmileRight", "mouthStretchLeft", "mouthStretchRight", "mouthUpperUpLeft", "mouthUpperUpRight", "noseSneerLeft", "noseSneerRight", "tongueOut", "mouthClose", "mouthDimpleLeft", "mouthDimpleRight", "jawRight"];
+/**
+ * Remap a blendshape frame from wav2arkit_cpu ordering to LAM_BLENDSHAPES ordering
+ *
+ * @param frame - Float32Array of 52 blendshape values in wav2arkit_cpu order
+ * @returns Float32Array of 52 blendshape values in LAM_BLENDSHAPES order
+ */
+declare function remapWav2ArkitToLam(frame: Float32Array): Float32Array;
+/**
+ * Unified Wav2Vec2 inference engine for Audio-to-Expression + ASR
+ *
+ * Runs entirely in the browser using WebGPU or WASM.
+ * Takes raw 16kHz audio and outputs:
+ * - 52 ARKit blendshapes (lip sync)
+ * - 32-token CTC logits (speech recognition)
+ *
+ * @category Inference
+ *
+ * @example Basic usage
+ * ```typescript
+ * import { Wav2Vec2Inference } from '@omote/core';
+ *
+ * const wav2vec = new Wav2Vec2Inference({ modelUrl: '/models/unified_wav2vec2_asr_a2e.onnx' });
+ * await wav2vec.load();
+ *
+ * // Process 1 second of audio (16kHz = 16000 samples)
+ * const result = await wav2vec.infer(audioSamples);
+ *
+ * console.log('Blendshapes:', result.blendshapes); // [30, 52] for 30fps
+ * console.log('ASR text:', result.text); // Decoded transcription
+ * ```
+ */
+type InferenceBackend = BackendPreference;
+interface Wav2Vec2InferenceConfig {
+    /** Path or URL to the ONNX model */
+    modelUrl: string;
+    /** Preferred backend (auto will try WebGPU first, fallback to WASM) */
+    backend?: InferenceBackend;
+    /** Number of identity classes (default: 12 for streaming model) */
+    numIdentityClasses?: number;
+}
+interface ModelInfo {
+    backend: 'webgpu' | 'wasm';
+    loadTimeMs: number;
+    inputNames: string[];
+    outputNames: string[];
+}
+/** CTC vocabulary (32 tokens from wav2vec2-base-960h) */
+declare const CTC_VOCAB: string[];
+interface Wav2Vec2Result {
+    /** Blendshape weights [frames, 52] - 30fps */
+    blendshapes: Float32Array[];
+    /** Raw CTC logits [frames, 32] - 50fps */
+    asrLogits: Float32Array[];
+    /** Decoded text from CTC */
+    text: string;
+    /** Number of blendshape frames (30fps) — alias for numA2EFrames */
+    numFrames: number;
+    /** Number of A2E frames (30fps) */
+    numA2EFrames: number;
+    /** Number of ASR frames (50fps) */
+    numASRFrames: number;
+    /** Inference time in ms */
+    inferenceTimeMs: number;
+}
+declare class Wav2Vec2Inference {
+    private session;
+    private ort;
+    private config;
+    private _backend;
+    private isLoading;
+    private numIdentityClasses;
+    private inferenceQueue;
+    constructor(config: Wav2Vec2InferenceConfig);
+    /**
+     * Check if WebGPU is available and working
+     * (iOS returns false even if navigator.gpu exists due to ONNX Runtime bugs)
+     */
+    static isWebGPUAvailable: typeof isWebGPUAvailable;
+    get backend(): 'webgpu' | 'wasm' | null;
+    get isLoaded(): boolean;
+    /**
+     * Load the ONNX model
+     */
+    load(): Promise<ModelInfo>;
+    /**
+     * Run inference on raw audio
+     * @param audioSamples - Float32Array of raw audio at 16kHz (16000 samples = 1 second)
+     * @param identityIndex - Optional identity index (0-11, default 0 = neutral)
+     *
+     * Note: Model expects 1-second chunks (16000 samples) for optimal performance.
+     * Audio will be zero-padded or truncated to 16000 samples.
+     */
+    infer(audioSamples: Float32Array, identityIndex?: number): Promise<Wav2Vec2Result>;
+    /**
+     * Decode CTC logits to text using greedy decoding
+     */
+    private decodeCTC;
+    /**
+     * Queue inference to serialize ONNX session calls
+     */
+    private queueInference;
+    /**
+     * Get blendshape value by name for a specific frame
+     */
+    getBlendshape(blendshapes: Float32Array, name: typeof LAM_BLENDSHAPES[number]): number;
+    /**
+     * Dispose of the model and free resources
+     */
+    dispose(): Promise<void>;
+}
+/**
+ * CPU-optimized lip sync inference using wav2arkit_cpu model
+ *
+ * A lightweight (1.8MB) alternative to Wav2Vec2Inference (384MB) designed
+ * for Safari/iOS where WebGPU crashes due to ONNX Runtime JSEP bugs.
+ *
+ * Key differences from Wav2Vec2Inference:
+ * - WASM-only backend (CPU-optimized, no WebGPU)
+ * - 1.8MB model vs 384MB
+ * - No identity input (baked to identity 11)
+ * - No ASR output (lip sync only)
+ * - Dynamic input length (not fixed to 16000 samples)
+ * - Different native blendshape ordering (remapped to LAM_BLENDSHAPES)
+ *
+ * @category Inference
+ *
+ * @example
+ * ```typescript
+ * import { Wav2ArkitCpuInference } from '@omote/core';
+ *
+ * const lam = new Wav2ArkitCpuInference({
+ *   modelUrl: '/models/wav2arkit_cpu.onnx',
+ * });
+ * await lam.load();
+ *
+ * const { blendshapes } = await lam.infer(audioSamples);
+ * // blendshapes: Float32Array[] in LAM_BLENDSHAPES order, 30fps
+ * ```
+ */
+interface Wav2ArkitCpuConfig {
+    /** Path or URL to the wav2arkit_cpu ONNX model */
+    modelUrl: string;
+    /** Preferred backend (default: 'wasm' — this model is CPU-optimized) */
+    backend?: BackendPreference;
+}
+declare class Wav2ArkitCpuInference implements LipSyncBackend {
+    private session;
+    private ort;
+    private config;
+    private _backend;
+    private isLoading;
+    private inferenceQueue;
+    constructor(config: Wav2ArkitCpuConfig);
+    get backend(): RuntimeBackend | null;
+    get isLoaded(): boolean;
+    /**
+     * Load the ONNX model
+     */
+    load(): Promise<LipSyncModelInfo>;
+    /**
+     * Run inference on raw audio
+     *
+     * Accepts variable-length audio (not fixed to 16000 samples).
+     * Output frames = ceil(30 * numSamples / 16000).
+     *
+     * @param audioSamples - Float32Array of raw audio at 16kHz
+     * @param _identityIndex - Ignored (identity 11 is baked into the model)
+     */
+    infer(audioSamples: Float32Array, _identityIndex?: number): Promise<LipSyncResult>;
+    /**
+     * Queue inference to serialize ONNX session calls
+     */
+    private queueInference;
+    /**
+     * Dispose of the model and free resources
+     */
+    dispose(): Promise<void>;
+}
+/**
+ * Factory function for lip sync with automatic GPU/CPU model selection
+ *
+ * Provides a unified API that automatically selects the optimal model:
+ * - Safari (macOS + iOS): Uses Wav2ArkitCpuInference (1.8MB, WASM)
+ * - Chrome/Firefox/Edge: Uses Wav2Vec2Inference (384MB, WebGPU)
+ * - Fallback: Gracefully falls back to CPU model if GPU model fails to load
+ *
+ * @category Inference
+ *
+ * @example Auto-detect (recommended)
+ * ```typescript
+ * import { createLipSync } from '@omote/core';
+ *
+ * const lam = createLipSync({
+ *   gpuModelUrl: '/models/unified_wav2vec2_asr_a2e.onnx',
+ *   cpuModelUrl: '/models/wav2arkit_cpu.onnx',
+ * });
+ *
+ * await lam.load();
+ * const { blendshapes } = await lam.infer(audioSamples);
+ * ```
+ *
+ * @example Force CPU model
+ * ```typescript
+ * const lam = createLipSync({
+ *   gpuModelUrl: '/models/unified_wav2vec2_asr_a2e.onnx',
+ *   cpuModelUrl: '/models/wav2arkit_cpu.onnx',
+ *   mode: 'cpu',
+ * });
+ * ```
+ */
+/**
+ * Configuration for the lip sync factory
+ */
+interface CreateLipSyncConfig {
+    /** URL for the GPU model (Wav2Vec2, used on Chrome/Firefox/Edge) */
+    gpuModelUrl: string;
+    /** URL for the CPU model (wav2arkit_cpu, used on Safari/iOS) */
+    cpuModelUrl: string;
+    /**
+     * Model selection mode:
+     * - 'auto': Safari/iOS → CPU, everything else → GPU (default)
+     * - 'gpu': Force GPU model (Wav2Vec2Inference)
+     * - 'cpu': Force CPU model (Wav2ArkitCpuInference)
+     */
+    mode?: 'auto' | 'gpu' | 'cpu';
+    /** Backend preference for GPU model (default: 'auto') */
+    gpuBackend?: BackendPreference;
+    /** Number of identity classes for GPU model (default: 12) */
+    numIdentityClasses?: number;
+    /**
+     * Fall back to CPU model if GPU model fails to load (default: true)
+     * Only applies when mode is 'auto' or 'gpu'
+     */
+    fallbackOnError?: boolean;
+}
+/**
+ * Create a lip sync instance with automatic GPU/CPU model selection
+ *
+ * @param config - Factory configuration
+ * @returns A LipSyncBackend instance (either GPU or CPU model)
+ */
+declare function createLipSync(config: CreateLipSyncConfig): LipSyncBackend;
 /**
  * Silero VAD (Voice Activity Detection) inference
  *
@@ -3809,4 +4065,4 @@ declare class EmphasisDetector {
     reset(): void;
 }
-export { type AIAdapter, type AIAdapterEvents, type AISessionState, type ActiveSpan, AgentCoreAdapter, type AgentCoreConfig, type AnimationClip, AnimationEvent, AnimationGraph, type AnimationGraphConfig, type AnimationGraphEvents, type AnimationLayer, type AnimationOutput, type AnimationState, type AnimationStateName, type AnimationTrigger, AudioChunkCoalescer, type AudioChunkCoalescerOptions, AudioEnergyAnalyzer, AudioScheduler, type AudioSchedulerOptions, type AudioSyncConfig, type AudioSyncEvents, AudioSyncManager, type BackendPreference, type BlendWeight, CTC_VOCAB, type CacheConfig, type CacheSpanAttributes, ConsoleExporter, type ConversationMessage, ConversationOrchestrator, type ConversationSession, DEFAULT_ANIMATION_CONFIG, EMOTION_NAMES, EMOTION_VECTOR_SIZE, type EmotionAnimationMap, EmotionController, type EmotionLabel, type EmotionName, type EmotionPresetName, EmotionPresets, type EmotionWeights, EmphasisDetector, EventEmitter, type FetchWithCacheOptions, HF_CDN_TEST_URL, type HuggingFaceUrlInfo, INFERENCE_LATENCY_BUCKETS, type InferenceSpanAttributes, type InterruptionConfig, type InterruptionEvents, InterruptionHandler, type LAMFrame, LAMPipeline, type LAMPipelineOptions, LAM_BLENDSHAPES, MODEL_LOAD_TIME_BUCKETS, type MessageRole, type MetricData, MetricNames, MicrophoneCapture, type MicrophoneCaptureConfig, ModelCache, type ModelSpanAttributes, OTLPExporter, type OTLPExporterConfig, OmoteEvents, OmoteTelemetry, type OrchestratorConfig, type OrchestratorEvents, type QuotaInfo, RingBuffer, type RuntimeBackend, type SafariSpeechConfig, SafariSpeechRecognition, type SamplingConfig, type SessionConfig, type SessionOptions, type SessionSnapshot, type SileroVADBackend, type SileroVADConfig, type SileroVADFactoryConfig, SileroVADInference, SileroVADWorker, type SpanAttributes, type SpanData, type SpeechErrorCallback, type SpeechRecognitionResult, type SpeechResultCallback, type SpeechSegment, SyncedAudioPipeline, type SyncedAudioPipelineEvents, type SyncedAudioPipelineOptions, type TelemetryConfig, type TelemetryExporter, type TelemetryExporterInterface, type TenantConfig, TenantManager, type TenantQuota, type TenantUsage, type TokenRefreshCallback, type TranscriptionResult, type Transition, type VADBackend, type VADModelInfo, type VADResult$1 as VADResult, type VADWorkerConfig, type VADWorkerModelInfo, type ValidationResult, type VoiceConfig, Wav2Vec2Inference, type Wav2Vec2InferenceConfig, type Wav2Vec2Result, type WhisperConfig, WhisperInference, type WhisperModel, blendEmotions, calculatePeak, calculateRMS, clearSpecificCache, clearTransformersCache, configureCacheLimit, configureTelemetry, createEmotionVector, createSessionWithFallback, createSileroVAD, fetchWithCache, formatBytes, getCacheConfig, getCacheKey, getEmotionPreset, getLoadedBackend, getModelCache, getOnnxRuntime, getOnnxRuntimeForPreference, getOptimalWasmThreads, getRecommendedBackend, getSessionOptions, getTelemetry, hasWebGPUApi, isAndroid, isHuggingFaceCDNReachable, isIOS, isIOSSafari, isMobile, isOnnxRuntimeLoaded, isSpeechRecognitionAvailable, isWebGPUAvailable, lerpEmotion, listCaches, nukeBrowserCaches, parseHuggingFaceUrl, preloadModels, resolveBackend, scanForInvalidCaches, shouldEnableWasmProxy, shouldUseNativeASR, shouldUseServerLipSync, supportsVADWorker, validateCachedResponse };
+export { type AIAdapter, type AIAdapterEvents, type AISessionState, ARKIT_BLENDSHAPES, type ActiveSpan, AgentCoreAdapter, type AgentCoreConfig, type AnimationClip, AnimationEvent, AnimationGraph, type AnimationGraphConfig, type AnimationGraphEvents, type AnimationLayer, type AnimationOutput, type AnimationState, type AnimationStateName, type AnimationTrigger, AudioChunkCoalescer, type AudioChunkCoalescerOptions, AudioEnergyAnalyzer, AudioScheduler, type AudioSchedulerOptions, type AudioSyncConfig, type AudioSyncEvents, AudioSyncManager, type BackendPreference, type BlendWeight, CTC_VOCAB, type CacheConfig, type CacheSpanAttributes, ConsoleExporter, type ConversationMessage, ConversationOrchestrator, type ConversationSession, type CreateLipSyncConfig, DEFAULT_ANIMATION_CONFIG, EMOTION_NAMES, EMOTION_VECTOR_SIZE, type EmotionAnimationMap, EmotionController, type EmotionLabel, type EmotionName, type EmotionPresetName, EmotionPresets, type EmotionWeights, EmphasisDetector, EventEmitter, type FetchWithCacheOptions, HF_CDN_TEST_URL, type HuggingFaceUrlInfo, INFERENCE_LATENCY_BUCKETS, type InferenceSpanAttributes, type InterruptionConfig, type InterruptionEvents, InterruptionHandler, type LAMFrame, LAMPipeline, type LAMPipelineOptions, LAM_BLENDSHAPES, type LipSyncBackend, type LipSyncModelInfo, type LipSyncResult, MODEL_LOAD_TIME_BUCKETS, type MessageRole, type MetricData, MetricNames, MicrophoneCapture, type MicrophoneCaptureConfig, ModelCache, type ModelSpanAttributes, OTLPExporter, type OTLPExporterConfig, OmoteEvents, OmoteTelemetry, type OrchestratorConfig, type OrchestratorEvents, type QuotaInfo, RingBuffer, type RuntimeBackend, type SafariSpeechConfig, SafariSpeechRecognition, type SamplingConfig, type SessionConfig, type SessionOptions, type SessionSnapshot, type SileroVADBackend, type SileroVADConfig, type SileroVADFactoryConfig, SileroVADInference, SileroVADWorker, type SpanAttributes, type SpanData, type SpeechErrorCallback, type SpeechRecognitionResult, type SpeechResultCallback, type SpeechSegment, SyncedAudioPipeline, type SyncedAudioPipelineEvents, type SyncedAudioPipelineOptions, type TelemetryConfig, type TelemetryExporter, type TelemetryExporterInterface, type TenantConfig, TenantManager, type TenantQuota, type TenantUsage, type TokenRefreshCallback, type TranscriptionResult, type Transition, type VADBackend, type VADModelInfo, type VADResult$1 as VADResult, type VADWorkerConfig, type VADWorkerModelInfo, type ValidationResult, type VoiceConfig, WAV2ARKIT_BLENDSHAPES, type Wav2ArkitCpuConfig, Wav2ArkitCpuInference, Wav2Vec2Inference, type Wav2Vec2InferenceConfig, type Wav2Vec2Result, type WhisperConfig, WhisperInference, type WhisperModel, blendEmotions, calculatePeak, calculateRMS, clearSpecificCache, clearTransformersCache, configureCacheLimit, configureTelemetry, createEmotionVector, createLipSync, createSessionWithFallback, createSileroVAD, fetchWithCache, formatBytes, getCacheConfig, getCacheKey, getEmotionPreset, getLoadedBackend, getModelCache, getOnnxRuntime, getOnnxRuntimeForPreference, getOptimalWasmThreads, getRecommendedBackend, getSessionOptions, getTelemetry, hasWebGPUApi, isAndroid, isHuggingFaceCDNReachable, isIOS, isIOSSafari, isMobile, isOnnxRuntimeLoaded, isSafari, isSpeechRecognitionAvailable, isWebGPUAvailable, lerpEmotion, listCaches, nukeBrowserCaches, parseHuggingFaceUrl, preloadModels, remapWav2ArkitToLam, resolveBackend, scanForInvalidCaches, shouldEnableWasmProxy, shouldUseCpuLipSync, shouldUseNativeASR, shouldUseServerLipSync, supportsVADWorker, symmetrizeBlendshapes, validateCachedResponse };