npm - @omote/core - Versions diffs - 0.4.5 → 0.4.7 - Mend

@omote/core 0.4.5 → 0.4.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

package/dist/index.d.mts CHANGED Viewed

@@ -1435,298 +1435,634 @@ declare class SenseVoiceInference {
 }
 /**
- * Kaldi-compatible filterbank (fbank) feature extraction
- *
- * Pure TypeScript implementation matching kaldi-native-fbank parameters
- * used by SenseVoice. No external dependencies.
- *
- * Pipeline: audio → framing → windowing → FFT → power spectrum → mel filterbank → log
- *
- * @module inference/kaldiFbank
- */
-interface KaldiFbankOptions {
-    /** Frame length in ms (default: 25) */
-    frameLengthMs?: number;
-    /** Frame shift in ms (default: 10) */
-    frameShiftMs?: number;
-    /** Low frequency cutoff in Hz (default: 20) */
-    lowFreq?: number;
-    /** High frequency cutoff in Hz (default: sampleRate / 2) */
-    highFreq?: number;
-    /** Dither amount (default: 0 for deterministic output) */
-    dither?: number;
-    /** Preemphasis coefficient (default: 0.97) */
-    preemphasis?: number;
-}
-/**
- * Compute Kaldi-compatible log mel filterbank features
+ * SenseVoice ASR Web Worker implementation
  *
- * @param audio Raw audio samples (float32, [-1, 1] range)
- * @param sampleRate Sample rate in Hz (must be 16000 for SenseVoice)
- * @param numMelBins Number of mel bins (80 for SenseVoice)
- * @param opts Optional parameters
- * @returns Flattened Float32Array of shape [numFrames, numMelBins]
- */
-declare function computeKaldiFbank(audio: Float32Array, sampleRate: number, numMelBins: number, opts?: KaldiFbankOptions): Float32Array;
-/**
- * Apply Low Frame Rate stacking for SenseVoice
+ * Runs SenseVoice speech recognition in a dedicated Web Worker to prevent
+ * main thread blocking. Uses inline worker script (Blob URL pattern) to
+ * avoid separate file deployment.
  *
- * Concatenates lfrM consecutive frames with stride lfrN.
- * Left-pads with copies of first frame, right-pads last group.
+ * Key design decisions:
+ * - WASM backend only (WebGPU doesn't work in Workers)
+ * - All preprocessing (fbank, LFR, CMVN) and CTC decoding inlined in worker
+ * - Audio copied (not transferred) to retain main thread access
+ * - ONNX Runtime loaded from CDN in worker (no bundler complications)
+ * - iOS: model URL passed as string to ORT (avoids 239MB JS heap allocation)
  *
- * @param features Flattened [numFrames, featureDim]
- * @param featureDim Feature dimension per frame (e.g., 80)
- * @param lfrM Number of frames to stack (default: 7)
- * @param lfrN Stride (default: 6)
- * @returns Flattened [numOutputFrames, featureDim * lfrM]
- */
-declare function applyLFR(features: Float32Array, featureDim: number, lfrM?: number, lfrN?: number): Float32Array;
-/**
- * Apply CMVN normalization in-place
+ * @category Inference
  *
- * Formula: normalized[i] = (features[i] + negMean[i % dim]) * invStddev[i % dim]
+ * @example Basic usage
+ * ```typescript
+ * import { SenseVoiceWorker } from '@omote/core';
  *
- * @param features Flattened feature array (modified in-place)
- * @param dim Feature dimension (560 for SenseVoice after LFR)
- * @param negMean Negative mean vector (dim-dimensional)
- * @param invStddev Inverse standard deviation vector (dim-dimensional)
- * @returns The same features array (for chaining)
- */
-declare function applyCMVN(features: Float32Array, dim: number, negMean: Float32Array, invStddev: Float32Array): Float32Array;
-/**
- * Parse CMVN vectors from comma-separated strings (stored in ONNX metadata)
+ * const asr = new SenseVoiceWorker({
+ *   modelUrl: '/models/sensevoice/model.int8.onnx',
+ *   tokensUrl: '/models/sensevoice/tokens.txt',
+ * });
+ * await asr.load();
  *
- * The sherpa-onnx SenseVoice export stores neg_mean and inv_stddev
- * as comma-separated float strings in the model's metadata.
+ * const { text, emotion, language } = await asr.transcribe(audioSamples);
+ * console.log(text);       // "Hello world"
+ * console.log(emotion);    // "NEUTRAL"
+ * console.log(language);   // "en"
+ * ```
  */
-declare function parseCMVNFromMetadata(negMeanStr: string, invStddevStr: string): {
-    negMean: Float32Array;
-    invStddev: Float32Array;
-};
 /**
- * CTC greedy decoder for SenseVoice
- *
- * Decodes CTC logits into text with structured token parsing
- * for language, emotion, and audio event detection.
- *
- * @module inference/ctcDecoder
+ * Configuration for SenseVoice Worker
  */
-interface CTCDecodeResult {
-    /** Decoded text (speech content only) */
-    text: string;
-    /** Detected language (e.g., 'zh', 'en', 'ja', 'ko', 'yue') */
-    language?: string;
-    /** Detected emotion (e.g., 'HAPPY', 'SAD', 'ANGRY', 'NEUTRAL') */
-    emotion?: string;
-    /** Detected audio event (e.g., 'Speech', 'BGM', 'Laughter') */
-    event?: string;
+interface SenseVoiceWorkerConfig {
+    /** Path or URL to model.int8.onnx (239MB) */
+    modelUrl: string;
+    /** Path or URL to tokens.txt vocabulary file (default: sibling of modelUrl) */
+    tokensUrl?: string;
+    /** Language hint (default: 'auto' for auto-detection) */
+    language?: SenseVoiceLanguage;
+    /** Text normalization: 'with_itn' applies inverse text normalization (default: 'with_itn') */
+    textNorm?: 'with_itn' | 'without_itn';
 }
-/** Resolve language string to SenseVoice language ID */
-declare function resolveLanguageId(language: string): number;
-/** Resolve text norm string to SenseVoice text norm ID */
-declare function resolveTextNormId(textNorm: string): number;
 /**
- * Parse tokens.txt into a token ID → string map
+ * SenseVoice ASR Worker - Speech Recognition in a Web Worker
  *
- * Format: each line is "token_string token_id"
- * e.g., "<unk> 0", "▁the 3", "s 4"
- */
-declare function parseTokensFile(content: string): Map<number, string>;
-/**
- * CTC greedy decode
+ * Runs SenseVoice inference off the main thread to prevent UI blocking.
+ * All preprocessing (fbank, LFR, CMVN) and CTC decoding run in the worker.
  *
- * @param logits Raw logits from model output, flattened [seqLen, vocabSize]
- * @param seqLen Sequence length (time steps)
- * @param vocabSize Vocabulary size
- * @param tokenMap Token ID → string map from tokens.txt
- * @returns Decoded text and structured metadata
+ * @see SenseVoiceInference for main-thread version
  */
-declare function ctcGreedyDecode(logits: Float32Array, seqLen: number, vocabSize: number, tokenMap: Map<number, string>): CTCDecodeResult;
+declare class SenseVoiceWorker {
+    private worker;
+    private config;
+    private isLoading;
+    private _isLoaded;
+    private inferenceQueue;
+    private poisoned;
+    private pendingResolvers;
+    private languageId;
+    private textNormId;
+    constructor(config: SenseVoiceWorkerConfig);
+    get isLoaded(): boolean;
+    /**
+     * Backend type (always 'wasm' for Worker, WebGPU not supported in Workers)
+     */
+    get backend(): 'wasm' | null;
+    /**
+     * Create the worker from inline script
+     */
+    private createWorker;
+    /**
+     * Handle messages from worker
+     */
+    private handleWorkerMessage;
+    /**
+     * Send message to worker and wait for response
+     */
+    private sendMessage;
+    /**
+     * Load the ONNX model in the worker
+     *
+     * @param onProgress - Optional progress callback. Fires once at 100% when load completes
+     *   (the worker downloads and loads the model internally, so granular progress is not available).
+     */
+    load(onProgress?: (loaded: number, total: number) => void): Promise<SenseVoiceModelInfo>;
+    /**
+     * Transcribe audio samples to text
+     *
+     * @param audioSamples Float32Array of audio samples at 16kHz, [-1, 1] range
+     * @returns Transcription result with text, emotion, language, and event
+     */
+    transcribe(audioSamples: Float32Array): Promise<SenseVoiceResult>;
+    /**
+     * Queue inference to serialize worker calls
+     */
+    private queueInference;
+    /**
+     * Dispose of the worker and free resources
+     */
+    dispose(): Promise<void>;
+    /**
+     * Check if Web Workers are supported
+     */
+    static isSupported(): boolean;
+}
 /**
- * Shared blendshape constants and utilities for lip sync inference
- *
- * Contains LAM_BLENDSHAPES (canonical ordering), symmetrization, and
- * index remapping used by both Wav2Vec2Inference and Wav2ArkitCpuInference.
- *
- * This module is the single source of truth for blendshape ordering to
- * avoid circular dependencies between inference classes.
- *
- * @category Inference
- */
-/**
- * LAM model blendshape names in order (52 total)
- * NOTE: This is alphabetical ordering used by LAM, different from standard ARKit order
- */
-declare const LAM_BLENDSHAPES: readonly ["browDownLeft", "browDownRight", "browInnerUp", "browOuterUpLeft", "browOuterUpRight", "cheekPuff", "cheekSquintLeft", "cheekSquintRight", "eyeBlinkLeft", "eyeBlinkRight", "eyeLookDownLeft", "eyeLookDownRight", "eyeLookInLeft", "eyeLookInRight", "eyeLookOutLeft", "eyeLookOutRight", "eyeLookUpLeft", "eyeLookUpRight", "eyeSquintLeft", "eyeSquintRight", "eyeWideLeft", "eyeWideRight", "jawForward", "jawLeft", "jawOpen", "jawRight", "mouthClose", "mouthDimpleLeft", "mouthDimpleRight", "mouthFrownLeft", "mouthFrownRight", "mouthFunnel", "mouthLeft", "mouthLowerDownLeft", "mouthLowerDownRight", "mouthPressLeft", "mouthPressRight", "mouthPucker", "mouthRight", "mouthRollLower", "mouthRollUpper", "mouthShrugLower", "mouthShrugUpper", "mouthSmileLeft", "mouthSmileRight", "mouthStretchLeft", "mouthStretchRight", "mouthUpperUpLeft", "mouthUpperUpRight", "noseSneerLeft", "noseSneerRight", "tongueOut"];
-/** Alias for backwards compatibility */
-declare const ARKIT_BLENDSHAPES: readonly ["browDownLeft", "browDownRight", "browInnerUp", "browOuterUpLeft", "browOuterUpRight", "cheekPuff", "cheekSquintLeft", "cheekSquintRight", "eyeBlinkLeft", "eyeBlinkRight", "eyeLookDownLeft", "eyeLookDownRight", "eyeLookInLeft", "eyeLookInRight", "eyeLookOutLeft", "eyeLookOutRight", "eyeLookUpLeft", "eyeLookUpRight", "eyeSquintLeft", "eyeSquintRight", "eyeWideLeft", "eyeWideRight", "jawForward", "jawLeft", "jawOpen", "jawRight", "mouthClose", "mouthDimpleLeft", "mouthDimpleRight", "mouthFrownLeft", "mouthFrownRight", "mouthFunnel", "mouthLeft", "mouthLowerDownLeft", "mouthLowerDownRight", "mouthPressLeft", "mouthPressRight", "mouthPucker", "mouthRight", "mouthRollLower", "mouthRollUpper", "mouthShrugLower", "mouthShrugUpper", "mouthSmileLeft", "mouthSmileRight", "mouthStretchLeft", "mouthStretchRight", "mouthUpperUpLeft", "mouthUpperUpRight", "noseSneerLeft", "noseSneerRight", "tongueOut"];
-/**
- * Symmetrize blendshapes by averaging left/right pairs
- * From LAM official postprocessing (models/utils.py)
- * This fixes asymmetric output from the raw model
- */
-declare function symmetrizeBlendshapes(frame: Float32Array): Float32Array;
-/**
- * wav2arkit_cpu model blendshape ordering
- *
- * Indices 0-24 match LAM_BLENDSHAPES, but 25+ diverge:
- * - LAM puts jawRight, mouthClose, mouthDimpleLeft, mouthDimpleRight at 25-28
- * - wav2arkit_cpu puts mouthFrownLeft at 25 and moves those four to 48-51
- */
-declare const WAV2ARKIT_BLENDSHAPES: readonly ["browDownLeft", "browDownRight", "browInnerUp", "browOuterUpLeft", "browOuterUpRight", "cheekPuff", "cheekSquintLeft", "cheekSquintRight", "eyeBlinkLeft", "eyeBlinkRight", "eyeLookDownLeft", "eyeLookDownRight", "eyeLookInLeft", "eyeLookInRight", "eyeLookOutLeft", "eyeLookOutRight", "eyeLookUpLeft", "eyeLookUpRight", "eyeSquintLeft", "eyeSquintRight", "eyeWideLeft", "eyeWideRight", "jawForward", "jawLeft", "jawOpen", "mouthFrownLeft", "mouthFrownRight", "mouthFunnel", "mouthLeft", "mouthLowerDownLeft", "mouthLowerDownRight", "mouthPressLeft", "mouthPressRight", "mouthPucker", "mouthRight", "mouthRollLower", "mouthRollUpper", "mouthShrugLower", "mouthShrugUpper", "mouthSmileLeft", "mouthSmileRight", "mouthStretchLeft", "mouthStretchRight", "mouthUpperUpLeft", "mouthUpperUpRight", "noseSneerLeft", "noseSneerRight", "tongueOut", "mouthClose", "mouthDimpleLeft", "mouthDimpleRight", "jawRight"];
-/**
- * Remap a blendshape frame from wav2arkit_cpu ordering to LAM_BLENDSHAPES ordering
+ * Silero VAD (Voice Activity Detection) inference
  *
- * @param frame - Float32Array of 52 blendshape values in wav2arkit_cpu order
- * @returns Float32Array of 52 blendshape values in LAM_BLENDSHAPES order
- */
-declare function remapWav2ArkitToLam(frame: Float32Array): Float32Array;
-/**
- * Unified Wav2Vec2 inference engine for Audio-to-Expression + ASR
+ * Neural network-based VAD running in browser via ONNX Runtime Web.
+ * Much more accurate than RMS-based energy detection.
  *
- * Runs entirely in the browser using WebGPU or WASM.
- * Takes raw 16kHz audio and outputs:
- * - 52 ARKit blendshapes (lip sync)
- * - 32-token CTC logits (speech recognition)
+ * Uses lazy loading to conditionally load WebGPU or WASM-only bundle:
+ * - iOS: Loads WASM-only bundle (WebGPU crashes due to Safari bugs)
+ * - Android/Desktop: Loads WebGPU bundle (with WASM fallback)
  *
  * @category Inference
  *
  * @example Basic usage
  * ```typescript
- * import { Wav2Vec2Inference } from '@omote/core';
+ * import { SileroVADInference } from '@omote/core';
  *
- * const wav2vec = new Wav2Vec2Inference({ modelUrl: '/models/unified_wav2vec2_asr_a2e.onnx' });
- * await wav2vec.load();
+ * const vad = new SileroVADInference({
+ *   modelUrl: '/models/silero-vad.onnx'
+ * });
+ * await vad.load();
  *
- * // Process 1 second of audio (16kHz = 16000 samples)
- * const result = await wav2vec.infer(audioSamples);
+ * // Process 32ms chunks (512 samples at 16kHz)
+ * const probability = await vad.process(audioChunk);
+ * if (probability > 0.5) {
+ *   console.log('Speech detected!');
+ * }
+ * ```
  *
- * console.log('Blendshapes:', result.blendshapes); // [30, 52] for 30fps
- * console.log('ASR text:', result.text); // Decoded transcription
+ * @example Streaming with state management
+ * ```typescript
+ * // State is automatically maintained between process() calls
+ * // Call reset() when starting a new audio stream
+ * vad.reset();
+ *
+ * for (const chunk of audioChunks) {
+ *   const prob = await vad.process(chunk);
+ *   // prob is speech probability [0, 1]
+ * }
  * ```
  */
-type InferenceBackend = BackendPreference;
-interface Wav2Vec2InferenceConfig {
+type VADBackend = BackendPreference;
+/**
+ * Configuration for Silero VAD
+ */
+interface SileroVADConfig {
     /** Path or URL to the ONNX model */
     modelUrl: string;
-    /**
-     * Path or URL to external model data file (.onnx.data weights).
-     * Default: `${modelUrl}.data` (e.g., /models/model.onnx.data)
-     *
-     * Set to `false` to skip external data loading (single-file models only).
-     */
-    externalDataUrl?: string | false;
     /** Preferred backend (auto will try WebGPU first, fallback to WASM) */
-    backend?: InferenceBackend;
-    /** Number of identity classes (default: 12 for streaming model) */
-    numIdentityClasses?: number;
+    backend?: VADBackend;
+    /** Sample rate (8000 or 16000, default: 16000) */
+    sampleRate?: 8000 | 16000;
+    /** Speech probability threshold (default: 0.5) */
+    threshold?: number;
+    /**
+     * Number of audio chunks to keep in pre-speech buffer.
+     * When VAD triggers, these chunks are prepended to the speech buffer
+     * to capture the beginning of speech that occurred before detection.
+     *
+     * At 512 samples/chunk and 16kHz:
+     * - 10 chunks = 320ms of pre-speech audio
+     * - 15 chunks = 480ms of pre-speech audio
+     *
+     * Default: 10 chunks (320ms)
+     */
+    preSpeechBufferChunks?: number;
 }
-interface ModelInfo {
+/**
+ * VAD model loading information
+ */
+interface VADModelInfo {
     backend: 'webgpu' | 'wasm';
     loadTimeMs: number;
     inputNames: string[];
     outputNames: string[];
+    sampleRate: number;
+    chunkSize: number;
 }
-/** CTC vocabulary (32 tokens from wav2vec2-base-960h) */
-declare const CTC_VOCAB: string[];
-interface Wav2Vec2Result {
-    /** Blendshape weights [frames, 52] - 30fps */
-    blendshapes: Float32Array[];
-    /** Raw CTC logits [frames, 32] - 50fps */
-    asrLogits: Float32Array[];
-    /** Decoded text from CTC */
-    text: string;
-    /** Number of blendshape frames (30fps) — alias for numA2EFrames */
-    numFrames: number;
-    /** Number of A2E frames (30fps) */
-    numA2EFrames: number;
-    /** Number of ASR frames (50fps) */
-    numASRFrames: number;
-    /** Inference time in ms */
+/**
+ * Result from a single VAD inference
+ */
+interface VADResult {
+    /** Speech probability (0-1) */
+    probability: number;
+    /** Whether speech is detected (probability > threshold) */
+    isSpeech: boolean;
+    /** Inference time in milliseconds */
     inferenceTimeMs: number;
+    /**
+     * Pre-speech audio chunks (only present on first speech detection).
+     * These are the N chunks immediately before VAD triggered, useful for
+     * capturing the beginning of speech that occurred before detection.
+     *
+     * Only populated when transitioning from silence to speech.
+     */
+    preSpeechChunks?: Float32Array[];
 }
-declare class Wav2Vec2Inference implements LipSyncBackend {
-    readonly modelId: "wav2vec2";
+/**
+ * Speech segment detected by VAD
+ */
+interface SpeechSegment {
+    /** Start time in seconds */
+    start: number;
+    /** End time in seconds */
+    end: number;
+    /** Average probability during segment */
+    avgProbability: number;
+}
+/**
+ * Silero VAD - Neural network voice activity detection
+ *
+ * Based on snakers4/silero-vad ONNX model.
+ * Processes 32ms chunks (512 samples at 16kHz) with LSTM state.
+ *
+ * @see https://github.com/snakers4/silero-vad
+ */
+declare class SileroVADInference {
     private session;
     private ort;
     private config;
     private _backend;
     private isLoading;
-    private numIdentityClasses;
+    private state;
+    private context;
+    private readonly chunkSize;
+    private readonly contextSize;
     private inferenceQueue;
-    private poisoned;
-    private static readonly INFERENCE_TIMEOUT_MS;
-    constructor(config: Wav2Vec2InferenceConfig);
+    private preSpeechBuffer;
+    private wasSpeaking;
+    private srTensor;
+    constructor(config: SileroVADConfig);
+    get backend(): RuntimeBackend | null;
+    get isLoaded(): boolean;
+    get sampleRate(): number;
+    get threshold(): number;
+    /**
+     * Get required chunk size in samples
+     */
+    getChunkSize(): number;
+    /**
+     * Get chunk duration in milliseconds
+     */
+    getChunkDurationMs(): number;
+    /**
+     * Check if WebGPU is available and working
+     * (iOS returns false even if navigator.gpu exists due to ONNX Runtime bugs)
+     */
+    static isWebGPUAvailable: typeof isWebGPUAvailable;
+    /**
+     * Load the ONNX model
+     */
+    load(): Promise<VADModelInfo>;
+    /**
+     * Reset state for new audio stream
+     */
+    reset(): void;
+    /**
+     * Process a single audio chunk
+     *
+     * @param audioChunk - Float32Array of exactly chunkSize samples (512 for 16kHz, 256 for 8kHz)
+     * @returns VAD result with speech probability
+     */
+    process(audioChunk: Float32Array): Promise<VADResult>;
+    /**
+     * Process audio and detect speech segments
+     *
+     * @param audio - Complete audio buffer
+     * @param options - Detection options
+     * @returns Array of speech segments
+     */
+    detectSpeech(audio: Float32Array, options?: {
+        /** Minimum speech duration in ms (default: 250) */
+        minSpeechDurationMs?: number;
+        /** Minimum silence duration to end segment in ms (default: 300) */
+        minSilenceDurationMs?: number;
+        /** Padding to add before/after speech in ms (default: 30) */
+        speechPadMs?: number;
+    }): Promise<SpeechSegment[]>;
+    /**
+     * Queue inference to serialize ONNX session calls
+     */
+    private queueInference;
+    /**
+     * Dispose of the model and free resources
+     */
+    dispose(): Promise<void>;
+}
+/**
+ * Silero VAD Web Worker implementation
+ *
+ * Runs Silero VAD inference in a dedicated Web Worker to prevent main thread blocking.
+ * Uses inline worker script (Blob URL pattern) to avoid separate file deployment.
+ *
+ * Key design decisions:
+ * - WASM backend only (WebGPU doesn't work in Workers)
+ * - LSTM state serialized as Float32Array (Tensors can't cross worker boundary)
+ * - Audio copied (not transferred) to retain main thread access for pre-speech buffer
+ * - ONNX Runtime loaded from CDN in worker (no bundler complications)
+ *
+ * @category Inference
+ *
+ * @example Basic usage
+ * ```typescript
+ * import { SileroVADWorker } from '@omote/core';
+ *
+ * const vad = new SileroVADWorker({
+ *   modelUrl: '/models/silero-vad.onnx'
+ * });
+ * await vad.load();
+ *
+ * // Process 32ms chunks (512 samples at 16kHz)
+ * const result = await vad.process(audioChunk);
+ * if (result.isSpeech) {
+ *   console.log('Speech detected!', result.probability);
+ * }
+ * ```
+ */
+/**
+ * Configuration for Silero VAD Worker
+ */
+interface VADWorkerConfig {
+    /** Path or URL to the ONNX model */
+    modelUrl: string;
+    /** Sample rate (8000 or 16000, default: 16000) */
+    sampleRate?: 8000 | 16000;
+    /** Speech probability threshold (default: 0.5) */
+    threshold?: number;
+    /**
+     * Number of audio chunks to keep in pre-speech buffer.
+     * When VAD triggers, these chunks are prepended to the speech buffer
+     * to capture the beginning of speech that occurred before detection.
+     *
+     * At 512 samples/chunk and 16kHz:
+     * - 10 chunks = 320ms of pre-speech audio
+     * - 15 chunks = 480ms of pre-speech audio
+     *
+     * Default: 10 chunks (320ms)
+     */
+    preSpeechBufferChunks?: number;
+}
+/**
+ * VAD model loading information from worker
+ */
+interface VADWorkerModelInfo {
+    backend: 'wasm';
+    loadTimeMs: number;
+    inputNames: string[];
+    outputNames: string[];
+    sampleRate: number;
+    chunkSize: number;
+}
+/**
+ * Silero VAD Worker - Voice Activity Detection in a Web Worker
+ *
+ * Runs Silero VAD inference off the main thread to prevent UI blocking.
+ * Feature parity with SileroVADInference but runs in dedicated worker.
+ *
+ * @see SileroVADInference for main-thread version
+ */
+declare class SileroVADWorker {
+    private worker;
+    private config;
+    private isLoading;
+    private _isLoaded;
+    private state;
+    private context;
+    private readonly chunkSize;
+    private readonly contextSize;
+    private inferenceQueue;
+    private preSpeechBuffer;
+    private wasSpeaking;
+    private pendingResolvers;
+    private messageId;
+    constructor(config: VADWorkerConfig);
+    get isLoaded(): boolean;
+    /**
+     * Backend type (always 'wasm' for Worker, WebGPU not supported in Workers)
+     */
+    get backend(): 'wasm' | null;
+    get sampleRate(): number;
+    get threshold(): number;
+    /**
+     * Get required chunk size in samples
+     */
+    getChunkSize(): number;
+    /**
+     * Get chunk duration in milliseconds
+     */
+    getChunkDurationMs(): number;
+    /**
+     * Create the worker from inline script
+     */
+    private createWorker;
+    /**
+     * Handle messages from worker
+     */
+    private handleWorkerMessage;
+    /**
+     * Send message to worker and wait for response
+     */
+    private sendMessage;
+    /**
+     * Load the ONNX model in the worker
+     */
+    load(): Promise<VADWorkerModelInfo>;
+    /**
+     * Reset state for new audio stream
+     */
+    reset(): Promise<void>;
+    /**
+     * Process a single audio chunk
+     *
+     * @param audioChunk - Float32Array of exactly chunkSize samples (512 for 16kHz, 256 for 8kHz)
+     * @returns VAD result with speech probability
+     */
+    process(audioChunk: Float32Array): Promise<VADResult>;
+    /**
+     * Queue inference to serialize worker calls
+     */
+    private queueInference;
+    /**
+     * Dispose of the worker and free resources
+     */
+    dispose(): Promise<void>;
+    /**
+     * Check if Web Workers are supported
+     */
+    static isSupported(): boolean;
+}
+/**
+ * Factory function for Silero VAD with automatic Worker vs main thread selection
+ *
+ * Provides a unified API that automatically selects the optimal implementation:
+ * - Desktop browsers: Uses SileroVADWorker (off-main-thread inference)
+ * - Mobile devices: Uses SileroVADInference (main thread, avoids memory overhead)
+ * - Fallback: Gracefully falls back to main thread if Worker fails
+ *
+ * @category Inference
+ *
+ * @example Basic usage (auto-detect)
+ * ```typescript
+ * import { createSileroVAD } from '@omote/core';
+ *
+ * const vad = createSileroVAD({
+ *   modelUrl: '/models/silero-vad.onnx',
+ *   threshold: 0.5,
+ * });
+ *
+ * await vad.load();
+ * const result = await vad.process(audioChunk);
+ * if (result.isSpeech) {
+ *   console.log('Speech detected!', result.probability);
+ * }
+ * ```
+ *
+ * @example Force worker usage
+ * ```typescript
+ * const vad = createSileroVAD({
+ *   modelUrl: '/models/silero-vad.onnx',
+ *   useWorker: true, // Force Worker even on mobile
+ * });
+ * ```
+ *
+ * @example Force main thread
+ * ```typescript
+ * const vad = createSileroVAD({
+ *   modelUrl: '/models/silero-vad.onnx',
+ *   useWorker: false, // Force main thread
+ * });
+ * ```
+ */
+/**
+ * Common interface for both SileroVADInference and SileroVADWorker
+ *
+ * This interface defines the shared API that both implementations provide,
+ * allowing consumers to use either interchangeably.
+ */
+interface SileroVADBackend {
+    /** Current backend type (webgpu, wasm, or null if not loaded) */
+    readonly backend: RuntimeBackend | null;
+    /** Whether the model is loaded and ready for inference */
+    readonly isLoaded: boolean;
+    /** Audio sample rate (8000 or 16000 Hz) */
+    readonly sampleRate: number;
+    /** Speech detection threshold (0-1) */
+    readonly threshold: number;
     /**
-     * Check if WebGPU is available and working
-     * (iOS returns false even if navigator.gpu exists due to ONNX Runtime bugs)
+     * Load the ONNX model
+     * @returns Model loading information
      */
-    static isWebGPUAvailable: typeof isWebGPUAvailable;
-    get backend(): 'webgpu' | 'wasm' | null;
-    get isLoaded(): boolean;
-    /** True if inference timed out and the session is permanently unusable */
-    get isSessionPoisoned(): boolean;
+    load(): Promise<VADModelInfo | VADWorkerModelInfo>;
     /**
-     * Load the ONNX model
+     * Process a single audio chunk
+     * @param audioChunk - Float32Array of exactly chunkSize samples
+     * @returns VAD result with speech probability
      */
-    load(): Promise<ModelInfo>;
+    process(audioChunk: Float32Array): Promise<VADResult>;
     /**
-     * Run inference on raw audio
-     * @param audioSamples - Float32Array of raw audio at 16kHz (16000 samples = 1 second)
-     * @param identityIndex - Optional identity index (0-11, default 0 = neutral)
-     *
-     * Note: Model expects 1-second chunks (16000 samples) for optimal performance.
-     * Audio will be zero-padded or truncated to 16000 samples.
+     * Reset state for new audio stream
      */
-    infer(audioSamples: Float32Array, identityIndex?: number): Promise<Wav2Vec2Result>;
+    reset(): void | Promise<void>;
     /**
-     * Decode CTC logits to text using greedy decoding
+     * Dispose of the model and free resources
      */
-    private decodeCTC;
+    dispose(): Promise<void>;
     /**
-     * Queue inference to serialize ONNX session calls
+     * Get required chunk size in samples
      */
-    private queueInference;
+    getChunkSize(): number;
     /**
-     * Get blendshape value by name for a specific frame
+     * Get chunk duration in milliseconds
      */
-    getBlendshape(blendshapes: Float32Array, name: typeof LAM_BLENDSHAPES[number]): number;
+    getChunkDurationMs(): number;
+}
+/**
+ * Configuration for the Silero VAD factory
+ *
+ * Extends SileroVADConfig with worker-specific options.
+ */
+interface SileroVADFactoryConfig extends SileroVADConfig {
     /**
-     * Dispose of the model and free resources
+     * Force worker usage (true), main thread (false), or auto-detect (undefined).
+     *
+     * Auto-detection behavior:
+     * - Desktop: Uses Worker (better responsiveness, off-main-thread)
+     * - Mobile: Uses main thread (avoids 5MB memory overhead)
+     *
+     * You can override this to:
+     * - `true`: Force Worker even on mobile (if you have memory headroom)
+     * - `false`: Force main thread even on desktop (for debugging)
+     *
+     * Default: undefined (auto-detect)
      */
-    dispose(): Promise<void>;
+    useWorker?: boolean;
+    /**
+     * Fallback to main thread on worker errors.
+     *
+     * When true (default), if the Worker fails to load or encounters an error,
+     * the factory will automatically create a main thread instance instead.
+     *
+     * When false, worker errors will propagate as exceptions.
+     *
+     * Default: true
+     */
+    fallbackOnError?: boolean;
+    /**
+     * Unified inference worker instance.
+     * When provided, uses SileroVADUnifiedAdapter (shared single-ORT worker).
+     * Takes precedence over useWorker setting.
+     */
+    unifiedWorker?: UnifiedInferenceWorker;
 }
 /**
- * CPU-optimized lip sync inference using wav2arkit_cpu model
+ * Check if the current environment supports VAD Web Workers
  *
- * A Safari/iOS-compatible alternative to Wav2Vec2Inference (384MB) designed
- * for platforms where WebGPU crashes due to ONNX Runtime JSEP bugs.
+ * Requirements:
+ * - Worker constructor must exist
+ * - Blob URL support (for inline worker script)
  *
- * The model uses ONNX external data format:
- * - wav2arkit_cpu.onnx (1.86MB graph structure)
- * - wav2arkit_cpu.onnx.data (402MB weights)
- * Both files are fetched and cached automatically.
+ * @returns true if VAD Worker is supported
+ */
+declare function supportsVADWorker(): boolean;
+/**
+ * Create a Silero VAD instance with automatic implementation selection
  *
- * Key differences from Wav2Vec2Inference:
- * - WASM-only backend (CPU-optimized, single-threaded, no WebGPU)
- * - No identity input (baked to identity 11)
- * - No ASR output (lip sync only)
- * - Dynamic input length (not fixed to 16000 samples)
- * - Different native blendshape ordering (remapped to LAM_BLENDSHAPES)
+ * This factory function automatically selects between:
+ * - **SileroVADWorker**: Off-main-thread inference (better for desktop)
+ * - **SileroVADInference**: Main thread inference (better for mobile)
+ *
+ * The selection is based on:
+ * 1. Explicit `useWorker` config (if provided)
+ * 2. Platform detection (mobile vs desktop)
+ * 3. Worker API availability
+ *
+ * Both implementations share the same interface (SileroVADBackend),
+ * so consumers can use either interchangeably.
+ *
+ * @param config - Factory configuration
+ * @returns A SileroVAD instance (either Worker or main thread)
+ *
+ * @example
+ * ```typescript
+ * // Auto-detect (recommended)
+ * const vad = createSileroVAD({ modelUrl: '/models/silero-vad.onnx' });
+ *
+ * // Force Worker
+ * const vadWorker = createSileroVAD({ modelUrl: '/models/silero-vad.onnx', useWorker: true });
+ *
+ * // Force main thread
+ * const vadMain = createSileroVAD({ modelUrl: '/models/silero-vad.onnx', useWorker: false });
+ * ```
+ */
+declare function createSileroVAD(config: SileroVADFactoryConfig): SileroVADBackend;
+/**
+ * Web Worker-based wav2arkit_cpu lip sync inference
+ *
+ * Runs wav2arkit_cpu inference in a dedicated Web Worker to prevent main thread blocking.
+ * Uses inline worker script (Blob URL pattern) to avoid separate file deployment.
+ *
+ * Key design decisions:
+ * - WASM backend only (WebGPU doesn't work in Workers)
+ * - Audio copied (not transferred) to retain main thread access
+ * - ONNX Runtime loaded from CDN in worker (no bundler complications)
+ * - Blendshape symmetrization inlined in worker (no module imports)
+ * - iOS: passes model URLs as strings directly to ORT (avoids 400MB+ JS heap)
  *
  * @category Inference
  *
  * @example
  * ```typescript
- * import { Wav2ArkitCpuInference } from '@omote/core';
+ * import { Wav2ArkitCpuWorker } from '@omote/core';
  *
- * const lam = new Wav2ArkitCpuInference({
+ * const lam = new Wav2ArkitCpuWorker({
  *   modelUrl: '/models/wav2arkit_cpu.onnx',
  * });
  * await lam.load();
@@ -1736,7 +2072,10 @@ declare class Wav2Vec2Inference implements LipSyncBackend {
  * ```
  */
-interface Wav2ArkitCpuConfig {
+/**
+ * Configuration for Wav2ArkitCpu Worker
+ */
+interface Wav2ArkitCpuWorkerConfig {
     /** Path or URL to the wav2arkit_cpu ONNX model (.onnx graph file) */
     modelUrl: string;
     /**
@@ -1746,24 +2085,44 @@ interface Wav2ArkitCpuConfig {
      * Set to `false` to skip external data loading (single-file models only).
      */
     externalDataUrl?: string | false;
-    /** Preferred backend (default: 'wasm' — this model is CPU-optimized) */
-    backend?: BackendPreference;
 }
-declare class Wav2ArkitCpuInference implements LipSyncBackend {
+/**
+ * Wav2ArkitCpu Worker - Lip sync inference in a Web Worker
+ *
+ * Runs wav2arkit_cpu inference off the main thread to prevent UI blocking.
+ * Feature parity with Wav2ArkitCpuInference but runs in dedicated worker.
+ *
+ * @see Wav2ArkitCpuInference for main-thread version
+ */
+declare class Wav2ArkitCpuWorker implements LipSyncBackend {
     readonly modelId: "wav2arkit_cpu";
-    private session;
-    private ort;
+    private worker;
     private config;
-    private _backend;
     private isLoading;
+    private _isLoaded;
     private inferenceQueue;
     private poisoned;
-    private static readonly INFERENCE_TIMEOUT_MS;
-    constructor(config: Wav2ArkitCpuConfig);
-    get backend(): RuntimeBackend | null;
+    private pendingResolvers;
+    constructor(config: Wav2ArkitCpuWorkerConfig);
     get isLoaded(): boolean;
     /**
-     * Load the ONNX model
+     * Backend type (always 'wasm' for Worker, WebGPU not supported in Workers)
+     */
+    get backend(): 'wasm' | null;
+    /**
+     * Create the worker from inline script
+     */
+    private createWorker;
+    /**
+     * Handle messages from worker
+     */
+    private handleWorkerMessage;
+    /**
+     * Send message to worker and wait for response
+     */
+    private sendMessage;
+    /**
+     * Load the ONNX model in the worker
      */
     load(): Promise<LipSyncModelInfo>;
     /**
@@ -1777,280 +2136,524 @@ declare class Wav2ArkitCpuInference implements LipSyncBackend {
      */
     infer(audioSamples: Float32Array, _identityIndex?: number): Promise<LipSyncResult>;
     /**
-     * Queue inference to serialize ONNX session calls
+     * Queue inference to serialize worker calls
      */
     private queueInference;
     /**
-     * Dispose of the model and free resources
+     * Dispose of the worker and free resources
+     */
+    dispose(): Promise<void>;
+    /**
+     * Check if Web Workers are supported
      */
+    static isSupported(): boolean;
+}
+/**
+ * Unified Inference Worker — single Web Worker hosting all WASM models
+ *
+ * Solves the multi-worker ORT problem: three per-model workers each load their
+ * own ORT WASM instance (~40MB each). On iOS this exceeds the ~1-1.5GB tab
+ * limit, forcing main-thread fallback which blocks the render loop.
+ *
+ * This worker hosts SenseVoice + Wav2ArkitCpu + Silero VAD in a single
+ * ORT WASM instance. Same total model memory (~643MB), but inference runs
+ * off-main-thread. Works on iOS because there's only one ORT instance.
+ *
+ * Consumer usage:
+ * ```typescript
+ * const worker = new UnifiedInferenceWorker();
+ * await worker.init();
+ *
+ * const asr = createSenseVoice({ modelUrl: '...', unifiedWorker: worker });
+ * const lam = createLipSync({ gpuModelUrl: '...', cpuModelUrl: '...', unifiedWorker: worker });
+ * const vad = createSileroVAD({ modelUrl: '...', unifiedWorker: worker });
+ * ```
+ *
+ * @category Inference
+ */
+/**
+ * Unified Inference Worker — single Web Worker for all WASM models
+ *
+ * Hosts SenseVoice, Wav2ArkitCpu, and Silero VAD in one ORT instance.
+ * Eliminates the multi-worker memory problem on iOS.
+ */
+declare class UnifiedInferenceWorker {
+    private worker;
+    private pendingRequests;
+    private initialized;
+    private poisoned;
+    /**
+     * Initialize the worker (load ORT WASM from CDN)
+     */
+    init(): Promise<void>;
+    loadSenseVoice(config: {
+        modelUrl: string;
+        tokensUrl: string;
+        language: number;
+        textNorm: number;
+    }): Promise<SenseVoiceModelInfo>;
+    transcribe(audio: Float32Array): Promise<SenseVoiceResult>;
+    disposeSenseVoice(): Promise<void>;
+    loadLipSync(config: {
+        modelUrl: string;
+        externalDataUrl: string | null;
+    }): Promise<LipSyncModelInfo>;
+    inferLipSync(audio: Float32Array): Promise<{
+        blendshapes: Float32Array;
+        numFrames: number;
+        numBlendshapes: number;
+        inferenceTimeMs: number;
+    }>;
+    disposeLipSync(): Promise<void>;
+    loadVAD(config: {
+        modelUrl: string;
+        sampleRate: number;
+    }): Promise<VADWorkerModelInfo>;
+    processVAD(audio: Float32Array, state: Float32Array, context: Float32Array): Promise<{
+        probability: number;
+        state: Float32Array;
+        inferenceTimeMs: number;
+    }>;
+    resetVAD(): Promise<Float32Array>;
+    disposeVAD(): Promise<void>;
+    dispose(): Promise<void>;
+    /** Check if the worker is initialized and not poisoned */
+    get isReady(): boolean;
+    /** Check if Web Workers are supported */
+    static isSupported(): boolean;
+    private assertReady;
+    private createWorker;
+    private handleWorkerMessage;
+    private sendMessage;
+    private rejectAllPending;
+    private cleanup;
+}
+/**
+ * SenseVoice adapter backed by UnifiedInferenceWorker
+ *
+ * Implements SenseVoiceBackend, delegating all inference to the shared worker.
+ */
+declare class SenseVoiceUnifiedAdapter implements SenseVoiceBackend {
+    private worker;
+    private config;
+    private _isLoaded;
+    private languageId;
+    private textNormId;
+    private inferenceQueue;
+    constructor(worker: UnifiedInferenceWorker, config: SenseVoiceWorkerConfig);
+    get isLoaded(): boolean;
+    get backend(): 'wasm' | null;
+    load(onProgress?: (loaded: number, total: number) => void): Promise<SenseVoiceModelInfo>;
+    transcribe(audioSamples: Float32Array): Promise<SenseVoiceResult>;
+    dispose(): Promise<void>;
+}
+/**
+ * Wav2ArkitCpu adapter backed by UnifiedInferenceWorker
+ *
+ * Implements LipSyncBackend, delegating all inference to the shared worker.
+ */
+declare class Wav2ArkitCpuUnifiedAdapter implements LipSyncBackend {
+    readonly modelId: "wav2arkit_cpu";
+    private worker;
+    private config;
+    private _isLoaded;
+    private inferenceQueue;
+    constructor(worker: UnifiedInferenceWorker, config: Wav2ArkitCpuWorkerConfig);
+    get isLoaded(): boolean;
+    get backend(): RuntimeBackend | null;
+    load(): Promise<LipSyncModelInfo>;
+    infer(audioSamples: Float32Array, _identityIndex?: number): Promise<LipSyncResult>;
+    dispose(): Promise<void>;
+}
+/**
+ * Silero VAD adapter backed by UnifiedInferenceWorker
+ *
+ * Implements SileroVADBackend, delegating all inference to the shared worker.
+ */
+declare class SileroVADUnifiedAdapter implements SileroVADBackend {
+    private worker;
+    private config;
+    private _isLoaded;
+    private state;
+    private context;
+    private readonly chunkSize;
+    private readonly contextSize;
+    private inferenceQueue;
+    private preSpeechBuffer;
+    private wasSpeaking;
+    constructor(worker: UnifiedInferenceWorker, config: SileroVADConfig);
+    get isLoaded(): boolean;
+    get backend(): RuntimeBackend | null;
+    get sampleRate(): number;
+    get threshold(): number;
+    getChunkSize(): number;
+    getChunkDurationMs(): number;
+    load(): Promise<VADWorkerModelInfo>;
+    process(audioChunk: Float32Array): Promise<VADResult>;
+    reset(): Promise<void>;
     dispose(): Promise<void>;
 }
 /**
- * Factory function for lip sync with automatic GPU/CPU model selection
- *
- * Provides a unified API that automatically selects the optimal model:
- * - Safari (macOS + iOS): Uses Wav2ArkitCpuInference (404MB, WASM)
- * - Chrome/Firefox/Edge: Uses Wav2Vec2Inference (384MB, WebGPU)
- * - Fallback: Gracefully falls back to CPU model if GPU model fails to load
+ * Factory function for SenseVoice ASR with automatic Worker vs main thread selection
  *
- * Why two separate models?
- * Wav2Vec2 (LAM) cannot run on Safari/iOS for two reasons:
- * 1. Its dual-head transformer graph needs ~750-950MB peak during ORT session
- *    creation (graph optimization), exceeding iOS WebKit's ~1-1.5GB tab limit.
- * 2. It ships as a single 384MB .onnx file that must load into JS heap before
- *    ORT can consume it. iOS WebKit OOMs on this allocation.
- * wav2arkit_cpu solves both: external data format (1.86MB graph + 402MB weights)
- * lets ORT load only the tiny graph, then stream weights via URL pass-through
- * directly into WASM memory. JS heap stays at ~2MB.
+ * Provides a unified API that automatically selects the optimal implementation:
+ * - Worker supported: Uses SenseVoiceWorker (off-main-thread inference)
+ * - Worker unsupported: Uses SenseVoiceInference (main thread)
  *
  * @category Inference
  *
  * @example Auto-detect (recommended)
  * ```typescript
- * import { createLipSync } from '@omote/core';
+ * import { createSenseVoice } from '@omote/core';
  *
- * const lam = createLipSync({
- *   gpuModelUrl: '/models/unified_wav2vec2_asr_a2e.onnx',
- *   cpuModelUrl: '/models/wav2arkit_cpu.onnx',
+ * const asr = createSenseVoice({
+ *   modelUrl: '/models/sensevoice/model.int8.onnx',
  * });
+ * await asr.load();
+ * const { text, emotion } = await asr.transcribe(audioSamples);
+ * ```
  *
- * await lam.load();
- * const { blendshapes } = await lam.infer(audioSamples);
+ * @example Force worker
+ * ```typescript
+ * const asr = createSenseVoice({
+ *   modelUrl: '/models/sensevoice/model.int8.onnx',
+ *   useWorker: true,
+ * });
  * ```
  *
- * @example Force CPU model
+ * @example Force main thread
  * ```typescript
- * const lam = createLipSync({
- *   gpuModelUrl: '/models/unified_wav2vec2_asr_a2e.onnx',
- *   cpuModelUrl: '/models/wav2arkit_cpu.onnx',
- *   mode: 'cpu',
+ * const asr = createSenseVoice({
+ *   modelUrl: '/models/sensevoice/model.int8.onnx',
+ *   useWorker: false,
  * });
  * ```
  */
 /**
- * Configuration for the lip sync factory
+ * Common interface for both SenseVoiceInference and SenseVoiceWorker
  */
-interface CreateLipSyncConfig {
-    /** URL for the GPU model (Wav2Vec2, used on Chrome/Firefox/Edge) */
-    gpuModelUrl: string;
+interface SenseVoiceBackend {
+    /** Whether the model is loaded and ready for inference */
+    readonly isLoaded: boolean;
+    /** Current backend type ('wasm' for worker, full backend for main thread, null if not loaded) */
+    readonly backend: 'wasm' | 'webgpu' | null;
     /**
-     * URL for GPU model external data file (.onnx.data weights).
-     * Default: `${gpuModelUrl}.data`
-     *
-     * Set to `false` to skip external data loading (single-file models only).
+     * Load the ONNX model
+     * @param onProgress - Optional progress callback (fires once at 100% for worker)
+     * @returns Model loading information
      */
-    gpuExternalDataUrl?: string | false;
-    /** URL for the CPU model (wav2arkit_cpu, used on Safari/iOS) */
-    cpuModelUrl: string;
+    load(onProgress?: (loaded: number, total: number) => void): Promise<SenseVoiceModelInfo>;
     /**
-     * Model selection mode:
-     * - 'auto': Safari/iOS → CPU, everything else → GPU (default)
-     * - 'gpu': Force GPU model (Wav2Vec2Inference)
-     * - 'cpu': Force CPU model (Wav2ArkitCpuInference)
+     * Transcribe audio samples to text
+     * @param audioSamples - Float32Array of audio samples at 16kHz
+     * @returns Transcription result
      */
-    mode?: 'auto' | 'gpu' | 'cpu';
-    /** Backend preference for GPU model (default: 'auto') */
-    gpuBackend?: BackendPreference;
-    /** Number of identity classes for GPU model (default: 12) */
-    numIdentityClasses?: number;
+    transcribe(audioSamples: Float32Array): Promise<SenseVoiceResult>;
     /**
-     * Fall back to CPU model if GPU model fails to load (default: true)
-     * Only applies when mode is 'auto' or 'gpu'
+     * Dispose of the model and free resources
      */
-    fallbackOnError?: boolean;
+    dispose(): Promise<void>;
 }
 /**
- * Create a lip sync instance with automatic GPU/CPU model selection
+ * Configuration for the SenseVoice factory
+ */
+interface CreateSenseVoiceConfig {
+    /** Path or URL to model.int8.onnx (239MB) */
+    modelUrl: string;
+    /** Path or URL to tokens.txt vocabulary file (default: sibling of modelUrl) */
+    tokensUrl?: string;
+    /** Language hint (default: 'auto') */
+    language?: SenseVoiceLanguage;
+    /** Text normalization (default: 'with_itn') */
+    textNorm?: 'with_itn' | 'without_itn';
+    /**
+     * Worker mode:
+     * - 'auto' (default): Use Worker if supported, else main thread
+     * - true: Force Worker (throws if unsupported)
+     * - false: Force main thread
+     */
+    useWorker?: boolean | 'auto';
+    /**
+     * Unified inference worker instance.
+     * When provided, uses SenseVoiceUnifiedAdapter (shared single-ORT worker).
+     * Takes precedence over useWorker setting.
+     */
+    unifiedWorker?: UnifiedInferenceWorker;
+}
+/**
+ * Create a SenseVoice ASR instance with automatic implementation selection
  *
  * @param config - Factory configuration
- * @returns A LipSyncBackend instance (either GPU or CPU model)
+ * @returns A SenseVoiceBackend instance (either Worker or main thread)
  */
-declare function createLipSync(config: CreateLipSyncConfig): LipSyncBackend;
+declare function createSenseVoice(config: CreateSenseVoiceConfig): SenseVoiceBackend;
 /**
- * Silero VAD (Voice Activity Detection) inference
+ * Kaldi-compatible filterbank (fbank) feature extraction
  *
- * Neural network-based VAD running in browser via ONNX Runtime Web.
- * Much more accurate than RMS-based energy detection.
+ * Pure TypeScript implementation matching kaldi-native-fbank parameters
+ * used by SenseVoice. No external dependencies.
  *
- * Uses lazy loading to conditionally load WebGPU or WASM-only bundle:
- * - iOS: Loads WASM-only bundle (WebGPU crashes due to Safari bugs)
- * - Android/Desktop: Loads WebGPU bundle (with WASM fallback)
+ * Pipeline: audio → framing → windowing → FFT → power spectrum → mel filterbank → log
+ *
+ * @module inference/kaldiFbank
+ */
+interface KaldiFbankOptions {
+    /** Frame length in ms (default: 25) */
+    frameLengthMs?: number;
+    /** Frame shift in ms (default: 10) */
+    frameShiftMs?: number;
+    /** Low frequency cutoff in Hz (default: 20) */
+    lowFreq?: number;
+    /** High frequency cutoff in Hz (default: sampleRate / 2) */
+    highFreq?: number;
+    /** Dither amount (default: 0 for deterministic output) */
+    dither?: number;
+    /** Preemphasis coefficient (default: 0.97) */
+    preemphasis?: number;
+}
+/**
+ * Compute Kaldi-compatible log mel filterbank features
+ *
+ * @param audio Raw audio samples (float32, [-1, 1] range)
+ * @param sampleRate Sample rate in Hz (must be 16000 for SenseVoice)
+ * @param numMelBins Number of mel bins (80 for SenseVoice)
+ * @param opts Optional parameters
+ * @returns Flattened Float32Array of shape [numFrames, numMelBins]
+ */
+declare function computeKaldiFbank(audio: Float32Array, sampleRate: number, numMelBins: number, opts?: KaldiFbankOptions): Float32Array;
+/**
+ * Apply Low Frame Rate stacking for SenseVoice
+ *
+ * Concatenates lfrM consecutive frames with stride lfrN.
+ * Left-pads with copies of first frame, right-pads last group.
+ *
+ * @param features Flattened [numFrames, featureDim]
+ * @param featureDim Feature dimension per frame (e.g., 80)
+ * @param lfrM Number of frames to stack (default: 7)
+ * @param lfrN Stride (default: 6)
+ * @returns Flattened [numOutputFrames, featureDim * lfrM]
+ */
+declare function applyLFR(features: Float32Array, featureDim: number, lfrM?: number, lfrN?: number): Float32Array;
+/**
+ * Apply CMVN normalization in-place
+ *
+ * Formula: normalized[i] = (features[i] + negMean[i % dim]) * invStddev[i % dim]
+ *
+ * @param features Flattened feature array (modified in-place)
+ * @param dim Feature dimension (560 for SenseVoice after LFR)
+ * @param negMean Negative mean vector (dim-dimensional)
+ * @param invStddev Inverse standard deviation vector (dim-dimensional)
+ * @returns The same features array (for chaining)
+ */
+declare function applyCMVN(features: Float32Array, dim: number, negMean: Float32Array, invStddev: Float32Array): Float32Array;
+/**
+ * Parse CMVN vectors from comma-separated strings (stored in ONNX metadata)
+ *
+ * The sherpa-onnx SenseVoice export stores neg_mean and inv_stddev
+ * as comma-separated float strings in the model's metadata.
+ */
+declare function parseCMVNFromMetadata(negMeanStr: string, invStddevStr: string): {
+    negMean: Float32Array;
+    invStddev: Float32Array;
+};
+/**
+ * CTC greedy decoder for SenseVoice
+ *
+ * Decodes CTC logits into text with structured token parsing
+ * for language, emotion, and audio event detection.
+ *
+ * @module inference/ctcDecoder
+ */
+interface CTCDecodeResult {
+    /** Decoded text (speech content only) */
+    text: string;
+    /** Detected language (e.g., 'zh', 'en', 'ja', 'ko', 'yue') */
+    language?: string;
+    /** Detected emotion (e.g., 'HAPPY', 'SAD', 'ANGRY', 'NEUTRAL') */
+    emotion?: string;
+    /** Detected audio event (e.g., 'Speech', 'BGM', 'Laughter') */
+    event?: string;
+}
+/** Resolve language string to SenseVoice language ID */
+declare function resolveLanguageId(language: string): number;
+/** Resolve text norm string to SenseVoice text norm ID */
+declare function resolveTextNormId(textNorm: string): number;
+/**
+ * Parse tokens.txt into a token ID → string map
+ *
+ * Format: each line is "token_string token_id"
+ * e.g., "<unk> 0", "▁the 3", "s 4"
+ */
+declare function parseTokensFile(content: string): Map<number, string>;
+/**
+ * CTC greedy decode
+ *
+ * @param logits Raw logits from model output, flattened [seqLen, vocabSize]
+ * @param seqLen Sequence length (time steps)
+ * @param vocabSize Vocabulary size
+ * @param tokenMap Token ID → string map from tokens.txt
+ * @returns Decoded text and structured metadata
+ */
+declare function ctcGreedyDecode(logits: Float32Array, seqLen: number, vocabSize: number, tokenMap: Map<number, string>): CTCDecodeResult;
+/**
+ * Shared blendshape constants and utilities for lip sync inference
+ *
+ * Contains LAM_BLENDSHAPES (canonical ordering), symmetrization, and
+ * index remapping used by both Wav2Vec2Inference and Wav2ArkitCpuInference.
+ *
+ * This module is the single source of truth for blendshape ordering to
+ * avoid circular dependencies between inference classes.
+ *
+ * @category Inference
+ */
+/**
+ * LAM model blendshape names in order (52 total)
+ * NOTE: This is alphabetical ordering used by LAM, different from standard ARKit order
+ */
+declare const LAM_BLENDSHAPES: readonly ["browDownLeft", "browDownRight", "browInnerUp", "browOuterUpLeft", "browOuterUpRight", "cheekPuff", "cheekSquintLeft", "cheekSquintRight", "eyeBlinkLeft", "eyeBlinkRight", "eyeLookDownLeft", "eyeLookDownRight", "eyeLookInLeft", "eyeLookInRight", "eyeLookOutLeft", "eyeLookOutRight", "eyeLookUpLeft", "eyeLookUpRight", "eyeSquintLeft", "eyeSquintRight", "eyeWideLeft", "eyeWideRight", "jawForward", "jawLeft", "jawOpen", "jawRight", "mouthClose", "mouthDimpleLeft", "mouthDimpleRight", "mouthFrownLeft", "mouthFrownRight", "mouthFunnel", "mouthLeft", "mouthLowerDownLeft", "mouthLowerDownRight", "mouthPressLeft", "mouthPressRight", "mouthPucker", "mouthRight", "mouthRollLower", "mouthRollUpper", "mouthShrugLower", "mouthShrugUpper", "mouthSmileLeft", "mouthSmileRight", "mouthStretchLeft", "mouthStretchRight", "mouthUpperUpLeft", "mouthUpperUpRight", "noseSneerLeft", "noseSneerRight", "tongueOut"];
+/** Alias for backwards compatibility */
+declare const ARKIT_BLENDSHAPES: readonly ["browDownLeft", "browDownRight", "browInnerUp", "browOuterUpLeft", "browOuterUpRight", "cheekPuff", "cheekSquintLeft", "cheekSquintRight", "eyeBlinkLeft", "eyeBlinkRight", "eyeLookDownLeft", "eyeLookDownRight", "eyeLookInLeft", "eyeLookInRight", "eyeLookOutLeft", "eyeLookOutRight", "eyeLookUpLeft", "eyeLookUpRight", "eyeSquintLeft", "eyeSquintRight", "eyeWideLeft", "eyeWideRight", "jawForward", "jawLeft", "jawOpen", "jawRight", "mouthClose", "mouthDimpleLeft", "mouthDimpleRight", "mouthFrownLeft", "mouthFrownRight", "mouthFunnel", "mouthLeft", "mouthLowerDownLeft", "mouthLowerDownRight", "mouthPressLeft", "mouthPressRight", "mouthPucker", "mouthRight", "mouthRollLower", "mouthRollUpper", "mouthShrugLower", "mouthShrugUpper", "mouthSmileLeft", "mouthSmileRight", "mouthStretchLeft", "mouthStretchRight", "mouthUpperUpLeft", "mouthUpperUpRight", "noseSneerLeft", "noseSneerRight", "tongueOut"];
+/**
+ * Symmetrize blendshapes by averaging left/right pairs
+ * From LAM official postprocessing (models/utils.py)
+ * This fixes asymmetric output from the raw model
+ */
+declare function symmetrizeBlendshapes(frame: Float32Array): Float32Array;
+/**
+ * wav2arkit_cpu model blendshape ordering
+ *
+ * Indices 0-24 match LAM_BLENDSHAPES, but 25+ diverge:
+ * - LAM puts jawRight, mouthClose, mouthDimpleLeft, mouthDimpleRight at 25-28
+ * - wav2arkit_cpu puts mouthFrownLeft at 25 and moves those four to 48-51
+ */
+declare const WAV2ARKIT_BLENDSHAPES: readonly ["browDownLeft", "browDownRight", "browInnerUp", "browOuterUpLeft", "browOuterUpRight", "cheekPuff", "cheekSquintLeft", "cheekSquintRight", "eyeBlinkLeft", "eyeBlinkRight", "eyeLookDownLeft", "eyeLookDownRight", "eyeLookInLeft", "eyeLookInRight", "eyeLookOutLeft", "eyeLookOutRight", "eyeLookUpLeft", "eyeLookUpRight", "eyeSquintLeft", "eyeSquintRight", "eyeWideLeft", "eyeWideRight", "jawForward", "jawLeft", "jawOpen", "mouthFrownLeft", "mouthFrownRight", "mouthFunnel", "mouthLeft", "mouthLowerDownLeft", "mouthLowerDownRight", "mouthPressLeft", "mouthPressRight", "mouthPucker", "mouthRight", "mouthRollLower", "mouthRollUpper", "mouthShrugLower", "mouthShrugUpper", "mouthSmileLeft", "mouthSmileRight", "mouthStretchLeft", "mouthStretchRight", "mouthUpperUpLeft", "mouthUpperUpRight", "noseSneerLeft", "noseSneerRight", "tongueOut", "mouthClose", "mouthDimpleLeft", "mouthDimpleRight", "jawRight"];
+/**
+ * Remap a blendshape frame from wav2arkit_cpu ordering to LAM_BLENDSHAPES ordering
+ *
+ * @param frame - Float32Array of 52 blendshape values in wav2arkit_cpu order
+ * @returns Float32Array of 52 blendshape values in LAM_BLENDSHAPES order
+ */
+declare function remapWav2ArkitToLam(frame: Float32Array): Float32Array;
+/**
+ * Unified Wav2Vec2 inference engine for Audio-to-Expression + ASR
+ *
+ * Runs entirely in the browser using WebGPU or WASM.
+ * Takes raw 16kHz audio and outputs:
+ * - 52 ARKit blendshapes (lip sync)
+ * - 32-token CTC logits (speech recognition)
  *
  * @category Inference
  *
  * @example Basic usage
  * ```typescript
- * import { SileroVADInference } from '@omote/core';
- *
- * const vad = new SileroVADInference({
- *   modelUrl: '/models/silero-vad.onnx'
- * });
- * await vad.load();
+ * import { Wav2Vec2Inference } from '@omote/core';
  *
- * // Process 32ms chunks (512 samples at 16kHz)
- * const probability = await vad.process(audioChunk);
- * if (probability > 0.5) {
- *   console.log('Speech detected!');
- * }
- * ```
+ * const wav2vec = new Wav2Vec2Inference({ modelUrl: '/models/unified_wav2vec2_asr_a2e.onnx' });
+ * await wav2vec.load();
  *
- * @example Streaming with state management
- * ```typescript
- * // State is automatically maintained between process() calls
- * // Call reset() when starting a new audio stream
- * vad.reset();
+ * // Process 1 second of audio (16kHz = 16000 samples)
+ * const result = await wav2vec.infer(audioSamples);
  *
- * for (const chunk of audioChunks) {
- *   const prob = await vad.process(chunk);
- *   // prob is speech probability [0, 1]
- * }
+ * console.log('Blendshapes:', result.blendshapes); // [30, 52] for 30fps
+ * console.log('ASR text:', result.text); // Decoded transcription
  * ```
  */
-type VADBackend = BackendPreference;
-/**
- * Configuration for Silero VAD
- */
-interface SileroVADConfig {
+type InferenceBackend = BackendPreference;
+interface Wav2Vec2InferenceConfig {
     /** Path or URL to the ONNX model */
     modelUrl: string;
-    /** Preferred backend (auto will try WebGPU first, fallback to WASM) */
-    backend?: VADBackend;
-    /** Sample rate (8000 or 16000, default: 16000) */
-    sampleRate?: 8000 | 16000;
-    /** Speech probability threshold (default: 0.5) */
-    threshold?: number;
     /**
-     * Number of audio chunks to keep in pre-speech buffer.
-     * When VAD triggers, these chunks are prepended to the speech buffer
-     * to capture the beginning of speech that occurred before detection.
-     *
-     * At 512 samples/chunk and 16kHz:
-     * - 10 chunks = 320ms of pre-speech audio
-     * - 15 chunks = 480ms of pre-speech audio
+     * Path or URL to external model data file (.onnx.data weights).
+     * Default: `${modelUrl}.data` (e.g., /models/model.onnx.data)
      *
-     * Default: 10 chunks (320ms)
+     * Set to `false` to skip external data loading (single-file models only).
      */
-    preSpeechBufferChunks?: number;
+    externalDataUrl?: string | false;
+    /** Preferred backend (auto will try WebGPU first, fallback to WASM) */
+    backend?: InferenceBackend;
+    /** Number of identity classes (default: 12 for streaming model) */
+    numIdentityClasses?: number;
 }
-/**
- * VAD model loading information
- */
-interface VADModelInfo {
+interface ModelInfo {
     backend: 'webgpu' | 'wasm';
     loadTimeMs: number;
     inputNames: string[];
     outputNames: string[];
-    sampleRate: number;
-    chunkSize: number;
 }
-/**
- * Result from a single VAD inference
- */
-interface VADResult {
-    /** Speech probability (0-1) */
-    probability: number;
-    /** Whether speech is detected (probability > threshold) */
-    isSpeech: boolean;
-    /** Inference time in milliseconds */
+/** CTC vocabulary (32 tokens from wav2vec2-base-960h) */
+declare const CTC_VOCAB: string[];
+interface Wav2Vec2Result {
+    /** Blendshape weights [frames, 52] - 30fps */
+    blendshapes: Float32Array[];
+    /** Raw CTC logits [frames, 32] - 50fps */
+    asrLogits: Float32Array[];
+    /** Decoded text from CTC */
+    text: string;
+    /** Number of blendshape frames (30fps) — alias for numA2EFrames */
+    numFrames: number;
+    /** Number of A2E frames (30fps) */
+    numA2EFrames: number;
+    /** Number of ASR frames (50fps) */
+    numASRFrames: number;
+    /** Inference time in ms */
     inferenceTimeMs: number;
-    /**
-     * Pre-speech audio chunks (only present on first speech detection).
-     * These are the N chunks immediately before VAD triggered, useful for
-     * capturing the beginning of speech that occurred before detection.
-     *
-     * Only populated when transitioning from silence to speech.
-     */
-    preSpeechChunks?: Float32Array[];
-}
-/**
- * Speech segment detected by VAD
- */
-interface SpeechSegment {
-    /** Start time in seconds */
-    start: number;
-    /** End time in seconds */
-    end: number;
-    /** Average probability during segment */
-    avgProbability: number;
 }
-/**
- * Silero VAD - Neural network voice activity detection
- *
- * Based on snakers4/silero-vad ONNX model.
- * Processes 32ms chunks (512 samples at 16kHz) with LSTM state.
- *
- * @see https://github.com/snakers4/silero-vad
- */
-declare class SileroVADInference {
+declare class Wav2Vec2Inference implements LipSyncBackend {
+    readonly modelId: "wav2vec2";
     private session;
     private ort;
     private config;
     private _backend;
     private isLoading;
-    private state;
-    private context;
-    private readonly chunkSize;
-    private readonly contextSize;
+    private numIdentityClasses;
     private inferenceQueue;
-    private preSpeechBuffer;
-    private wasSpeaking;
-    private srTensor;
-    constructor(config: SileroVADConfig);
-    get backend(): RuntimeBackend | null;
-    get isLoaded(): boolean;
-    get sampleRate(): number;
-    get threshold(): number;
-    /**
-     * Get required chunk size in samples
-     */
-    getChunkSize(): number;
-    /**
-     * Get chunk duration in milliseconds
-     */
-    getChunkDurationMs(): number;
+    private poisoned;
+    private static readonly INFERENCE_TIMEOUT_MS;
+    constructor(config: Wav2Vec2InferenceConfig);
     /**
      * Check if WebGPU is available and working
      * (iOS returns false even if navigator.gpu exists due to ONNX Runtime bugs)
      */
     static isWebGPUAvailable: typeof isWebGPUAvailable;
+    get backend(): 'webgpu' | 'wasm' | null;
+    get isLoaded(): boolean;
+    /** True if inference timed out and the session is permanently unusable */
+    get isSessionPoisoned(): boolean;
     /**
      * Load the ONNX model
      */
-    load(): Promise<VADModelInfo>;
-    /**
-     * Reset state for new audio stream
-     */
-    reset(): void;
+    load(): Promise<ModelInfo>;
     /**
-     * Process a single audio chunk
+     * Run inference on raw audio
+     * @param audioSamples - Float32Array of raw audio at 16kHz (16000 samples = 1 second)
+     * @param identityIndex - Optional identity index (0-11, default 0 = neutral)
      *
-     * @param audioChunk - Float32Array of exactly chunkSize samples (512 for 16kHz, 256 for 8kHz)
-     * @returns VAD result with speech probability
+     * Note: Model expects 1-second chunks (16000 samples) for optimal performance.
+     * Audio will be zero-padded or truncated to 16000 samples.
      */
-    process(audioChunk: Float32Array): Promise<VADResult>;
+    infer(audioSamples: Float32Array, identityIndex?: number): Promise<Wav2Vec2Result>;
     /**
-     * Process audio and detect speech segments
-     *
-     * @param audio - Complete audio buffer
-     * @param options - Detection options
-     * @returns Array of speech segments
+     * Decode CTC logits to text using greedy decoding
      */
-    detectSpeech(audio: Float32Array, options?: {
-        /** Minimum speech duration in ms (default: 250) */
-        minSpeechDurationMs?: number;
-        /** Minimum silence duration to end segment in ms (default: 300) */
-        minSilenceDurationMs?: number;
-        /** Padding to add before/after speech in ms (default: 30) */
-        speechPadMs?: number;
-    }): Promise<SpeechSegment[]>;
+    private decodeCTC;
     /**
      * Queue inference to serialize ONNX session calls
      */
     private queueInference;
+    /**
+     * Get blendshape value by name for a specific frame
+     */
+    getBlendshape(blendshapes: Float32Array, name: typeof LAM_BLENDSHAPES[number]): number;
     /**
      * Dispose of the model and free resources
      */
@@ -2058,309 +2661,189 @@ declare class SileroVADInference {
 }
 /**
- * Silero VAD Web Worker implementation
+ * CPU-optimized lip sync inference using wav2arkit_cpu model
  *
- * Runs Silero VAD inference in a dedicated Web Worker to prevent main thread blocking.
- * Uses inline worker script (Blob URL pattern) to avoid separate file deployment.
+ * A Safari/iOS-compatible alternative to Wav2Vec2Inference (384MB) designed
+ * for platforms where WebGPU crashes due to ONNX Runtime JSEP bugs.
  *
- * Key design decisions:
- * - WASM backend only (WebGPU doesn't work in Workers)
- * - LSTM state serialized as Float32Array (Tensors can't cross worker boundary)
- * - Audio copied (not transferred) to retain main thread access for pre-speech buffer
- * - ONNX Runtime loaded from CDN in worker (no bundler complications)
+ * The model uses ONNX external data format:
+ * - wav2arkit_cpu.onnx (1.86MB graph structure)
+ * - wav2arkit_cpu.onnx.data (402MB weights)
+ * Both files are fetched and cached automatically.
+ *
+ * Key differences from Wav2Vec2Inference:
+ * - WASM-only backend (CPU-optimized, single-threaded, no WebGPU)
+ * - No identity input (baked to identity 11)
+ * - No ASR output (lip sync only)
+ * - Dynamic input length (not fixed to 16000 samples)
+ * - Different native blendshape ordering (remapped to LAM_BLENDSHAPES)
  *
  * @category Inference
  *
- * @example Basic usage
+ * @example
  * ```typescript
- * import { SileroVADWorker } from '@omote/core';
+ * import { Wav2ArkitCpuInference } from '@omote/core';
  *
- * const vad = new SileroVADWorker({
- *   modelUrl: '/models/silero-vad.onnx'
+ * const lam = new Wav2ArkitCpuInference({
+ *   modelUrl: '/models/wav2arkit_cpu.onnx',
  * });
- * await vad.load();
+ * await lam.load();
  *
- * // Process 32ms chunks (512 samples at 16kHz)
- * const result = await vad.process(audioChunk);
- * if (result.isSpeech) {
- *   console.log('Speech detected!', result.probability);
- * }
+ * const { blendshapes } = await lam.infer(audioSamples);
+ * // blendshapes: Float32Array[] in LAM_BLENDSHAPES order, 30fps
  * ```
  */
-/**
- * Configuration for Silero VAD Worker
- */
-interface VADWorkerConfig {
-    /** Path or URL to the ONNX model */
+interface Wav2ArkitCpuConfig {
+    /** Path or URL to the wav2arkit_cpu ONNX model (.onnx graph file) */
     modelUrl: string;
-    /** Sample rate (8000 or 16000, default: 16000) */
-    sampleRate?: 8000 | 16000;
-    /** Speech probability threshold (default: 0.5) */
-    threshold?: number;
     /**
-     * Number of audio chunks to keep in pre-speech buffer.
-     * When VAD triggers, these chunks are prepended to the speech buffer
-     * to capture the beginning of speech that occurred before detection.
-     *
-     * At 512 samples/chunk and 16kHz:
-     * - 10 chunks = 320ms of pre-speech audio
-     * - 15 chunks = 480ms of pre-speech audio
+     * Path or URL to external model data file (.onnx.data weights).
+     * Default: `${modelUrl}.data` (e.g., /models/wav2arkit_cpu.onnx.data)
      *
-     * Default: 10 chunks (320ms)
-     */
-    preSpeechBufferChunks?: number;
-}
-/**
- * VAD model loading information from worker
- */
-interface VADWorkerModelInfo {
-    backend: 'wasm';
-    loadTimeMs: number;
-    inputNames: string[];
-    outputNames: string[];
-    sampleRate: number;
-    chunkSize: number;
-}
-/**
- * Silero VAD Worker - Voice Activity Detection in a Web Worker
- *
- * Runs Silero VAD inference off the main thread to prevent UI blocking.
- * Feature parity with SileroVADInference but runs in dedicated worker.
- *
- * @see SileroVADInference for main-thread version
- */
-declare class SileroVADWorker {
-    private worker;
+     * Set to `false` to skip external data loading (single-file models only).
+     */
+    externalDataUrl?: string | false;
+    /** Preferred backend (default: 'wasm' — this model is CPU-optimized) */
+    backend?: BackendPreference;
+}
+declare class Wav2ArkitCpuInference implements LipSyncBackend {
+    readonly modelId: "wav2arkit_cpu";
+    private session;
+    private ort;
     private config;
+    private _backend;
     private isLoading;
-    private _isLoaded;
-    private state;
-    private context;
-    private readonly chunkSize;
-    private readonly contextSize;
     private inferenceQueue;
-    private preSpeechBuffer;
-    private wasSpeaking;
-    private pendingResolvers;
-    private messageId;
-    constructor(config: VADWorkerConfig);
+    private poisoned;
+    private static readonly INFERENCE_TIMEOUT_MS;
+    constructor(config: Wav2ArkitCpuConfig);
+    get backend(): RuntimeBackend | null;
     get isLoaded(): boolean;
     /**
-     * Backend type (always 'wasm' for Worker, WebGPU not supported in Workers)
-     */
-    get backend(): 'wasm' | null;
-    get sampleRate(): number;
-    get threshold(): number;
-    /**
-     * Get required chunk size in samples
-     */
-    getChunkSize(): number;
-    /**
-     * Get chunk duration in milliseconds
-     */
-    getChunkDurationMs(): number;
-    /**
-     * Create the worker from inline script
-     */
-    private createWorker;
-    /**
-     * Handle messages from worker
-     */
-    private handleWorkerMessage;
-    /**
-     * Send message to worker and wait for response
-     */
-    private sendMessage;
-    /**
-     * Load the ONNX model in the worker
-     */
-    load(): Promise<VADWorkerModelInfo>;
-    /**
-     * Reset state for new audio stream
+     * Load the ONNX model
      */
-    reset(): Promise<void>;
+    load(): Promise<LipSyncModelInfo>;
     /**
-     * Process a single audio chunk
+     * Run inference on raw audio
      *
-     * @param audioChunk - Float32Array of exactly chunkSize samples (512 for 16kHz, 256 for 8kHz)
-     * @returns VAD result with speech probability
+     * Accepts variable-length audio (not fixed to 16000 samples).
+     * Output frames = ceil(30 * numSamples / 16000).
+     *
+     * @param audioSamples - Float32Array of raw audio at 16kHz
+     * @param _identityIndex - Ignored (identity 11 is baked into the model)
      */
-    process(audioChunk: Float32Array): Promise<VADResult>;
+    infer(audioSamples: Float32Array, _identityIndex?: number): Promise<LipSyncResult>;
     /**
-     * Queue inference to serialize worker calls
+     * Queue inference to serialize ONNX session calls
      */
     private queueInference;
     /**
-     * Dispose of the worker and free resources
+     * Dispose of the model and free resources
      */
     dispose(): Promise<void>;
-    /**
-     * Check if Web Workers are supported
-     */
-    static isSupported(): boolean;
 }
 /**
- * Factory function for Silero VAD with automatic Worker vs main thread selection
+ * Factory function for lip sync with automatic GPU/CPU model selection
  *
- * Provides a unified API that automatically selects the optimal implementation:
- * - Desktop browsers: Uses SileroVADWorker (off-main-thread inference)
- * - Mobile devices: Uses SileroVADInference (main thread, avoids memory overhead)
- * - Fallback: Gracefully falls back to main thread if Worker fails
+ * Provides a unified API that automatically selects the optimal model:
+ * - Safari (macOS + iOS): Uses Wav2ArkitCpuInference (404MB, WASM)
+ * - Chrome/Firefox/Edge: Uses Wav2Vec2Inference (384MB, WebGPU)
+ * - Fallback: Gracefully falls back to CPU model if GPU model fails to load
+ *
+ * Why two separate models?
+ * Wav2Vec2 (LAM) cannot run on Safari/iOS for two reasons:
+ * 1. Its dual-head transformer graph needs ~750-950MB peak during ORT session
+ *    creation (graph optimization), exceeding iOS WebKit's ~1-1.5GB tab limit.
+ * 2. It ships as a single 384MB .onnx file that must load into JS heap before
+ *    ORT can consume it. iOS WebKit OOMs on this allocation.
+ * wav2arkit_cpu solves both: external data format (1.86MB graph + 402MB weights)
+ * lets ORT load only the tiny graph, then stream weights via URL pass-through
+ * directly into WASM memory. JS heap stays at ~2MB.
  *
  * @category Inference
  *
- * @example Basic usage (auto-detect)
+ * @example Auto-detect (recommended)
  * ```typescript
- * import { createSileroVAD } from '@omote/core';
+ * import { createLipSync } from '@omote/core';
  *
- * const vad = createSileroVAD({
- *   modelUrl: '/models/silero-vad.onnx',
- *   threshold: 0.5,
+ * const lam = createLipSync({
+ *   gpuModelUrl: '/models/unified_wav2vec2_asr_a2e.onnx',
+ *   cpuModelUrl: '/models/wav2arkit_cpu.onnx',
  * });
  *
- * await vad.load();
- * const result = await vad.process(audioChunk);
- * if (result.isSpeech) {
- *   console.log('Speech detected!', result.probability);
- * }
- * ```
- *
- * @example Force worker usage
- * ```typescript
- * const vad = createSileroVAD({
- *   modelUrl: '/models/silero-vad.onnx',
- *   useWorker: true, // Force Worker even on mobile
- * });
+ * await lam.load();
+ * const { blendshapes } = await lam.infer(audioSamples);
  * ```
  *
- * @example Force main thread
+ * @example Force CPU model
  * ```typescript
- * const vad = createSileroVAD({
- *   modelUrl: '/models/silero-vad.onnx',
- *   useWorker: false, // Force main thread
+ * const lam = createLipSync({
+ *   gpuModelUrl: '/models/unified_wav2vec2_asr_a2e.onnx',
+ *   cpuModelUrl: '/models/wav2arkit_cpu.onnx',
+ *   mode: 'cpu',
  * });
  * ```
  */
 /**
- * Common interface for both SileroVADInference and SileroVADWorker
- *
- * This interface defines the shared API that both implementations provide,
- * allowing consumers to use either interchangeably.
+ * Configuration for the lip sync factory
  */
-interface SileroVADBackend {
-    /** Current backend type (webgpu, wasm, or null if not loaded) */
-    readonly backend: RuntimeBackend | null;
-    /** Whether the model is loaded and ready for inference */
-    readonly isLoaded: boolean;
-    /** Audio sample rate (8000 or 16000 Hz) */
-    readonly sampleRate: number;
-    /** Speech detection threshold (0-1) */
-    readonly threshold: number;
-    /**
-     * Load the ONNX model
-     * @returns Model loading information
-     */
-    load(): Promise<VADModelInfo | VADWorkerModelInfo>;
-    /**
-     * Process a single audio chunk
-     * @param audioChunk - Float32Array of exactly chunkSize samples
-     * @returns VAD result with speech probability
-     */
-    process(audioChunk: Float32Array): Promise<VADResult>;
-    /**
-     * Reset state for new audio stream
-     */
-    reset(): void | Promise<void>;
+interface CreateLipSyncConfig {
+    /** URL for the GPU model (Wav2Vec2, used on Chrome/Firefox/Edge) */
+    gpuModelUrl: string;
     /**
-     * Dispose of the model and free resources
+     * URL for GPU model external data file (.onnx.data weights).
+     * Default: `${gpuModelUrl}.data`
+     *
+     * Set to `false` to skip external data loading (single-file models only).
      */
-    dispose(): Promise<void>;
+    gpuExternalDataUrl?: string | false;
+    /** URL for the CPU model (wav2arkit_cpu, used on Safari/iOS) */
+    cpuModelUrl: string;
     /**
-     * Get required chunk size in samples
+     * Model selection mode:
+     * - 'auto': Safari/iOS → CPU, everything else → GPU (default)
+     * - 'gpu': Force GPU model (Wav2Vec2Inference)
+     * - 'cpu': Force CPU model (Wav2ArkitCpuInference)
      */
-    getChunkSize(): number;
+    mode?: 'auto' | 'gpu' | 'cpu';
+    /** Backend preference for GPU model (default: 'auto') */
+    gpuBackend?: BackendPreference;
+    /** Number of identity classes for GPU model (default: 12) */
+    numIdentityClasses?: number;
     /**
-     * Get chunk duration in milliseconds
+     * Fall back to CPU model if GPU model fails to load (default: true)
+     * Only applies when mode is 'auto' or 'gpu'
      */
-    getChunkDurationMs(): number;
-}
-/**
- * Configuration for the Silero VAD factory
- *
- * Extends SileroVADConfig with worker-specific options.
- */
-interface SileroVADFactoryConfig extends SileroVADConfig {
+    fallbackOnError?: boolean;
     /**
-     * Force worker usage (true), main thread (false), or auto-detect (undefined).
-     *
-     * Auto-detection behavior:
-     * - Desktop: Uses Worker (better responsiveness, off-main-thread)
-     * - Mobile: Uses main thread (avoids 5MB memory overhead)
+     * Use Web Worker for CPU model inference (default: false)
      *
-     * You can override this to:
-     * - `true`: Force Worker even on mobile (if you have memory headroom)
-     * - `false`: Force main thread even on desktop (for debugging)
+     * When true, Wav2ArkitCpuWorker is used instead of Wav2ArkitCpuInference,
+     * running inference off the main thread to prevent UI blocking during
+     * model loading and inference.
      *
-     * Default: undefined (auto-detect)
+     * Only applies when the CPU model is selected (mode: 'cpu', auto on Safari/iOS,
+     * or fallback from GPU).
      */
     useWorker?: boolean;
     /**
-     * Fallback to main thread on worker errors.
-     *
-     * When true (default), if the Worker fails to load or encounters an error,
-     * the factory will automatically create a main thread instance instead.
-     *
-     * When false, worker errors will propagate as exceptions.
-     *
-     * Default: true
+     * Unified inference worker instance.
+     * When provided and CPU model is selected, uses Wav2ArkitCpuUnifiedAdapter.
+     * Takes precedence over useWorker setting for the CPU model path.
+     * GPU model (Wav2Vec2) always stays on main thread (WebGPU).
      */
-    fallbackOnError?: boolean;
+    unifiedWorker?: UnifiedInferenceWorker;
 }
 /**
- * Check if the current environment supports VAD Web Workers
- *
- * Requirements:
- * - Worker constructor must exist
- * - Blob URL support (for inline worker script)
- *
- * @returns true if VAD Worker is supported
- */
-declare function supportsVADWorker(): boolean;
-/**
- * Create a Silero VAD instance with automatic implementation selection
- *
- * This factory function automatically selects between:
- * - **SileroVADWorker**: Off-main-thread inference (better for desktop)
- * - **SileroVADInference**: Main thread inference (better for mobile)
- *
- * The selection is based on:
- * 1. Explicit `useWorker` config (if provided)
- * 2. Platform detection (mobile vs desktop)
- * 3. Worker API availability
- *
- * Both implementations share the same interface (SileroVADBackend),
- * so consumers can use either interchangeably.
+ * Create a lip sync instance with automatic GPU/CPU model selection
  *
  * @param config - Factory configuration
- * @returns A SileroVAD instance (either Worker or main thread)
- *
- * @example
- * ```typescript
- * // Auto-detect (recommended)
- * const vad = createSileroVAD({ modelUrl: '/models/silero-vad.onnx' });
- *
- * // Force Worker
- * const vadWorker = createSileroVAD({ modelUrl: '/models/silero-vad.onnx', useWorker: true });
- *
- * // Force main thread
- * const vadMain = createSileroVAD({ modelUrl: '/models/silero-vad.onnx', useWorker: false });
- * ```
+ * @returns A LipSyncBackend instance (either GPU or CPU model)
  */
-declare function createSileroVAD(config: SileroVADFactoryConfig): SileroVADBackend;
+declare function createLipSync(config: CreateLipSyncConfig): LipSyncBackend;
 /**
  * Safari Web Speech API wrapper for iOS speech recognition
@@ -3509,11 +3992,6 @@ declare class InterruptionHandler extends EventEmitter<InterruptionEvents> {
      * @param audioEnergy - Optional RMS energy for logging (default: 0)
      */
     processVADResult(vadProbability: number, audioEnergy?: number): void;
-    /**
-     * @deprecated Use processVADResult() instead. This method uses naive RMS detection.
-     * Process audio samples for VAD (legacy - uses simple RMS)
-     */
-    processAudio(samples: Float32Array | Int16Array): void;
     /**
      * Notify that AI started speaking
      */
@@ -3537,7 +4015,6 @@ declare class InterruptionHandler extends EventEmitter<InterruptionEvents> {
         isSpeaking: boolean;
         speechDurationMs: number;
     };
-    private calculateRMS;
     private onSpeechDetected;
     private onSilenceDetected;
 }
@@ -4713,4 +5190,4 @@ declare class ProceduralLifeLayer {
     private updateBrowNoise;
 }
-export { type AIAdapter, type AIAdapterEvents, AISessionState, ARKIT_BLENDSHAPES, type ActiveSpan, AgentCoreAdapter, type AgentCoreConfig, type AnimationClip, AnimationEvent, AnimationGraph, type AnimationGraphConfig, type AnimationGraphEvents, type AnimationLayer, type AnimationOutput, type AnimationState, type AnimationStateName, type AnimationTrigger, AudioChunkCoalescer, type AudioChunkCoalescerOptions, AudioEnergyAnalyzer, AudioScheduler, type AudioSchedulerOptions, type AudioSyncConfig, type AudioSyncEvents, AudioSyncManager, type BackendPreference, type BlendWeight, type CTCDecodeResult, CTC_VOCAB, type CacheConfig, type CacheSpanAttributes, ConsoleExporter, type ConversationMessage, ConversationOrchestrator, type ConversationSession, type CreateLipSyncConfig, DEFAULT_ANIMATION_CONFIG, EMOTION_ARKIT_MAP, EMOTION_NAMES, EMOTION_VECTOR_SIZE, type Emotion2VecLabel, type EmotionAnimationMap, type EmotionBlendMode, type EmotionBlendshapeConfig, EmotionController, type EmotionFrame, type EmotionLabel, type EmotionName, type EmotionPresetName, EmotionPresets, EmotionToBlendshapeMapper, type EmotionWeights, EmphasisDetector, EventEmitter, type FetchWithCacheOptions, type FullFaceFrame, FullFacePipeline, type FullFacePipelineEvents, type FullFacePipelineOptions, INFERENCE_LATENCY_BUCKETS, type InferenceSpanAttributes, type InterruptionConfig, type InterruptionEvents, InterruptionHandler, type KaldiFbankOptions, type LAMFrame, LAMPipeline, type LAMPipelineOptions, LAM_BLENDSHAPES, type LifeLayerConfig, type LifeLayerInput, type LifeLayerOutput, type LipSyncBackend, type LipSyncModelInfo, type LipSyncResult, MODEL_LOAD_TIME_BUCKETS, type MessageRole, type MetricData, MetricNames, MicrophoneCapture, type MicrophoneCaptureConfig, ModelCache, type ModelSpanAttributes, OTLPExporter, type OTLPExporterConfig, OmoteEvents, OmoteTelemetry, type OrchestratorConfig, type OrchestratorEvents, ProceduralLifeLayer, type QuotaInfo, RingBuffer, type RuntimeBackend, type SafariSpeechConfig, SafariSpeechRecognition, type SamplingConfig, type SenseVoiceConfig, SenseVoiceInference, type SenseVoiceLanguage, type SenseVoiceModelInfo, type SenseVoiceResult, type SessionConfig, type SessionOptions, type SessionSnapshot, type SileroVADBackend, type SileroVADConfig, type SileroVADFactoryConfig, SileroVADInference, SileroVADWorker, type SpanAttributes, type SpanData, type SpeechErrorCallback, type SpeechRecognitionResult, type SpeechResultCallback, type SpeechSegment, SyncedAudioPipeline, type SyncedAudioPipelineEvents, type SyncedAudioPipelineOptions, type TelemetryConfig, type TelemetryExporter, type TelemetryExporterInterface, type TenantConfig, TenantManager, type TenantQuota, type TenantUsage, type TokenRefreshCallback, type Transition, UPPER_FACE_BLENDSHAPES, type UpperFaceBlendshapeName, type UpperFaceBlendshapes, type VADBackend, type VADModelInfo, type VADResult, type VADWorkerConfig, type VADWorkerModelInfo, type ValidationResult, type VoiceConfig, WAV2ARKIT_BLENDSHAPES, type Wav2ArkitCpuConfig, Wav2ArkitCpuInference, Wav2Vec2Inference, type Wav2Vec2InferenceConfig, type Wav2Vec2Result, applyCMVN, applyLFR, blendEmotions, calculatePeak, calculateRMS, computeKaldiFbank, configureCacheLimit, configureTelemetry, createEmotionVector, createLipSync, createSessionWithFallback, createSileroVAD, ctcGreedyDecode, fetchWithCache, formatBytes, getCacheConfig, getCacheKey, getEmotionPreset, getLoadedBackend, getModelCache, getOnnxRuntime, getOnnxRuntimeForPreference, getOptimalWasmThreads, getRecommendedBackend, getSessionOptions, getTelemetry, hasWebGPUApi, isAndroid, isIOS, isIOSSafari, isMobile, isOnnxRuntimeLoaded, isSafari, isSpeechRecognitionAvailable, isWebGPUAvailable, lerpEmotion, parseCMVNFromMetadata, parseTokensFile, preloadModels, preloadOnnxRuntime, remapWav2ArkitToLam, resolveBackend, resolveLanguageId, resolveTextNormId, shouldEnableWasmProxy, shouldUseCpuLipSync, shouldUseNativeASR, shouldUseServerLipSync, supportsVADWorker, symmetrizeBlendshapes };
+export { type AIAdapter, type AIAdapterEvents, AISessionState, ARKIT_BLENDSHAPES, type ActiveSpan, AgentCoreAdapter, type AgentCoreConfig, type AnimationClip, AnimationEvent, AnimationGraph, type AnimationGraphConfig, type AnimationGraphEvents, type AnimationLayer, type AnimationOutput, type AnimationState, type AnimationStateName, type AnimationTrigger, AudioChunkCoalescer, type AudioChunkCoalescerOptions, AudioEnergyAnalyzer, AudioScheduler, type AudioSchedulerOptions, type AudioSyncConfig, type AudioSyncEvents, AudioSyncManager, type BackendPreference, type BlendWeight, type CTCDecodeResult, CTC_VOCAB, type CacheConfig, type CacheSpanAttributes, ConsoleExporter, type ConversationMessage, ConversationOrchestrator, type ConversationSession, type CreateLipSyncConfig, type CreateSenseVoiceConfig, DEFAULT_ANIMATION_CONFIG, EMOTION_ARKIT_MAP, EMOTION_NAMES, EMOTION_VECTOR_SIZE, type Emotion2VecLabel, type EmotionAnimationMap, type EmotionBlendMode, type EmotionBlendshapeConfig, EmotionController, type EmotionFrame, type EmotionLabel, type EmotionName, type EmotionPresetName, EmotionPresets, EmotionToBlendshapeMapper, type EmotionWeights, EmphasisDetector, EventEmitter, type FetchWithCacheOptions, type FullFaceFrame, FullFacePipeline, type FullFacePipelineEvents, type FullFacePipelineOptions, INFERENCE_LATENCY_BUCKETS, type InferenceSpanAttributes, type InterruptionConfig, type InterruptionEvents, InterruptionHandler, type KaldiFbankOptions, type LAMFrame, LAMPipeline, type LAMPipelineOptions, LAM_BLENDSHAPES, type LifeLayerConfig, type LifeLayerInput, type LifeLayerOutput, type LipSyncBackend, type LipSyncModelInfo, type LipSyncResult, MODEL_LOAD_TIME_BUCKETS, type MessageRole, type MetricData, MetricNames, MicrophoneCapture, type MicrophoneCaptureConfig, ModelCache, type ModelSpanAttributes, OTLPExporter, type OTLPExporterConfig, OmoteEvents, OmoteTelemetry, type OrchestratorConfig, type OrchestratorEvents, ProceduralLifeLayer, type QuotaInfo, RingBuffer, type RuntimeBackend, type SafariSpeechConfig, SafariSpeechRecognition, type SamplingConfig, type SenseVoiceBackend, type SenseVoiceConfig, SenseVoiceInference, type SenseVoiceLanguage, type SenseVoiceModelInfo, type SenseVoiceResult, SenseVoiceUnifiedAdapter, SenseVoiceWorker, type SenseVoiceWorkerConfig, type SessionConfig, type SessionOptions, type SessionSnapshot, type SileroVADBackend, type SileroVADConfig, type SileroVADFactoryConfig, SileroVADInference, SileroVADUnifiedAdapter, SileroVADWorker, type SpanAttributes, type SpanData, type SpeechErrorCallback, type SpeechRecognitionResult, type SpeechResultCallback, type SpeechSegment, SyncedAudioPipeline, type SyncedAudioPipelineEvents, type SyncedAudioPipelineOptions, type TelemetryConfig, type TelemetryExporter, type TelemetryExporterInterface, type TenantConfig, TenantManager, type TenantQuota, type TenantUsage, type TokenRefreshCallback, type Transition, UPPER_FACE_BLENDSHAPES, UnifiedInferenceWorker, type UpperFaceBlendshapeName, type UpperFaceBlendshapes, type VADBackend, type VADModelInfo, type VADResult, type VADWorkerConfig, type VADWorkerModelInfo, type ValidationResult, type VoiceConfig, WAV2ARKIT_BLENDSHAPES, type Wav2ArkitCpuConfig, Wav2ArkitCpuInference, Wav2ArkitCpuUnifiedAdapter, Wav2ArkitCpuWorker, type Wav2ArkitCpuWorkerConfig, Wav2Vec2Inference, type Wav2Vec2InferenceConfig, type Wav2Vec2Result, applyCMVN, applyLFR, blendEmotions, calculatePeak, calculateRMS, computeKaldiFbank, configureCacheLimit, configureTelemetry, createEmotionVector, createLipSync, createSenseVoice, createSessionWithFallback, createSileroVAD, ctcGreedyDecode, fetchWithCache, formatBytes, getCacheConfig, getCacheKey, getEmotionPreset, getLoadedBackend, getModelCache, getOnnxRuntime, getOnnxRuntimeForPreference, getOptimalWasmThreads, getRecommendedBackend, getSessionOptions, getTelemetry, hasWebGPUApi, isAndroid, isIOS, isIOSSafari, isMobile, isOnnxRuntimeLoaded, isSafari, isSpeechRecognitionAvailable, isWebGPUAvailable, lerpEmotion, parseCMVNFromMetadata, parseTokensFile, preloadModels, preloadOnnxRuntime, remapWav2ArkitToLam, resolveBackend, resolveLanguageId, resolveTextNormId, shouldEnableWasmProxy, shouldUseCpuLipSync, shouldUseNativeASR, shouldUseServerLipSync, supportsVADWorker, symmetrizeBlendshapes };