npm - @omote/core - Versions diffs - 0.4.7 → 0.5.3 - Mend

@omote/core 0.4.7 → 0.5.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

package/dist/index.d.ts CHANGED Viewed

@@ -1,6 +1,5 @@
 import { EventEmitter, OmoteEvents, AISessionState, AnimationEvent } from './events/index.js';
 export { BackendEvent, EmotionEvent, GazeEvent, STTFinalEvent, STTPartialEvent, SessionStateEvent, TTSEndEvent, TTSMarkEvent, TTSStartEvent, VisemeEvent } from './events/index.js';
-import { InferenceSession, Tensor, Env } from 'onnxruntime-common';
 export { D as DEFAULT_LOGGING_CONFIG, I as ILogger, e as LOG_LEVEL_PRIORITY, b as LogEntry, L as LogFormatter, a as LogLevel, c as LogSink, d as LoggingConfig, f as configureLogging, i as createLogger, g as getLoggingConfig, n as noopLogger, r as resetLoggingConfig, s as setLogLevel, h as setLoggingEnabled } from './Logger-I_k4sGhM.js';
 export { ARKitToFLAMEMapping, ApiError, AudioChunkEvent, AvatarFormat, Character, CharacterAvatar, CharacterMemory, CharacterPersonality, CharacterSpec, CharacterVoice, CreateCharacterRequest, CreateCharacterResponse, CreateLAMJobRequest, CreateLAMJobResponse, CreateSessionRequest, CreateSessionResponse, GSplatConfig, LAMJob, LAMJobStatus, PROTOCOL_VERSION, PaginatedResponse, PlatformSession, ErrorEvent as ProtocolErrorEvent, ProtocolEvent, ResponseChunkEvent, ResponseEndEvent, ResponseStartEvent, SessionMessage, SessionStatus, isProtocolEvent } from '@omote/types';
@@ -379,7 +378,7 @@ declare function shouldEnableWasmProxy(): boolean;
  */
 declare function isSafari(): boolean;
 /**
- * Recommend using CPU-optimized lip sync model (wav2arkit_cpu)
+ * Recommend using CPU-optimized A2E model (wav2arkit_cpu)
  *
  * All iOS browsers use WebKit and have tight memory limits — the 384MB
  * LAM model causes silent crashes. wav2arkit_cpu uses URL pass-through
@@ -390,7 +389,7 @@ declare function isSafari(): boolean;
  *
  * @returns true if iOS (any browser) or Safari (any platform)
  */
-declare function shouldUseCpuLipSync(): boolean;
+declare function shouldUseCpuA2E(): boolean;
 /**
  * Check if Web Speech API is available in the browser
  *
@@ -415,18 +414,18 @@ declare function shouldUseNativeASR(): boolean;
 /**
  * Recommend using server-side LAM over client-side on iOS
  *
- * On iOS, LAM lip sync via WASM takes ~332ms per second of audio (3.3x over target).
+ * On iOS, LAM A2E via WASM takes ~332ms per second of audio (3.3x over target).
  * Server-side inference with GPU can achieve ~50ms, providing:
- * - Real-time lip sync (under 100ms target)
+ * - Real-time A2E (under 100ms target)
  * - Reduced iOS device thermal/battery impact
  * - Better user experience
  *
- * @returns true if on iOS (should use server-side lip sync)
+ * @returns true if on iOS (should use server-side A2E)
  */
-declare function shouldUseServerLipSync(): boolean;
+declare function shouldUseServerA2E(): boolean;
 /**
- * Common interface for lip sync inference backends
+ * Common interface for audio-to-expression (A2E) inference backends
  *
  * Both Wav2Vec2Inference (GPU, 384MB) and Wav2ArkitCpuInference (CPU, 404MB)
  * implement this interface, allowing SyncedAudioPipeline and LAMPipeline to
@@ -438,19 +437,19 @@ declare function shouldUseServerLipSync(): boolean;
 /**
  * Model loading information returned by load()
  */
-interface LipSyncModelInfo {
+interface A2EModelInfo {
     backend: RuntimeBackend;
     loadTimeMs: number;
     inputNames: string[];
     outputNames: string[];
 }
 /**
- * Result from lip sync inference
+ * Result from A2E inference
  *
  * All implementations must return blendshapes in LAM_BLENDSHAPES order (alphabetical).
  * Models with different native orderings must remap internally before returning.
  */
-interface LipSyncResult {
+interface A2EResult {
     /** Blendshape weights [frames, 52] in LAM_BLENDSHAPES order - 30fps */
     blendshapes: Float32Array[];
     /** Number of blendshape frames */
@@ -459,31 +458,33 @@ interface LipSyncResult {
     inferenceTimeMs: number;
 }
 /**
- * Common interface for lip sync inference engines
+ * Common interface for A2E (audio-to-expression) inference engines
  *
  * Implemented by:
- * - Wav2Vec2Inference (WebGPU/WASM, 384MB, ASR + lip sync)
- * - Wav2ArkitCpuInference (WASM-only, 404MB, lip sync only)
+ * - Wav2Vec2Inference (WebGPU/WASM, 384MB, ASR + A2E)
+ * - Wav2ArkitCpuInference (WASM-only, 404MB, A2E only)
  */
-interface LipSyncBackend {
+interface A2EBackend {
     /** Model identifier for backend-specific tuning (e.g. audio delay) */
     readonly modelId: 'wav2vec2' | 'wav2arkit_cpu';
     /** Current backend type (webgpu, wasm, or null if not loaded) */
     readonly backend: RuntimeBackend | null;
     /** Whether the model is loaded and ready for inference */
     readonly isLoaded: boolean;
+    /** Optimal number of audio samples per inference call (e.g. 16000 = 1s at 16kHz) */
+    readonly chunkSize: number;
     /**
      * Load the ONNX model
      * @returns Model loading information
      */
-    load(): Promise<LipSyncModelInfo>;
+    load(): Promise<A2EModelInfo>;
     /**
      * Run inference on raw audio
      * @param audioSamples - Float32Array of raw audio at 16kHz
      * @param identityIndex - Optional identity index (ignored by CPU model)
-     * @returns Lip sync result with blendshapes in LAM_BLENDSHAPES order
+     * @returns A2E result with blendshapes in LAM_BLENDSHAPES order
      */
-    infer(audioSamples: Float32Array, identityIndex?: number): Promise<LipSyncResult>;
+    infer(audioSamples: Float32Array, identityIndex?: number): Promise<A2EResult>;
     /**
      * Dispose of the model and free resources
      */
@@ -491,542 +492,16 @@ interface LipSyncBackend {
 }
 /**
- * LAMPipeline - Coordinate LAM (Wav2Vec2) inference with frame synchronization
+ * FullFacePipeline - A2E expression pipeline with ExpressionProfile weight scaling
  *
- * Manages the buffering and processing pipeline for LAM lip sync:
- * 1. Accumulates audio samples in a ring buffer
- * 2. Triggers LAM inference when buffer reaches required size (16000 samples @ 16kHz = 1.0s)
- * 3. Queues resulting blendshape frames with precise timestamps
- * 4. Provides frames synchronized to AudioContext clock
+ * Orchestrates full-face animation by:
+ * 1. Scheduling audio for playback immediately (audio-first, never waits for A2E)
+ * 2. Running A2E inference in background (fire-and-forget via A2EProcessor)
+ * 3. Applying per-character ExpressionProfile scaling to raw A2E output
  *
- * Key Design Decisions:
- * - Ring buffer pattern for efficient sample accumulation (no allocation churn)
- * - Frame queue with timestamps for deterministic playback
- * - Timestamp-based frame retrieval (not callback) for renderer flexibility
- *
- * Based on patterns from Chrome Audio Worklet design and Web Audio clock management.
- *
- * @see https://developer.chrome.com/blog/audio-worklet-design-pattern
- * @category Audio
- */
-interface LAMFrame {
-    /** 52 ARKit blendshape weights */
-    frame: Float32Array;
-    /** AudioContext time when this frame should be displayed */
-    timestamp: number;
-}
-interface LAMPipelineOptions {
-    /**
-     * Sample rate in Hz (must match audio playback)
-     * Default: 16000
-     */
-    sampleRate?: number;
-    /**
-     * LAM inference callback
-     * Called each time LAM processes a buffer
-     */
-    onInference?: (frameCount: number) => void;
-    /**
-     * Error callback for inference failures
-     */
-    onError?: (error: Error) => void;
-}
-declare class LAMPipeline {
-    private readonly options;
-    private readonly REQUIRED_SAMPLES;
-    private readonly FRAME_RATE;
-    private buffer;
-    private bufferStartTime;
-    private frameQueue;
-    /**
-     * Last successfully retrieved frame
-     * Used as fallback when no new frame is available to prevent avatar freezing
-     */
-    private lastFrame;
-    constructor(options?: LAMPipelineOptions);
-    /**
-     * Push audio samples into the pipeline
-     *
-     * Accumulates samples and triggers LAM inference when buffer is full.
-     * Multiple calls may be needed to accumulate enough samples.
-     *
-     * @param samples - Float32Array of audio samples
-     * @param timestamp - AudioContext time when these samples start playing
-     * @param lam - LAM inference engine
-     */
-    push(samples: Float32Array, timestamp: number, lam: LipSyncBackend): Promise<void>;
-    /**
-     * Process accumulated buffer through LAM inference
-     */
-    private processBuffer;
-    /**
-     * Get the frame that should be displayed at the current time
-     *
-     * Automatically removes frames that have already been displayed.
-     * This prevents memory leaks from accumulating old frames.
-     *
-     * Discard Window (prevents premature frame discarding):
-     * - WebGPU: 0.5s (LAM inference 20-100ms + RAF jitter + React stalls)
-     * - WASM: 1.0s (LAM inference 50-500ms + higher variability)
-     *
-     * Last-Frame-Hold: Returns last valid frame instead of null to prevent
-     * avatar freezing when between frames (RAF at 60fps vs LAM at 30fps).
-     *
-     * @param currentTime - Current AudioContext time
-     * @param lam - LAM inference engine (optional, for backend detection)
-     * @returns Current frame, or last frame as fallback, or null if no frames yet
-     */
-    getFrameForTime(currentTime: number, lam?: {
-        backend: 'webgpu' | 'wasm' | null;
-    }): Float32Array | null;
-    /**
-     * Get all frames in the queue (for debugging/monitoring)
-     */
-    getQueuedFrames(): LAMFrame[];
-    /**
-     * Get current buffer fill level (0-1)
-     */
-    get fillLevel(): number;
-    /**
-     * Get number of frames queued
-     */
-    get queuedFrameCount(): number;
-    /**
-     * Get buffered audio duration in seconds
-     */
-    get bufferedDuration(): number;
-    /**
-     * Flush remaining buffered audio
-     *
-     * Processes any remaining audio in the buffer, even if less than REQUIRED_SAMPLES.
-     * This ensures the final audio chunk generates blendshape frames.
-     *
-     * Should be called when audio stream ends to prevent losing the last 0-1 seconds.
-     *
-     * @param lam - LAM inference engine
-     */
-    flush(lam: LipSyncBackend): Promise<void>;
-    /**
-     * Adjust all queued frame timestamps by an offset
-     *
-     * Used for synchronization when audio scheduling time differs from
-     * the estimated time used during LAM processing.
-     *
-     * @param offset - Time offset in seconds to add to all timestamps
-     */
-    adjustTimestamps(offset: number): void;
-    /**
-     * Reset the pipeline
-     */
-    reset(): void;
-}
-/**
- * SyncedAudioPipeline - Audio playback + LAM lip sync coordinator
- *
- * Orchestrates the complete pipeline for synchronized audio playback and lip sync:
- * 1. Network chunks → Coalescer → Optimized buffers
- * 2. Audio buffers → Scheduler → Gapless playback (immediate, never blocks)
- * 3. Audio buffers → LAM Pipeline → Blendshape frames (background, fire-and-forget)
- * 4. Frames synchronized to AudioContext clock → Renderer
- *
- * Key Architecture Pattern: Audio-First, LAM-Background
- * - Audio chunks are scheduled for playback immediately (never waits for LAM)
- * - LAM inference runs in background without blocking the audio path
- * - Lip sync starts ~1 second after audio (LAM needs 16000 samples to infer)
- * - Once LAM catches up, frames stay synchronized to AudioContext clock
- *
- * This decoupled design prevents LAM inference (50-300ms) from blocking audio
- * scheduling, which caused audible stuttering when audio arrived as a continuous
- * stream (e.g., single-call TTS from ElevenLabs via AgentCore).
- *
- * @see https://web.dev/articles/audio-scheduling (Web Audio clock patterns)
- * @category Audio
- */
-interface SyncedAudioPipelineOptions {
-    /** Sample rate in Hz (default: 16000) */
-    sampleRate?: number;
-    /** Target chunk duration in ms for coalescing (default: 200) */
-    chunkTargetMs?: number;
-    /** LAM inference engine */
-    lam: LipSyncBackend;
-    /**
-     * Audio playback delay in ms before first audio plays.
-     * Gives LAM inference time to pre-compute blendshapes.
-     * Default: auto-detected from lam.backend (50ms WebGPU, 350ms WASM).
-     */
-    audioDelayMs?: number;
-}
-interface SyncedAudioPipelineEvents {
-    /** New frame ready for display */
-    frame_ready: Float32Array;
-    /** Playback has completed */
-    playback_complete: void;
-    /** First audio chunk scheduled, playback starting */
-    playback_start: number;
-    /** Error occurred */
-    error: Error;
-    /** Index signature for EventEmitter compatibility */
-    [key: string]: unknown;
-}
-declare class SyncedAudioPipeline extends EventEmitter<SyncedAudioPipelineEvents> {
-    private readonly options;
-    private scheduler;
-    private coalescer;
-    private lamPipeline;
-    private playbackStarted;
-    private monitorInterval;
-    private frameAnimationId;
-    constructor(options: SyncedAudioPipelineOptions);
-    /**
-     * Initialize the pipeline
-     */
-    initialize(): Promise<void>;
-    /**
-     * Start a new playback session
-     *
-     * Resets all state and prepares for incoming audio chunks.
-     * Audio will be scheduled immediately as chunks arrive (no buffering).
-     */
-    start(): void;
-    /**
-     * Receive audio chunk from network
-     *
-     * Audio-first design: schedules audio immediately, LAM runs in background.
-     * This prevents LAM inference (50-300ms) from blocking audio scheduling,
-     * which caused audible stuttering with continuous audio streams.
-     *
-     * @param chunk - Uint8Array containing Int16 PCM audio
-     */
-    onAudioChunk(chunk: Uint8Array): Promise<void>;
-    /**
-     * End of audio stream
-     *
-     * Flushes any remaining buffered data.
-     */
-    end(): Promise<void>;
-    /**
-     * Stop playback immediately with smooth fade-out
-     *
-     * Gracefully cancels all audio playback and LAM processing:
-     * - Fades out audio over specified duration (default: 50ms)
-     * - Cancels pending LAM inferences
-     * - Clears all buffers and queues
-     * - Emits 'playback_complete' event
-     *
-     * Use this for interruptions (e.g., user barge-in during AI speech).
-     *
-     * @param fadeOutMs - Fade-out duration in milliseconds (default: 50ms)
-     * @returns Promise that resolves when fade-out completes
-     */
-    stop(fadeOutMs?: number): Promise<void>;
-    /**
-     * Start frame animation loop
-     *
-     * Uses requestAnimationFrame to check for new LAM frames.
-     * Synchronized to AudioContext clock (not visual refresh rate).
-     *
-     * Frame Emission Strategy:
-     * - LAMPipeline uses last-frame-hold to prevent null returns
-     * - Always emit frames (even repeated frames) to maintain smooth animation
-     * - Renderer is responsible for detecting duplicate frames if needed
-     */
-    private startFrameLoop;
-    /**
-     * Start monitoring for playback completion
-     */
-    private startMonitoring;
-    /**
-     * Stop monitoring
-     */
-    private stopMonitoring;
-    /**
-     * Get current pipeline state (for debugging/monitoring)
-     */
-    getState(): {
-        playbackStarted: boolean;
-        coalescerFill: number;
-        lamFill: number;
-        queuedFrames: number;
-        currentTime: number;
-        playbackEndTime: number;
-    };
-    /**
-     * Cleanup resources
-     */
-    dispose(): void;
-}
-/**
- * Emotion to ARKit Blendshape Mapper
- *
- * Converts Emotion2VecInference output to upper face ARKit blendshapes for
- * expressive avatar animation. Maps 4 emotion categories (neutral, happy, angry, sad)
- * to 11 upper face blendshapes (brows, eyes, cheeks).
- *
- * Supports two blend modes:
- * - 'dominant': Uses only the strongest emotion (simpler, more stable)
- * - 'weighted': Blends all emotions by probability (more nuanced, e.g., bittersweet)
- *
- * Also supports energy modulation to scale emotion intensity by audio energy,
- * making expressions stronger during emphasized speech.
- *
- * @example Basic usage
- * ```typescript
- * import { EmotionToBlendshapeMapper } from '@omote/core';
- * import { Emotion2VecInference } from '@omote/core';
- *
- * const emotion = new Emotion2VecInference({ modelUrl: '/models/emotion.onnx' });
- * const mapper = new EmotionToBlendshapeMapper();
- *
- * // Process emotion frame
- * const result = await emotion.infer(audioSamples);
- * const blendshapes = mapper.mapFrame(result.dominant);
- *
- * // Apply to avatar
- * for (const [name, value] of Object.entries(blendshapes)) {
- *   avatar.setBlendshape(name, value);
- * }
- * ```
- *
- * @example Weighted blending for nuanced expressions
- * ```typescript
- * const mapper = new EmotionToBlendshapeMapper({
- *   blendMode: 'weighted',
- *   minBlendProbability: 0.1,
- * });
- *
- * // Frame with mixed emotions: { happy: 0.6, sad: 0.3, neutral: 0.1 }
- * // Result: bittersweet expression (smiling but worried brow)
- * const blendshapes = mapper.mapFrame(emotionFrame);
- * ```
- *
- * @example Energy-modulated emotion
- * ```typescript
- * import { AudioEnergyAnalyzer } from '@omote/core';
- *
- * const energyAnalyzer = new AudioEnergyAnalyzer();
- * const mapper = new EmotionToBlendshapeMapper({ energyModulation: true });
- *
- * // In animation loop
- * function animate(audioChunk: Float32Array, emotionFrame: EmotionFrame) {
- *   const { energy } = energyAnalyzer.analyze(audioChunk);
- *   mapper.mapFrame(emotionFrame, energy); // Louder = stronger emotion
- *   mapper.update(16);
- *   applyToAvatar(mapper.getCurrentBlendshapes());
- * }
- * ```
- *
- * @module animation
- */
-declare const EMOTION2VEC_LABELS: readonly ["neutral", "happy", "angry", "sad"];
-type Emotion2VecLabel = (typeof EMOTION2VEC_LABELS)[number];
-interface EmotionFrame {
-    /** Primary emotion label */
-    emotion: Emotion2VecLabel;
-    /** Confidence for primary emotion (0-1) */
-    confidence: number;
-    /** All emotion probabilities */
-    probabilities: Record<Emotion2VecLabel, number>;
-}
-/**
- * Upper face ARKit blendshape names (11 total)
- *
- * These blendshapes control the upper face (brows, eyes, cheeks) and are
- * driven by emotion detection, complementing the mouth blendshapes from
- * LAM lip sync.
- */
-declare const UPPER_FACE_BLENDSHAPES: readonly ["browDownLeft", "browDownRight", "browInnerUp", "browOuterUpLeft", "browOuterUpRight", "eyeSquintLeft", "eyeSquintRight", "eyeWideLeft", "eyeWideRight", "cheekSquintLeft", "cheekSquintRight"];
-type UpperFaceBlendshapeName = (typeof UPPER_FACE_BLENDSHAPES)[number];
-/**
- * Upper face blendshape values (0-1 for each)
- */
-type UpperFaceBlendshapes = Record<UpperFaceBlendshapeName, number>;
-/**
- * Blend mode for combining emotions
- * - 'dominant': Use only the strongest emotion (default, more stable)
- * - 'weighted': Blend all emotions by probability (more nuanced)
- */
-type EmotionBlendMode = 'dominant' | 'weighted';
-/**
- * Emotion to ARKit blendshape mapping
- *
- * Based on Paul Ekman's FACS (Facial Action Coding System) research:
- *
- * - Happy (AU6+AU12): Cheek raise + lip corner pull (Duchenne smile)
- *   Upper face: cheekSquint (AU6) + slight eyeSquint from genuine smile
- *
- * - Angry (AU4+AU5+AU7+AU23): Brow lower + eye wide + lid tighten + lip press
- *   Upper face: browDown (AU4) + eyeWide (AU5) + eyeSquint (AU7) creates the "glare"
- *
- * - Sad (AU1+AU4+AU15): Inner brow raise + brow furrow + lip corner depress
- *   Upper face: browInnerUp (AU1) + browDown (AU4) creates the worried/sad brow
- *
- * - Neutral: All zeros (no expression overlay)
- *
- * @see https://imotions.com/blog/learning/research-fundamentals/facial-action-coding-system/
- * @see https://melindaozel.com/arkit-to-facs-cheat-sheet/
- */
-declare const EMOTION_ARKIT_MAP: Record<Emotion2VecLabel, Partial<UpperFaceBlendshapes>>;
-/**
- * Configuration for EmotionToBlendshapeMapper
- */
-interface EmotionBlendshapeConfig {
-    /**
-     * Smoothing factor for exponential moving average (0-1)
-     * Lower = slower, smoother transitions
-     * Higher = faster, more responsive
-     * @default 0.15
-     */
-    smoothingFactor?: number;
-    /**
-     * Minimum confidence threshold for emotion to take effect
-     * Emotions below this confidence are treated as neutral
-     * @default 0.3
-     */
-    confidenceThreshold?: number;
-    /**
-     * Global intensity multiplier for all blendshapes (0-2)
-     * @default 1.0
-     */
-    intensity?: number;
-    /**
-     * Blend mode for combining emotions
-     * - 'dominant': Use only the strongest emotion (default)
-     * - 'weighted': Blend all emotions by probability
-     * @default 'dominant'
-     */
-    blendMode?: EmotionBlendMode;
-    /**
-     * Minimum probability for an emotion to contribute in weighted blend mode
-     * Emotions with probability below this are ignored
-     * @default 0.1
-     */
-    minBlendProbability?: number;
-    /**
-     * Enable energy modulation - scale emotion intensity by audio energy
-     * When enabled, louder speech produces stronger expressions
-     * @default false
-     */
-    energyModulation?: boolean;
-    /**
-     * Minimum energy scale when energy modulation is enabled (0-1)
-     * At zero audio energy, emotion intensity is scaled by this factor
-     * @default 0.3
-     */
-    minEnergyScale?: number;
-    /**
-     * Maximum energy scale when energy modulation is enabled (0-2)
-     * At maximum audio energy, emotion intensity is scaled by this factor
-     * @default 1.0
-     */
-    maxEnergyScale?: number;
-}
-/**
- * EmotionToBlendshapeMapper
- *
- * Converts emotion detection output to upper face ARKit blendshapes.
- * Provides smooth transitions between emotion states using exponential
- * moving average interpolation.
- *
- * Supports two blend modes:
- * - 'dominant': Uses only the strongest emotion
- * - 'weighted': Blends all emotions by probability for nuanced expressions
- *
- * Also supports energy modulation to scale emotion intensity by audio energy.
- */
-declare class EmotionToBlendshapeMapper {
-    private config;
-    private targetBlendshapes;
-    private currentBlendshapes;
-    private currentEnergy;
-    /**
-     * Create a new EmotionToBlendshapeMapper
-     *
-     * @param config - Optional configuration
-     */
-    constructor(config?: EmotionBlendshapeConfig);
-    /**
-     * Map an emotion frame to target blendshapes
-     *
-     * This sets the target values that the mapper will smoothly interpolate
-     * towards. Call update() each frame to apply smoothing.
-     *
-     * @param frame - Emotion frame from Emotion2VecInference
-     * @param audioEnergy - Optional audio energy (0-1) for energy modulation
-     * @returns Target upper face blendshapes (before smoothing)
-     */
-    mapFrame(frame: EmotionFrame, audioEnergy?: number): UpperFaceBlendshapes;
-    /**
-     * Map using dominant emotion only (original behavior)
-     */
-    private mapFrameDominant;
-    /**
-     * Map using weighted blend of all emotions by probability
-     * Creates more nuanced expressions (e.g., bittersweet = happy + sad)
-     */
-    private mapFrameWeighted;
-    /**
-     * Apply energy modulation to scale emotion intensity by audio energy
-     * Louder speech = stronger expressions
-     */
-    private applyEnergyModulation;
-    /**
-     * Apply smoothing to interpolate current values towards target
-     *
-     * Uses exponential moving average:
-     * current = current + smoothingFactor * (target - current)
-     *
-     * @param _deltaMs - Delta time in milliseconds (reserved for future time-based smoothing)
-     */
-    update(_deltaMs: number): void;
-    /**
-     * Get current smoothed blendshape values
-     *
-     * @returns Current upper face blendshapes (after smoothing)
-     */
-    getCurrentBlendshapes(): UpperFaceBlendshapes;
-    /**
-     * Reset mapper to neutral state
-     *
-     * Sets both target and current blendshapes to zero.
-     */
-    reset(): void;
-    /**
-     * Get current configuration
-     */
-    getConfig(): Required<EmotionBlendshapeConfig>;
-    /**
-     * Update configuration
-     *
-     * @param config - Partial configuration to update
-     */
-    setConfig(config: Partial<EmotionBlendshapeConfig>): void;
-}
-/**
- * FullFacePipeline - Combined LAM lip sync + Emotion upper face pipeline
- *
- * Orchestrates full-face animation by combining:
- * 1. LAM lip sync (52 ARKit blendshapes) via audio-first scheduling
- * 2. Emotion labels (from backend LLM or `setEmotionLabel()`) for upper face
- * 3. AudioEnergyAnalyzer for prosody-driven fallback when no emotion label is set
- *
- * Architecture: Audio-First, LAM-Background (same as SyncedAudioPipeline)
- * - Audio chunks are scheduled for playback immediately (never waits for LAM)
- * - LAM inference runs in background without blocking the audio path
- * - Lip sync starts ~1 second after audio (LAM needs 16000 samples to infer)
- *
- * Merge Strategy:
- * - Lower face (41 blendshapes): 100% from LAM (mouth, jaw, tongue, etc.)
- * - Upper face (11 blendshapes): Emotion overlay with LAM as subtle fallback
- *   Formula: emotion * emotionBlendFactor + lam * lamBlendFactor
- *
- * Emotion Sources (in priority order):
- * 1. `setEmotionLabel()` — explicit label from backend LLM (recommended)
- * 2. Prosody fallback — subtle brow movement from audio energy (automatic)
+ * The A2E model outputs all 52 ARKit blendshapes from audio — brows, eyes, cheeks,
+ * mouth, jaw, everything. ExpressionProfile allows per-character weight scaling
+ * by group (eyes, brows, jaw, mouth, cheeks, nose, tongue) with per-blendshape overrides.
  *
  * @category Audio
  *
@@ -1036,8 +511,7 @@ declare class EmotionToBlendshapeMapper {
  *
  * const pipeline = new FullFacePipeline({
  *   lam,
- *   emotionBlendFactor: 0.8,
- *   lamBlendFactor: 0.2,
+ *   profile: { mouth: 1.2, brows: 0.8 },
  * });
  * await pipeline.initialize();
  *
@@ -1046,11 +520,41 @@ declare class EmotionToBlendshapeMapper {
  * });
  *
  * pipeline.start();
- * pipeline.setEmotionLabel('happy'); // From backend LLM
  * await pipeline.onAudioChunk(audioData);
  * ```
  */
+type BlendshapeGroup = 'eyes' | 'brows' | 'jaw' | 'mouth' | 'cheeks' | 'nose' | 'tongue';
+/**
+ * Per-character weight scaling for A2E blendshape output.
+ *
+ * Group scalers multiply all blendshapes in that group (default 1.0).
+ * Per-blendshape overrides take priority over group scalers.
+ * Final values are clamped to [0, 1].
+ */
+interface ExpressionProfile {
+    /** eyeBlink*, eyeLook*, eyeSquint*, eyeWide* (14 blendshapes) */
+    eyes?: number;
+    /** browDown*, browInnerUp, browOuterUp* (5 blendshapes) */
+    brows?: number;
+    /** jawForward, jawLeft, jawRight, jawOpen (4 blendshapes) */
+    jaw?: number;
+    /** mouth* (23 blendshapes) */
+    mouth?: number;
+    /** cheekPuff, cheekSquint* (3 blendshapes) */
+    cheeks?: number;
+    /** noseSneer* (2 blendshapes) */
+    nose?: number;
+    /** tongueOut (1 blendshape) */
+    tongue?: number;
+    /** Per-blendshape overrides (0-2). Takes priority over group scalers. */
+    overrides?: Partial<Record<string, number>>;
+}
+/**
+ * Map each LAM_BLENDSHAPES entry to its BlendshapeGroup.
+ * Built once at module load from prefix matching.
+ */
+declare const BLENDSHAPE_TO_GROUP: Map<string, BlendshapeGroup>;
 /**
  * Configuration for FullFacePipeline
  */
@@ -1061,37 +565,43 @@ interface FullFacePipelineOptions {
     chunkTargetMs?: number;
     /**
      * Audio playback delay in ms before first audio plays.
-     * Gives LAM inference time to pre-compute blendshapes.
-     * Default: auto-detected from lam.backend (50ms WebGPU, 350ms WASM).
+     * Gives A2E inference time to pre-compute blendshapes before audio
+     * starts, preventing frame drops/desync. Must be ≥ chunkSize
+     * accumulation time + inference latency.
+     *
+     * Default: auto-calculated from chunkSize and backend type.
      */
     audioDelayMs?: number;
-    /** LAM inference engine */
-    lam: LipSyncBackend;
     /**
-     * Emotion blend factor for upper face blendshapes (0-1)
-     * Higher values give more weight to emotion detection
-     * @default 0.8
+     * A2E inference chunk size in samples.
+     * Controls how many samples accumulate before each inference call.
+     * Smaller = lower latency (less delay before first frame), more overhead.
+     * Larger = higher latency, less overhead.
+     *
+     * Default: 16000 (1s) — the model's native window size.
+     * Smaller chunks get zero-padded, causing near-zero blendshape output.
      */
-    emotionBlendFactor?: number;
+    chunkSize?: number;
+    /** A2E inference engine */
+    lam: A2EBackend;
+    /** Per-character expression weight scaling */
+    profile?: ExpressionProfile;
     /**
-     * LAM blend factor for upper face blendshapes (0-1)
-     * Provides subtle fallback from LAM when emotion is weak
-     * @default 0.2
+     * Time in ms with no new inference frames before logging a stale warning.
+     *
+     * Must be larger than the inter-batch gap (chunkSize/sampleRate + inference time).
+     * Default: 2000
      */
-    lamBlendFactor?: number;
+    staleThresholdMs?: number;
 }
 /**
- * Full face frame with merged blendshapes and emotion data
+ * Full face frame with scaled blendshapes
  */
 interface FullFaceFrame {
-    /** Merged 52 ARKit blendshapes (lower face from LAM + upper face from emotion) */
+    /** Scaled 52 ARKit blendshapes (ExpressionProfile applied) */
     blendshapes: Float32Array;
-    /** Original LAM blendshapes (52) */
-    lamBlendshapes: Float32Array;
-    /** Emotion-driven upper face blendshapes (11) */
-    emotionBlendshapes: UpperFaceBlendshapes;
-    /** Raw emotion frame data */
-    emotion: EmotionFrame | null;
+    /** Raw A2E output (52 blendshapes, before profile scaling) */
+    rawBlendshapes: Float32Array;
     /** AudioContext timestamp for this frame */
     timestamp: number;
 }
@@ -1103,8 +613,6 @@ interface FullFacePipelineEvents {
     full_frame_ready: FullFaceFrame;
     /** Raw LAM frame ready (for debugging/monitoring) */
     lam_frame_ready: Float32Array;
-    /** Emotion frame ready (for debugging/monitoring) */
-    emotion_frame_ready: EmotionFrame;
     /** Playback has completed */
     playback_complete: void;
     /** First frame ready, playback starting */
@@ -1115,53 +623,45 @@ interface FullFacePipelineEvents {
     [key: string]: unknown;
 }
 /**
- * FullFacePipeline - Unified LAM + Emotion animation pipeline
+ * FullFacePipeline - A2E animation pipeline with ExpressionProfile scaling
  *
  * Audio-first design matching SyncedAudioPipeline:
- * - Audio is scheduled immediately (never waits for LAM)
- * - LAM runs in background (fire-and-forget)
- * - Emotion from setEmotionLabel() or prosody fallback
+ * - Audio is scheduled immediately (never waits for A2E)
+ * - A2E runs in background (fire-and-forget via A2EProcessor)
+ * - ExpressionProfile scales raw A2E output per-character
  */
 declare class FullFacePipeline extends EventEmitter<FullFacePipelineEvents> {
     private readonly options;
     private scheduler;
     private coalescer;
-    private lamPipeline;
-    private emotionMapper;
-    private energyAnalyzer;
+    private processor;
     private playbackStarted;
     private monitorInterval;
     private frameAnimationId;
-    private lastEmotionFrame;
-    private currentAudioEnergy;
     private lastNewFrameTime;
     private lastKnownLamFrame;
     private staleWarningEmitted;
-    private static readonly STALE_FRAME_THRESHOLD_MS;
-    private emotionBlendFactor;
-    private lamBlendFactor;
+    private readonly staleThresholdMs;
+    private frameLoopCount;
+    private profile;
     constructor(options: FullFacePipelineOptions);
     /**
      * Initialize the pipeline
      */
     initialize(): Promise<void>;
     /**
-     * Set emotion label from backend (e.g., LLM response emotion).
-     *
-     * Converts a natural language emotion label into an EmotionFrame
-     * that drives upper face blendshapes for the duration of the utterance.
-     *
-     * Supported labels: happy, excited, joyful, sad, melancholic, angry,
-     * frustrated, neutral, etc.
-     *
-     * @param label - Emotion label string (case-insensitive)
+     * Update the ExpressionProfile at runtime (e.g., character switch).
      */
-    setEmotionLabel(label: string): void;
+    setProfile(profile: ExpressionProfile): void;
     /**
-     * Clear any set emotion label.
-     * Falls back to prosody-only upper face animation.
+     * Apply ExpressionProfile scaling to raw A2E blendshapes.
+     *
+     * For each blendshape:
+     * 1. If an override exists for the blendshape name, use override as scaler
+     * 2. Otherwise, use the group scaler (default 1.0)
+     * 3. Clamp result to [0, 1]
      */
-    clearEmotionLabel(): void;
+    applyProfile(raw: Float32Array): Float32Array;
     /**
      * Start a new playback session
      *
@@ -1172,29 +672,18 @@ declare class FullFacePipeline extends EventEmitter<FullFacePipelineEvents> {
     /**
      * Receive audio chunk from network
      *
-     * Audio-first design: schedules audio immediately, LAM runs in background.
-     * This prevents LAM inference (50-300ms) from blocking audio scheduling.
+     * Audio-first design: schedules audio immediately, A2E runs in background.
+     * This prevents A2E inference (50-300ms) from blocking audio scheduling.
      *
      * @param chunk - Uint8Array containing Int16 PCM audio
      */
     onAudioChunk(chunk: Uint8Array): Promise<void>;
-    /**
-     * Get emotion frame for current animation.
-     *
-     * Priority:
-     * 1. Explicit emotion label from setEmotionLabel()
-     * 2. Prosody fallback: subtle brow movement from audio energy
-     */
-    private getEmotionFrame;
-    /**
-     * Merge LAM blendshapes with emotion upper face blendshapes
-     */
-    mergeBlendshapes(lamFrame: Float32Array, emotionFrame: EmotionFrame | null, audioEnergy?: number): {
-        merged: Float32Array;
-        emotionBlendshapes: UpperFaceBlendshapes;
-    };
     /**
      * Start frame animation loop
+     *
+     * Polls A2EProcessor at render rate (60fps) for the latest inference frame
+     * matching the current AudioContext time. Between inference batches (~30fps
+     * bursts), getFrameForTime() holds the last frame.
      */
     private startFrameLoop;
     /**
@@ -1219,17 +708,11 @@ declare class FullFacePipeline extends EventEmitter<FullFacePipelineEvents> {
     getState(): {
         playbackStarted: boolean;
         coalescerFill: number;
-        lamFill: number;
-        queuedLAMFrames: number;
-        emotionLabel: "neutral" | "happy" | "angry" | "sad" | null;
-        currentAudioEnergy: number;
+        processorFill: number;
+        queuedFrames: number;
         currentTime: number;
         playbackEndTime: number;
     };
-    /**
-     * Check if an explicit emotion label is currently set
-     */
-    get hasEmotionLabel(): boolean;
     /**
      * Cleanup resources
      */
@@ -1255,13 +738,6 @@ declare class FullFacePipeline extends EventEmitter<FullFacePipelineEvents> {
  * @module inference/onnxLoader
  */
-type OrtModule = {
-    InferenceSession: typeof InferenceSession;
-    Tensor: typeof Tensor;
-    env: Env;
-};
-type SessionOptions = InferenceSession.SessionOptions;
 /**
  * Check if WebGPU is available and likely to work
  *
@@ -1271,74 +747,6 @@ type SessionOptions = InferenceSession.SessionOptions;
  * @returns true if WebGPU is available and working
  */
 declare function isWebGPUAvailable(): Promise<boolean>;
-/**
- * Load ONNX Runtime with the specified backend
- *
- * This lazily loads the appropriate bundle:
- * - 'wasm': Loads onnxruntime-web (WASM-only, smaller)
- * - 'webgpu': Loads onnxruntime-web/webgpu (includes WebGPU + WASM fallback)
- *
- * Once loaded, the same instance is reused for all subsequent calls.
- * If you need to switch backends, you must reload the page.
- *
- * @param backend The backend to load ('webgpu' or 'wasm')
- * @returns The ONNX Runtime module
- */
-declare function getOnnxRuntime(backend: RuntimeBackend): Promise<OrtModule>;
-/**
- * Get the appropriate ONNX Runtime based on user preference
- *
- * This resolves the user's preference against platform capabilities
- * and loads the appropriate bundle.
- *
- * @param preference User's backend preference
- * @returns The ONNX Runtime module and the resolved backend
- */
-declare function getOnnxRuntimeForPreference(preference?: BackendPreference): Promise<{
-    ort: OrtModule;
-    backend: RuntimeBackend;
-}>;
-/**
- * Get session options for creating an inference session
- *
- * This returns optimized session options based on the backend and platform.
- *
- * @param backend The backend being used
- * @returns Session options for InferenceSession.create()
- */
-declare function getSessionOptions(backend: RuntimeBackend): SessionOptions;
-/**
- * Create an inference session with automatic fallback
- *
- * If WebGPU session creation fails, automatically falls back to WASM.
- *
- * @param modelBuffer The model data as ArrayBuffer
- * @param preferredBackend The preferred backend
- * @returns The created session and the backend used
- */
-declare function createSessionWithFallback(modelBuffer: ArrayBuffer, preferredBackend: RuntimeBackend): Promise<{
-    session: InferenceSession;
-    backend: RuntimeBackend;
-}>;
-/**
- * Get the currently loaded backend (if any)
- */
-declare function getLoadedBackend(): RuntimeBackend | null;
-/**
- * Check if ONNX Runtime has been loaded
- */
-declare function isOnnxRuntimeLoaded(): boolean;
-/**
- * Preload ONNX Runtime and compile the WASM binary early
- *
- * Call this before loading heavy resources (Three.js, VRM models) to ensure
- * WASM memory is allocated in a clean JS heap, reducing iOS memory pressure.
- * Uses the singleton pattern — subsequent model loading reuses this instance.
- *
- * @param preference Backend preference (default: 'auto')
- * @returns The resolved backend that was loaded
- */
-declare function preloadOnnxRuntime(preference?: BackendPreference): Promise<RuntimeBackend>;
 /**
  * SenseVoice automatic speech recognition using ONNX Runtime Web
@@ -2094,8 +1502,9 @@ interface Wav2ArkitCpuWorkerConfig {
  *
  * @see Wav2ArkitCpuInference for main-thread version
  */
-declare class Wav2ArkitCpuWorker implements LipSyncBackend {
+declare class Wav2ArkitCpuWorker implements A2EBackend {
     readonly modelId: "wav2arkit_cpu";
+    readonly chunkSize: number;
     private worker;
     private config;
     private isLoading;
@@ -2124,7 +1533,7 @@ declare class Wav2ArkitCpuWorker implements LipSyncBackend {
     /**
      * Load the ONNX model in the worker
      */
-    load(): Promise<LipSyncModelInfo>;
+    load(): Promise<A2EModelInfo>;
     /**
      * Run inference on raw audio
      *
@@ -2134,7 +1543,7 @@ declare class Wav2ArkitCpuWorker implements LipSyncBackend {
      * @param audioSamples - Float32Array of raw audio at 16kHz
      * @param _identityIndex - Ignored (identity 11 is baked into the model)
      */
-    infer(audioSamples: Float32Array, _identityIndex?: number): Promise<LipSyncResult>;
+    infer(audioSamples: Float32Array, _identityIndex?: number): Promise<A2EResult>;
     /**
      * Queue inference to serialize worker calls
      */
@@ -2166,7 +1575,7 @@ declare class Wav2ArkitCpuWorker implements LipSyncBackend {
  * await worker.init();
  *
  * const asr = createSenseVoice({ modelUrl: '...', unifiedWorker: worker });
- * const lam = createLipSync({ gpuModelUrl: '...', cpuModelUrl: '...', unifiedWorker: worker });
+ * const lam = createA2E({ gpuModelUrl: '...', cpuModelUrl: '...', unifiedWorker: worker });
  * const vad = createSileroVAD({ modelUrl: '...', unifiedWorker: worker });
  * ```
  *
@@ -2196,17 +1605,17 @@ declare class UnifiedInferenceWorker {
     }): Promise<SenseVoiceModelInfo>;
     transcribe(audio: Float32Array): Promise<SenseVoiceResult>;
     disposeSenseVoice(): Promise<void>;
-    loadLipSync(config: {
+    loadA2E(config: {
         modelUrl: string;
         externalDataUrl: string | null;
-    }): Promise<LipSyncModelInfo>;
-    inferLipSync(audio: Float32Array): Promise<{
+    }): Promise<A2EModelInfo>;
+    inferA2E(audio: Float32Array): Promise<{
         blendshapes: Float32Array;
         numFrames: number;
         numBlendshapes: number;
         inferenceTimeMs: number;
     }>;
-    disposeLipSync(): Promise<void>;
+    disposeA2E(): Promise<void>;
     loadVAD(config: {
         modelUrl: string;
         sampleRate: number;
@@ -2252,10 +1661,11 @@ declare class SenseVoiceUnifiedAdapter implements SenseVoiceBackend {
 /**
  * Wav2ArkitCpu adapter backed by UnifiedInferenceWorker
  *
- * Implements LipSyncBackend, delegating all inference to the shared worker.
+ * Implements A2EBackend, delegating all inference to the shared worker.
  */
-declare class Wav2ArkitCpuUnifiedAdapter implements LipSyncBackend {
+declare class Wav2ArkitCpuUnifiedAdapter implements A2EBackend {
     readonly modelId: "wav2arkit_cpu";
+    readonly chunkSize: number;
     private worker;
     private config;
     private _isLoaded;
@@ -2263,8 +1673,8 @@ declare class Wav2ArkitCpuUnifiedAdapter implements LipSyncBackend {
     constructor(worker: UnifiedInferenceWorker, config: Wav2ArkitCpuWorkerConfig);
     get isLoaded(): boolean;
     get backend(): RuntimeBackend | null;
-    load(): Promise<LipSyncModelInfo>;
-    infer(audioSamples: Float32Array, _identityIndex?: number): Promise<LipSyncResult>;
+    load(): Promise<A2EModelInfo>;
+    infer(audioSamples: Float32Array, _identityIndex?: number): Promise<A2EResult>;
     dispose(): Promise<void>;
 }
 /**
@@ -2392,116 +1802,6 @@ interface CreateSenseVoiceConfig {
  */
 declare function createSenseVoice(config: CreateSenseVoiceConfig): SenseVoiceBackend;
-/**
- * Kaldi-compatible filterbank (fbank) feature extraction
- *
- * Pure TypeScript implementation matching kaldi-native-fbank parameters
- * used by SenseVoice. No external dependencies.
- *
- * Pipeline: audio → framing → windowing → FFT → power spectrum → mel filterbank → log
- *
- * @module inference/kaldiFbank
- */
-interface KaldiFbankOptions {
-    /** Frame length in ms (default: 25) */
-    frameLengthMs?: number;
-    /** Frame shift in ms (default: 10) */
-    frameShiftMs?: number;
-    /** Low frequency cutoff in Hz (default: 20) */
-    lowFreq?: number;
-    /** High frequency cutoff in Hz (default: sampleRate / 2) */
-    highFreq?: number;
-    /** Dither amount (default: 0 for deterministic output) */
-    dither?: number;
-    /** Preemphasis coefficient (default: 0.97) */
-    preemphasis?: number;
-}
-/**
- * Compute Kaldi-compatible log mel filterbank features
- *
- * @param audio Raw audio samples (float32, [-1, 1] range)
- * @param sampleRate Sample rate in Hz (must be 16000 for SenseVoice)
- * @param numMelBins Number of mel bins (80 for SenseVoice)
- * @param opts Optional parameters
- * @returns Flattened Float32Array of shape [numFrames, numMelBins]
- */
-declare function computeKaldiFbank(audio: Float32Array, sampleRate: number, numMelBins: number, opts?: KaldiFbankOptions): Float32Array;
-/**
- * Apply Low Frame Rate stacking for SenseVoice
- *
- * Concatenates lfrM consecutive frames with stride lfrN.
- * Left-pads with copies of first frame, right-pads last group.
- *
- * @param features Flattened [numFrames, featureDim]
- * @param featureDim Feature dimension per frame (e.g., 80)
- * @param lfrM Number of frames to stack (default: 7)
- * @param lfrN Stride (default: 6)
- * @returns Flattened [numOutputFrames, featureDim * lfrM]
- */
-declare function applyLFR(features: Float32Array, featureDim: number, lfrM?: number, lfrN?: number): Float32Array;
-/**
- * Apply CMVN normalization in-place
- *
- * Formula: normalized[i] = (features[i] + negMean[i % dim]) * invStddev[i % dim]
- *
- * @param features Flattened feature array (modified in-place)
- * @param dim Feature dimension (560 for SenseVoice after LFR)
- * @param negMean Negative mean vector (dim-dimensional)
- * @param invStddev Inverse standard deviation vector (dim-dimensional)
- * @returns The same features array (for chaining)
- */
-declare function applyCMVN(features: Float32Array, dim: number, negMean: Float32Array, invStddev: Float32Array): Float32Array;
-/**
- * Parse CMVN vectors from comma-separated strings (stored in ONNX metadata)
- *
- * The sherpa-onnx SenseVoice export stores neg_mean and inv_stddev
- * as comma-separated float strings in the model's metadata.
- */
-declare function parseCMVNFromMetadata(negMeanStr: string, invStddevStr: string): {
-    negMean: Float32Array;
-    invStddev: Float32Array;
-};
-/**
- * CTC greedy decoder for SenseVoice
- *
- * Decodes CTC logits into text with structured token parsing
- * for language, emotion, and audio event detection.
- *
- * @module inference/ctcDecoder
- */
-interface CTCDecodeResult {
-    /** Decoded text (speech content only) */
-    text: string;
-    /** Detected language (e.g., 'zh', 'en', 'ja', 'ko', 'yue') */
-    language?: string;
-    /** Detected emotion (e.g., 'HAPPY', 'SAD', 'ANGRY', 'NEUTRAL') */
-    emotion?: string;
-    /** Detected audio event (e.g., 'Speech', 'BGM', 'Laughter') */
-    event?: string;
-}
-/** Resolve language string to SenseVoice language ID */
-declare function resolveLanguageId(language: string): number;
-/** Resolve text norm string to SenseVoice text norm ID */
-declare function resolveTextNormId(textNorm: string): number;
-/**
- * Parse tokens.txt into a token ID → string map
- *
- * Format: each line is "token_string token_id"
- * e.g., "<unk> 0", "▁the 3", "s 4"
- */
-declare function parseTokensFile(content: string): Map<number, string>;
-/**
- * CTC greedy decode
- *
- * @param logits Raw logits from model output, flattened [seqLen, vocabSize]
- * @param seqLen Sequence length (time steps)
- * @param vocabSize Vocabulary size
- * @param tokenMap Token ID → string map from tokens.txt
- * @returns Decoded text and structured metadata
- */
-declare function ctcGreedyDecode(logits: Float32Array, seqLen: number, vocabSize: number, tokenMap: Map<number, string>): CTCDecodeResult;
 /**
  * Shared blendshape constants and utilities for lip sync inference
  *
@@ -2521,26 +1821,18 @@ declare const LAM_BLENDSHAPES: readonly ["browDownLeft", "browDownRight", "browI
 /** Alias for backwards compatibility */
 declare const ARKIT_BLENDSHAPES: readonly ["browDownLeft", "browDownRight", "browInnerUp", "browOuterUpLeft", "browOuterUpRight", "cheekPuff", "cheekSquintLeft", "cheekSquintRight", "eyeBlinkLeft", "eyeBlinkRight", "eyeLookDownLeft", "eyeLookDownRight", "eyeLookInLeft", "eyeLookInRight", "eyeLookOutLeft", "eyeLookOutRight", "eyeLookUpLeft", "eyeLookUpRight", "eyeSquintLeft", "eyeSquintRight", "eyeWideLeft", "eyeWideRight", "jawForward", "jawLeft", "jawOpen", "jawRight", "mouthClose", "mouthDimpleLeft", "mouthDimpleRight", "mouthFrownLeft", "mouthFrownRight", "mouthFunnel", "mouthLeft", "mouthLowerDownLeft", "mouthLowerDownRight", "mouthPressLeft", "mouthPressRight", "mouthPucker", "mouthRight", "mouthRollLower", "mouthRollUpper", "mouthShrugLower", "mouthShrugUpper", "mouthSmileLeft", "mouthSmileRight", "mouthStretchLeft", "mouthStretchRight", "mouthUpperUpLeft", "mouthUpperUpRight", "noseSneerLeft", "noseSneerRight", "tongueOut"];
 /**
- * Symmetrize blendshapes by averaging left/right pairs
- * From LAM official postprocessing (models/utils.py)
- * This fixes asymmetric output from the raw model
- */
-declare function symmetrizeBlendshapes(frame: Float32Array): Float32Array;
-/**
- * wav2arkit_cpu model blendshape ordering
+ * Linearly interpolate between two blendshape weight arrays.
  *
- * Indices 0-24 match LAM_BLENDSHAPES, but 25+ diverge:
- * - LAM puts jawRight, mouthClose, mouthDimpleLeft, mouthDimpleRight at 25-28
- * - wav2arkit_cpu puts mouthFrownLeft at 25 and moves those four to 48-51
- */
-declare const WAV2ARKIT_BLENDSHAPES: readonly ["browDownLeft", "browDownRight", "browInnerUp", "browOuterUpLeft", "browOuterUpRight", "cheekPuff", "cheekSquintLeft", "cheekSquintRight", "eyeBlinkLeft", "eyeBlinkRight", "eyeLookDownLeft", "eyeLookDownRight", "eyeLookInLeft", "eyeLookInRight", "eyeLookOutLeft", "eyeLookOutRight", "eyeLookUpLeft", "eyeLookUpRight", "eyeSquintLeft", "eyeSquintRight", "eyeWideLeft", "eyeWideRight", "jawForward", "jawLeft", "jawOpen", "mouthFrownLeft", "mouthFrownRight", "mouthFunnel", "mouthLeft", "mouthLowerDownLeft", "mouthLowerDownRight", "mouthPressLeft", "mouthPressRight", "mouthPucker", "mouthRight", "mouthRollLower", "mouthRollUpper", "mouthShrugLower", "mouthShrugUpper", "mouthSmileLeft", "mouthSmileRight", "mouthStretchLeft", "mouthStretchRight", "mouthUpperUpLeft", "mouthUpperUpRight", "noseSneerLeft", "noseSneerRight", "tongueOut", "mouthClose", "mouthDimpleLeft", "mouthDimpleRight", "jawRight"];
-/**
- * Remap a blendshape frame from wav2arkit_cpu ordering to LAM_BLENDSHAPES ordering
+ * Pure math utility with zero renderer dependency — used by all renderer
+ * adapters (@omote/three, @omote/babylon, @omote/r3f) for smooth frame
+ * transitions.
  *
- * @param frame - Float32Array of 52 blendshape values in wav2arkit_cpu order
- * @returns Float32Array of 52 blendshape values in LAM_BLENDSHAPES order
+ * @param current - Current blendshape weights
+ * @param target  - Target blendshape weights
+ * @param factor  - Interpolation factor (0 = no change, 1 = snap to target). Default: 0.3
+ * @returns Interpolated weights as number[]
  */
-declare function remapWav2ArkitToLam(frame: Float32Array): Float32Array;
+declare function lerpBlendshapes(current: Float32Array | number[], target: Float32Array | number[], factor?: number): number[];
 /**
  * Unified Wav2Vec2 inference engine for Audio-to-Expression + ASR
@@ -2582,6 +1874,12 @@ interface Wav2Vec2InferenceConfig {
     backend?: InferenceBackend;
     /** Number of identity classes (default: 12 for streaming model) */
     numIdentityClasses?: number;
+    /**
+     * Number of audio samples per inference chunk (default: 16000).
+     * Model supports variable chunk sizes. Smaller chunks = lower latency,
+     * more inference overhead. 8000 (500ms) is recommended for real-time lip sync.
+     */
+    chunkSize?: number;
 }
 interface ModelInfo {
     backend: 'webgpu' | 'wasm';
@@ -2608,7 +1906,7 @@ interface Wav2Vec2Result {
     /** Inference time in ms */
     inferenceTimeMs: number;
 }
-declare class Wav2Vec2Inference implements LipSyncBackend {
+declare class Wav2Vec2Inference implements A2EBackend {
     readonly modelId: "wav2vec2";
     private session;
     private ort;
@@ -2616,6 +1914,7 @@ declare class Wav2Vec2Inference implements LipSyncBackend {
     private _backend;
     private isLoading;
     private numIdentityClasses;
+    readonly chunkSize: number;
     private inferenceQueue;
     private poisoned;
     private static readonly INFERENCE_TIMEOUT_MS;
@@ -2635,11 +1934,10 @@ declare class Wav2Vec2Inference implements LipSyncBackend {
     load(): Promise<ModelInfo>;
     /**
      * Run inference on raw audio
-     * @param audioSamples - Float32Array of raw audio at 16kHz (16000 samples = 1 second)
+     * @param audioSamples - Float32Array of raw audio at 16kHz
      * @param identityIndex - Optional identity index (0-11, default 0 = neutral)
      *
-     * Note: Model expects 1-second chunks (16000 samples) for optimal performance.
-     * Audio will be zero-padded or truncated to 16000 samples.
+     * Audio will be zero-padded or truncated to chunkSize samples.
      */
     infer(audioSamples: Float32Array, identityIndex?: number): Promise<Wav2Vec2Result>;
     /**
@@ -2707,8 +2005,9 @@ interface Wav2ArkitCpuConfig {
     /** Preferred backend (default: 'wasm' — this model is CPU-optimized) */
     backend?: BackendPreference;
 }
-declare class Wav2ArkitCpuInference implements LipSyncBackend {
+declare class Wav2ArkitCpuInference implements A2EBackend {
     readonly modelId: "wav2arkit_cpu";
+    readonly chunkSize: number;
     private session;
     private ort;
     private config;
@@ -2723,7 +2022,7 @@ declare class Wav2ArkitCpuInference implements LipSyncBackend {
     /**
      * Load the ONNX model
      */
-    load(): Promise<LipSyncModelInfo>;
+    load(): Promise<A2EModelInfo>;
     /**
      * Run inference on raw audio
      *
@@ -2733,7 +2032,7 @@ declare class Wav2ArkitCpuInference implements LipSyncBackend {
      * @param audioSamples - Float32Array of raw audio at 16kHz
      * @param _identityIndex - Ignored (identity 11 is baked into the model)
      */
-    infer(audioSamples: Float32Array, _identityIndex?: number): Promise<LipSyncResult>;
+    infer(audioSamples: Float32Array, _identityIndex?: number): Promise<A2EResult>;
     /**
      * Queue inference to serialize ONNX session calls
      */
@@ -2745,7 +2044,7 @@ declare class Wav2ArkitCpuInference implements LipSyncBackend {
 }
 /**
- * Factory function for lip sync with automatic GPU/CPU model selection
+ * Factory function for A2E with automatic GPU/CPU model selection
  *
  * Provides a unified API that automatically selects the optimal model:
  * - Safari (macOS + iOS): Uses Wav2ArkitCpuInference (404MB, WASM)
@@ -2766,20 +2065,20 @@ declare class Wav2ArkitCpuInference implements LipSyncBackend {
  *
  * @example Auto-detect (recommended)
  * ```typescript
- * import { createLipSync } from '@omote/core';
+ * import { createA2E } from '@omote/core';
  *
- * const lam = createLipSync({
+ * const a2e = createA2E({
  *   gpuModelUrl: '/models/unified_wav2vec2_asr_a2e.onnx',
  *   cpuModelUrl: '/models/wav2arkit_cpu.onnx',
  * });
  *
- * await lam.load();
- * const { blendshapes } = await lam.infer(audioSamples);
+ * await a2e.load();
+ * const { blendshapes } = await a2e.infer(audioSamples);
  * ```
  *
  * @example Force CPU model
  * ```typescript
- * const lam = createLipSync({
+ * const a2e = createA2E({
  *   gpuModelUrl: '/models/unified_wav2vec2_asr_a2e.onnx',
  *   cpuModelUrl: '/models/wav2arkit_cpu.onnx',
  *   mode: 'cpu',
@@ -2788,9 +2087,9 @@ declare class Wav2ArkitCpuInference implements LipSyncBackend {
  */
 /**
- * Configuration for the lip sync factory
+ * Configuration for the A2E factory
  */
-interface CreateLipSyncConfig {
+interface CreateA2EConfig {
     /** URL for the GPU model (Wav2Vec2, used on Chrome/Firefox/Edge) */
     gpuModelUrl: string;
     /**
@@ -2804,7 +2103,7 @@ interface CreateLipSyncConfig {
     cpuModelUrl: string;
     /**
      * Model selection mode:
-     * - 'auto': Safari/iOS → CPU, everything else → GPU (default)
+     * - 'auto': Safari/iOS -> CPU, everything else -> GPU (default)
      * - 'gpu': Force GPU model (Wav2Vec2Inference)
      * - 'cpu': Force CPU model (Wav2ArkitCpuInference)
      */
@@ -2838,12 +2137,322 @@ interface CreateLipSyncConfig {
     unifiedWorker?: UnifiedInferenceWorker;
 }
 /**
- * Create a lip sync instance with automatic GPU/CPU model selection
+ * Create an A2E instance with automatic GPU/CPU model selection
  *
  * @param config - Factory configuration
- * @returns A LipSyncBackend instance (either GPU or CPU model)
+ * @returns An A2EBackend instance (either GPU or CPU model)
+ */
+declare function createA2E(config: CreateA2EConfig): A2EBackend;
+/**
+ * A2EProcessor — Engine-agnostic audio-to-expression processor
+ *
+ * The core inference primitive: audio samples in → blendshape frames out.
+ * No mic capture, no audio playback, no Web Audio API.
+ *
+ * This is what Unity/Unreal/Godot/any engine would use directly.
+ * Web-specific concerns (mic, AudioContext, scheduling) live in the
+ * orchestrator and pipeline layers above.
+ *
+ * Two output modes:
+ * - **Pull mode**: `pushAudio(samples, timestamp)` + `getFrameForTime(t)`
+ *   For TTS playback where frames are synced to AudioContext clock.
+ * - **Push mode**: `pushAudio(samples)` + `startDrip()` + `latestFrame`
+ *   For live mic / game loop where frames are consumed at ~30fps.
+ *
+ * @category Inference
+ *
+ * @example Pull mode (TTS playback)
+ * ```typescript
+ * const processor = new A2EProcessor({ backend: a2e });
+ * processor.pushAudio(samples, audioContext.currentTime + delay);
+ * const frame = processor.getFrameForTime(audioContext.currentTime);
+ * ```
+ *
+ * @example Push mode (live mic)
+ * ```typescript
+ * const processor = new A2EProcessor({
+ *   backend: a2e,
+ *   onFrame: (frame) => applyToAvatar(frame),
+ * });
+ * processor.startDrip();
+ * processor.pushAudio(micSamples); // no timestamp → drip mode
+ * ```
+ */
+interface A2EProcessorConfig {
+    /** Inference backend */
+    backend: A2EBackend;
+    /** Sample rate (default: 16000) */
+    sampleRate?: number;
+    /** Samples per inference chunk (default: 16000 = 1s) */
+    chunkSize?: number;
+    /** Callback fired with each blendshape frame (push mode) */
+    onFrame?: (frame: Float32Array) => void;
+    /** Error callback */
+    onError?: (error: Error) => void;
+}
+declare class A2EProcessor {
+    private readonly backend;
+    private readonly sampleRate;
+    private readonly chunkSize;
+    private readonly onFrame?;
+    private readonly onError?;
+    private bufferCapacity;
+    private buffer;
+    private writeOffset;
+    private bufferStartTime;
+    private timestampedQueue;
+    private plainQueue;
+    private _latestFrame;
+    private dripInterval;
+    private lastPulledFrame;
+    private inferenceRunning;
+    private pendingChunks;
+    private getFrameCallCount;
+    private disposed;
+    constructor(config: A2EProcessorConfig);
+    /**
+     * Push audio samples for inference (any source: mic, TTS, file).
+     *
+     * - With `timestamp`: frames stored with timestamps (pull mode)
+     * - Without `timestamp`: frames stored in plain queue (drip/push mode)
+     *
+     * Fire-and-forget: returns immediately, inference runs async.
+     */
+    pushAudio(samples: Float32Array, timestamp?: number): void;
+    /**
+     * Flush remaining buffered audio (pads to chunkSize).
+     * Call at end of stream to process final partial chunk.
+     *
+     * Routes through the serialized pendingChunks pipeline to maintain
+     * correct frame ordering. Without this, flush() could push frames
+     * with the latest timestamp to the queue before drainPendingChunks()
+     * finishes pushing frames with earlier timestamps — causing
+     * getFrameForTime() to see out-of-order timestamps and stall.
+     */
+    flush(): Promise<void>;
+    /**
+     * Reset buffer and frame queues
+     */
+    reset(): void;
+    /**
+     * Get frame synced to external clock (e.g. AudioContext.currentTime).
+     *
+     * Discards frames that are too old, returns the current frame,
+     * or holds last frame as fallback to prevent avatar freezing.
+     *
+     * @param currentTime - Current playback time (seconds)
+     * @returns Blendshape frame, or null if no frames yet
+     */
+    getFrameForTime(currentTime: number): Float32Array | null;
+    /** Latest frame from drip-feed (live mic, game loop) */
+    get latestFrame(): Float32Array | null;
+    /** Start 30fps drip-feed timer (push mode) */
+    startDrip(): void;
+    /** Stop drip-feed timer */
+    stopDrip(): void;
+    /** Number of frames waiting in queue (both modes combined) */
+    get queuedFrameCount(): number;
+    /** Buffer fill level as fraction of chunkSize (0-1) */
+    get fillLevel(): number;
+    /** Dispose resources */
+    dispose(): void;
+    /**
+     * Process pending chunks sequentially.
+     * Fire-and-forget — called from pushAudio() without awaiting.
+     */
+    private drainPendingChunks;
+    private handleError;
+}
+/**
+ * BlendshapeSmoother — Per-channel critically damped spring for 52 ARKit blendshapes
+ *
+ * Eliminates frame gaps between inference batches by smoothly interpolating
+ * blendshape weights using critically damped springs (the game industry standard).
+ *
+ * Each of the 52 blendshape channels has its own spring with position + velocity
+ * state. When a new inference frame arrives, spring targets are updated. Between
+ * frames, springs continue converging toward the last target — no frozen face.
+ *
+ * When inference stalls, `decayToNeutral()` sets all targets to 0, and the
+ * springs smoothly close the mouth / relax the face over the halflife period.
+ *
+ * Math from Daniel Holden's "Spring-It-On" (Epic Games):
+ * https://theorangeduck.com/page/spring-roll-call
+ *
+ * @category Inference
+ *
+ * @example Basic usage
+ * ```typescript
+ * const smoother = new BlendshapeSmoother({ halflife: 0.06 });
+ *
+ * // In frame loop (60fps):
+ * smoother.setTarget(inferenceFrame);        // when new frame arrives
+ * const smoothed = smoother.update(1/60);    // every render frame
+ * applyToAvatar(smoothed);
+ * ```
  */
-declare function createLipSync(config: CreateLipSyncConfig): LipSyncBackend;
+interface BlendshapeSmootherConfig {
+    /**
+     * Spring halflife in seconds — time for the distance to the target
+     * to reduce by half. Lower = snappier, higher = smoother.
+     *
+     * - 0.04s (40ms): Very snappy, slight jitter on fast transitions
+     * - 0.06s (60ms): Sweet spot for lip sync (default)
+     * - 0.10s (100ms): Very smooth, slight lag on fast consonants
+     * - 0: Bypass mode — passes through raw target values (no smoothing)
+     *
+     * Default: 0.06
+     */
+    halflife?: number;
+}
+declare class BlendshapeSmoother {
+    private readonly halflife;
+    /** Current smoothed blendshape values */
+    private values;
+    /** Per-channel spring velocities */
+    private velocities;
+    /** Current spring targets (from latest inference frame) */
+    private targets;
+    /** Whether any target has been set */
+    private _hasTarget;
+    constructor(config?: BlendshapeSmootherConfig);
+    /** Whether a target frame has been set (false until first setTarget call) */
+    get hasTarget(): boolean;
+    /**
+     * Set new target frame from inference output.
+     * Springs will converge toward these values on subsequent update() calls.
+     */
+    setTarget(frame: Float32Array): void;
+    /**
+     * Advance all 52 springs by `dt` seconds and return the smoothed frame.
+     *
+     * Call this every render frame (e.g., inside requestAnimationFrame).
+     * Returns the internal values buffer — do NOT mutate the returned array.
+     *
+     * @param dt - Time step in seconds (e.g., 1/60 for 60fps)
+     * @returns Smoothed blendshape values (Float32Array of 52)
+     */
+    update(dt: number): Float32Array;
+    /**
+     * Decay all spring targets to neutral (0).
+     *
+     * Call when inference stalls (no new frames for threshold duration).
+     * The springs will smoothly close the mouth / relax the face over
+     * the halflife period rather than freezing.
+     */
+    decayToNeutral(): void;
+    /**
+     * Reset all state (values, velocities, targets).
+     * Call when starting a new playback session.
+     */
+    reset(): void;
+}
+/**
+ * Renderer-agnostic A2E (audio-to-expression) orchestrator
+ *
+ * Manages the mic capture + A2E inference loop independently of any
+ * 3D renderer. Adapter packages (@omote/three, @omote/babylon) wrap this
+ * thinly and pipe `latestWeights` into their renderer-specific blendshape
+ * controllers.
+ *
+ * Internally delegates all buffer accumulation, inference, and frame
+ * drip-feeding to {@link A2EProcessor}. This class only handles mic capture
+ * (getUserMedia, ScriptProcessorNode, resampling).
+ *
+ * @category Inference
+ */
+/**
+ * Progress event emitted during model download / compile
+ */
+interface A2EProgressEvent {
+    phase: 'download' | 'compile';
+    progress: number;
+}
+/**
+ * Configuration for the A2EOrchestrator
+ */
+interface A2EOrchestratorConfig {
+    /** URL for the GPU model (Wav2Vec2, Chrome/Firefox/Edge) */
+    gpuModelUrl: string;
+    /** URL for GPU model external data file */
+    gpuExternalDataUrl?: string | false;
+    /** URL for the CPU model (wav2arkit_cpu, Safari/iOS) */
+    cpuModelUrl?: string;
+    /** Sample rate for mic capture (default: 16000) */
+    sampleRate?: number;
+    /** Chunk size in samples for mic capture (default: 16000 = 1s at 16kHz) */
+    chunkSize?: number;
+    /** Callback fired with new blendshape weights after each inference */
+    onFrame?: (weights: Float32Array) => void;
+    /** Callback fired during model loading progress */
+    onProgress?: (event: A2EProgressEvent) => void;
+    /** Callback fired on error */
+    onError?: (error: Error) => void;
+    /** Callback fired when model is loaded and ready */
+    onReady?: () => void;
+    /** Additional createA2E config options */
+    a2eConfig?: Partial<CreateA2EConfig>;
+}
+/**
+ * Renderer-agnostic A2E orchestrator.
+ *
+ * Manages mic capture + delegates inference to {@link A2EProcessor}.
+ * Adapters read `latestWeights` each frame to apply to their meshes.
+ *
+ * @example Quick start (used by @omote/three and @omote/babylon adapters)
+ * ```typescript
+ * const orchestrator = new A2EOrchestrator({
+ *   gpuModelUrl: '/models/wav2vec2.onnx',
+ *   cpuModelUrl: '/models/wav2arkit_cpu.onnx',
+ *   onFrame: (weights) => controller.update(weights),
+ * });
+ * await orchestrator.load();
+ * await orchestrator.start();
+ * ```
+ */
+declare class A2EOrchestrator {
+    private config;
+    private a2e;
+    private processor;
+    private stream;
+    private audioContext;
+    private scriptProcessor;
+    private nativeSampleRate;
+    private _isReady;
+    private _isStreaming;
+    private _backend;
+    private disposed;
+    constructor(config: A2EOrchestratorConfig);
+    /** Latest blendshape weights from inference (null if none yet) */
+    get latestWeights(): Float32Array | null;
+    /** Whether the model is loaded and ready for inference */
+    get isReady(): boolean;
+    /** Whether mic is active and inference loop is running */
+    get isStreaming(): boolean;
+    /** Current backend type (webgpu, wasm, or null) */
+    get backend(): string | null;
+    /**
+     * Load the A2E model and create the processor
+     */
+    load(): Promise<void>;
+    /**
+     * Start mic capture and inference loop
+     */
+    start(): Promise<void>;
+    /**
+     * Stop mic capture and inference loop
+     */
+    stop(): void;
+    /**
+     * Dispose of all resources
+     */
+    dispose(): Promise<void>;
+}
 /**
  * Safari Web Speech API wrapper for iOS speech recognition
@@ -5190,4 +4799,4 @@ declare class ProceduralLifeLayer {
     private updateBrowNoise;
 }
-export { type AIAdapter, type AIAdapterEvents, AISessionState, ARKIT_BLENDSHAPES, type ActiveSpan, AgentCoreAdapter, type AgentCoreConfig, type AnimationClip, AnimationEvent, AnimationGraph, type AnimationGraphConfig, type AnimationGraphEvents, type AnimationLayer, type AnimationOutput, type AnimationState, type AnimationStateName, type AnimationTrigger, AudioChunkCoalescer, type AudioChunkCoalescerOptions, AudioEnergyAnalyzer, AudioScheduler, type AudioSchedulerOptions, type AudioSyncConfig, type AudioSyncEvents, AudioSyncManager, type BackendPreference, type BlendWeight, type CTCDecodeResult, CTC_VOCAB, type CacheConfig, type CacheSpanAttributes, ConsoleExporter, type ConversationMessage, ConversationOrchestrator, type ConversationSession, type CreateLipSyncConfig, type CreateSenseVoiceConfig, DEFAULT_ANIMATION_CONFIG, EMOTION_ARKIT_MAP, EMOTION_NAMES, EMOTION_VECTOR_SIZE, type Emotion2VecLabel, type EmotionAnimationMap, type EmotionBlendMode, type EmotionBlendshapeConfig, EmotionController, type EmotionFrame, type EmotionLabel, type EmotionName, type EmotionPresetName, EmotionPresets, EmotionToBlendshapeMapper, type EmotionWeights, EmphasisDetector, EventEmitter, type FetchWithCacheOptions, type FullFaceFrame, FullFacePipeline, type FullFacePipelineEvents, type FullFacePipelineOptions, INFERENCE_LATENCY_BUCKETS, type InferenceSpanAttributes, type InterruptionConfig, type InterruptionEvents, InterruptionHandler, type KaldiFbankOptions, type LAMFrame, LAMPipeline, type LAMPipelineOptions, LAM_BLENDSHAPES, type LifeLayerConfig, type LifeLayerInput, type LifeLayerOutput, type LipSyncBackend, type LipSyncModelInfo, type LipSyncResult, MODEL_LOAD_TIME_BUCKETS, type MessageRole, type MetricData, MetricNames, MicrophoneCapture, type MicrophoneCaptureConfig, ModelCache, type ModelSpanAttributes, OTLPExporter, type OTLPExporterConfig, OmoteEvents, OmoteTelemetry, type OrchestratorConfig, type OrchestratorEvents, ProceduralLifeLayer, type QuotaInfo, RingBuffer, type RuntimeBackend, type SafariSpeechConfig, SafariSpeechRecognition, type SamplingConfig, type SenseVoiceBackend, type SenseVoiceConfig, SenseVoiceInference, type SenseVoiceLanguage, type SenseVoiceModelInfo, type SenseVoiceResult, SenseVoiceUnifiedAdapter, SenseVoiceWorker, type SenseVoiceWorkerConfig, type SessionConfig, type SessionOptions, type SessionSnapshot, type SileroVADBackend, type SileroVADConfig, type SileroVADFactoryConfig, SileroVADInference, SileroVADUnifiedAdapter, SileroVADWorker, type SpanAttributes, type SpanData, type SpeechErrorCallback, type SpeechRecognitionResult, type SpeechResultCallback, type SpeechSegment, SyncedAudioPipeline, type SyncedAudioPipelineEvents, type SyncedAudioPipelineOptions, type TelemetryConfig, type TelemetryExporter, type TelemetryExporterInterface, type TenantConfig, TenantManager, type TenantQuota, type TenantUsage, type TokenRefreshCallback, type Transition, UPPER_FACE_BLENDSHAPES, UnifiedInferenceWorker, type UpperFaceBlendshapeName, type UpperFaceBlendshapes, type VADBackend, type VADModelInfo, type VADResult, type VADWorkerConfig, type VADWorkerModelInfo, type ValidationResult, type VoiceConfig, WAV2ARKIT_BLENDSHAPES, type Wav2ArkitCpuConfig, Wav2ArkitCpuInference, Wav2ArkitCpuUnifiedAdapter, Wav2ArkitCpuWorker, type Wav2ArkitCpuWorkerConfig, Wav2Vec2Inference, type Wav2Vec2InferenceConfig, type Wav2Vec2Result, applyCMVN, applyLFR, blendEmotions, calculatePeak, calculateRMS, computeKaldiFbank, configureCacheLimit, configureTelemetry, createEmotionVector, createLipSync, createSenseVoice, createSessionWithFallback, createSileroVAD, ctcGreedyDecode, fetchWithCache, formatBytes, getCacheConfig, getCacheKey, getEmotionPreset, getLoadedBackend, getModelCache, getOnnxRuntime, getOnnxRuntimeForPreference, getOptimalWasmThreads, getRecommendedBackend, getSessionOptions, getTelemetry, hasWebGPUApi, isAndroid, isIOS, isIOSSafari, isMobile, isOnnxRuntimeLoaded, isSafari, isSpeechRecognitionAvailable, isWebGPUAvailable, lerpEmotion, parseCMVNFromMetadata, parseTokensFile, preloadModels, preloadOnnxRuntime, remapWav2ArkitToLam, resolveBackend, resolveLanguageId, resolveTextNormId, shouldEnableWasmProxy, shouldUseCpuLipSync, shouldUseNativeASR, shouldUseServerLipSync, supportsVADWorker, symmetrizeBlendshapes };
+export { type A2EBackend, type A2EModelInfo, A2EOrchestrator, type A2EOrchestratorConfig, A2EProcessor, type A2EProcessorConfig, type A2EProgressEvent, type A2EResult, type AIAdapter, type AIAdapterEvents, AISessionState, ARKIT_BLENDSHAPES, type ActiveSpan, AgentCoreAdapter, type AgentCoreConfig, type AnimationClip, AnimationEvent, AnimationGraph, type AnimationGraphConfig, type AnimationGraphEvents, type AnimationLayer, type AnimationOutput, type AnimationState, type AnimationStateName, type AnimationTrigger, AudioChunkCoalescer, type AudioChunkCoalescerOptions, AudioEnergyAnalyzer, AudioScheduler, type AudioSchedulerOptions, type AudioSyncConfig, type AudioSyncEvents, AudioSyncManager, BLENDSHAPE_TO_GROUP, type BackendPreference, type BlendWeight, type BlendshapeGroup, BlendshapeSmoother, type BlendshapeSmootherConfig, CTC_VOCAB, type CacheConfig, type CacheSpanAttributes, ConsoleExporter, type ConversationMessage, ConversationOrchestrator, type ConversationSession, type CreateA2EConfig, type CreateSenseVoiceConfig, DEFAULT_ANIMATION_CONFIG, EMOTION_NAMES, EMOTION_VECTOR_SIZE, type EmotionAnimationMap, EmotionController, type EmotionLabel, type EmotionName, type EmotionPresetName, EmotionPresets, type EmotionWeights, EmphasisDetector, EventEmitter, type ExpressionProfile, type FetchWithCacheOptions, type FullFaceFrame, FullFacePipeline, type FullFacePipelineEvents, type FullFacePipelineOptions, INFERENCE_LATENCY_BUCKETS, type InferenceSpanAttributes, type InterruptionConfig, type InterruptionEvents, InterruptionHandler, LAM_BLENDSHAPES, type LifeLayerConfig, type LifeLayerInput, type LifeLayerOutput, MODEL_LOAD_TIME_BUCKETS, type MessageRole, type MetricData, MetricNames, MicrophoneCapture, type MicrophoneCaptureConfig, ModelCache, type ModelSpanAttributes, OTLPExporter, type OTLPExporterConfig, OmoteEvents, OmoteTelemetry, type OrchestratorConfig, type OrchestratorEvents, ProceduralLifeLayer, type QuotaInfo, RingBuffer, type RuntimeBackend, type SafariSpeechConfig, SafariSpeechRecognition, type SamplingConfig, type SenseVoiceBackend, type SenseVoiceConfig, SenseVoiceInference, type SenseVoiceLanguage, type SenseVoiceModelInfo, type SenseVoiceResult, SenseVoiceUnifiedAdapter, SenseVoiceWorker, type SenseVoiceWorkerConfig, type SessionConfig, type SessionSnapshot, type SileroVADBackend, type SileroVADConfig, type SileroVADFactoryConfig, SileroVADInference, SileroVADUnifiedAdapter, SileroVADWorker, type SpanAttributes, type SpanData, type SpeechErrorCallback, type SpeechRecognitionResult, type SpeechResultCallback, type SpeechSegment, type TelemetryConfig, type TelemetryExporter, type TelemetryExporterInterface, type TenantConfig, TenantManager, type TenantQuota, type TenantUsage, type TokenRefreshCallback, type Transition, UnifiedInferenceWorker, type VADBackend, type VADModelInfo, type VADResult, type VADWorkerConfig, type VADWorkerModelInfo, type ValidationResult, type VoiceConfig, type Wav2ArkitCpuConfig, Wav2ArkitCpuInference, Wav2ArkitCpuUnifiedAdapter, Wav2ArkitCpuWorker, type Wav2ArkitCpuWorkerConfig, Wav2Vec2Inference, type Wav2Vec2InferenceConfig, type Wav2Vec2Result, blendEmotions, calculatePeak, calculateRMS, configureCacheLimit, configureTelemetry, createA2E, createEmotionVector, createSenseVoice, createSileroVAD, fetchWithCache, formatBytes, getCacheConfig, getCacheKey, getEmotionPreset, getModelCache, getOptimalWasmThreads, getRecommendedBackend, getTelemetry, hasWebGPUApi, isAndroid, isIOS, isIOSSafari, isMobile, isSafari, isSpeechRecognitionAvailable, isWebGPUAvailable, lerpBlendshapes, lerpEmotion, preloadModels, resolveBackend, shouldEnableWasmProxy, shouldUseCpuA2E, shouldUseNativeASR, shouldUseServerA2E, supportsVADWorker };