npm - @omote/core - Versions diffs - 0.9.6 → 0.10.5 - Mend

@omote/core 0.9.6 → 0.10.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

package/README.md +1 -1
package/dist/{chunk-Y3DTP5P3.mjs → chunk-VSYYT4HO.mjs} +1 -1
package/dist/{chunk-X5OTUOE6.mjs.map → chunk-VSYYT4HO.mjs.map} +1 -1
package/dist/index.d.mts +268 -1305
package/dist/index.d.ts +268 -1305
package/dist/index.js +6417 -11038
package/dist/index.js.map +1 -1
package/dist/index.mjs +6416 -11037
package/dist/index.mjs.map +1 -1
package/dist/logging/index.js.map +1 -1
package/dist/logging/index.mjs +1 -1
package/package.json +1 -2
package/dist/Logger-BeUI6jG7.d.mts +0 -145
package/dist/Logger-BeUI6jG7.d.ts +0 -145
package/dist/Logger-DSoGAYJu.d.mts +0 -141
package/dist/Logger-DSoGAYJu.d.ts +0 -141
package/dist/chunk-3NDJA3I4.mjs +0 -853
package/dist/chunk-3NDJA3I4.mjs.map +0 -1
package/dist/chunk-CYBTTLG7.mjs +0 -927
package/dist/chunk-CYBTTLG7.mjs.map +0 -1
package/dist/chunk-ESU52TDS.mjs +0 -287
package/dist/chunk-ESU52TDS.mjs.map +0 -1
package/dist/chunk-MXKJOF4I.mjs +0 -38
package/dist/chunk-MXKJOF4I.mjs.map +0 -1
package/dist/chunk-X5OTUOE6.mjs +0 -927
package/dist/chunk-XK22BRG4.mjs +0 -38
package/dist/chunk-XK22BRG4.mjs.map +0 -1
package/dist/chunk-Y3DTP5P3.mjs.map +0 -1

package/dist/index.d.mts CHANGED Viewed

@@ -470,7 +470,7 @@ declare function shouldUseServerA2E(): boolean;
 /**
  * Common interface for audio-to-expression (A2E) inference backends
  *
- * Implemented by A2EInference and A2EUnifiedAdapter, allowing PlaybackPipeline
+ * Implemented by A2EUnifiedAdapter, allowing PlaybackPipeline
  * and A2EProcessor to work with either implementation transparently.
  *
  * @category Inference
@@ -488,11 +488,11 @@ interface A2EModelInfo {
 /**
  * Result from A2E inference
  *
- * All implementations must return blendshapes in LAM_BLENDSHAPES order (alphabetical).
+ * All implementations must return blendshapes in ARKIT_BLENDSHAPES order (alphabetical).
  * Models with different native orderings must remap internally before returning.
  */
 interface A2EResult {
-    /** Blendshape weights [frames, 52] in LAM_BLENDSHAPES order - 30fps */
+    /** Blendshape weights [frames, 52] in ARKIT_BLENDSHAPES order - 30fps */
     blendshapes: Float32Array[];
     /** Number of blendshape frames */
     numFrames: number;
@@ -507,10 +507,8 @@ interface A2EResult {
  * pipeline — A2E is the interface abstraction, LAM is the model.
  *
  * Implemented by:
- * - {@link A2EInference} (WebGPU/WASM, 192MB fp16)
- * - A2EUnifiedAdapter (shared unified worker)
+ * - {@link A2EUnifiedAdapter} (shared unified worker)
  *
- * @see {@link A2EInference} for direct usage
  * @see {@link createA2E} for the recommended factory API
  */
 interface A2EBackend {
@@ -531,7 +529,7 @@ interface A2EBackend {
      * Run inference on raw audio
      * @param audioSamples - Float32Array of raw audio at 16kHz
      * @param identityIndex - Optional identity index (ignored by CPU model)
-     * @returns A2E result with blendshapes in LAM_BLENDSHAPES order
+     * @returns A2E result with blendshapes in ARKIT_BLENDSHAPES order
      */
     infer(audioSamples: Float32Array, identityIndex?: number): Promise<A2EResult>;
     /**
@@ -544,7 +542,7 @@ interface A2EBackend {
  * ExpressionProfile - Per-character weight scaling for A2E blendshape output
  *
  * Maps blendshape groups (eyes, brows, jaw, mouth, cheeks, nose, tongue)
- * to weight scalers. Used by PlaybackPipeline, MicLipSync, and VoicePipeline.
+ * to weight scalers. Used by PlaybackPipeline, MicLipSync, and VoiceOrchestrator.
  *
  * @category Audio
  */
@@ -575,7 +573,7 @@ interface ExpressionProfile {
     overrides?: Partial<Record<string, number>>;
 }
 /**
- * Map each LAM_BLENDSHAPES entry to its BlendshapeGroup.
+ * Map each ARKIT_BLENDSHAPES entry to its BlendshapeGroup.
  * Built once at module load from prefix matching.
  */
 declare const BLENDSHAPE_TO_GROUP: Map<string, BlendshapeGroup>;
@@ -678,6 +676,13 @@ declare class PlaybackPipeline extends EventEmitter<PlaybackPipelineEvents> {
     private neutralTransitionFrame;
     private neutralTransitionStart;
     private neutralAnimationId;
+    private static readonly RAMP_IN_HALFLIFE;
+    private static readonly RAMP_IN_DURATION_MS;
+    private rampInSmoother;
+    private rampInActive;
+    private rampInLastTime;
+    private rampInStartTime;
+    private readonly _rampInBuffer;
     private _currentFrame;
     private _currentRawFrame;
     private _emotion;
@@ -691,6 +696,8 @@ declare class PlaybackPipeline extends EventEmitter<PlaybackPipelineEvents> {
     constructor(config: PlaybackPipelineConfig);
     /** Initialize AudioContext (lazy, call after user gesture) */
     initialize(): Promise<void>;
+    /** Eagerly create AudioContext. Call from user gesture for iOS. */
+    warmup(): Promise<void>;
     /** Update ExpressionProfile at runtime */
     setProfile(profile: ExpressionProfile): void;
     /** Set the emotion label to include in emitted frames */
@@ -737,7 +744,7 @@ declare class PlaybackPipeline extends EventEmitter<PlaybackPipelineEvents> {
  * TTSBackend — Streaming text-to-speech backend interface.
  *
  * Any TTS engine (Kokoro, ElevenLabs, etc.) can implement this contract
- * to integrate with TTSPlayback and VoicePipeline.
+ * to integrate with TTSPlayback and VoiceOrchestrator.
  *
  * @category Inference
  */
@@ -781,6 +788,10 @@ interface TTSStreamOptions {
     voice?: string;
     /** Speed multiplier override per-call */
     speed?: number;
+    /** Language override per-call (e.g. 'en-us', 'ja'). Default: derived from voice name. */
+    language?: string;
+    /** When true, emit the entire text as a single chunk (no sentence splitting). */
+    singleShot?: boolean;
 }
 /**
  * A single chunk of TTS audio output
@@ -856,7 +867,11 @@ declare class TTSPlayback extends EventEmitter<TTSPlaybackEvents> {
     speak(text: string, options?: {
         signal?: AbortSignal;
         voice?: string;
+        speed?: number;
+        language?: string;
     }): Promise<void>;
+    /** Eagerly create AudioContext. Call from user gesture for iOS. */
+    warmup(): Promise<void>;
     /** Dispose of all resources. */
     dispose(): Promise<void>;
     private speakWithPrefetch;
@@ -893,34 +908,9 @@ declare class TTSPlayback extends EventEmitter<TTSPlaybackEvents> {
 declare function isWebGPUAvailable(): Promise<boolean>;
 /**
- * SenseVoice automatic speech recognition using ONNX Runtime Web
- *
- * Non-autoregressive CTC-based ASR that is 5x faster than Whisper-Small.
- * Runs entirely in browser via WebGPU or WASM. No transformers.js dependency.
- *
- * Uses the sherpa-onnx SenseVoice export (model.int8.onnx, 239MB int8 quantized).
- * Also provides emotion detection, language identification, and audio event detection
- * from the same forward pass.
+ * SenseVoice type definitions
  *
  * @category Inference
- *
- * @example Basic usage
- * ```typescript
- * import { SenseVoiceInference } from '@omote/core';
- *
- * const asr = new SenseVoiceInference({
- *   modelUrl: '/models/sensevoice/model.int8.onnx',
- *   tokensUrl: '/models/sensevoice/tokens.txt',
- * });
- * await asr.load();
- *
- * const { text, emotion, language } = await asr.transcribe(audioSamples);
- * console.log(text);       // "Hello world"
- * console.log(emotion);    // "NEUTRAL"
- * console.log(language);   // "en"
- * ```
- *
- * @module inference/SenseVoiceInference
  */
 type SenseVoiceLanguage = 'auto' | 'zh' | 'en' | 'ja' | 'ko' | 'yue';
@@ -957,76 +947,49 @@ interface SenseVoiceModelInfo {
     outputNames: string[];
     vocabSize: number;
 }
-declare class SenseVoiceInference {
-    private session;
-    private ort;
-    private config;
-    private _backend;
-    private isLoading;
-    private inferenceQueue;
-    private poisoned;
-    private static readonly INFERENCE_TIMEOUT_MS;
-    private lastLfrFrames;
-    private webgpuShapeWarned;
-    private tokenMap;
-    private negMean;
-    private invStddev;
-    private languageId;
-    private textNormId;
-    constructor(config: SenseVoiceConfig);
-    get backend(): RuntimeBackend | null;
-    get isLoaded(): boolean;
+/**
+ * Configuration for SenseVoice Worker (used by SenseVoiceUnifiedAdapter)
+ */
+interface SenseVoiceWorkerConfig {
+    /** Path or URL to model.int8.onnx (239MB) */
+    modelUrl: string;
+    /** Path or URL to tokens.txt vocabulary file (default: sibling of modelUrl) */
+    tokensUrl?: string;
+    /** Language hint (default: 'auto' for auto-detection) */
+    language?: SenseVoiceLanguage;
+    /** Text normalization: 'with_itn' applies inverse text normalization (default: 'with_itn') */
+    textNorm?: 'with_itn' | 'without_itn';
+}
+/**
+ * Common interface for SenseVoice implementations
+ */
+interface SenseVoiceBackend {
+    /** Whether the model is loaded and ready for inference */
+    readonly isLoaded: boolean;
+    /** Current backend type ('wasm' for worker, full backend for main thread, null if not loaded) */
+    readonly backend: 'wasm' | 'webgpu' | null;
+    /**
+     * Load the ONNX model
+     * @param onProgress - Optional progress callback (fires once at 100% for worker)
+     * @returns Model loading information
+     */
     load(onProgress?: (loaded: number, total: number) => void): Promise<SenseVoiceModelInfo>;
     /**
      * Transcribe audio samples to text
-     *
-     * @param audioSamples Float32Array of audio samples at 16kHz, [-1, 1] range
-     * @returns Transcription result with text, emotion, language, and event
+     * @param audioSamples - Float32Array of audio samples at 16kHz
+     * @returns Transcription result
      */
     transcribe(audioSamples: Float32Array): Promise<SenseVoiceResult>;
-    private queueInference;
+    /**
+     * Dispose of the model and free resources
+     */
     dispose(): Promise<void>;
 }
 /**
- * Silero VAD (Voice Activity Detection) inference
- *
- * Neural network-based VAD running in browser via ONNX Runtime Web.
- * Much more accurate than RMS-based energy detection.
- *
- * Uses lazy loading to conditionally load WebGPU or WASM-only bundle:
- * - iOS: Loads WASM-only bundle (WebGPU crashes due to Safari bugs)
- * - Android/Desktop: Loads WebGPU bundle (with WASM fallback)
+ * Silero VAD type definitions
  *
  * @category Inference
- *
- * @example Basic usage
- * ```typescript
- * import { SileroVADInference } from '@omote/core';
- *
- * const vad = new SileroVADInference({
- *   modelUrl: '/models/silero-vad.onnx'
- * });
- * await vad.load();
- *
- * // Process 32ms chunks (512 samples at 16kHz)
- * const probability = await vad.process(audioChunk);
- * if (probability > 0.5) {
- *   console.log('Speech detected!');
- * }
- * ```
- *
- * @example Streaming with state management
- * ```typescript
- * // State is automatically maintained between process() calls
- * // Call reset() when starting a new audio stream
- * vad.reset();
- *
- * for (const chunk of audioChunks) {
- *   const prob = await vad.process(chunk);
- *   // prob is speech probability [0, 1]
- * }
- * ```
  */
 type VADBackend = BackendPreference;
@@ -1096,117 +1059,6 @@ interface SpeechSegment {
     /** Average probability during segment */
     avgProbability: number;
 }
-/**
- * Silero VAD - Neural network voice activity detection
- *
- * Based on snakers4/silero-vad ONNX model.
- * Processes 32ms chunks (512 samples at 16kHz) with LSTM state.
- *
- * @see https://github.com/snakers4/silero-vad
- */
-declare class SileroVADInference {
-    private session;
-    private ort;
-    private config;
-    private _backend;
-    private isLoading;
-    private state;
-    private context;
-    private readonly chunkSize;
-    private readonly contextSize;
-    private inferenceQueue;
-    private preSpeechBuffer;
-    private wasSpeaking;
-    private srTensor;
-    constructor(config: SileroVADConfig);
-    get backend(): RuntimeBackend | null;
-    get isLoaded(): boolean;
-    get sampleRate(): number;
-    get threshold(): number;
-    /**
-     * Get required chunk size in samples
-     */
-    getChunkSize(): number;
-    /**
-     * Get chunk duration in milliseconds
-     */
-    getChunkDurationMs(): number;
-    /**
-     * Check if WebGPU is available and working
-     * (iOS returns false even if navigator.gpu exists due to ONNX Runtime bugs)
-     */
-    static isWebGPUAvailable: typeof isWebGPUAvailable;
-    /**
-     * Load the ONNX model
-     */
-    load(): Promise<VADModelInfo>;
-    /**
-     * Reset state for new audio stream
-     */
-    reset(): void;
-    /**
-     * Process a single audio chunk
-     *
-     * @param audioChunk - Float32Array of exactly chunkSize samples (512 for 16kHz, 256 for 8kHz)
-     * @returns VAD result with speech probability
-     */
-    process(audioChunk: Float32Array): Promise<VADResult>;
-    /**
-     * Process audio and detect speech segments
-     *
-     * @param audio - Complete audio buffer
-     * @param options - Detection options
-     * @returns Array of speech segments
-     */
-    detectSpeech(audio: Float32Array, options?: {
-        /** Minimum speech duration in ms (default: 250) */
-        minSpeechDurationMs?: number;
-        /** Minimum silence duration to end segment in ms (default: 300) */
-        minSilenceDurationMs?: number;
-        /** Padding to add before/after speech in ms (default: 30) */
-        speechPadMs?: number;
-    }): Promise<SpeechSegment[]>;
-    /**
-     * Queue inference to serialize ONNX session calls
-     */
-    private queueInference;
-    /**
-     * Dispose of the model and free resources
-     */
-    dispose(): Promise<void>;
-}
-/**
- * Silero VAD Web Worker implementation
- *
- * Runs Silero VAD inference in a dedicated Web Worker to prevent main thread blocking.
- * Uses inline worker script (Blob URL pattern) to avoid separate file deployment.
- *
- * Key design decisions:
- * - WASM backend only (WebGPU doesn't work in Workers)
- * - LSTM state serialized as Float32Array (Tensors can't cross worker boundary)
- * - Audio copied (not transferred) to retain main thread access for pre-speech buffer
- * - ONNX Runtime loaded from CDN in worker (no bundler complications)
- *
- * @category Inference
- *
- * @example Basic usage
- * ```typescript
- * import { SileroVADWorker } from '@omote/core';
- *
- * const vad = new SileroVADWorker({
- *   modelUrl: '/models/silero-vad.onnx'
- * });
- * await vad.load();
- *
- * // Process 32ms chunks (512 samples at 16kHz)
- * const result = await vad.process(audioChunk);
- * if (result.isSpeech) {
- *   console.log('Speech detected!', result.probability);
- * }
- * ```
- */
 /**
  * Configuration for Silero VAD Worker
  */
@@ -1219,13 +1071,6 @@ interface VADWorkerConfig {
     threshold?: number;
     /**
      * Number of audio chunks to keep in pre-speech buffer.
-     * When VAD triggers, these chunks are prepended to the speech buffer
-     * to capture the beginning of speech that occurred before detection.
-     *
-     * At 512 samples/chunk and 16kHz:
-     * - 10 chunks = 320ms of pre-speech audio
-     * - 15 chunks = 480ms of pre-speech audio
-     *
      * Default: 10 chunks (320ms)
      */
     preSpeechBufferChunks?: number;
@@ -1241,85 +1086,45 @@ interface VADWorkerModelInfo {
     sampleRate: number;
     chunkSize: number;
 }
 /**
- * Silero VAD Worker - Voice Activity Detection in a Web Worker
- *
- * Runs Silero VAD inference off the main thread to prevent UI blocking.
- * Feature parity with SileroVADInference but runs in dedicated worker.
- *
- * @see SileroVADInference for main-thread version
+ * Common interface for Silero VAD implementations
  */
-declare class SileroVADWorker {
-    private worker;
-    private config;
-    private isLoading;
-    private _isLoaded;
-    private poisoned;
-    private state;
-    private context;
-    private readonly chunkSize;
-    private readonly contextSize;
-    private inferenceQueue;
-    private preSpeechBuffer;
-    private wasSpeaking;
-    private pendingResolvers;
-    private messageId;
-    constructor(config: VADWorkerConfig);
-    get isLoaded(): boolean;
-    /**
-     * Backend type (always 'wasm' for Worker, WebGPU not supported in Workers)
-     */
-    get backend(): 'wasm' | null;
-    get sampleRate(): number;
-    get threshold(): number;
-    /**
-     * Get required chunk size in samples
-     */
-    getChunkSize(): number;
-    /**
-     * Get chunk duration in milliseconds
-     */
-    getChunkDurationMs(): number;
-    /**
-     * Create the worker from inline script
-     */
-    private createWorker;
-    /**
-     * Handle messages from worker
-     */
-    private handleWorkerMessage;
-    /**
-     * Send message to worker and wait for response
-     */
-    private sendMessage;
-    /**
-     * Load the ONNX model in the worker
-     */
-    load(): Promise<VADWorkerModelInfo>;
+interface SileroVADBackend {
+    /** Current backend type (webgpu, wasm, or null if not loaded) */
+    readonly backend: RuntimeBackend | null;
+    /** Whether the model is loaded and ready for inference */
+    readonly isLoaded: boolean;
+    /** Audio sample rate (8000 or 16000 Hz) */
+    readonly sampleRate: number;
+    /** Speech detection threshold (0-1) */
+    readonly threshold: number;
     /**
-     * Reset state for new audio stream
+     * Load the ONNX model
+     * @returns Model loading information
      */
-    reset(): Promise<void>;
+    load(): Promise<VADModelInfo | VADWorkerModelInfo>;
     /**
      * Process a single audio chunk
-     *
-     * @param audioChunk - Float32Array of exactly chunkSize samples (512 for 16kHz, 256 for 8kHz)
+     * @param audioChunk - Float32Array of exactly chunkSize samples
      * @returns VAD result with speech probability
      */
     process(audioChunk: Float32Array): Promise<VADResult>;
     /**
-     * Queue inference to serialize worker calls
+     * Reset state for new audio stream
      */
-    private queueInference;
+    reset(): void | Promise<void>;
     /**
-     * Dispose of the worker and free resources
+     * Dispose of the model and free resources
      */
     dispose(): Promise<void>;
     /**
-     * Check if Web Workers are supported
+     * Get required chunk size in samples
      */
-    static isSupported(): boolean;
+    getChunkSize(): number;
+    /**
+     * Get chunk duration in milliseconds
+     */
+    getChunkDurationMs(): number;
 }
 /**
@@ -1447,43 +1252,33 @@ declare class UnifiedInferenceWorker {
 /** Base config shared across all inference factory functions */
 interface InferenceFactoryConfig {
-    /**
-     * Worker mode:
-     * - 'auto' (default): Use Worker if supported, else main thread
-     * - true: Force Worker (throws if unsupported)
-     * - false: Force main thread
-     */
-    useWorker?: boolean | 'auto';
     /**
      * Unified inference worker instance.
-     * When provided, routes inference through the shared worker,
+     * Routes inference through the shared worker,
      * keeping all inference off the main thread.
-     * Takes precedence over useWorker setting.
      */
     unifiedWorker?: UnifiedInferenceWorker;
 }
 /**
- * Factory function for A2E inference
+ * Factory function for A2E inference via UnifiedInferenceWorker
  *
  * Creates an A2EBackend instance with zero-config defaults (HuggingFace CDN).
- * Supports unified worker mode for iOS off-main-thread inference.
+ * Routes inference through the shared unified worker.
  *
  * @category Inference
  *
- * @example Auto-detect (recommended, zero-config)
+ * @example
  * ```typescript
- * import { createA2E } from '@omote/core';
+ * import { createA2E, UnifiedInferenceWorker } from '@omote/core';
+ *
+ * const worker = new UnifiedInferenceWorker();
+ * await worker.init();
  *
- * const a2e = createA2E(); // uses HF CDN defaults (192MB fp16)
+ * const a2e = createA2E({ unifiedWorker: worker });
  * await a2e.load();
  * const { blendshapes } = await a2e.infer(audioSamples);
  * ```
- *
- * @example Custom model URL
- * ```typescript
- * const a2e = createA2E({ modelUrl: '/models/lam.onnx' });
- * ```
  */
 /**
@@ -1499,13 +1294,13 @@ interface CreateA2EConfig extends InferenceFactoryConfig {
      * Set to `false` to skip external data loading (single-file models only).
      */
     externalDataUrl?: string | false;
-    /** Backend preference (default: 'auto') */
-    backend?: BackendPreference;
     /** Number of identity classes (default: 12) */
     numIdentityClasses?: number;
 }
 /**
- * Create an A2E instance
+ * Create an A2E instance via the unified worker.
+ *
+ * If no `unifiedWorker` is provided, a dedicated worker is created on load().
  *
  * @param config - Factory configuration
  * @returns An A2EBackend instance
@@ -1521,7 +1316,7 @@ declare function createA2E(config?: CreateA2EConfig): A2EBackend;
 /**
  * Generic frame source -- any object that emits 'frame' events with blendshapes.
  *
- * Implemented by PlaybackPipeline, MicLipSync, VoicePipeline, and any custom source.
+ * Implemented by PlaybackPipeline, MicLipSync, and any custom source.
  * Used by OmoteAvatar (all renderer adapters) to receive animation frames.
  */
 interface FrameSource {
@@ -1550,7 +1345,7 @@ interface TranscriptResult {
     inferenceTimeMs?: number;
 }
 /**
- * Consumer's response handler. VoicePipeline calls this with transcribed text.
+ * Consumer's response handler. VoiceOrchestrator calls this with transcribed text.
  * Consumer must stream audio back for playback + lip sync.
  */
 interface ResponseHandler {
@@ -1581,6 +1376,8 @@ interface ResponseHandler {
  */
 interface TTSSpeakerConfig {
+    /** Skip LAM download — audio playback only, no lip sync. Default: false. */
+    audioOnly?: boolean;
     /** Per-character expression weight scaling */
     profile?: ExpressionProfile;
     /** Identity/style index for A2E model (default: 0) */
@@ -1593,8 +1390,8 @@ interface TTSSpeakerConfig {
     neutralTransitionMs?: number;
     /** Pre-built A2E backend (skip internal createA2E). */
     lam?: A2EBackend;
-    /** LAM model config (only when lam not provided) */
-    models?: CreateA2EConfig;
+    /** LAM model config (only when lam not provided). unifiedWorker is supplied by TTSSpeaker. */
+    models?: Omit<CreateA2EConfig, 'unifiedWorker'>;
     /** Shared unified worker (recommended for iOS) */
     unifiedWorker?: UnifiedInferenceWorker;
 }
@@ -1603,6 +1400,7 @@ declare class TTSSpeaker {
     private tts;
     private ownedLam;
     private ownedWorker;
+    private usesSharedWorker;
     private currentAbort;
     private _isSpeaking;
     private _audioOnly;
@@ -1616,11 +1414,8 @@ declare class TTSSpeaker {
     /**
      * Connect a TTS backend.
      *
-     * When config includes `lam`, `unifiedWorker`, or `models`, the full lip sync
-     * pipeline is created (LAM + TTSPlayback + PlaybackPipeline).
-     *
-     * When config is omitted or has none of those, audio-only mode is used:
-     * TTS → AudioScheduler (speakers only, no blendshapes, no LAM download).
+     * By default, the full lip sync pipeline is created (auto-downloads LAM).
+     * Pass `audioOnly: true` for audio-only mode (no blendshapes, no LAM download).
      *
      * @param tts - TTS backend to use for speech synthesis
      * @param config - Optional configuration for A2E, expression profile, etc.
@@ -1636,6 +1431,8 @@ declare class TTSSpeaker {
     speak(text: string, options?: {
         signal?: AbortSignal;
         voice?: string;
+        speed?: number;
+        language?: string;
     }): Promise<void>;
     /** Audio-only speak: TTS → resample → AudioScheduler (no blendshapes). */
     private speakAudioOnly;
@@ -1655,13 +1452,20 @@ declare class TTSSpeaker {
     streamText(options: {
         signal?: AbortSignal;
         voice?: string;
+        speed?: number;
+        language?: string;
     }): Promise<{
         push: (token: string) => void;
         end: () => Promise<void>;
     }>;
     /** streamText in audio-only mode: TTS → AudioScheduler (no blendshapes). */
     private streamTextAudioOnly;
-    /** Abort current speak if any. */
+    /**
+     * Warm up AudioContext for iOS/Safari autoplay policy.
+     * Call from a user gesture handler (click/tap) before speak().
+     */
+    warmup(): Promise<void>;
+    /** Abort current speak if any. Triggers neutral transition on PlaybackPipeline. */
     stop(): void;
     /** Clean teardown of all owned resources. */
     dispose(): Promise<void>;
@@ -1697,11 +1501,13 @@ interface CreateTTSPlayerConfig {
     modelUrl?: string;
     /** Voice data base URL override */
     voiceBaseUrl?: string;
+    /** Shared unified worker (created automatically if not provided) */
+    unifiedWorker?: UnifiedInferenceWorker;
 }
 /**
  * Zero-config TTS player. Speak text through speakers without an avatar.
  *
- * Uses Kokoro TTS (82M q8, ~92MB) with automatic worker selection.
+ * Uses Kokoro TTS (82M q8, ~92MB) with automatic worker creation.
  * No LAM model is downloaded — audio plays directly through AudioScheduler.
  */
 declare function createTTSPlayer(config?: CreateTTSPlayerConfig): TTSPlayer;
@@ -1710,254 +1516,27 @@ declare function createTTSPlayer(config?: CreateTTSPlayerConfig): TTSPlayer;
  */
 declare class TTSPlayer extends TTSSpeaker {
     private backend;
-    constructor(tts: TTSBackend);
+    private ttsWorker;
+    private ttsPlayerUsesSharedWorker;
+    private ttsConfig;
+    constructor(config?: CreateTTSPlayerConfig);
     /** Load TTS model and connect in audio-only mode. */
     load(): Promise<void>;
     /** Whether the TTS model is loaded and ready. */
     get isLoaded(): boolean;
+    dispose(): Promise<void>;
 }
 /**
- * Factory function for SenseVoice ASR with automatic Worker vs main thread selection
- *
- * Provides a unified API that automatically selects the optimal implementation:
- * - Worker supported: Uses SenseVoiceWorker (off-main-thread inference)
- * - Worker unsupported: Uses SenseVoiceInference (main thread)
- *
- * @category Inference
- *
- * @example Auto-detect (recommended)
- * ```typescript
- * import { createSenseVoice } from '@omote/core';
+ * SpeechListener — Standalone listening primitive.
  *
- * const asr = createSenseVoice({
- *   modelUrl: '/models/sensevoice/model.int8.onnx',
- * });
- * await asr.load();
- * const { text, emotion } = await asr.transcribe(audioSamples);
- * ```
+ * Composes: MicrophoneCapture → SileroVAD → SenseVoice ASR → transcript events.
+ * Used independently or alongside TTSSpeaker and VoiceOrchestrator.
  *
- * @example Force worker
- * ```typescript
- * const asr = createSenseVoice({
- *   modelUrl: '/models/sensevoice/model.int8.onnx',
- *   useWorker: true,
- * });
- * ```
+ * Does NOT handle TTS, LAM, or response routing — those belong to TTSSpeaker
+ * and VoiceOrchestrator respectively.
  *
- * @example Force main thread
- * ```typescript
- * const asr = createSenseVoice({
- *   modelUrl: '/models/sensevoice/model.int8.onnx',
- *   useWorker: false,
- * });
- * ```
- */
-/**
- * Common interface for both SenseVoiceInference and SenseVoiceWorker
- */
-interface SenseVoiceBackend {
-    /** Whether the model is loaded and ready for inference */
-    readonly isLoaded: boolean;
-    /** Current backend type ('wasm' for worker, full backend for main thread, null if not loaded) */
-    readonly backend: 'wasm' | 'webgpu' | null;
-    /**
-     * Load the ONNX model
-     * @param onProgress - Optional progress callback (fires once at 100% for worker)
-     * @returns Model loading information
-     */
-    load(onProgress?: (loaded: number, total: number) => void): Promise<SenseVoiceModelInfo>;
-    /**
-     * Transcribe audio samples to text
-     * @param audioSamples - Float32Array of audio samples at 16kHz
-     * @returns Transcription result
-     */
-    transcribe(audioSamples: Float32Array): Promise<SenseVoiceResult>;
-    /**
-     * Dispose of the model and free resources
-     */
-    dispose(): Promise<void>;
-}
-/**
- * Configuration for the SenseVoice factory
- */
-interface CreateSenseVoiceConfig extends InferenceFactoryConfig {
-    /** Path or URL to model.int8.onnx (239MB). Default: HuggingFace CDN */
-    modelUrl?: string;
-    /** Path or URL to tokens.txt vocabulary file (default: sibling of modelUrl) */
-    tokensUrl?: string;
-    /** Language hint (default: 'auto') */
-    language?: SenseVoiceLanguage;
-    /** Text normalization (default: 'with_itn') */
-    textNorm?: 'with_itn' | 'without_itn';
-}
-/**
- * Create a SenseVoice ASR instance with automatic implementation selection
- *
- * @param config - Factory configuration
- * @returns A SenseVoiceBackend instance (either Worker or main thread)
- */
-declare function createSenseVoice(config?: CreateSenseVoiceConfig): SenseVoiceBackend;
-/**
- * Factory function for Silero VAD with automatic Worker vs main thread selection
- *
- * Provides a unified API that automatically selects the optimal implementation:
- * - Desktop browsers: Uses SileroVADWorker (off-main-thread inference)
- * - Mobile devices: Uses SileroVADInference (main thread, avoids memory overhead)
- * - Fallback: Gracefully falls back to main thread if Worker fails
- *
- * @category Inference
- *
- * @example Basic usage (auto-detect)
- * ```typescript
- * import { createSileroVAD } from '@omote/core';
- *
- * const vad = createSileroVAD({
- *   modelUrl: '/models/silero-vad.onnx',
- *   threshold: 0.5,
- * });
- *
- * await vad.load();
- * const result = await vad.process(audioChunk);
- * if (result.isSpeech) {
- *   console.log('Speech detected!', result.probability);
- * }
- * ```
- *
- * @example Force worker usage
- * ```typescript
- * const vad = createSileroVAD({
- *   modelUrl: '/models/silero-vad.onnx',
- *   useWorker: true, // Force Worker even on mobile
- * });
- * ```
- *
- * @example Force main thread
- * ```typescript
- * const vad = createSileroVAD({
- *   modelUrl: '/models/silero-vad.onnx',
- *   useWorker: false, // Force main thread
- * });
- * ```
- */
-/**
- * Common interface for both SileroVADInference and SileroVADWorker
- *
- * This interface defines the shared API that both implementations provide,
- * allowing consumers to use either interchangeably.
- */
-interface SileroVADBackend {
-    /** Current backend type (webgpu, wasm, or null if not loaded) */
-    readonly backend: RuntimeBackend | null;
-    /** Whether the model is loaded and ready for inference */
-    readonly isLoaded: boolean;
-    /** Audio sample rate (8000 or 16000 Hz) */
-    readonly sampleRate: number;
-    /** Speech detection threshold (0-1) */
-    readonly threshold: number;
-    /**
-     * Load the ONNX model
-     * @returns Model loading information
-     */
-    load(): Promise<VADModelInfo | VADWorkerModelInfo>;
-    /**
-     * Process a single audio chunk
-     * @param audioChunk - Float32Array of exactly chunkSize samples
-     * @returns VAD result with speech probability
-     */
-    process(audioChunk: Float32Array): Promise<VADResult>;
-    /**
-     * Reset state for new audio stream
-     */
-    reset(): void | Promise<void>;
-    /**
-     * Dispose of the model and free resources
-     */
-    dispose(): Promise<void>;
-    /**
-     * Get required chunk size in samples
-     */
-    getChunkSize(): number;
-    /**
-     * Get chunk duration in milliseconds
-     */
-    getChunkDurationMs(): number;
-}
-/**
- * Configuration for the Silero VAD factory
- *
- * Extends SileroVADConfig with worker-specific options.
- */
-interface SileroVADFactoryConfig extends Omit<SileroVADConfig, 'modelUrl'>, InferenceFactoryConfig {
-    /** Path or URL to the ONNX model. Default: HuggingFace CDN */
-    modelUrl?: string;
-    /**
-     * Fallback to main thread on worker errors.
-     *
-     * When true (default), if the Worker fails to load or encounters an error,
-     * the factory will automatically create a main thread instance instead.
-     *
-     * When false, worker errors will propagate as exceptions.
-     *
-     * Default: true
-     */
-    fallbackOnError?: boolean;
-}
-/**
- * Check if the current environment supports VAD Web Workers
- *
- * Requirements:
- * - Worker constructor must exist
- * - Blob URL support (for inline worker script)
- *
- * @returns true if VAD Worker is supported
- */
-declare function supportsVADWorker(): boolean;
-/**
- * Create a Silero VAD instance with automatic implementation selection
- *
- * This factory function automatically selects between:
- * - **SileroVADWorker**: Off-main-thread inference (better for desktop)
- * - **SileroVADInference**: Main thread inference (better for mobile)
- *
- * The selection is based on:
- * 1. Explicit `useWorker` config (if provided)
- * 2. Platform detection (mobile vs desktop)
- * 3. Worker API availability
- *
- * Both implementations share the same interface (SileroVADBackend),
- * so consumers can use either interchangeably.
- *
- * @param config - Factory configuration
- * @returns A SileroVAD instance (either Worker or main thread)
- *
- * @example
- * ```typescript
- * // Auto-detect (recommended)
- * const vad = createSileroVAD({ modelUrl: '/models/silero-vad.onnx' });
- *
- * // Force Worker
- * const vadWorker = createSileroVAD({ modelUrl: '/models/silero-vad.onnx', useWorker: true });
- *
- * // Force main thread
- * const vadMain = createSileroVAD({ modelUrl: '/models/silero-vad.onnx', useWorker: false });
- * ```
- */
-declare function createSileroVAD(config?: SileroVADFactoryConfig): SileroVADBackend;
-/**
- * SpeechListener — Standalone listening primitive.
- *
- * Composes: MicrophoneCapture → SileroVAD → SenseVoice ASR → transcript events.
- * Extracted from VoicePipeline's listening half so it can be used independently.
- *
- * Does NOT handle TTS, LAM, or response routing — those belong to TTSSpeaker
- * and VoicePipeline respectively.
- *
- * @category Audio
+ * @category Audio
  */
 interface SpeechListenerConfig {
@@ -1974,6 +1553,7 @@ interface SpeechListenerConfig {
             modelUrl: string;
             tokensUrl?: string;
             language?: string;
+            textNorm?: 'with_itn' | 'without_itn';
         };
         vad: {
             modelUrl: string;
@@ -2028,6 +1608,7 @@ declare class SpeechListener extends EventEmitter<SpeechListenerEvents> {
     private asr;
     private vad;
     private ownedWorker;
+    private usesSharedWorker;
     private mic;
     private omoteEvents;
     private _unsubChunk;
@@ -2157,114 +1738,48 @@ declare class InterruptionHandler extends EventEmitter<InterruptionEvents> {
 }
 /**
- * SenseVoice ASR Web Worker implementation
- *
- * Runs SenseVoice speech recognition in a dedicated Web Worker to prevent
- * main thread blocking. Uses inline worker script (Blob URL pattern) to
- * avoid separate file deployment.
- *
- * Key design decisions:
- * - WASM backend only (WebGPU doesn't work in Workers)
- * - All preprocessing (fbank, LFR, CMVN) and CTC decoding inlined in worker
- * - Audio copied (not transferred) to retain main thread access
- * - ONNX Runtime loaded from CDN in worker (no bundler complications)
- * - iOS: model URL passed as string to ORT (avoids 239MB JS heap allocation)
+ * Factory function for SenseVoice ASR via UnifiedInferenceWorker
  *
  * @category Inference
  *
- * @example Basic usage
+ * @example
  * ```typescript
- * import { SenseVoiceWorker } from '@omote/core';
+ * import { createSenseVoice, UnifiedInferenceWorker } from '@omote/core';
  *
- * const asr = new SenseVoiceWorker({
+ * const worker = new UnifiedInferenceWorker();
+ * await worker.init();
+ *
+ * const asr = createSenseVoice({
  *   modelUrl: '/models/sensevoice/model.int8.onnx',
- *   tokensUrl: '/models/sensevoice/tokens.txt',
+ *   unifiedWorker: worker,
  * });
  * await asr.load();
- *
- * const { text, emotion, language } = await asr.transcribe(audioSamples);
- * console.log(text);       // "Hello world"
- * console.log(emotion);    // "NEUTRAL"
- * console.log(language);   // "en"
+ * const { text, emotion } = await asr.transcribe(audioSamples);
  * ```
  */
 /**
- * Configuration for SenseVoice Worker
+ * Configuration for the SenseVoice factory
  */
-interface SenseVoiceWorkerConfig {
-    /** Path or URL to model.int8.onnx (239MB) */
-    modelUrl: string;
+interface CreateSenseVoiceConfig extends InferenceFactoryConfig {
+    /** Path or URL to model.int8.onnx (239MB). Default: HuggingFace CDN */
+    modelUrl?: string;
     /** Path or URL to tokens.txt vocabulary file (default: sibling of modelUrl) */
     tokensUrl?: string;
-    /** Language hint (default: 'auto' for auto-detection) */
+    /** Language hint (default: 'auto') */
     language?: SenseVoiceLanguage;
-    /** Text normalization: 'with_itn' applies inverse text normalization (default: 'with_itn') */
+    /** Text normalization (default: 'with_itn') */
     textNorm?: 'with_itn' | 'without_itn';
 }
 /**
- * SenseVoice ASR Worker - Speech Recognition in a Web Worker
+ * Create a SenseVoice ASR instance via the unified worker.
  *
- * Runs SenseVoice inference off the main thread to prevent UI blocking.
- * All preprocessing (fbank, LFR, CMVN) and CTC decoding run in the worker.
+ * If no `unifiedWorker` is provided, a dedicated worker is created on load().
  *
- * @see SenseVoiceInference for main-thread version
+ * @param config - Factory configuration
+ * @returns A SenseVoiceBackend instance
  */
-declare class SenseVoiceWorker {
-    private worker;
-    private config;
-    private isLoading;
-    private _isLoaded;
-    private inferenceQueue;
-    private poisoned;
-    private pendingResolvers;
-    private languageId;
-    private textNormId;
-    constructor(config: SenseVoiceWorkerConfig);
-    get isLoaded(): boolean;
-    /**
-     * Backend type (always 'wasm' for Worker, WebGPU not supported in Workers)
-     */
-    get backend(): 'wasm' | null;
-    /**
-     * Create the worker from inline script
-     */
-    private createWorker;
-    /**
-     * Handle messages from worker
-     */
-    private handleWorkerMessage;
-    /**
-     * Send message to worker and wait for response
-     */
-    private sendMessage;
-    /**
-     * Load the ONNX model in the worker
-     *
-     * @param onProgress - Optional progress callback. Fires once at 100% when load completes
-     *   (the worker downloads and loads the model internally, so granular progress is not available).
-     */
-    load(onProgress?: (loaded: number, total: number) => void): Promise<SenseVoiceModelInfo>;
-    /**
-     * Transcribe audio samples to text
-     *
-     * @param audioSamples Float32Array of audio samples at 16kHz, [-1, 1] range
-     * @returns Transcription result with text, emotion, language, and event
-     */
-    transcribe(audioSamples: Float32Array): Promise<SenseVoiceResult>;
-    /**
-     * Queue inference to serialize worker calls
-     */
-    private queueInference;
-    /**
-     * Dispose of the worker and free resources
-     */
-    dispose(): Promise<void>;
-    /**
-     * Check if Web Workers are supported
-     */
-    static isSupported(): boolean;
-}
+declare function createSenseVoice(config?: CreateSenseVoiceConfig): SenseVoiceBackend;
 /**
  * Shared blendshape constants and utilities for lip sync inference
@@ -2298,100 +1813,6 @@ declare const LAM_BLENDSHAPES: readonly ["browDownLeft", "browDownRight", "browI
  */
 declare function lerpBlendshapes(current: Float32Array | number[], target: Float32Array | number[], factor?: number): number[];
-/**
- * A2E inference engine for Audio-to-Expression (LAM model)
- *
- * Runs entirely in the browser using WebGPU or WASM.
- * Takes raw 16kHz audio and outputs 52 ARKit blendshapes for lip sync.
- * Uses the LAM (Large Animation Model) — see {@link A2EBackend} for the interface.
- *
- * @see {@link createA2E} for the recommended zero-config factory
- * @see {@link A2EBackend} for the common interface
- * @category Inference
- *
- * @example Basic usage
- * ```typescript
- * import { A2EInference } from '@omote/core';
- *
- * const a2e = new A2EInference({ modelUrl: '/models/lam.onnx' });
- * await a2e.load();
- *
- * // Process 1 second of audio (16kHz = 16000 samples)
- * const result = await a2e.infer(audioSamples);
- * console.log('Blendshapes:', result.blendshapes); // [30, 52] for 30fps
- * ```
- */
-interface A2EInferenceConfig {
-    /** Path or URL to the ONNX model */
-    modelUrl: string;
-    /**
-     * Path or URL to external model data file (.onnx.data weights).
-     * Default: `${modelUrl}.data` (e.g., /models/model.onnx.data)
-     *
-     * Set to `false` to skip external data loading (single-file models only).
-     */
-    externalDataUrl?: string | false;
-    /** Preferred backend (auto will try WebGPU first, fallback to WASM) */
-    backend?: BackendPreference;
-    /** Number of identity classes (default: 12 for streaming model) */
-    numIdentityClasses?: number;
-    /**
-     * Number of audio samples per inference chunk (default: 16000).
-     * Model supports variable chunk sizes. Smaller chunks = lower latency,
-     * more inference overhead. 8000 (500ms) is recommended for real-time lip sync.
-     */
-    chunkSize?: number;
-}
-declare class A2EInference implements A2EBackend {
-    readonly modelId: "a2e";
-    private session;
-    private ort;
-    private config;
-    private _backend;
-    private isLoading;
-    private numIdentityClasses;
-    readonly chunkSize: number;
-    private inferenceQueue;
-    private poisoned;
-    private static readonly INFERENCE_TIMEOUT_MS;
-    constructor(config: A2EInferenceConfig);
-    /**
-     * Check if WebGPU is available and working
-     * (iOS returns false even if navigator.gpu exists due to ONNX Runtime bugs)
-     */
-    static isWebGPUAvailable: typeof isWebGPUAvailable;
-    get backend(): 'webgpu' | 'wasm' | null;
-    get isLoaded(): boolean;
-    /** True if inference timed out and the session is permanently unusable */
-    get isSessionPoisoned(): boolean;
-    /**
-     * Load the ONNX model
-     */
-    load(): Promise<A2EModelInfo>;
-    /**
-     * Run inference on raw audio
-     * @param audioSamples - Float32Array of raw audio at 16kHz
-     * @param identityIndex - Optional identity index (0-11, default 0 = neutral)
-     *
-     * Audio will be zero-padded or truncated to chunkSize samples.
-     */
-    infer(audioSamples: Float32Array, identityIndex?: number): Promise<A2EResult>;
-    /**
-     * Queue inference to serialize ONNX session calls
-     */
-    private queueInference;
-    /**
-     * Get blendshape value by name for a specific frame
-     */
-    getBlendshape(blendshapes: Float32Array, name: typeof LAM_BLENDSHAPES[number]): number;
-    /**
-     * Dispose of the model and free resources
-     */
-    dispose(): Promise<void>;
-}
 /**
  * Default and user-configurable model URLs for all ONNX models
  *
@@ -2427,7 +1848,7 @@ type ModelUrlKey = 'lam' | 'senseVoice' | 'sileroVad' | 'kokoroTTS' | 'kokoroVoi
  * Resolved model URLs — user overrides take priority, HuggingFace CDN is fallback.
  *
  * All SDK factories (`createA2E`, `createSenseVoice`, `createSileroVAD`) and
- * orchestrators (`VoicePipeline`) read from this object. Call
+ * orchestrators (`VoiceOrchestrator`) read from this object. Call
  * {@link configureModelUrls} before constructing any pipelines to point
  * models at your own CDN.
  */
@@ -2697,6 +2118,44 @@ declare class BlendshapeSmoother {
     reset(): void;
 }
+/**
+ * Factory function for Silero VAD via UnifiedInferenceWorker
+ *
+ * @category Inference
+ *
+ * @example
+ * ```typescript
+ * import { createSileroVAD, UnifiedInferenceWorker } from '@omote/core';
+ *
+ * const worker = new UnifiedInferenceWorker();
+ * await worker.init();
+ *
+ * const vad = createSileroVAD({
+ *   modelUrl: '/models/silero-vad.onnx',
+ *   unifiedWorker: worker,
+ * });
+ * await vad.load();
+ * const result = await vad.process(audioChunk);
+ * ```
+ */
+/**
+ * Configuration for the Silero VAD factory
+ */
+interface SileroVADFactoryConfig extends Omit<SileroVADConfig, 'modelUrl'>, InferenceFactoryConfig {
+    /** Path or URL to the ONNX model. Default: HuggingFace CDN */
+    modelUrl?: string;
+}
+/**
+ * Create a Silero VAD instance via the unified worker.
+ *
+ * If no `unifiedWorker` is provided, a dedicated worker is created on load().
+ *
+ * @param config - Factory configuration
+ * @returns A SileroVADBackend instance
+ */
+declare function createSileroVAD(config?: SileroVADFactoryConfig): SileroVADBackend;
 /**
  * SenseVoice adapter backed by UnifiedInferenceWorker
  *
@@ -2755,34 +2214,9 @@ declare class A2EUnifiedAdapter implements A2EBackend {
 }
 /**
- * Kokoro TTS inference using ONNX Runtime Web
- *
- * Pure ONNX pipeline for browser-based text-to-speech. No transformers.js dependency.
- * Uses eSpeak-NG WASM for phonemization and Kokoro-82M (q8, 92MB) for synthesis.
- *
- * Pipeline: Text → Normalize → Phonemize (eSpeak WASM) → Tokenize → Voice Style → ONNX → Audio
+ * Kokoro TTS type definitions
  *
  * @category Inference
- *
- * @example Basic usage
- * ```typescript
- * import { KokoroTTSInference } from '@omote/core';
- *
- * const tts = new KokoroTTSInference({ defaultVoice: 'af_heart' });
- * await tts.load();
- *
- * const { audio, duration } = await tts.synthesize("Hello world");
- * // audio: Float32Array @ 24kHz
- * ```
- *
- * @example Streaming (sentence-by-sentence)
- * ```typescript
- * for await (const chunk of tts.stream("First sentence. Second sentence.")) {
- *   playbackPipeline.feedBuffer(chunk.audio);
- * }
- * ```
- *
- * @module inference/KokoroTTSInference
  */
 interface KokoroTTSConfig {
@@ -2796,6 +2230,8 @@ interface KokoroTTSConfig {
     backend?: BackendPreference;
     /** Speech speed multiplier (default: 1.0) */
     speed?: number;
+    /** Eagerly load phonemizer + default voice during load() instead of first speak(). Default: true. */
+    eagerLoad?: boolean;
 }
 interface KokoroTTSResult {
     /** Audio samples at 24kHz */
@@ -2834,67 +2270,6 @@ interface SynthesizeOptions {
  * Returns trimmed text on success, throws on invalid input.
  */
 declare function validateTTSInput(text: unknown, voiceName: string, speed: number, availableVoices?: string[]): string;
-declare class KokoroTTSInference implements TTSBackend {
-    private readonly config;
-    private readonly modelUrl;
-    private readonly voiceBaseUrl;
-    private ort;
-    private session;
-    private _backend;
-    private isLoading;
-    private poisoned;
-    private inferenceQueue;
-    private phonemizerReady;
-    private defaultVoiceLoaded;
-    /** Cached voice data (voice name → Float32Array) */
-    private loadedVoices;
-    constructor(config?: KokoroTTSConfig);
-    get isLoaded(): boolean;
-    get sampleRate(): number;
-    /**
-     * Load the ONNX model, phonemizer WASM, and default voice.
-     * Safe to call multiple times (no-ops after first successful load).
-     */
-    load(): Promise<KokoroTTSModelInfo>;
-    /**
-     * Lazily initialize phonemizer and default voice on first use.
-     * Moves 100-200ms of main-thread blocking out of load() into first synthesis.
-     */
-    private ensureReady;
-    /**
-     * Synthesize speech from text (one-shot, full audio output).
-     *
-     * @param text - Input text to synthesize
-     * @param options - Voice and speed overrides
-     * @returns Audio Float32Array at 24kHz with duration
-     */
-    synthesize(text: string, options?: SynthesizeOptions): Promise<KokoroTTSResult>;
-    /**
-     * Stream synthesis sentence-by-sentence (async generator).
-     * Splits text on sentence boundaries and yields audio for each.
-     *
-     * Compatible with both `SynthesizeOptions` (legacy) and `TTSStreamOptions` (TTSBackend).
-     *
-     * @param text - Input text (can be multiple sentences)
-     * @param options - Voice, speed, and abort signal overrides
-     */
-    stream(text: string, options?: SynthesizeOptions & TTSStreamOptions): AsyncGenerator<KokoroStreamChunk & TTSChunk>;
-    /**
-     * Preload a voice (fetches and caches the .bin file).
-     */
-    preloadVoice(voiceName: string): Promise<void>;
-    /**
-     * List available voice names.
-     */
-    listVoices(): string[];
-    /**
-     * Release the ONNX session and clear cached voices.
-     */
-    dispose(): Promise<void>;
-    private ensureVoice;
-    private queueInference;
-    private runInference;
-}
 /**
  * Kokoro TTS adapter backed by UnifiedInferenceWorker
@@ -2910,6 +2285,7 @@ declare class KokoroTTSUnifiedAdapter implements TTSBackend {
     private readonly modelUrl;
     private readonly voiceBaseUrl;
     private _isLoaded;
+    private _backend;
     private loadedGeneration;
     /** Per-adapter inference queue — ensures sequential state updates. */
     private inferenceQueue;
@@ -3131,148 +2507,61 @@ declare class SafariSpeechRecognition {
     /**
      * Remove an error callback
      */
-    offError(callback: SpeechErrorCallback): void;
-    /**
-     * Start listening for speech
-     *
-     * On iOS Safari, this will trigger the microphone permission prompt
-     * if not already granted.
-     */
-    start(): Promise<void>;
-    /**
-     * Stop listening and return the final transcript
-     */
-    stop(): Promise<SpeechRecognitionResult>;
-    /**
-     * Abort recognition without waiting for final result
-     */
-    abort(): void;
-    /**
-     * NOT SUPPORTED: Transcribe audio buffer
-     *
-     * Safari Speech API does not support transcribing pre-recorded audio.
-     * It only works with live microphone input.
-     *
-     * For batch transcription on iOS, use server-side Whisper or a cloud ASR service.
-     *
-     * @throws Error always - this method is not supported
-     */
-    transcribe(_audio: Float32Array): Promise<SpeechRecognitionResult>;
-    /**
-     * Dispose of recognition resources
-     */
-    dispose(): void;
-    /**
-     * Set up event handlers for the recognition instance
-     */
-    private setupEventHandlers;
-    /**
-     * Emit result to all registered callbacks
-     */
-    private emitResult;
-    /**
-     * Emit error to all registered callbacks
-     */
-    private emitError;
-}
-/**
- * Kokoro TTS Web Worker implementation
- *
- * Moves the heavy ONNX `session.run()` to a dedicated Web Worker to prevent
- * main thread blocking (~1-2s per sentence on WASM). Phonemizer, tokenizer,
- * and voice logic stay on the main thread (fast, <10ms combined).
- *
- * Architecture:
- * ```
- * Main Thread (KokoroTTSWorker):        Worker (WORKER_SCRIPT):
- *   stream(text) →
- *     splitSentences(text)
- *     for each sentence:
- *       phonemize(sentence)  → phonemes
- *       tokenize(phonemes)   → tokens
- *       ensureVoice()        → style
- *       postMessage(tokens, style, speed)  ──→  session.run(feeds)
- *       await result                       ←──  postMessage(audio)
- *       yield {audio, text, phonemes, duration}
- * ```
- *
- * @category Inference
- *
- * @example Basic usage
- * ```typescript
- * import { KokoroTTSWorker } from '@omote/core';
- *
- * const tts = new KokoroTTSWorker({ defaultVoice: 'af_heart' });
- * await tts.load();
- *
- * for await (const chunk of tts.stream("Hello world!")) {
- *   playbackPipeline.feedBuffer(chunk.audio);
- * }
- * ```
- *
- * @module inference/KokoroTTSWorker
- */
-/**
- * Kokoro TTS Worker — off-main-thread ONNX inference for non-blocking TTS.
- *
- * Phonemizer/tokenizer/voice logic run on the main thread (fast, <10ms).
- * Only the heavy ONNX `session.run()` is delegated to the worker.
- *
- * Implements the same TTSBackend interface as KokoroTTSInference.
- *
- * @see KokoroTTSInference for main-thread version
- */
-declare class KokoroTTSWorker implements TTSBackend {
-    private readonly config;
-    private readonly modelUrl;
-    private readonly voiceBaseUrl;
-    private worker;
-    private _isLoaded;
-    private isLoading;
-    private poisoned;
-    /** Serializes all worker calls (stream sentence chunks + synthesize) */
-    private inferenceQueue;
-    /** Cached voice data (voice name → Float32Array) */
-    private loadedVoices;
-    /** Pending message handlers */
-    private pendingResolvers;
-    constructor(config?: KokoroTTSConfig);
-    get isLoaded(): boolean;
-    get sampleRate(): number;
-    load(): Promise<KokoroTTSModelInfo>;
-    synthesize(text: string, options?: SynthesizeOptions): Promise<KokoroTTSResult>;
-    stream(text: string, options?: SynthesizeOptions & TTSStreamOptions): AsyncGenerator<KokoroStreamChunk & TTSChunk>;
-    preloadVoice(voiceName: string): Promise<void>;
-    listVoices(): string[];
-    dispose(): Promise<void>;
-    static isSupported(): boolean;
-    private ensureVoice;
-    private createWorker;
-    private handleWorkerMessage;
-    private sendMessage;
+    offError(callback: SpeechErrorCallback): void;
     /**
-     * Queue worker inference through the serialization queue.
-     * Sends pre-computed tokens + style to worker, returns audio.
+     * Start listening for speech
+     *
+     * On iOS Safari, this will trigger the microphone permission prompt
+     * if not already granted.
      */
-    private runWorkerInference;
+    start(): Promise<void>;
+    /**
+     * Stop listening and return the final transcript
+     */
+    stop(): Promise<SpeechRecognitionResult>;
+    /**
+     * Abort recognition without waiting for final result
+     */
+    abort(): void;
+    /**
+     * NOT SUPPORTED: Transcribe audio buffer
+     *
+     * Safari Speech API does not support transcribing pre-recorded audio.
+     * It only works with live microphone input.
+     *
+     * For batch transcription on iOS, use server-side Whisper or a cloud ASR service.
+     *
+     * @throws Error always - this method is not supported
+     */
+    transcribe(_audio: Float32Array): Promise<SpeechRecognitionResult>;
+    /**
+     * Dispose of recognition resources
+     */
+    dispose(): void;
+    /**
+     * Set up event handlers for the recognition instance
+     */
+    private setupEventHandlers;
+    /**
+     * Emit result to all registered callbacks
+     */
+    private emitResult;
     /**
-     * One-shot synthesis (phonemize + tokenize + worker inference).
+     * Emit error to all registered callbacks
      */
-    private queueInference;
+    private emitError;
 }
 /**
- * Factory function for Kokoro TTS with automatic Worker vs main thread selection
+ * Factory function for Kokoro TTS via UnifiedInferenceWorker
  *
- * Provides a unified API that automatically selects the optimal implementation:
- * - Desktop: Uses KokoroTTSWorker (off-main-thread inference, no render hitching)
- * - iOS: Uses KokoroTTSInference (main thread, shared ORT instance to avoid OOM)
+ * When called without a `unifiedWorker`, a dedicated worker is created
+ * automatically on the first `load()` call. Pass a shared worker when using
+ * VoiceOrchestrator or multiple models to avoid extra WASM instances.
  *
  * @category Inference
  *
- * @example Auto-detect (recommended)
+ * @example Standalone (auto-creates worker)
  * ```typescript
  * import { createKokoroTTS } from '@omote/core';
  *
@@ -3284,14 +2573,9 @@ declare class KokoroTTSWorker implements TTSBackend {
  * }
  * ```
  *
- * @example Force worker
+ * @example With shared worker
  * ```typescript
- * const tts = createKokoroTTS({ defaultVoice: 'af_heart', useWorker: true });
- * ```
- *
- * @example Force main thread
- * ```typescript
- * const tts = createKokoroTTS({ defaultVoice: 'af_heart', useWorker: false });
+ * const tts = createKokoroTTS({ defaultVoice: 'af_heart', unifiedWorker: worker });
  * ```
  */
@@ -3301,10 +2585,12 @@ declare class KokoroTTSWorker implements TTSBackend {
 interface CreateKokoroTTSConfig extends KokoroTTSConfig, InferenceFactoryConfig {
 }
 /**
- * Create a Kokoro TTS instance with automatic implementation selection.
+ * Create a Kokoro TTS instance via the unified worker.
+ *
+ * If no `unifiedWorker` is provided, a dedicated worker is created on load().
  *
  * @param config - Factory configuration
- * @returns A TTSBackend instance (either Worker or main thread)
+ * @returns A TTSBackend instance
  */
 declare function createKokoroTTS(config?: CreateKokoroTTSConfig): TTSBackend;
@@ -3353,7 +2639,7 @@ declare function listVoices(): string[];
  * ElevenLabs TTS Backend — Cloud text-to-speech via ElevenLabs REST API.
  *
  * Implements the TTSBackend interface so it can be used anywhere Kokoro TTS is used
- * (TTSPlayback, TTSSpeaker, VoicePipeline, PlaybackPipeline, etc.)
+ * (TTSPlayback, TTSSpeaker, VoiceOrchestrator, PlaybackPipeline, etc.)
  *
  * Zero external dependencies — uses fetch() directly.
  *
@@ -3431,141 +2717,6 @@ declare class ElevenLabsTTSBackend implements TTSBackend {
     private getHttpErrorMessage;
 }
-/**
- * AWS Polly TTS Backend — Cloud text-to-speech via consumer-provided AWS SDK call.
- *
- * Implements the TTSBackend interface. Keeps @omote/core free of AWS SDK dependencies
- * by delegating the actual Polly API call to a consumer-provided function.
- *
- * @category Inference
- *
- * @example Basic usage with AWS SDK v3
- * ```typescript
- * import { PollyTTSBackend } from '@omote/core';
- * import { PollyClient, SynthesizeSpeechCommand } from '@aws-sdk/client-polly';
- *
- * const polly = new PollyClient({ region: 'us-east-1' });
- *
- * const tts = new PollyTTSBackend({
- *   synthesizeFn: async (text, voice, sampleRate) => {
- *     const cmd = new SynthesizeSpeechCommand({
- *       Text: text,
- *       VoiceId: voice,
- *       Engine: 'neural',
- *       OutputFormat: 'pcm',
- *       SampleRate: String(sampleRate),
- *     });
- *     const result = await polly.send(cmd);
- *     const stream = result.AudioStream;
- *     // Convert stream to ArrayBuffer (Node or browser)
- *     const chunks: Uint8Array[] = [];
- *     for await (const chunk of stream as AsyncIterable<Uint8Array>) {
- *       chunks.push(chunk);
- *     }
- *     const totalLength = chunks.reduce((sum, c) => sum + c.length, 0);
- *     const merged = new Uint8Array(totalLength);
- *     let offset = 0;
- *     for (const chunk of chunks) {
- *       merged.set(chunk, offset);
- *       offset += chunk.length;
- *     }
- *     return {
- *       audio: merged.buffer,
- *       contentType: result.ContentType ?? 'audio/pcm',
- *     };
- *   },
- * });
- *
- * await tts.load();
- * for await (const chunk of tts.stream("Hello world!")) {
- *   playbackPipeline.feedBuffer(chunk.audio);
- * }
- * ```
- */
-/**
- * Result from the consumer-provided synthesize function.
- */
-interface PollySynthesizeResult {
-    /** Raw PCM audio bytes (Int16 LE) */
-    audio: ArrayBuffer;
-    /** Content type from Polly response (e.g., 'audio/pcm') */
-    contentType: string;
-}
-/**
- * Configuration for PollyTTSBackend.
- *
- * The `synthesizeFn` callback lets consumers use their own AWS SDK setup
- * (credentials, region, SDK version) without @omote/core depending on `@aws-sdk/client-polly`.
- */
-interface PollyConfig {
-    /**
-     * Consumer-provided function that calls AWS Polly.
-     * Must return PCM audio (Int16 LE) at the requested sample rate.
-     *
-     * @param text - Text to synthesize
-     * @param voice - Polly voice ID (e.g., 'Joanna')
-     * @param sampleRate - Requested output sample rate (e.g., 16000)
-     * @returns PCM audio buffer and content type
-     */
-    synthesizeFn: (text: string, voice: string, sampleRate: number) => Promise<PollySynthesizeResult>;
-    /** Polly voice ID (default: 'Joanna') */
-    voice?: string;
-    /** Output sample rate in Hz (default: 16000) */
-    sampleRate?: number;
-    /** Polly engine type (default: 'neural') */
-    engine?: 'neural' | 'standard' | 'generative' | 'long-form';
-}
-declare class PollyTTSBackend implements TTSBackend {
-    private readonly synthesizeFn;
-    private readonly voice;
-    private readonly _sampleRate;
-    private readonly engine;
-    private _isLoaded;
-    constructor(config: PollyConfig);
-    get sampleRate(): number;
-    get isLoaded(): boolean;
-    /**
-     * No-op for cloud TTS (no model to load).
-     * Marks backend as ready.
-     */
-    load(): Promise<void>;
-    /**
-     * Synthesize audio via consumer's Polly function.
-     *
-     * Polly's SynthesizeSpeech is request/response (not streaming for PCM),
-     * so this yields a single chunk per call. For long text, consider splitting
-     * into sentences on the consumer side.
-     */
-    stream(text: string, options?: TTSStreamOptions): AsyncGenerator<TTSChunk>;
-    dispose(): Promise<void>;
-}
-/**
- * ORT CDN configuration
- *
- * Allows consumers to override the CDN base URL used for loading
- * ONNX Runtime WASM/WebGPU binaries. By default, ORT loads from
- * its bundled CDN path. Use {@link configureOrtCdn} to point at
- * a self-hosted or enterprise CDN.
- *
- * @category Inference
- */
-/**
- * Override the CDN base URL for ONNX Runtime WASM/WebGPU binaries.
- *
- * Must be an HTTPS URL or a relative path (starts with `/` or `./`).
- * Call this once at app startup, before loading any models.
- *
- * @param cdnPath - HTTPS URL or relative path to ORT binaries directory
- * @throws If cdnPath is not HTTPS or a relative path
- */
-declare function configureOrtCdn(cdnPath: string): void;
-/**
- * Get the current ORT CDN base URL override, or null if using defaults.
- */
-declare function getOrtCdnBase(): string | null;
 /**
  * Emotion - Helper for creating emotion vectors for avatar animation
  *
@@ -4111,7 +3262,7 @@ declare const MetricNames: {
     readonly CACHE_QUOTA_WARNING: "omote.cache.quota_warning";
     /** Counter: Cache eviction (LRU) */
     readonly CACHE_EVICTION: "omote.cache.eviction";
-    /** Histogram: VoicePipeline turn latency (speech end → transcript ready, excludes playback) */
+    /** Histogram: Voice turn latency (speech end → transcript ready, excludes playback) */
     readonly VOICE_TURN_LATENCY: "omote.voice.turn.latency";
     /** Histogram: ASR transcription latency in ms */
     readonly VOICE_TRANSCRIPTION_LATENCY: "omote.voice.transcription.latency";
@@ -4959,7 +4110,7 @@ declare class ProceduralLifeLayer {
      */
     update(delta: number, input?: LifeLayerInput): LifeLayerOutput;
     /**
-     * Write life layer output directly to a Float32Array[52] in LAM_BLENDSHAPES order.
+     * Write life layer output directly to a Float32Array[52] in ARKIT_BLENDSHAPES order.
      *
      * Includes micro-jitter (0.4% amplitude simplex noise on all channels) to
      * break uncanny stillness on undriven channels.
@@ -5294,7 +4445,7 @@ declare class FaceCompositor {
     /**
      * Compose a single output frame from the 5-stage signal chain.
      *
-     * @param base - A2E raw output (Float32Array[52], LAM_BLENDSHAPES order)
+     * @param base - A2E raw output (Float32Array[52], ARKIT_BLENDSHAPES order)
      * @param input - Per-frame input (deltaTime, emotion, life layer params)
      * @param target - Optional pre-allocated output buffer (avoids per-frame allocation).
      *   When omitted, an internal buffer is used (valid until next compose() call).
@@ -5576,216 +4727,6 @@ declare class MicLipSync extends EventEmitter<MicLipSyncEvents> {
     private setState;
 }
-/**
- * VoicePipeline - Full conversational agent loop
- *
- * Composes: MicrophoneCapture → SileroVAD → SenseVoice ASR → PlaybackPipeline (A2E)
- *
- * State machine: idle → loading → ready → listening → thinking → speaking → listening → ...
- *
- * The consumer provides an `onResponse` callback that receives transcribed text
- * and streams audio back for playback + lip sync. VoicePipeline is backend-agnostic.
- *
- * @category Orchestration
- */
-/** Shared config options for all VoicePipeline modes */
-interface VoicePipelineBaseConfig {
-    /** Pre-built backends — skip internal factory creation. Takes precedence over `models`. */
-    backends?: {
-        asr: SenseVoiceBackend;
-        lam: A2EBackend;
-        vad: SileroVADBackend;
-        tts?: TTSBackend;
-    };
-    /** External unified worker (reuse across pipelines). Takes precedence over internal creation. */
-    unifiedWorker?: UnifiedInferenceWorker;
-    /** URLs and options for model loading. Required if `backends` not provided. */
-    models?: {
-        senseVoice: {
-            modelUrl: string;
-            tokensUrl?: string;
-            language?: string;
-        };
-        lam: {
-            modelUrl: string;
-            externalDataUrl?: string | false;
-            backend?: 'auto' | 'webgpu' | 'wasm';
-        };
-        vad: {
-            modelUrl: string;
-            threshold?: number;
-            preSpeechBufferChunks?: number;
-        };
-    };
-    /** Per-character expression weight scaling */
-    profile?: ExpressionProfile;
-    /** Identity/style index for A2E model (default: 0) */
-    identityIndex?: number;
-    /** Base silence timeout in ms (default: 500) */
-    silenceTimeoutMs?: number;
-    /** Extended silence timeout for long utterances (default: 700) */
-    silenceTimeoutExtendedMs?: number;
-    /** Enable adaptive timeout based on speech duration (default: true) */
-    adaptiveTimeout?: boolean;
-    /** Minimum audio duration in seconds (default: 0.3) */
-    minAudioDurationSec?: number;
-    /** Minimum audio energy (default: 0.02) */
-    minAudioEnergy?: number;
-    /** Enable audio normalization for quiet audio (default: true) */
-    normalizeAudio?: boolean;
-    /** Progressive transcription interval — desktop (default: 500ms) */
-    progressiveIntervalMs?: number;
-    /** Progressive transcription interval — iOS (default: 800ms) */
-    progressiveIntervalIosMs?: number;
-    /** Coverage threshold to use progressive result (default: 0.8) */
-    progressiveCoverageThreshold?: number;
-    /** Minimum samples before progressive transcription starts (default: 8000) */
-    progressiveMinSamples?: number;
-    /** Timeout for individual SenseVoice.transcribe() calls (default: 10000ms) */
-    transcriptionTimeoutMs?: number;
-    /** Enable barge-in detection (default: true) */
-    interruptionEnabled?: boolean;
-    /** Minimum speech duration for interruption (default: 200ms) */
-    interruptionMinSpeechMs?: number;
-    /** Audio playback delay (default: auto-detected) */
-    audioDelayMs?: number;
-    /** Coalescer target duration (default: 200ms) */
-    chunkTargetMs?: number;
-    /** Enable neutral transition on playback complete (default: true) */
-    neutralTransitionEnabled?: boolean;
-    /** Duration of neutral fade-out (default: 250ms) */
-    neutralTransitionMs?: number;
-}
-/** Cloud TTS mode: consumer handles response + audio streaming */
-interface VoicePipelineCloudConfig extends VoicePipelineBaseConfig {
-    mode: 'cloud';
-    /** Consumer's response handler (streams audio back) */
-    onResponse: ResponseHandler;
-}
-/** Local TTS mode: SDK handles synthesis internally via TTSBackend */
-interface VoicePipelineLocalConfig extends VoicePipelineBaseConfig {
-    mode: 'local';
-    /**
-     * TTS backend (e.g., KokoroTTSInference). Provide either `tts` or `ttsConfig`.
-     *
-     * When `tts` is provided, VoicePipeline uses it as-is. On iOS, this means
-     * inference runs on the main thread (may cause UI freezes).
-     *
-     * Prefer `ttsConfig` for automatic unified worker integration on iOS.
-     */
-    tts?: TTSBackend;
-    /**
-     * Kokoro TTS configuration. When provided, VoicePipeline creates the TTS
-     * internally and passes the unified worker on iOS for off-main-thread inference.
-     *
-     * Takes precedence over `tts` if both are provided.
-     */
-    ttsConfig?: {
-        defaultVoice?: string;
-        speed?: number;
-        modelUrl?: string;
-        voiceBaseUrl?: string;
-    };
-    /** Optional text transform (e.g., LLM call). Receives transcript, returns response text. */
-    onTranscript?: (text: string) => string | Promise<string>;
-}
-type VoicePipelineConfig = VoicePipelineCloudConfig | VoicePipelineLocalConfig;
-interface VoicePipelineEvents {
-    'state': VoicePipelineState;
-    'loading:progress': LoadingProgress;
-    'transcript': TranscriptResult;
-    'frame': FullFaceFrame;
-    'frame:raw': Float32Array;
-    'speech:start': void;
-    'speech:end': {
-        durationMs: number;
-    };
-    'playback:start': {
-        time: number;
-    };
-    'playback:complete': void;
-    'interruption': void;
-    'audio:level': {
-        rms: number;
-        peak: number;
-    };
-    'error': Error;
-    [key: string]: unknown;
-}
-declare class VoicePipeline extends EventEmitter<VoicePipelineEvents> {
-    private readonly config;
-    private readonly isLocalMode;
-    private _state;
-    private stopped;
-    private epoch;
-    private _sessionId;
-    private asr;
-    private lam;
-    private vad;
-    private unifiedWorker;
-    private playback;
-    private interruption;
-    private omoteEvents;
-    private mic;
-    private static readonly MAX_AUDIO_BUFFER_SAMPLES;
-    private audioBuffer;
-    private audioBufferSamples;
-    private speechStartTime;
-    private silenceTimer;
-    private isSpeaking;
-    private progressiveTimer;
-    private progressivePromise;
-    private lastProgressiveResult;
-    private lastProgressiveSamples;
-    private asrErrorCount;
-    private progressiveErrorCount;
-    private responseAbortController;
-    private _unsubChunk;
-    private _unsubLevel;
-    private _currentFrame;
-    /** Current pipeline state */
-    get state(): VoicePipelineState;
-    /** Latest blendshape frame */
-    get currentFrame(): Float32Array | null;
-    /** Whether user is currently speaking */
-    get isSpeechActive(): boolean;
-    /** Session ID (generated on start(), null before) */
-    get sessionId(): string | null;
-    constructor(config: VoicePipelineConfig);
-    loadModels(): Promise<void>;
-    /**
-     * Load from pre-built backends (dependency injection path).
-     * Loads any backends that aren't loaded yet.
-     */
-    private loadFromBackends;
-    /**
-     * Load from factories (original path). Loads SenseVoice, LAM, and VAD in parallel.
-     */
-    private loadFromFactories;
-    start(): Promise<void>;
-    stop(): void;
-    setProfile(profile: ExpressionProfile): void;
-    dispose(): Promise<void>;
-    private processAudioChunk;
-    private getSilenceTimeout;
-    private onSilenceDetected;
-    private processEndOfSpeech;
-    private callResponseHandler;
-    /** Cloud mode: delegate to consumer's onResponse handler */
-    private handleCloudResponse;
-    /** Local mode: synthesize text with TTSBackend, stream to PlaybackPipeline */
-    private handleLocalResponse;
-    private handleInterruption;
-    private startProgressiveTranscription;
-    private stopProgressiveTranscription;
-    private transcribeWithTimeout;
-    private normalizeAudio;
-    private setState;
-    private emitProgress;
-    private clearSilenceTimer;
-}
 /**
  * VoiceOrchestrator — Shared voice wiring for OmoteAvatar adapters.
  *
@@ -5803,6 +4744,11 @@ interface VoiceOrchestratorBaseConfig {
     listener?: SpeechListenerConfig;
     interruptionEnabled?: boolean;
     profile?: ExpressionProfile;
+    onStateChange?: (state: ConversationalState) => void;
+    onLoadingProgress?: (progress: LoadingProgress) => void;
+    onError?: (error: Error) => void;
+    onTranscriptEvent?: (result: TranscriptResult) => void;
+    onInterruption?: () => void;
 }
 interface VoiceOrchestratorLocalConfig extends VoiceOrchestratorBaseConfig {
     mode?: 'local';
@@ -5816,12 +4762,23 @@ interface VoiceOrchestratorCloudConfig extends VoiceOrchestratorBaseConfig {
     lam?: {
         modelUrl?: string;
         externalDataUrl?: string | false;
+        unifiedWorker?: UnifiedInferenceWorker;
     };
+    identityIndex?: number;
+    neutralTransitionEnabled?: boolean;
 }
 type VoiceOrchestratorConfig = VoiceOrchestratorLocalConfig | VoiceOrchestratorCloudConfig;
 interface VoiceOrchestratorEvents {
     'state': ConversationalState;
     'transcript': TranscriptResult;
+    'interruption': void;
+    'loading:progress': LoadingProgress;
+    'error': Error;
+    'audio:level': {
+        rms: number;
+        peak: number;
+    };
+    'playback:complete': void;
     [key: string]: unknown;
 }
 declare class VoiceOrchestrator extends EventEmitter<VoiceOrchestratorEvents> {
@@ -5830,6 +4787,8 @@ declare class VoiceOrchestrator extends EventEmitter<VoiceOrchestratorEvents> {
     private ttsSpeaker;
     private playbackPipeline;
     private ownedLam;
+    private ownedWorker;
+    private usesSharedWorker;
     private transcriptUnsub;
     private audioChunkUnsub;
     private connectEpoch;
@@ -5853,10 +4812,14 @@ declare class VoiceOrchestrator extends EventEmitter<VoiceOrchestratorEvents> {
     speak(text: string, options?: {
         signal?: AbortSignal;
         voice?: string;
+        speed?: number;
+        language?: string;
     }): Promise<void>;
     streamText(options?: {
         signal?: AbortSignal;
         voice?: string;
+        speed?: number;
+        language?: string;
     }): Promise<{
         push: (token: string) => void;
         end: () => Promise<void>;
@@ -5868,4 +4831,4 @@ declare class VoiceOrchestrator extends EventEmitter<VoiceOrchestratorEvents> {
     private setState;
 }
-export { type A2EBackend, A2EInference, type A2EInferenceConfig, type A2EModelInfo, A2EProcessor, type A2EProcessorConfig, type A2EResult, A2EUnifiedAdapter, ALL_AUS, ARKIT_BLENDSHAPES, type AUActivation, AU_TO_ARKIT, type ActiveSpan, type AnimationClip, type AnimationController, AnimationGraph, type AnimationGraphConfig, type AnimationGraphEvents, type AnimationLayer, type AnimationOutput, type AnimationSource, type AnimationSourceOptions, type AnimationState, type AnimationStateName, type AnimationTrigger, AudioChunkCoalescer, type AudioChunkCoalescerOptions, AudioEnergyAnalyzer, AudioScheduler, type AudioSchedulerOptions, BLENDSHAPE_TO_GROUP, type BackendPreference, type BlendWeight, type BlendshapeGroup, BlendshapeSmoother, type BlendshapeSmootherConfig, type BoneFilterConfig, type CacheConfig, type CacheSpanAttributes, CharacterController, type CharacterControllerConfig, type CharacterProfile, type CharacterUpdateInput, type CharacterUpdateOutput, ConsoleExporter, type ConversationalState, type CreateA2EConfig, type CreateKokoroTTSConfig, type CreateSenseVoiceConfig, type CreateTTSPlayerConfig, DEFAULT_ANIMATION_CONFIG, DEFAULT_BONE_FILTER, DEFAULT_MODEL_URLS, EMOTION_NAMES, EMOTION_TO_AU, EMOTION_VECTOR_SIZE, EXPLICIT_EMOTION_COUNT, type ElevenLabsConfig, ElevenLabsTTSBackend, type EmotionAnimationMap, EmotionController, type EmotionLabel, type EmotionName, type EmotionPresetName, EmotionPresets, EmotionResolver, type EmotionWeights, EmphasisDetector, type ErrorType, ErrorTypes, EventEmitter, type ExpressionProfile, FaceCompositor, type FaceCompositorConfig, type FaceCompositorInput, type FaceCompositorOutput, type FetchWithCacheOptions, type FrameSource, type FullFaceFrame, HF_CDN_URLS, INFERENCE_LATENCY_BUCKETS, type InferenceFactoryConfig, type InferenceSpanAttributes, type InterruptionConfig, type InterruptionEvents, InterruptionHandler, KOKORO_VOICES, type KokoroStreamChunk, type KokoroTTSConfig, KokoroTTSInference, type KokoroTTSModelInfo, type KokoroTTSResult, KokoroTTSUnifiedAdapter, KokoroTTSWorker, type KokoroVoiceName, LAM_BLENDSHAPES, type LifeLayerConfig, type LifeLayerInput, type LifeLayerOutput, type LoadingProgress, MIXAMO_PREFIX, MODEL_LOAD_TIME_BUCKETS, type MetricData, MetricNames, MicLipSync, type MicLipSyncConfig, type MicLipSyncEvents, type MicLipSyncFrame, type MicLipSyncState, MicrophoneCapture, type MicrophoneCaptureConfig, ModelCache, type ModelSpanAttributes, type ModelUrlKey, OTLPExporter, type OTLPExporterConfig, OmoteEvents, OmoteTelemetry, PRESERVE_POSITION_BONES, PlaybackPipeline, type PlaybackPipelineConfig, type PlaybackPipelineEvents, type PlaybackState, type PollyConfig, type PollySynthesizeResult, PollyTTSBackend, ProceduralLifeLayer, type Quat, type QuotaInfo, type ResolvedEmotion, type ResponseHandler, RingBuffer, type RuntimeBackend, type SafariSpeechConfig, SafariSpeechRecognition, type SamplingConfig, type SenseVoiceBackend, type SenseVoiceConfig, SenseVoiceInference, type SenseVoiceLanguage, type SenseVoiceModelInfo, type SenseVoiceResult, SenseVoiceUnifiedAdapter, SenseVoiceWorker, type SenseVoiceWorkerConfig, type SileroVADBackend, type SileroVADConfig, type SileroVADFactoryConfig, SileroVADInference, SileroVADUnifiedAdapter, SileroVADWorker, type SpanAttributes, type SpanData, type SpeechErrorCallback, SpeechListener, type SpeechListenerConfig, type SpeechListenerEvents, type SpeechListenerState, type SpeechRecognitionResult, type SpeechResultCallback, type SpeechSegment, type SynthesizeOptions, type TTSBackend, type TTSChunk, TTSPlayback, type TTSPlaybackConfig, type TTSPlaybackEvents, TTSPlayer, TTSSpeaker, type TTSSpeakerConfig, type TTSStreamOptions, type TelemetryConfig, type TelemetryExporter, type TelemetryExporterInterface, type TrackDescriptor, type TranscriptResult, type Transition, UnifiedInferenceWorker, type VADBackend, type VADModelInfo, type VADResult, type VADWorkerConfig, type VADWorkerModelInfo, type ValidationResult, type Vec3, VoiceOrchestrator, type VoiceOrchestratorCloudConfig, type VoiceOrchestratorConfig, type VoiceOrchestratorEvents, type VoiceOrchestratorLocalConfig, VoicePipeline, type VoicePipelineCloudConfig, type VoicePipelineConfig, type VoicePipelineEvents, type VoicePipelineLocalConfig, type VoicePipelineState, A2EInference as Wav2Vec2Inference, type WorkerHealthState, analyzeTextEmotion, applyProfile, blendEmotions, calculatePeak, calculateRMS, configureCacheLimit, configureModelUrls, configureOrtCdn, configureTelemetry, createA2E, createEmotionVector, createKokoroTTS, createSenseVoice, createSileroVAD, createTTSPlayer, fetchWithCache, float32ToPcm16, formatBytes, getCacheConfig, getCacheKey, getEmotionPreset, getModelCache, getOptimalWasmThreads, getOrtCdnBase, getRecommendedBackend, getTelemetry, hasWebGPUApi, int16ToFloat32, isAndroid, isIOS, isIOSSafari, isMobile, isSafari, isSpeechRecognitionAvailable, isWebGPUAvailable, lerpBlendshapes, lerpEmotion, listVoices as listKokoroVoices, parseEmotionTags, pcm16ToFloat32, preloadModels, resampleLinear, resetModelUrls, resolveBackend, resolveEmotion, shouldEnableWasmProxy, shouldKeepTrack, shouldUseNativeASR, shouldUseServerA2E, stripMixamoPrefix, supportsVADWorker, ttsToPlaybackFormat, validateTTSInput };
+export { type A2EBackend, type A2EModelInfo, A2EProcessor, type A2EProcessorConfig, type A2EResult, A2EUnifiedAdapter, ALL_AUS, ARKIT_BLENDSHAPES, type AUActivation, AU_TO_ARKIT, type ActiveSpan, type AnimationClip, type AnimationController, AnimationGraph, type AnimationGraphConfig, type AnimationGraphEvents, type AnimationLayer, type AnimationOutput, type AnimationSource, type AnimationSourceOptions, type AnimationState, type AnimationStateName, type AnimationTrigger, AudioChunkCoalescer, type AudioChunkCoalescerOptions, AudioEnergyAnalyzer, AudioScheduler, type AudioSchedulerOptions, BLENDSHAPE_TO_GROUP, type BackendPreference, type BlendWeight, type BlendshapeGroup, BlendshapeSmoother, type BlendshapeSmootherConfig, type BoneFilterConfig, type CacheConfig, type CacheSpanAttributes, CharacterController, type CharacterControllerConfig, type CharacterProfile, type CharacterUpdateInput, type CharacterUpdateOutput, ConsoleExporter, type ConversationalState, type CreateA2EConfig, type CreateKokoroTTSConfig, type CreateSenseVoiceConfig, type CreateTTSPlayerConfig, DEFAULT_ANIMATION_CONFIG, DEFAULT_BONE_FILTER, DEFAULT_MODEL_URLS, EMOTION_NAMES, EMOTION_TO_AU, EMOTION_VECTOR_SIZE, EXPLICIT_EMOTION_COUNT, type ElevenLabsConfig, ElevenLabsTTSBackend, type EmotionAnimationMap, EmotionController, type EmotionLabel, type EmotionName, type EmotionPresetName, EmotionPresets, EmotionResolver, type EmotionWeights, EmphasisDetector, type ErrorType, ErrorTypes, EventEmitter, type ExpressionProfile, FaceCompositor, type FaceCompositorConfig, type FaceCompositorInput, type FaceCompositorOutput, type FetchWithCacheOptions, type FrameSource, type FullFaceFrame, HF_CDN_URLS, INFERENCE_LATENCY_BUCKETS, type InferenceFactoryConfig, type InferenceSpanAttributes, type InterruptionConfig, type InterruptionEvents, InterruptionHandler, KOKORO_VOICES, type KokoroStreamChunk, type KokoroTTSConfig, type KokoroTTSModelInfo, type KokoroTTSResult, KokoroTTSUnifiedAdapter, type KokoroVoiceName, LAM_BLENDSHAPES, type LifeLayerConfig, type LifeLayerInput, type LifeLayerOutput, type LoadingProgress, MIXAMO_PREFIX, MODEL_LOAD_TIME_BUCKETS, type MetricData, MetricNames, MicLipSync, type MicLipSyncConfig, type MicLipSyncEvents, type MicLipSyncFrame, type MicLipSyncState, MicrophoneCapture, type MicrophoneCaptureConfig, ModelCache, type ModelSpanAttributes, type ModelUrlKey, OTLPExporter, type OTLPExporterConfig, OmoteEvents, OmoteTelemetry, PRESERVE_POSITION_BONES, PlaybackPipeline, type PlaybackPipelineConfig, type PlaybackPipelineEvents, type PlaybackState, ProceduralLifeLayer, type Quat, type QuotaInfo, type ResolvedEmotion, type ResponseHandler, RingBuffer, type RuntimeBackend, type SafariSpeechConfig, SafariSpeechRecognition, type SamplingConfig, type SenseVoiceBackend, type SenseVoiceConfig, type SenseVoiceLanguage, type SenseVoiceModelInfo, type SenseVoiceResult, SenseVoiceUnifiedAdapter, type SenseVoiceWorkerConfig, type SileroVADBackend, type SileroVADConfig, type SileroVADFactoryConfig, SileroVADUnifiedAdapter, type SpanAttributes, type SpanData, type SpeechErrorCallback, SpeechListener, type SpeechListenerConfig, type SpeechListenerEvents, type SpeechListenerState, type SpeechRecognitionResult, type SpeechResultCallback, type SpeechSegment, type SynthesizeOptions, type TTSBackend, type TTSChunk, TTSPlayback, type TTSPlaybackConfig, type TTSPlaybackEvents, TTSPlayer, TTSSpeaker, type TTSSpeakerConfig, type TTSStreamOptions, type TelemetryConfig, type TelemetryExporter, type TelemetryExporterInterface, type TrackDescriptor, type TranscriptResult, type Transition, UnifiedInferenceWorker, type VADBackend, type VADModelInfo, type VADResult, type VADWorkerConfig, type VADWorkerModelInfo, type ValidationResult, type Vec3, VoiceOrchestrator, type VoiceOrchestratorCloudConfig, type VoiceOrchestratorConfig, type VoiceOrchestratorEvents, type VoiceOrchestratorLocalConfig, type VoicePipelineState, type WorkerHealthState, analyzeTextEmotion, applyProfile, blendEmotions, calculatePeak, calculateRMS, configureCacheLimit, configureModelUrls, configureTelemetry, createA2E, createEmotionVector, createKokoroTTS, createSenseVoice, createSileroVAD, createTTSPlayer, fetchWithCache, float32ToPcm16, formatBytes, getCacheConfig, getCacheKey, getEmotionPreset, getModelCache, getOptimalWasmThreads, getRecommendedBackend, getTelemetry, hasWebGPUApi, int16ToFloat32, isAndroid, isIOS, isIOSSafari, isMobile, isSafari, isSpeechRecognitionAvailable, isWebGPUAvailable, lerpBlendshapes, lerpEmotion, listVoices as listKokoroVoices, parseEmotionTags, pcm16ToFloat32, preloadModels, resampleLinear, resetModelUrls, resolveBackend, resolveEmotion, shouldEnableWasmProxy, shouldKeepTrack, shouldUseNativeASR, shouldUseServerA2E, stripMixamoPrefix, ttsToPlaybackFormat, validateTTSInput };