@omote/core 0.7.1 → 0.9.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.ts CHANGED
@@ -1,6 +1,6 @@
1
1
  import { EventEmitter, OmoteEvents } from './events/index.js';
2
2
  export { AnimationEvent, BackendEvent, EmotionEvent, GazeEvent, STTFinalEvent, STTPartialEvent, SessionStateEvent, TTSEndEvent, TTSMarkEvent, TTSStartEvent, VisemeEvent } from './events/index.js';
3
- export { D as DEFAULT_LOGGING_CONFIG, I as ILogger, a as LOG_LEVEL_PRIORITY, b as LogEntry, L as LogFormatter, c as LogLevel, d as LogSink, e as LoggingConfig, g as configureLogging, h as createLogger, i as getLoggingConfig, n as noopLogger, r as resetLoggingConfig, s as setLogLevel, k as setLoggingEnabled } from './Logger-DSoGAYJu.js';
3
+ export { C as Clock, D as DEFAULT_LOGGING_CONFIG, E as ErrorCode, a as ErrorCodes, I as ILogger, b as LOG_LEVEL_PRIORITY, c as LogEntry, L as LogFormatter, d as LogLevel, e as LogSink, f as LoggingConfig, h as configureClock, i as configureLogging, j as createLogger, l as getClock, m as getLoggingConfig, o as noopLogger, r as resetLoggingConfig, s as setLogLevel, p as setLoggingEnabled } from './ErrorCodes-AX3ADZri.js';
4
4
  export { ARKitToFLAMEMapping, ApiError, AudioChunkEvent, AvatarFormat, Character, CharacterAvatar, CharacterMemory, CharacterPersonality, CharacterSpec, CharacterVoice, CreateCharacterRequest, CreateCharacterResponse, CreateLAMJobRequest, CreateLAMJobResponse, CreateSessionRequest, CreateSessionResponse, GSplatConfig, LAMJob, LAMJobStatus, PROTOCOL_VERSION, PaginatedResponse, PlatformSession, ErrorEvent as ProtocolErrorEvent, ProtocolEvent, ResponseChunkEvent, ResponseEndEvent, ResponseStartEvent, SessionMessage, SessionStatus, isProtocolEvent } from '@omote/types';
5
5
 
6
6
  /**
@@ -163,6 +163,8 @@ interface AudioSchedulerOptions {
163
163
  * Default: 0.05 (50ms) for WebGPU, increase to 0.3-0.5 for WASM on iOS.
164
164
  */
165
165
  initialLookaheadSec?: number;
166
+ /** Error callback for critical scheduling issues */
167
+ onError?: (error: Error) => void;
166
168
  }
167
169
  declare class AudioScheduler {
168
170
  private readonly options;
@@ -171,6 +173,8 @@ declare class AudioScheduler {
171
173
  private scheduledSources;
172
174
  private isPlaying;
173
175
  constructor(options?: AudioSchedulerOptions);
176
+ /** Configured sample rate (default: 16000). */
177
+ get sampleRate(): number;
174
178
  /**
175
179
  * Initialize AudioContext with specified sample rate
176
180
  *
@@ -429,19 +433,6 @@ declare function shouldEnableWasmProxy(): boolean;
429
433
  * @returns true if running in Safari on any platform
430
434
  */
431
435
  declare function isSafari(): boolean;
432
- /**
433
- * Recommend using CPU-optimized A2E model (wav2arkit_cpu)
434
- *
435
- * All iOS browsers use WebKit and have tight memory limits — the 192MB fp16
436
- * LAM model causes silent crashes. wav2arkit_cpu uses URL pass-through
437
- * (ORT fetches the 402MB weights directly into WASM, no JS heap copy).
438
- *
439
- * macOS Safari also needs this due to ONNX Runtime JSEP/ASYNCIFY bugs
440
- * that crash WebKit's JIT compiler.
441
- *
442
- * @returns true if iOS (any browser) or Safari (any platform)
443
- */
444
- declare function shouldUseCpuA2E(): boolean;
445
436
  /**
446
437
  * Check if Web Speech API is available in the browser
447
438
  *
@@ -479,9 +470,8 @@ declare function shouldUseServerA2E(): boolean;
479
470
  /**
480
471
  * Common interface for audio-to-expression (A2E) inference backends
481
472
  *
482
- * Both Wav2Vec2Inference (GPU, 192MB fp16) and Wav2ArkitCpuInference (CPU, 404MB)
483
- * implement this interface, allowing FullFacePipeline and A2EProcessor to
484
- * work with either model transparently.
473
+ * Implemented by A2EInference and A2EUnifiedAdapter, allowing PlaybackPipeline
474
+ * and A2EProcessor to work with either implementation transparently.
485
475
  *
486
476
  * @category Inference
487
477
  */
@@ -510,15 +500,22 @@ interface A2EResult {
510
500
  inferenceTimeMs: number;
511
501
  }
512
502
  /**
513
- * Common interface for A2E (audio-to-expression) inference engines
503
+ * Common interface for A2E (audio-to-expression) inference engines.
504
+ *
505
+ * A2E is the SDK term for audio-to-expression inference. The underlying model
506
+ * is called **LAM** (Large Animation Model). "A2E" and "LAM" refer to the same
507
+ * pipeline — A2E is the interface abstraction, LAM is the model.
514
508
  *
515
509
  * Implemented by:
516
- * - Wav2Vec2Inference (WebGPU/WASM, 192MB fp16, A2E)
517
- * - Wav2ArkitCpuInference (WASM-only, 404MB, A2E only)
510
+ * - {@link A2EInference} (WebGPU/WASM, 192MB fp16)
511
+ * - A2EUnifiedAdapter (shared unified worker)
512
+ *
513
+ * @see {@link A2EInference} for direct usage
514
+ * @see {@link createA2E} for the recommended factory API
518
515
  */
519
516
  interface A2EBackend {
520
- /** Model identifier for backend-specific tuning (e.g. audio delay) */
521
- readonly modelId: 'wav2vec2' | 'wav2arkit_cpu';
517
+ /** Model identifier */
518
+ readonly modelId: 'a2e';
522
519
  /** Current backend type (webgpu, wasm, or null if not loaded) */
523
520
  readonly backend: RuntimeBackend | null;
524
521
  /** Whether the model is loaded and ready for inference */
@@ -590,7 +587,7 @@ declare const BLENDSHAPE_TO_GROUP: Map<string, BlendshapeGroup>;
590
587
  * 2. Otherwise, use the group scaler (default 1.0)
591
588
  * 3. Clamp result to [0, 1]
592
589
  */
593
- declare function applyProfile(raw: Float32Array, profile: ExpressionProfile): Float32Array;
590
+ declare function applyProfile(raw: Float32Array, profile: ExpressionProfile, out?: Float32Array): Float32Array;
594
591
 
595
592
  /**
596
593
  * PlaybackPipeline - Audio playback + A2E lip sync with ExpressionProfile scaling
@@ -616,7 +613,7 @@ interface PlaybackPipelineConfig {
616
613
  audioDelayMs?: number;
617
614
  /** A2E inference chunk size in samples (default: 16000) */
618
615
  chunkSize?: number;
619
- /** Identity/style index for Wav2Vec2 (default: 0) */
616
+ /** Identity/style index for A2E model (default: 0) */
620
617
  identityIndex?: number;
621
618
  /** Per-character expression weight scaling */
622
619
  profile?: ExpressionProfile;
@@ -637,6 +634,8 @@ interface FullFaceFrame {
637
634
  rawBlendshapes: Float32Array;
638
635
  /** AudioContext timestamp for this frame */
639
636
  timestamp: number;
637
+ /** Emotion label for this frame (from SenseVoice, text heuristics, or LLM tags) */
638
+ emotion?: string;
640
639
  }
641
640
  interface PlaybackPipelineEvents {
642
641
  /** New frame ready for display (scaled by ExpressionProfile) */
@@ -655,10 +654,6 @@ interface PlaybackPipelineEvents {
655
654
  'error': Error;
656
655
  /** State changed */
657
656
  'state': PlaybackState;
658
- 'full_frame_ready': FullFaceFrame;
659
- 'lam_frame_ready': Float32Array;
660
- 'playback_complete': void;
661
- 'playback_start': number;
662
657
  [key: string]: unknown;
663
658
  }
664
659
  declare class PlaybackPipeline extends EventEmitter<PlaybackPipelineEvents> {
@@ -676,6 +671,7 @@ declare class PlaybackPipeline extends EventEmitter<PlaybackPipelineEvents> {
676
671
  private staleWarningEmitted;
677
672
  private readonly staleThresholdMs;
678
673
  private frameLoopCount;
674
+ private sessionStartTime;
679
675
  private profile;
680
676
  private readonly neutralTransitionEnabled;
681
677
  private readonly neutralTransitionMs;
@@ -684,6 +680,8 @@ declare class PlaybackPipeline extends EventEmitter<PlaybackPipelineEvents> {
684
680
  private neutralAnimationId;
685
681
  private _currentFrame;
686
682
  private _currentRawFrame;
683
+ private _emotion;
684
+ private readonly _profileBuffer;
687
685
  /** Current pipeline state */
688
686
  get state(): PlaybackState;
689
687
  /** Current scaled blendshapes (updated in-place for perf) */
@@ -695,6 +693,8 @@ declare class PlaybackPipeline extends EventEmitter<PlaybackPipelineEvents> {
695
693
  initialize(): Promise<void>;
696
694
  /** Update ExpressionProfile at runtime */
697
695
  setProfile(profile: ExpressionProfile): void;
696
+ /** Set the emotion label to include in emitted frames */
697
+ setEmotion(emotion: string | null): void;
698
698
  /**
699
699
  * Start a new playback session.
700
700
  * Idempotent — calling during playback resets cleanly without emitting
@@ -733,201 +733,6 @@ declare class PlaybackPipeline extends EventEmitter<PlaybackPipelineEvents> {
733
733
  private setState;
734
734
  }
735
735
 
736
- /**
737
- * FullFacePipeline - A2E expression pipeline with ExpressionProfile weight scaling
738
- *
739
- * Orchestrates full-face animation by:
740
- * 1. Scheduling audio for playback immediately (audio-first, never waits for A2E)
741
- * 2. Running A2E inference in background (fire-and-forget via A2EProcessor)
742
- * 3. Applying per-character ExpressionProfile scaling to raw A2E output
743
- *
744
- * The A2E model outputs all 52 ARKit blendshapes from audio — brows, eyes, cheeks,
745
- * mouth, jaw, everything. ExpressionProfile allows per-character weight scaling
746
- * by group (eyes, brows, jaw, mouth, cheeks, nose, tongue) with per-blendshape overrides.
747
- *
748
- * @deprecated Use {@link PlaybackPipeline} from `@omote/core` instead. PlaybackPipeline
749
- * is a superset with sync mode (`feedBuffer`), state tracking, and opt-in neutral transition.
750
- * FullFacePipeline will continue to work but is no longer actively developed.
751
- *
752
- * @category Audio
753
- *
754
- * @example Basic usage
755
- * ```typescript
756
- * import { FullFacePipeline } from '@omote/core';
757
- *
758
- * const pipeline = new FullFacePipeline({
759
- * lam,
760
- * profile: { mouth: 1.2, brows: 0.8 },
761
- * });
762
- * await pipeline.initialize();
763
- *
764
- * pipeline.on('full_frame_ready', (frame) => {
765
- * applyToAvatar(frame.blendshapes);
766
- * });
767
- *
768
- * pipeline.start();
769
- * await pipeline.onAudioChunk(audioData);
770
- * ```
771
- */
772
-
773
- /**
774
- * Configuration for FullFacePipeline
775
- */
776
- interface FullFacePipelineOptions {
777
- /** Sample rate in Hz (default: 16000) */
778
- sampleRate?: number;
779
- /** Target chunk duration in ms for coalescing (default: 200) */
780
- chunkTargetMs?: number;
781
- /**
782
- * Audio playback delay in ms before first audio plays.
783
- * Gives A2E inference time to pre-compute blendshapes before audio
784
- * starts, preventing frame drops/desync. Must be ≥ chunkSize
785
- * accumulation time + inference latency.
786
- *
787
- * Default: auto-calculated from chunkSize and backend type.
788
- */
789
- audioDelayMs?: number;
790
- /**
791
- * A2E inference chunk size in samples.
792
- * Controls how many samples accumulate before each inference call.
793
- * Smaller = lower latency (less delay before first frame), more overhead.
794
- * Larger = higher latency, less overhead.
795
- *
796
- * Default: 16000 (1s) — the model's native window size.
797
- * Smaller chunks get zero-padded, causing near-zero blendshape output.
798
- */
799
- chunkSize?: number;
800
- /** A2E inference engine */
801
- lam: A2EBackend;
802
- /**
803
- * Identity/style index for the A2E model (default: 0).
804
- *
805
- * The LAM model uses a 12-class one-hot identity vector as style conditioning.
806
- * Different indices produce different expression intensity across face regions.
807
- * Only affects Wav2Vec2Inference (GPU). Wav2ArkitCpuInference has identity 11 baked in.
808
- */
809
- identityIndex?: number;
810
- /** Per-character expression weight scaling */
811
- profile?: ExpressionProfile;
812
- /**
813
- * Time in ms with no new inference frames before logging a stale warning.
814
- *
815
- * Must be larger than the inter-batch gap (chunkSize/sampleRate + inference time).
816
- * Default: 2000
817
- */
818
- staleThresholdMs?: number;
819
- }
820
- /**
821
- * Events emitted by FullFacePipeline
822
- */
823
- interface FullFacePipelineEvents {
824
- /** New merged frame ready for display */
825
- full_frame_ready: FullFaceFrame;
826
- /** Raw LAM frame ready (for debugging/monitoring) */
827
- lam_frame_ready: Float32Array;
828
- /** Playback has completed */
829
- playback_complete: void;
830
- /** First frame ready, playback starting */
831
- playback_start: number;
832
- /** Error occurred */
833
- error: Error;
834
- /** Index signature for EventEmitter compatibility */
835
- [key: string]: unknown;
836
- }
837
- /**
838
- * FullFacePipeline - A2E animation pipeline with ExpressionProfile scaling
839
- *
840
- * Audio-first design matching SyncedAudioPipeline:
841
- * - Audio is scheduled immediately (never waits for A2E)
842
- * - A2E runs in background (fire-and-forget via A2EProcessor)
843
- * - ExpressionProfile scales raw A2E output per-character
844
- */
845
- declare class FullFacePipeline extends EventEmitter<FullFacePipelineEvents> {
846
- private readonly options;
847
- private scheduler;
848
- private coalescer;
849
- private processor;
850
- private playbackStarted;
851
- private monitorInterval;
852
- private frameAnimationId;
853
- private lastNewFrameTime;
854
- private lastKnownLamFrame;
855
- private staleWarningEmitted;
856
- private readonly staleThresholdMs;
857
- private frameLoopCount;
858
- private profile;
859
- constructor(options: FullFacePipelineOptions);
860
- /**
861
- * Initialize the pipeline
862
- */
863
- initialize(): Promise<void>;
864
- /**
865
- * Update the ExpressionProfile at runtime (e.g., character switch).
866
- */
867
- setProfile(profile: ExpressionProfile): void;
868
- /**
869
- * Apply ExpressionProfile scaling to raw A2E blendshapes.
870
- *
871
- * Delegates to the standalone applyProfile() utility from expressionProfile.ts.
872
- */
873
- applyProfile(raw: Float32Array): Float32Array;
874
- /**
875
- * Start a new playback session
876
- *
877
- * Resets all state and prepares for incoming audio chunks.
878
- * Audio will be scheduled immediately as chunks arrive (no buffering).
879
- */
880
- start(): void;
881
- /**
882
- * Receive audio chunk from network
883
- *
884
- * Audio-first design: schedules audio immediately, A2E runs in background.
885
- * This prevents A2E inference (50-300ms) from blocking audio scheduling.
886
- *
887
- * @param chunk - Uint8Array containing Int16 PCM audio
888
- */
889
- onAudioChunk(chunk: Uint8Array): Promise<void>;
890
- /**
891
- * Start frame animation loop
892
- *
893
- * Polls A2EProcessor at render rate (60fps) for the latest inference frame
894
- * matching the current AudioContext time. Between inference batches (~30fps
895
- * bursts), getFrameForTime() holds the last frame.
896
- */
897
- private startFrameLoop;
898
- /**
899
- * End of audio stream
900
- */
901
- end(): Promise<void>;
902
- /**
903
- * Stop playback immediately with smooth fade-out
904
- */
905
- stop(fadeOutMs?: number): Promise<void>;
906
- /**
907
- * Start monitoring for playback completion
908
- */
909
- private startMonitoring;
910
- /**
911
- * Stop monitoring
912
- */
913
- private stopMonitoring;
914
- /**
915
- * Get current pipeline state (for debugging/monitoring)
916
- */
917
- getState(): {
918
- playbackStarted: boolean;
919
- coalescerFill: number;
920
- processorFill: number;
921
- queuedFrames: number;
922
- currentTime: number;
923
- playbackEndTime: number;
924
- };
925
- /**
926
- * Cleanup resources
927
- */
928
- dispose(): void;
929
- }
930
-
931
736
  /**
932
737
  * TTSBackend — Streaming text-to-speech backend interface.
933
738
  *
@@ -1007,7 +812,7 @@ interface TTSPlaybackConfig {
1007
812
  profile?: ExpressionProfile;
1008
813
  /** Prefetch next sentence while current plays. Default: true */
1009
814
  prefetch?: boolean;
1010
- /** Identity/style index for Wav2Vec2 (default: 0) */
815
+ /** Identity/style index for A2E model (default: 0) */
1011
816
  identityIndex?: number;
1012
817
  /** Audio playback delay in ms */
1013
818
  audioDelayMs?: number;
@@ -1027,6 +832,8 @@ interface TTSPlaybackEvents {
1027
832
  };
1028
833
  /** Playback completed */
1029
834
  'playback:complete': void;
835
+ /** Playback stopped (user-initiated) */
836
+ 'playback:stop': void;
1030
837
  /** Error */
1031
838
  'error': Error;
1032
839
  [key: string]: unknown;
@@ -1056,87 +863,6 @@ declare class TTSPlayback extends EventEmitter<TTSPlaybackEvents> {
1056
863
  private speakSequential;
1057
864
  }
1058
865
 
1059
- /**
1060
- * Interruption Handler
1061
- *
1062
- * VAD-based barge-in detection for AI conversations:
1063
- * - Monitors VAD probability for user speech
1064
- * - Detects when user interrupts AI response
1065
- * - Triggers interruption callbacks
1066
- */
1067
-
1068
- interface InterruptionEvents {
1069
- [key: string]: unknown;
1070
- 'speech.detected': {
1071
- rms: number;
1072
- };
1073
- 'speech.ended': {
1074
- durationMs: number;
1075
- };
1076
- 'interruption.triggered': {
1077
- rms: number;
1078
- durationMs: number;
1079
- };
1080
- }
1081
- /**
1082
- * Interruption handler configuration
1083
- *
1084
- * Industry standards applied:
1085
- * - vadThreshold: 0.5 (Silero VAD default)
1086
- * - minSpeechDurationMs: 200ms (Google/Amazon barge-in standard)
1087
- * - silenceTimeoutMs: 500ms (OpenAI Realtime API standard)
1088
- */
1089
- interface InterruptionConfig {
1090
- /** VAD probability threshold for speech detection (default: 0.5, Silero standard) */
1091
- vadThreshold?: number;
1092
- /** Minimum speech duration to trigger interruption (default: 200ms, Google/Amazon standard) */
1093
- minSpeechDurationMs?: number;
1094
- /** Silence duration to end speech (default: 500ms, OpenAI standard) */
1095
- silenceTimeoutMs?: number;
1096
- /** Enable interruption detection (default: true) */
1097
- enabled?: boolean;
1098
- }
1099
- declare class InterruptionHandler extends EventEmitter<InterruptionEvents> {
1100
- private config;
1101
- private isSpeaking;
1102
- private speechStartTime;
1103
- private lastSpeechTime;
1104
- private silenceTimer;
1105
- private aiIsSpeaking;
1106
- private interruptionTriggeredThisSession;
1107
- constructor(config?: InterruptionConfig);
1108
- /**
1109
- * Process raw audio energy for interruption detection (no VAD required).
1110
- * Used during speaking state when the unified worker is busy with TTS.
1111
- * Echo-cancelled mic input means energy above threshold = user speech.
1112
- *
1113
- * @param rms - RMS energy of audio chunk (0-1)
1114
- * @param energyThreshold - Minimum energy to consider speech (default: 0.02)
1115
- */
1116
- processAudioEnergy(rms: number, energyThreshold?: number): void;
1117
- /**
1118
- * Process VAD result for interruption detection
1119
- * @param vadProbability - Speech probability from VAD (0-1)
1120
- * @param audioEnergy - Optional RMS energy for logging (default: 0)
1121
- */
1122
- processVADResult(vadProbability: number, audioEnergy?: number): void;
1123
- /** Notify that AI started/stopped speaking */
1124
- setAISpeaking(speaking: boolean): void;
1125
- /** Enable/disable interruption detection */
1126
- setEnabled(enabled: boolean): void;
1127
- /** Update configuration */
1128
- updateConfig(config: Partial<InterruptionConfig>): void;
1129
- /** Reset state */
1130
- reset(): void;
1131
- /** Get current state */
1132
- getState(): {
1133
- isSpeaking: boolean;
1134
- speechDurationMs: number;
1135
- };
1136
- private onSpeechDetected;
1137
- private onSilenceDetected;
1138
- }
1139
-
1140
866
  /**
1141
867
  * Lazy ONNX Runtime loader with conditional WebGPU/WASM bundle loading
1142
868
  *
@@ -1240,6 +966,8 @@ declare class SenseVoiceInference {
1240
966
  private inferenceQueue;
1241
967
  private poisoned;
1242
968
  private static readonly INFERENCE_TIMEOUT_MS;
969
+ private lastLfrFrames;
970
+ private webgpuShapeWarned;
1243
971
  private tokenMap;
1244
972
  private negMean;
1245
973
  private invStddev;
@@ -1261,157 +989,47 @@ declare class SenseVoiceInference {
1261
989
  }
1262
990
 
1263
991
  /**
1264
- * SenseVoice ASR Web Worker implementation
992
+ * Silero VAD (Voice Activity Detection) inference
1265
993
  *
1266
- * Runs SenseVoice speech recognition in a dedicated Web Worker to prevent
1267
- * main thread blocking. Uses inline worker script (Blob URL pattern) to
1268
- * avoid separate file deployment.
994
+ * Neural network-based VAD running in browser via ONNX Runtime Web.
995
+ * Much more accurate than RMS-based energy detection.
1269
996
  *
1270
- * Key design decisions:
1271
- * - WASM backend only (WebGPU doesn't work in Workers)
1272
- * - All preprocessing (fbank, LFR, CMVN) and CTC decoding inlined in worker
1273
- * - Audio copied (not transferred) to retain main thread access
1274
- * - ONNX Runtime loaded from CDN in worker (no bundler complications)
1275
- * - iOS: model URL passed as string to ORT (avoids 239MB JS heap allocation)
997
+ * Uses lazy loading to conditionally load WebGPU or WASM-only bundle:
998
+ * - iOS: Loads WASM-only bundle (WebGPU crashes due to Safari bugs)
999
+ * - Android/Desktop: Loads WebGPU bundle (with WASM fallback)
1276
1000
  *
1277
1001
  * @category Inference
1278
1002
  *
1279
1003
  * @example Basic usage
1280
1004
  * ```typescript
1281
- * import { SenseVoiceWorker } from '@omote/core';
1005
+ * import { SileroVADInference } from '@omote/core';
1282
1006
  *
1283
- * const asr = new SenseVoiceWorker({
1284
- * modelUrl: '/models/sensevoice/model.int8.onnx',
1285
- * tokensUrl: '/models/sensevoice/tokens.txt',
1007
+ * const vad = new SileroVADInference({
1008
+ * modelUrl: '/models/silero-vad.onnx'
1286
1009
  * });
1287
- * await asr.load();
1010
+ * await vad.load();
1288
1011
  *
1289
- * const { text, emotion, language } = await asr.transcribe(audioSamples);
1290
- * console.log(text); // "Hello world"
1291
- * console.log(emotion); // "NEUTRAL"
1292
- * console.log(language); // "en"
1012
+ * // Process 32ms chunks (512 samples at 16kHz)
1013
+ * const probability = await vad.process(audioChunk);
1014
+ * if (probability > 0.5) {
1015
+ * console.log('Speech detected!');
1016
+ * }
1017
+ * ```
1018
+ *
1019
+ * @example Streaming with state management
1020
+ * ```typescript
1021
+ * // State is automatically maintained between process() calls
1022
+ * // Call reset() when starting a new audio stream
1023
+ * vad.reset();
1024
+ *
1025
+ * for (const chunk of audioChunks) {
1026
+ * const prob = await vad.process(chunk);
1027
+ * // prob is speech probability [0, 1]
1028
+ * }
1293
1029
  * ```
1294
1030
  */
1295
1031
 
1296
- /**
1297
- * Configuration for SenseVoice Worker
1298
- */
1299
- interface SenseVoiceWorkerConfig {
1300
- /** Path or URL to model.int8.onnx (239MB) */
1301
- modelUrl: string;
1302
- /** Path or URL to tokens.txt vocabulary file (default: sibling of modelUrl) */
1303
- tokensUrl?: string;
1304
- /** Language hint (default: 'auto' for auto-detection) */
1305
- language?: SenseVoiceLanguage;
1306
- /** Text normalization: 'with_itn' applies inverse text normalization (default: 'with_itn') */
1307
- textNorm?: 'with_itn' | 'without_itn';
1308
- }
1309
- /**
1310
- * SenseVoice ASR Worker - Speech Recognition in a Web Worker
1311
- *
1312
- * Runs SenseVoice inference off the main thread to prevent UI blocking.
1313
- * All preprocessing (fbank, LFR, CMVN) and CTC decoding run in the worker.
1314
- *
1315
- * @see SenseVoiceInference for main-thread version
1316
- */
1317
- declare class SenseVoiceWorker {
1318
- private worker;
1319
- private config;
1320
- private isLoading;
1321
- private _isLoaded;
1322
- private inferenceQueue;
1323
- private poisoned;
1324
- private pendingResolvers;
1325
- private languageId;
1326
- private textNormId;
1327
- constructor(config: SenseVoiceWorkerConfig);
1328
- get isLoaded(): boolean;
1329
- /**
1330
- * Backend type (always 'wasm' for Worker, WebGPU not supported in Workers)
1331
- */
1332
- get backend(): 'wasm' | null;
1333
- /**
1334
- * Create the worker from inline script
1335
- */
1336
- private createWorker;
1337
- /**
1338
- * Handle messages from worker
1339
- */
1340
- private handleWorkerMessage;
1341
- /**
1342
- * Send message to worker and wait for response
1343
- */
1344
- private sendMessage;
1345
- /**
1346
- * Load the ONNX model in the worker
1347
- *
1348
- * @param onProgress - Optional progress callback. Fires once at 100% when load completes
1349
- * (the worker downloads and loads the model internally, so granular progress is not available).
1350
- */
1351
- load(onProgress?: (loaded: number, total: number) => void): Promise<SenseVoiceModelInfo>;
1352
- /**
1353
- * Transcribe audio samples to text
1354
- *
1355
- * @param audioSamples Float32Array of audio samples at 16kHz, [-1, 1] range
1356
- * @returns Transcription result with text, emotion, language, and event
1357
- */
1358
- transcribe(audioSamples: Float32Array): Promise<SenseVoiceResult>;
1359
- /**
1360
- * Queue inference to serialize worker calls
1361
- */
1362
- private queueInference;
1363
- /**
1364
- * Dispose of the worker and free resources
1365
- */
1366
- dispose(): Promise<void>;
1367
- /**
1368
- * Check if Web Workers are supported
1369
- */
1370
- static isSupported(): boolean;
1371
- }
1372
-
1373
- /**
1374
- * Silero VAD (Voice Activity Detection) inference
1375
- *
1376
- * Neural network-based VAD running in browser via ONNX Runtime Web.
1377
- * Much more accurate than RMS-based energy detection.
1378
- *
1379
- * Uses lazy loading to conditionally load WebGPU or WASM-only bundle:
1380
- * - iOS: Loads WASM-only bundle (WebGPU crashes due to Safari bugs)
1381
- * - Android/Desktop: Loads WebGPU bundle (with WASM fallback)
1382
- *
1383
- * @category Inference
1384
- *
1385
- * @example Basic usage
1386
- * ```typescript
1387
- * import { SileroVADInference } from '@omote/core';
1388
- *
1389
- * const vad = new SileroVADInference({
1390
- * modelUrl: '/models/silero-vad.onnx'
1391
- * });
1392
- * await vad.load();
1393
- *
1394
- * // Process 32ms chunks (512 samples at 16kHz)
1395
- * const probability = await vad.process(audioChunk);
1396
- * if (probability > 0.5) {
1397
- * console.log('Speech detected!');
1398
- * }
1399
- * ```
1400
- *
1401
- * @example Streaming with state management
1402
- * ```typescript
1403
- * // State is automatically maintained between process() calls
1404
- * // Call reset() when starting a new audio stream
1405
- * vad.reset();
1406
- *
1407
- * for (const chunk of audioChunks) {
1408
- * const prob = await vad.process(chunk);
1409
- * // prob is speech probability [0, 1]
1410
- * }
1411
- * ```
1412
- */
1413
-
1414
- type VADBackend = BackendPreference;
1032
+ type VADBackend = BackendPreference;
1415
1033
  /**
1416
1034
  * Configuration for Silero VAD
1417
1035
  */
@@ -1705,15 +1323,16 @@ declare class SileroVADWorker {
1705
1323
  }
1706
1324
 
1707
1325
  /**
1708
- * Unified Inference Worker — single Web Worker hosting all WASM models
1326
+ * Unified Inference Worker — single Web Worker hosting all ONNX models
1709
1327
  *
1710
- * Solves the multi-worker ORT problem: three per-model workers each load their
1711
- * own ORT WASM instance (~40MB each). On iOS this exceeds the ~1-1.5GB tab
1712
- * limit, forcing main-thread fallback which blocks the render loop.
1328
+ * Runs all model loading and inference off the main thread, preventing
1329
+ * InferenceSession.create() from blocking the renderer (5-30s).
1713
1330
  *
1714
- * This worker hosts SenseVoice + Wav2ArkitCpu + Silero VAD in a single
1715
- * ORT WASM instance. Same total model memory (~643MB), but inference runs
1716
- * off-main-thread. Works on iOS because there's only one ORT instance.
1331
+ * Uses WebGPU when available (Chrome/Edge 113+), falls back to WASM.
1332
+ * On iOS, uses a single WASM instance to stay within the ~1-1.5GB tab limit.
1333
+ *
1334
+ * This worker hosts SenseVoice + A2E + Silero VAD + Kokoro TTS in a single
1335
+ * ORT instance. Same total model memory, but inference runs off-main-thread.
1717
1336
  *
1718
1337
  * Consumer usage:
1719
1338
  * ```typescript
@@ -1721,7 +1340,7 @@ declare class SileroVADWorker {
1721
1340
  * await worker.init();
1722
1341
  *
1723
1342
  * const asr = createSenseVoice({ modelUrl: '...', unifiedWorker: worker });
1724
- * const lam = createA2E({ gpuModelUrl: '...', cpuModelUrl: '...', unifiedWorker: worker });
1343
+ * const lam = createA2E({ modelUrl: '...', unifiedWorker: worker });
1725
1344
  * const vad = createSileroVAD({ modelUrl: '...', unifiedWorker: worker });
1726
1345
  * ```
1727
1346
  *
@@ -1731,10 +1350,11 @@ declare class SileroVADWorker {
1731
1350
  /** Health state of the unified worker */
1732
1351
  type WorkerHealthState = 'healthy' | 'unhealthy' | 'recovering';
1733
1352
  /**
1734
- * Unified Inference Worker — single Web Worker for all WASM models
1353
+ * Unified Inference Worker — single Web Worker for all ONNX models
1735
1354
  *
1736
- * Hosts SenseVoice, Wav2ArkitCpu, and Silero VAD in one ORT instance.
1737
- * Eliminates the multi-worker memory problem on iOS.
1355
+ * Hosts SenseVoice, A2E (LAM), Kokoro TTS, and Silero VAD in one ORT instance.
1356
+ * Uses WebGPU on Chrome/Edge 113+, falls back to WASM on Safari/iOS/Firefox.
1357
+ * All model loading and inference runs off the main thread.
1738
1358
  */
1739
1359
  declare class UnifiedInferenceWorker {
1740
1360
  private worker;
@@ -1744,6 +1364,7 @@ declare class UnifiedInferenceWorker {
1744
1364
  private consecutiveFailures;
1745
1365
  private _generation;
1746
1366
  private recovering;
1367
+ private _workerBackend;
1747
1368
  /**
1748
1369
  * Initialize the worker (load ORT WASM from CDN)
1749
1370
  */
@@ -1756,17 +1377,6 @@ declare class UnifiedInferenceWorker {
1756
1377
  }): Promise<SenseVoiceModelInfo>;
1757
1378
  transcribe(audio: Float32Array): Promise<SenseVoiceResult>;
1758
1379
  disposeSenseVoice(): Promise<void>;
1759
- loadA2E(config: {
1760
- modelUrl: string;
1761
- externalDataUrl: string | null;
1762
- }): Promise<A2EModelInfo>;
1763
- inferA2E(audio: Float32Array): Promise<{
1764
- blendshapes: Float32Array;
1765
- numFrames: number;
1766
- numBlendshapes: number;
1767
- inferenceTimeMs: number;
1768
- }>;
1769
- disposeA2E(): Promise<void>;
1770
1380
  loadLAM(config: {
1771
1381
  modelUrl: string;
1772
1382
  externalDataUrl: string | null;
@@ -1807,6 +1417,8 @@ declare class UnifiedInferenceWorker {
1807
1417
  get health(): WorkerHealthState;
1808
1418
  /** Generation counter — increments on worker recovery. Adapters compare to detect stale sessions. */
1809
1419
  get workerGeneration(): number;
1420
+ /** The ORT backend the worker is using ('webgpu' on Chrome/Edge, 'wasm' on Safari/iOS/Firefox) */
1421
+ get backend(): 'wasm' | 'webgpu';
1810
1422
  /** Check if Web Workers are supported */
1811
1423
  static isSupported(): boolean;
1812
1424
  private assertReady;
@@ -1852,143 +1464,865 @@ interface InferenceFactoryConfig {
1852
1464
  }
1853
1465
 
1854
1466
  /**
1855
- * Factory function for SenseVoice ASR with automatic Worker vs main thread selection
1467
+ * Factory function for A2E inference
1856
1468
  *
1857
- * Provides a unified API that automatically selects the optimal implementation:
1858
- * - Worker supported: Uses SenseVoiceWorker (off-main-thread inference)
1859
- * - Worker unsupported: Uses SenseVoiceInference (main thread)
1469
+ * Creates an A2EBackend instance with zero-config defaults (HuggingFace CDN).
1470
+ * Supports unified worker mode for iOS off-main-thread inference.
1860
1471
  *
1861
1472
  * @category Inference
1862
1473
  *
1863
- * @example Auto-detect (recommended)
1474
+ * @example Auto-detect (recommended, zero-config)
1864
1475
  * ```typescript
1865
- * import { createSenseVoice } from '@omote/core';
1866
- *
1867
- * const asr = createSenseVoice({
1868
- * modelUrl: '/models/sensevoice/model.int8.onnx',
1869
- * });
1870
- * await asr.load();
1871
- * const { text, emotion } = await asr.transcribe(audioSamples);
1872
- * ```
1476
+ * import { createA2E } from '@omote/core';
1873
1477
  *
1874
- * @example Force worker
1875
- * ```typescript
1876
- * const asr = createSenseVoice({
1877
- * modelUrl: '/models/sensevoice/model.int8.onnx',
1878
- * useWorker: true,
1879
- * });
1478
+ * const a2e = createA2E(); // uses HF CDN defaults (192MB fp16)
1479
+ * await a2e.load();
1480
+ * const { blendshapes } = await a2e.infer(audioSamples);
1880
1481
  * ```
1881
1482
  *
1882
- * @example Force main thread
1483
+ * @example Custom model URL
1883
1484
  * ```typescript
1884
- * const asr = createSenseVoice({
1885
- * modelUrl: '/models/sensevoice/model.int8.onnx',
1886
- * useWorker: false,
1887
- * });
1485
+ * const a2e = createA2E({ modelUrl: '/models/lam.onnx' });
1888
1486
  * ```
1889
1487
  */
1890
1488
 
1891
1489
  /**
1892
- * Common interface for both SenseVoiceInference and SenseVoiceWorker
1490
+ * Configuration for the A2E factory
1893
1491
  */
1894
- interface SenseVoiceBackend {
1895
- /** Whether the model is loaded and ready for inference */
1896
- readonly isLoaded: boolean;
1897
- /** Current backend type ('wasm' for worker, full backend for main thread, null if not loaded) */
1898
- readonly backend: 'wasm' | 'webgpu' | null;
1899
- /**
1900
- * Load the ONNX model
1901
- * @param onProgress - Optional progress callback (fires once at 100% for worker)
1902
- * @returns Model loading information
1903
- */
1904
- load(onProgress?: (loaded: number, total: number) => void): Promise<SenseVoiceModelInfo>;
1905
- /**
1906
- * Transcribe audio samples to text
1907
- * @param audioSamples - Float32Array of audio samples at 16kHz
1908
- * @returns Transcription result
1909
- */
1910
- transcribe(audioSamples: Float32Array): Promise<SenseVoiceResult>;
1492
+ interface CreateA2EConfig extends InferenceFactoryConfig {
1493
+ /** URL for the ONNX model. Default: HuggingFace CDN */
1494
+ modelUrl?: string;
1911
1495
  /**
1912
- * Dispose of the model and free resources
1496
+ * URL for external model data file (.onnx.data weights).
1497
+ * Default: `${modelUrl}.data`
1498
+ *
1499
+ * Set to `false` to skip external data loading (single-file models only).
1913
1500
  */
1914
- dispose(): Promise<void>;
1915
- }
1916
- /**
1917
- * Configuration for the SenseVoice factory
1918
- */
1919
- interface CreateSenseVoiceConfig extends InferenceFactoryConfig {
1920
- /** Path or URL to model.int8.onnx (239MB). Default: HuggingFace CDN */
1921
- modelUrl?: string;
1922
- /** Path or URL to tokens.txt vocabulary file (default: sibling of modelUrl) */
1923
- tokensUrl?: string;
1924
- /** Language hint (default: 'auto') */
1925
- language?: SenseVoiceLanguage;
1926
- /** Text normalization (default: 'with_itn') */
1927
- textNorm?: 'with_itn' | 'without_itn';
1501
+ externalDataUrl?: string | false;
1502
+ /** Backend preference (default: 'auto') */
1503
+ backend?: BackendPreference;
1504
+ /** Number of identity classes (default: 12) */
1505
+ numIdentityClasses?: number;
1928
1506
  }
1929
1507
  /**
1930
- * Create a SenseVoice ASR instance with automatic implementation selection
1508
+ * Create an A2E instance
1931
1509
  *
1932
1510
  * @param config - Factory configuration
1933
- * @returns A SenseVoiceBackend instance (either Worker or main thread)
1511
+ * @returns An A2EBackend instance
1934
1512
  */
1935
- declare function createSenseVoice(config?: CreateSenseVoiceConfig): SenseVoiceBackend;
1513
+ declare function createA2E(config?: CreateA2EConfig): A2EBackend;
1936
1514
 
1937
1515
  /**
1938
- * Shared blendshape constants and utilities for lip sync inference
1939
- *
1940
- * Contains LAM_BLENDSHAPES (canonical ordering), symmetrization, and
1941
- * index remapping used by both Wav2Vec2Inference and Wav2ArkitCpuInference.
1516
+ * Shared types for orchestration layer
1942
1517
  *
1943
- * This module is the single source of truth for blendshape ordering to
1944
- * avoid circular dependencies between inference classes.
1518
+ * @category Orchestration
1519
+ */
1520
+
1521
+ /**
1522
+ * Generic frame source -- any object that emits 'frame' events with blendshapes.
1945
1523
  *
1946
- * @category Inference
1524
+ * Implemented by PlaybackPipeline, MicLipSync, VoicePipeline, and any custom source.
1525
+ * Used by OmoteAvatar (all renderer adapters) to receive animation frames.
1947
1526
  */
1527
+ interface FrameSource {
1528
+ on(event: 'frame', callback: (frame: {
1529
+ blendshapes: Float32Array;
1530
+ emotion?: string;
1531
+ }) => void): void;
1532
+ off?(event: 'frame', callback: (frame: {
1533
+ blendshapes: Float32Array;
1534
+ emotion?: string;
1535
+ }) => void): void;
1536
+ }
1537
+ type VoicePipelineState = 'idle' | 'loading' | 'ready' | 'listening' | 'thinking' | 'speaking' | 'error';
1538
+ interface LoadingProgress {
1539
+ currentModel: string;
1540
+ progress: number;
1541
+ totalModels: number;
1542
+ modelsLoaded: number;
1543
+ }
1544
+ interface TranscriptResult {
1545
+ text: string;
1546
+ emotion?: string;
1547
+ language?: string;
1548
+ event?: string;
1549
+ isFinal: boolean;
1550
+ inferenceTimeMs?: number;
1551
+ }
1948
1552
  /**
1949
- * LAM model blendshape names in order (52 total)
1950
- * NOTE: This is alphabetical ordering used by LAM, different from standard ARKit order
1553
+ * Consumer's response handler. VoicePipeline calls this with transcribed text.
1554
+ * Consumer must stream audio back for playback + lip sync.
1951
1555
  */
1952
- declare const LAM_BLENDSHAPES: readonly ["browDownLeft", "browDownRight", "browInnerUp", "browOuterUpLeft", "browOuterUpRight", "cheekPuff", "cheekSquintLeft", "cheekSquintRight", "eyeBlinkLeft", "eyeBlinkRight", "eyeLookDownLeft", "eyeLookDownRight", "eyeLookInLeft", "eyeLookInRight", "eyeLookOutLeft", "eyeLookOutRight", "eyeLookUpLeft", "eyeLookUpRight", "eyeSquintLeft", "eyeSquintRight", "eyeWideLeft", "eyeWideRight", "jawForward", "jawLeft", "jawOpen", "jawRight", "mouthClose", "mouthDimpleLeft", "mouthDimpleRight", "mouthFrownLeft", "mouthFrownRight", "mouthFunnel", "mouthLeft", "mouthLowerDownLeft", "mouthLowerDownRight", "mouthPressLeft", "mouthPressRight", "mouthPucker", "mouthRight", "mouthRollLower", "mouthRollUpper", "mouthShrugLower", "mouthShrugUpper", "mouthSmileLeft", "mouthSmileRight", "mouthStretchLeft", "mouthStretchRight", "mouthUpperUpLeft", "mouthUpperUpRight", "noseSneerLeft", "noseSneerRight", "tongueOut"];
1953
- /** Alias for backwards compatibility */
1954
- declare const ARKIT_BLENDSHAPES: readonly ["browDownLeft", "browDownRight", "browInnerUp", "browOuterUpLeft", "browOuterUpRight", "cheekPuff", "cheekSquintLeft", "cheekSquintRight", "eyeBlinkLeft", "eyeBlinkRight", "eyeLookDownLeft", "eyeLookDownRight", "eyeLookInLeft", "eyeLookInRight", "eyeLookOutLeft", "eyeLookOutRight", "eyeLookUpLeft", "eyeLookUpRight", "eyeSquintLeft", "eyeSquintRight", "eyeWideLeft", "eyeWideRight", "jawForward", "jawLeft", "jawOpen", "jawRight", "mouthClose", "mouthDimpleLeft", "mouthDimpleRight", "mouthFrownLeft", "mouthFrownRight", "mouthFunnel", "mouthLeft", "mouthLowerDownLeft", "mouthLowerDownRight", "mouthPressLeft", "mouthPressRight", "mouthPucker", "mouthRight", "mouthRollLower", "mouthRollUpper", "mouthShrugLower", "mouthShrugUpper", "mouthSmileLeft", "mouthSmileRight", "mouthStretchLeft", "mouthStretchRight", "mouthUpperUpLeft", "mouthUpperUpRight", "noseSneerLeft", "noseSneerRight", "tongueOut"];
1556
+ interface ResponseHandler {
1557
+ (params: {
1558
+ text: string;
1559
+ emotion?: string;
1560
+ event?: string;
1561
+ /** Set avatar emotion during response streaming (e.g., from LLM emotion_update messages) */
1562
+ setEmotion?: (emotion: string) => void;
1563
+ /** Stream audio chunks to pipeline for playback + lip sync */
1564
+ send: (chunk: Uint8Array) => Promise<void>;
1565
+ /** Call when all audio has been sent */
1566
+ done: () => Promise<void>;
1567
+ /** Aborted on interruption or stop() */
1568
+ signal: AbortSignal;
1569
+ /** Session ID for backend correlation */
1570
+ sessionId: string;
1571
+ }): Promise<void>;
1572
+ }
1573
+
1955
1574
  /**
1956
- * Linearly interpolate between two blendshape weight arrays.
1575
+ * TTSSpeaker Shared helper for OmoteAvatar TTS integration.
1957
1576
  *
1958
- * Pure math utility with zero renderer dependency used by all renderer
1959
- * adapters (@omote/three, @omote/babylon, @omote/r3f) for smooth frame
1960
- * transitions.
1577
+ * Encapsulates createA2E + TTSPlayback lifecycle so that renderer adapters
1578
+ * (Three.js, Babylon.js) and the R3F hook can delegate with ~15 lines each.
1961
1579
  *
1962
- * @param current - Current blendshape weights
1963
- * @param target - Target blendshape weights
1580
+ * @category Audio
1581
+ */
1582
+
1583
+ interface TTSSpeakerConfig {
1584
+ /** Per-character expression weight scaling */
1585
+ profile?: ExpressionProfile;
1586
+ /** Identity/style index for A2E model (default: 0) */
1587
+ identityIndex?: number;
1588
+ /** Audio playback delay in ms */
1589
+ audioDelayMs?: number;
1590
+ /** Enable neutral transition on playback complete */
1591
+ neutralTransitionEnabled?: boolean;
1592
+ /** Duration of neutral fade-out in ms */
1593
+ neutralTransitionMs?: number;
1594
+ /** Pre-built A2E backend (skip internal createA2E). */
1595
+ lam?: A2EBackend;
1596
+ /** LAM model config (only when lam not provided) */
1597
+ models?: CreateA2EConfig;
1598
+ /** Shared unified worker (recommended for iOS) */
1599
+ unifiedWorker?: UnifiedInferenceWorker;
1600
+ }
1601
+ declare class TTSSpeaker {
1602
+ private ttsPlayback;
1603
+ private tts;
1604
+ private ownedLam;
1605
+ private ownedWorker;
1606
+ private currentAbort;
1607
+ private _isSpeaking;
1608
+ private _audioOnly;
1609
+ private scheduler;
1610
+ /** Whether the speaker is currently playing audio. */
1611
+ get isSpeaking(): boolean;
1612
+ /** Whether this speaker is in audio-only mode (no lip sync). */
1613
+ get audioOnly(): boolean;
1614
+ /** The internal TTSPlayback (implements FrameSource). Null until connect() or in audio-only mode. */
1615
+ get frameSource(): FrameSource | null;
1616
+ /**
1617
+ * Connect a TTS backend.
1618
+ *
1619
+ * When config includes `lam`, `unifiedWorker`, or `models`, the full lip sync
1620
+ * pipeline is created (LAM + TTSPlayback + PlaybackPipeline).
1621
+ *
1622
+ * When config is omitted or has none of those, audio-only mode is used:
1623
+ * TTS → AudioScheduler (speakers only, no blendshapes, no LAM download).
1624
+ *
1625
+ * @param tts - TTS backend to use for speech synthesis
1626
+ * @param config - Optional configuration for A2E, expression profile, etc.
1627
+ */
1628
+ connect(tts: TTSBackend, config?: TTSSpeakerConfig): Promise<void>;
1629
+ /**
1630
+ * Synthesize and play text with lip sync.
1631
+ * Auto-aborts previous speak if still in progress.
1632
+ *
1633
+ * @param text - Text to synthesize and play
1634
+ * @param options - Optional voice override and abort signal
1635
+ */
1636
+ speak(text: string, options?: {
1637
+ signal?: AbortSignal;
1638
+ voice?: string;
1639
+ }): Promise<void>;
1640
+ /** Audio-only speak: TTS → resample → AudioScheduler (no blendshapes). */
1641
+ private speakAudioOnly;
1642
+ /** Poll scheduler until all audio has played. */
1643
+ private waitForSchedulerComplete;
1644
+ /**
1645
+ * Stream text token-by-token with automatic sentence buffering.
1646
+ * Designed for LLM token-by-token output. Sentences are detected at
1647
+ * boundary characters (.!?\n) with a minimum length threshold, then
1648
+ * synthesized and played with lip sync.
1649
+ *
1650
+ * Auto-aborts previous speak/streamText if still in progress.
1651
+ *
1652
+ * @param options - Optional voice override and abort signal
1653
+ * @returns Sink with push() and end() methods
1654
+ */
1655
+ streamText(options: {
1656
+ signal?: AbortSignal;
1657
+ voice?: string;
1658
+ }): Promise<{
1659
+ push: (token: string) => void;
1660
+ end: () => Promise<void>;
1661
+ }>;
1662
+ /** streamText in audio-only mode: TTS → AudioScheduler (no blendshapes). */
1663
+ private streamTextAudioOnly;
1664
+ /** Abort current speak if any. */
1665
+ stop(): void;
1666
+ /** Clean teardown of all owned resources. */
1667
+ dispose(): Promise<void>;
1668
+ }
1669
+
1670
+ /**
1671
+ * createTTSPlayer — Zero-config TTS player for audio-only playback.
1672
+ *
1673
+ * Speaks text through speakers without an avatar. No LAM download, no lip sync.
1674
+ *
1675
+ * @example
1676
+ * ```typescript
1677
+ * import { createTTSPlayer } from '@omote/core';
1678
+ *
1679
+ * const player = createTTSPlayer();
1680
+ * await player.load();
1681
+ * await player.speak("Hello world!");
1682
+ *
1683
+ * // Streaming:
1684
+ * const stream = await player.streamText({});
1685
+ * stream.push("Hello ");
1686
+ * stream.push("world!");
1687
+ * await stream.end();
1688
+ * ```
1689
+ *
1690
+ * @category Audio
1691
+ */
1692
+
1693
+ interface CreateTTSPlayerConfig {
1694
+ /** Voice to use (default: 'af_heart') */
1695
+ voice?: string;
1696
+ /** Model URL override */
1697
+ modelUrl?: string;
1698
+ /** Voice data base URL override */
1699
+ voiceBaseUrl?: string;
1700
+ }
1701
+ /**
1702
+ * Zero-config TTS player. Speak text through speakers without an avatar.
1703
+ *
1704
+ * Uses Kokoro TTS (82M q8, ~92MB) with automatic worker selection.
1705
+ * No LAM model is downloaded — audio plays directly through AudioScheduler.
1706
+ */
1707
+ declare function createTTSPlayer(config?: CreateTTSPlayerConfig): TTSPlayer;
1708
+ /**
1709
+ * Thin wrapper: TTSSpeaker in audio-only mode + delegated load().
1710
+ */
1711
+ declare class TTSPlayer extends TTSSpeaker {
1712
+ private backend;
1713
+ constructor(tts: TTSBackend);
1714
+ /** Load TTS model and connect in audio-only mode. */
1715
+ load(): Promise<void>;
1716
+ /** Whether the TTS model is loaded and ready. */
1717
+ get isLoaded(): boolean;
1718
+ }
1719
+
1720
+ /**
1721
+ * Factory function for SenseVoice ASR with automatic Worker vs main thread selection
1722
+ *
1723
+ * Provides a unified API that automatically selects the optimal implementation:
1724
+ * - Worker supported: Uses SenseVoiceWorker (off-main-thread inference)
1725
+ * - Worker unsupported: Uses SenseVoiceInference (main thread)
1726
+ *
1727
+ * @category Inference
1728
+ *
1729
+ * @example Auto-detect (recommended)
1730
+ * ```typescript
1731
+ * import { createSenseVoice } from '@omote/core';
1732
+ *
1733
+ * const asr = createSenseVoice({
1734
+ * modelUrl: '/models/sensevoice/model.int8.onnx',
1735
+ * });
1736
+ * await asr.load();
1737
+ * const { text, emotion } = await asr.transcribe(audioSamples);
1738
+ * ```
1739
+ *
1740
+ * @example Force worker
1741
+ * ```typescript
1742
+ * const asr = createSenseVoice({
1743
+ * modelUrl: '/models/sensevoice/model.int8.onnx',
1744
+ * useWorker: true,
1745
+ * });
1746
+ * ```
1747
+ *
1748
+ * @example Force main thread
1749
+ * ```typescript
1750
+ * const asr = createSenseVoice({
1751
+ * modelUrl: '/models/sensevoice/model.int8.onnx',
1752
+ * useWorker: false,
1753
+ * });
1754
+ * ```
1755
+ */
1756
+
1757
+ /**
1758
+ * Common interface for both SenseVoiceInference and SenseVoiceWorker
1759
+ */
1760
+ interface SenseVoiceBackend {
1761
+ /** Whether the model is loaded and ready for inference */
1762
+ readonly isLoaded: boolean;
1763
+ /** Current backend type ('wasm' for worker, full backend for main thread, null if not loaded) */
1764
+ readonly backend: 'wasm' | 'webgpu' | null;
1765
+ /**
1766
+ * Load the ONNX model
1767
+ * @param onProgress - Optional progress callback (fires once at 100% for worker)
1768
+ * @returns Model loading information
1769
+ */
1770
+ load(onProgress?: (loaded: number, total: number) => void): Promise<SenseVoiceModelInfo>;
1771
+ /**
1772
+ * Transcribe audio samples to text
1773
+ * @param audioSamples - Float32Array of audio samples at 16kHz
1774
+ * @returns Transcription result
1775
+ */
1776
+ transcribe(audioSamples: Float32Array): Promise<SenseVoiceResult>;
1777
+ /**
1778
+ * Dispose of the model and free resources
1779
+ */
1780
+ dispose(): Promise<void>;
1781
+ }
1782
+ /**
1783
+ * Configuration for the SenseVoice factory
1784
+ */
1785
+ interface CreateSenseVoiceConfig extends InferenceFactoryConfig {
1786
+ /** Path or URL to model.int8.onnx (239MB). Default: HuggingFace CDN */
1787
+ modelUrl?: string;
1788
+ /** Path or URL to tokens.txt vocabulary file (default: sibling of modelUrl) */
1789
+ tokensUrl?: string;
1790
+ /** Language hint (default: 'auto') */
1791
+ language?: SenseVoiceLanguage;
1792
+ /** Text normalization (default: 'with_itn') */
1793
+ textNorm?: 'with_itn' | 'without_itn';
1794
+ }
1795
+ /**
1796
+ * Create a SenseVoice ASR instance with automatic implementation selection
1797
+ *
1798
+ * @param config - Factory configuration
1799
+ * @returns A SenseVoiceBackend instance (either Worker or main thread)
1800
+ */
1801
+ declare function createSenseVoice(config?: CreateSenseVoiceConfig): SenseVoiceBackend;
1802
+
1803
+ /**
1804
+ * Factory function for Silero VAD with automatic Worker vs main thread selection
1805
+ *
1806
+ * Provides a unified API that automatically selects the optimal implementation:
1807
+ * - Desktop browsers: Uses SileroVADWorker (off-main-thread inference)
1808
+ * - Mobile devices: Uses SileroVADInference (main thread, avoids memory overhead)
1809
+ * - Fallback: Gracefully falls back to main thread if Worker fails
1810
+ *
1811
+ * @category Inference
1812
+ *
1813
+ * @example Basic usage (auto-detect)
1814
+ * ```typescript
1815
+ * import { createSileroVAD } from '@omote/core';
1816
+ *
1817
+ * const vad = createSileroVAD({
1818
+ * modelUrl: '/models/silero-vad.onnx',
1819
+ * threshold: 0.5,
1820
+ * });
1821
+ *
1822
+ * await vad.load();
1823
+ * const result = await vad.process(audioChunk);
1824
+ * if (result.isSpeech) {
1825
+ * console.log('Speech detected!', result.probability);
1826
+ * }
1827
+ * ```
1828
+ *
1829
+ * @example Force worker usage
1830
+ * ```typescript
1831
+ * const vad = createSileroVAD({
1832
+ * modelUrl: '/models/silero-vad.onnx',
1833
+ * useWorker: true, // Force Worker even on mobile
1834
+ * });
1835
+ * ```
1836
+ *
1837
+ * @example Force main thread
1838
+ * ```typescript
1839
+ * const vad = createSileroVAD({
1840
+ * modelUrl: '/models/silero-vad.onnx',
1841
+ * useWorker: false, // Force main thread
1842
+ * });
1843
+ * ```
1844
+ */
1845
+
1846
+ /**
1847
+ * Common interface for both SileroVADInference and SileroVADWorker
1848
+ *
1849
+ * This interface defines the shared API that both implementations provide,
1850
+ * allowing consumers to use either interchangeably.
1851
+ */
1852
+ interface SileroVADBackend {
1853
+ /** Current backend type (webgpu, wasm, or null if not loaded) */
1854
+ readonly backend: RuntimeBackend | null;
1855
+ /** Whether the model is loaded and ready for inference */
1856
+ readonly isLoaded: boolean;
1857
+ /** Audio sample rate (8000 or 16000 Hz) */
1858
+ readonly sampleRate: number;
1859
+ /** Speech detection threshold (0-1) */
1860
+ readonly threshold: number;
1861
+ /**
1862
+ * Load the ONNX model
1863
+ * @returns Model loading information
1864
+ */
1865
+ load(): Promise<VADModelInfo | VADWorkerModelInfo>;
1866
+ /**
1867
+ * Process a single audio chunk
1868
+ * @param audioChunk - Float32Array of exactly chunkSize samples
1869
+ * @returns VAD result with speech probability
1870
+ */
1871
+ process(audioChunk: Float32Array): Promise<VADResult>;
1872
+ /**
1873
+ * Reset state for new audio stream
1874
+ */
1875
+ reset(): void | Promise<void>;
1876
+ /**
1877
+ * Dispose of the model and free resources
1878
+ */
1879
+ dispose(): Promise<void>;
1880
+ /**
1881
+ * Get required chunk size in samples
1882
+ */
1883
+ getChunkSize(): number;
1884
+ /**
1885
+ * Get chunk duration in milliseconds
1886
+ */
1887
+ getChunkDurationMs(): number;
1888
+ }
1889
+ /**
1890
+ * Configuration for the Silero VAD factory
1891
+ *
1892
+ * Extends SileroVADConfig with worker-specific options.
1893
+ */
1894
+ interface SileroVADFactoryConfig extends Omit<SileroVADConfig, 'modelUrl'>, InferenceFactoryConfig {
1895
+ /** Path or URL to the ONNX model. Default: HuggingFace CDN */
1896
+ modelUrl?: string;
1897
+ /**
1898
+ * Fallback to main thread on worker errors.
1899
+ *
1900
+ * When true (default), if the Worker fails to load or encounters an error,
1901
+ * the factory will automatically create a main thread instance instead.
1902
+ *
1903
+ * When false, worker errors will propagate as exceptions.
1904
+ *
1905
+ * Default: true
1906
+ */
1907
+ fallbackOnError?: boolean;
1908
+ }
1909
+ /**
1910
+ * Check if the current environment supports VAD Web Workers
1911
+ *
1912
+ * Requirements:
1913
+ * - Worker constructor must exist
1914
+ * - Blob URL support (for inline worker script)
1915
+ *
1916
+ * @returns true if VAD Worker is supported
1917
+ */
1918
+ declare function supportsVADWorker(): boolean;
1919
+ /**
1920
+ * Create a Silero VAD instance with automatic implementation selection
1921
+ *
1922
+ * This factory function automatically selects between:
1923
+ * - **SileroVADWorker**: Off-main-thread inference (better for desktop)
1924
+ * - **SileroVADInference**: Main thread inference (better for mobile)
1925
+ *
1926
+ * The selection is based on:
1927
+ * 1. Explicit `useWorker` config (if provided)
1928
+ * 2. Platform detection (mobile vs desktop)
1929
+ * 3. Worker API availability
1930
+ *
1931
+ * Both implementations share the same interface (SileroVADBackend),
1932
+ * so consumers can use either interchangeably.
1933
+ *
1934
+ * @param config - Factory configuration
1935
+ * @returns A SileroVAD instance (either Worker or main thread)
1936
+ *
1937
+ * @example
1938
+ * ```typescript
1939
+ * // Auto-detect (recommended)
1940
+ * const vad = createSileroVAD({ modelUrl: '/models/silero-vad.onnx' });
1941
+ *
1942
+ * // Force Worker
1943
+ * const vadWorker = createSileroVAD({ modelUrl: '/models/silero-vad.onnx', useWorker: true });
1944
+ *
1945
+ * // Force main thread
1946
+ * const vadMain = createSileroVAD({ modelUrl: '/models/silero-vad.onnx', useWorker: false });
1947
+ * ```
1948
+ */
1949
+ declare function createSileroVAD(config?: SileroVADFactoryConfig): SileroVADBackend;
1950
+
1951
+ /**
1952
+ * SpeechListener — Standalone listening primitive.
1953
+ *
1954
+ * Composes: MicrophoneCapture → SileroVAD → SenseVoice ASR → transcript events.
1955
+ * Extracted from VoicePipeline's listening half so it can be used independently.
1956
+ *
1957
+ * Does NOT handle TTS, LAM, or response routing — those belong to TTSSpeaker
1958
+ * and VoicePipeline respectively.
1959
+ *
1960
+ * @category Audio
1961
+ */
1962
+
1963
+ interface SpeechListenerConfig {
1964
+ /** Pre-built backends — skip internal factory creation. */
1965
+ backends?: {
1966
+ asr: SenseVoiceBackend;
1967
+ vad: SileroVADBackend;
1968
+ };
1969
+ /** External unified worker (reuse across pipelines). */
1970
+ unifiedWorker?: UnifiedInferenceWorker;
1971
+ /** URLs and options for model loading (when backends not provided). */
1972
+ models?: {
1973
+ senseVoice: {
1974
+ modelUrl: string;
1975
+ tokensUrl?: string;
1976
+ language?: string;
1977
+ };
1978
+ vad: {
1979
+ modelUrl: string;
1980
+ threshold?: number;
1981
+ preSpeechBufferChunks?: number;
1982
+ };
1983
+ };
1984
+ /** Base silence timeout in ms (default: 500) */
1985
+ silenceTimeoutMs?: number;
1986
+ /** Extended silence timeout for long utterances (default: 700) */
1987
+ silenceTimeoutExtendedMs?: number;
1988
+ /** Enable adaptive timeout based on speech duration (default: true) */
1989
+ adaptiveTimeout?: boolean;
1990
+ /** Minimum audio duration in seconds (default: 0.3) */
1991
+ minAudioDurationSec?: number;
1992
+ /** Minimum audio energy (default: 0.02) */
1993
+ minAudioEnergy?: number;
1994
+ /** Enable audio normalization for quiet audio (default: true) */
1995
+ normalizeAudio?: boolean;
1996
+ /** Progressive transcription interval — desktop (default: 500ms) */
1997
+ progressiveIntervalMs?: number;
1998
+ /** Progressive transcription interval — iOS (default: 800ms) */
1999
+ progressiveIntervalIosMs?: number;
2000
+ /** Coverage threshold to use progressive result (default: 0.8) */
2001
+ progressiveCoverageThreshold?: number;
2002
+ /** Minimum samples before progressive transcription starts (default: 8000) */
2003
+ progressiveMinSamples?: number;
2004
+ /** Timeout for individual transcribe() calls (default: 10000ms) */
2005
+ transcriptionTimeoutMs?: number;
2006
+ }
2007
+ type SpeechListenerState = 'idle' | 'loading' | 'ready' | 'listening' | 'processing' | 'paused';
2008
+ interface SpeechListenerEvents {
2009
+ 'state': SpeechListenerState;
2010
+ 'loading:progress': LoadingProgress;
2011
+ 'transcript': TranscriptResult;
2012
+ 'speech:start': void;
2013
+ 'speech:end': {
2014
+ durationMs: number;
2015
+ };
2016
+ 'audio:level': {
2017
+ rms: number;
2018
+ peak: number;
2019
+ };
2020
+ 'audio:chunk': Float32Array;
2021
+ 'error': Error;
2022
+ [key: string]: unknown;
2023
+ }
2024
+ declare class SpeechListener extends EventEmitter<SpeechListenerEvents> {
2025
+ private readonly config;
2026
+ private _state;
2027
+ private epoch;
2028
+ private asr;
2029
+ private vad;
2030
+ private ownedWorker;
2031
+ private mic;
2032
+ private omoteEvents;
2033
+ private _unsubChunk;
2034
+ private _unsubLevel;
2035
+ private static readonly MAX_AUDIO_BUFFER_SAMPLES;
2036
+ private audioBuffer;
2037
+ private audioBufferSamples;
2038
+ private speechStartTime;
2039
+ private silenceTimer;
2040
+ private isSpeechActive;
2041
+ private progressiveTimer;
2042
+ private progressivePromise;
2043
+ private lastProgressiveResult;
2044
+ private lastProgressiveSamples;
2045
+ private asrErrorCount;
2046
+ private progressiveErrorCount;
2047
+ /** Current listener state */
2048
+ get state(): SpeechListenerState;
2049
+ constructor(config?: SpeechListenerConfig);
2050
+ /**
2051
+ * Load ASR + VAD models. Only loads speech recognition models,
2052
+ * NOT TTS or LAM (those belong to TTSSpeaker).
2053
+ */
2054
+ loadModels(): Promise<void>;
2055
+ /** Start listening — activates mic + VAD. */
2056
+ start(): Promise<void>;
2057
+ /** Stop listening — deactivates mic, clears buffers. */
2058
+ stop(): void;
2059
+ /** Pause VAD/ASR but keep mic active for audio:chunk events (for interruption detection). */
2060
+ pause(): void;
2061
+ /** Resume VAD/ASR from paused state. */
2062
+ resume(): void;
2063
+ /** Dispose all resources. */
2064
+ dispose(): Promise<void>;
2065
+ private processAudioChunk;
2066
+ private getSilenceTimeout;
2067
+ private onSilenceDetected;
2068
+ private processEndOfSpeech;
2069
+ private startProgressiveTranscription;
2070
+ private stopProgressiveTranscription;
2071
+ private transcribeWithTimeout;
2072
+ private normalizeAudio;
2073
+ private setState;
2074
+ private emitProgress;
2075
+ private clearSilenceTimer;
2076
+ }
2077
+
2078
+ /**
2079
+ * Interruption Handler
2080
+ *
2081
+ * VAD-based barge-in detection for AI conversations:
2082
+ * - Monitors VAD probability for user speech
2083
+ * - Detects when user interrupts AI response
2084
+ * - Triggers interruption callbacks
2085
+ */
2086
+
2087
+ interface InterruptionEvents {
2088
+ [key: string]: unknown;
2089
+ 'speech.detected': {
2090
+ rms: number;
2091
+ };
2092
+ 'speech.ended': {
2093
+ durationMs: number;
2094
+ };
2095
+ 'interruption.triggered': {
2096
+ rms: number;
2097
+ durationMs: number;
2098
+ };
2099
+ }
2100
+ /**
2101
+ * Interruption handler configuration
2102
+ *
2103
+ * Industry standards applied:
2104
+ * - vadThreshold: 0.5 (Silero VAD default)
2105
+ * - minSpeechDurationMs: 200ms (Google/Amazon barge-in standard)
2106
+ * - silenceTimeoutMs: 500ms (OpenAI Realtime API standard)
2107
+ */
2108
+ interface InterruptionConfig {
2109
+ /** VAD probability threshold for speech detection (default: 0.5, Silero standard) */
2110
+ vadThreshold?: number;
2111
+ /** Minimum speech duration to trigger interruption (default: 200ms, Google/Amazon standard) */
2112
+ minSpeechDurationMs?: number;
2113
+ /** Silence duration to end speech (default: 500ms, OpenAI standard) */
2114
+ silenceTimeoutMs?: number;
2115
+ /** Enable interruption detection (default: true) */
2116
+ enabled?: boolean;
2117
+ }
2118
+ declare class InterruptionHandler extends EventEmitter<InterruptionEvents> {
2119
+ private config;
2120
+ private isSpeaking;
2121
+ private speechStartTime;
2122
+ private lastSpeechTime;
2123
+ private silenceTimer;
2124
+ private aiIsSpeaking;
2125
+ private interruptionTriggeredThisSession;
2126
+ constructor(config?: InterruptionConfig);
2127
+ /**
2128
+ * Process raw audio energy for interruption detection (no VAD required).
2129
+ * Used during speaking state when the unified worker is busy with TTS.
2130
+ * Echo-cancelled mic input means energy above threshold = user speech.
2131
+ *
2132
+ * @param rms - RMS energy of audio chunk (0-1)
2133
+ * @param energyThreshold - Minimum energy to consider speech (default: 0.02)
2134
+ */
2135
+ processAudioEnergy(rms: number, energyThreshold?: number): void;
2136
+ /**
2137
+ * Process VAD result for interruption detection
2138
+ * @param vadProbability - Speech probability from VAD (0-1)
2139
+ * @param audioEnergy - Optional RMS energy for logging (default: 0)
2140
+ */
2141
+ processVADResult(vadProbability: number, audioEnergy?: number): void;
2142
+ /** Notify that AI started/stopped speaking */
2143
+ setAISpeaking(speaking: boolean): void;
2144
+ /** Enable/disable interruption detection */
2145
+ setEnabled(enabled: boolean): void;
2146
+ /** Update configuration */
2147
+ updateConfig(config: Partial<InterruptionConfig>): void;
2148
+ /** Reset state */
2149
+ reset(): void;
2150
+ /** Get current state */
2151
+ getState(): {
2152
+ isSpeaking: boolean;
2153
+ speechDurationMs: number;
2154
+ };
2155
+ private onSpeechDetected;
2156
+ private onSilenceDetected;
2157
+ }
2158
+
2159
+ /**
2160
+ * SenseVoice ASR Web Worker implementation
2161
+ *
2162
+ * Runs SenseVoice speech recognition in a dedicated Web Worker to prevent
2163
+ * main thread blocking. Uses inline worker script (Blob URL pattern) to
2164
+ * avoid separate file deployment.
2165
+ *
2166
+ * Key design decisions:
2167
+ * - WASM backend only (WebGPU doesn't work in Workers)
2168
+ * - All preprocessing (fbank, LFR, CMVN) and CTC decoding inlined in worker
2169
+ * - Audio copied (not transferred) to retain main thread access
2170
+ * - ONNX Runtime loaded from CDN in worker (no bundler complications)
2171
+ * - iOS: model URL passed as string to ORT (avoids 239MB JS heap allocation)
2172
+ *
2173
+ * @category Inference
2174
+ *
2175
+ * @example Basic usage
2176
+ * ```typescript
2177
+ * import { SenseVoiceWorker } from '@omote/core';
2178
+ *
2179
+ * const asr = new SenseVoiceWorker({
2180
+ * modelUrl: '/models/sensevoice/model.int8.onnx',
2181
+ * tokensUrl: '/models/sensevoice/tokens.txt',
2182
+ * });
2183
+ * await asr.load();
2184
+ *
2185
+ * const { text, emotion, language } = await asr.transcribe(audioSamples);
2186
+ * console.log(text); // "Hello world"
2187
+ * console.log(emotion); // "NEUTRAL"
2188
+ * console.log(language); // "en"
2189
+ * ```
2190
+ */
2191
+
2192
+ /**
2193
+ * Configuration for SenseVoice Worker
2194
+ */
2195
+ interface SenseVoiceWorkerConfig {
2196
+ /** Path or URL to model.int8.onnx (239MB) */
2197
+ modelUrl: string;
2198
+ /** Path or URL to tokens.txt vocabulary file (default: sibling of modelUrl) */
2199
+ tokensUrl?: string;
2200
+ /** Language hint (default: 'auto' for auto-detection) */
2201
+ language?: SenseVoiceLanguage;
2202
+ /** Text normalization: 'with_itn' applies inverse text normalization (default: 'with_itn') */
2203
+ textNorm?: 'with_itn' | 'without_itn';
2204
+ }
2205
+ /**
2206
+ * SenseVoice ASR Worker - Speech Recognition in a Web Worker
2207
+ *
2208
+ * Runs SenseVoice inference off the main thread to prevent UI blocking.
2209
+ * All preprocessing (fbank, LFR, CMVN) and CTC decoding run in the worker.
2210
+ *
2211
+ * @see SenseVoiceInference for main-thread version
2212
+ */
2213
+ declare class SenseVoiceWorker {
2214
+ private worker;
2215
+ private config;
2216
+ private isLoading;
2217
+ private _isLoaded;
2218
+ private inferenceQueue;
2219
+ private poisoned;
2220
+ private pendingResolvers;
2221
+ private languageId;
2222
+ private textNormId;
2223
+ constructor(config: SenseVoiceWorkerConfig);
2224
+ get isLoaded(): boolean;
2225
+ /**
2226
+ * Backend type (always 'wasm' for Worker, WebGPU not supported in Workers)
2227
+ */
2228
+ get backend(): 'wasm' | null;
2229
+ /**
2230
+ * Create the worker from inline script
2231
+ */
2232
+ private createWorker;
2233
+ /**
2234
+ * Handle messages from worker
2235
+ */
2236
+ private handleWorkerMessage;
2237
+ /**
2238
+ * Send message to worker and wait for response
2239
+ */
2240
+ private sendMessage;
2241
+ /**
2242
+ * Load the ONNX model in the worker
2243
+ *
2244
+ * @param onProgress - Optional progress callback. Fires once at 100% when load completes
2245
+ * (the worker downloads and loads the model internally, so granular progress is not available).
2246
+ */
2247
+ load(onProgress?: (loaded: number, total: number) => void): Promise<SenseVoiceModelInfo>;
2248
+ /**
2249
+ * Transcribe audio samples to text
2250
+ *
2251
+ * @param audioSamples Float32Array of audio samples at 16kHz, [-1, 1] range
2252
+ * @returns Transcription result with text, emotion, language, and event
2253
+ */
2254
+ transcribe(audioSamples: Float32Array): Promise<SenseVoiceResult>;
2255
+ /**
2256
+ * Queue inference to serialize worker calls
2257
+ */
2258
+ private queueInference;
2259
+ /**
2260
+ * Dispose of the worker and free resources
2261
+ */
2262
+ dispose(): Promise<void>;
2263
+ /**
2264
+ * Check if Web Workers are supported
2265
+ */
2266
+ static isSupported(): boolean;
2267
+ }
2268
+
2269
+ /**
2270
+ * Shared blendshape constants and utilities for lip sync inference
2271
+ *
2272
+ * Contains ARKIT_BLENDSHAPES (canonical 52-blendshape ordering), symmetrization,
2273
+ * and interpolation utilities used by A2EInference and all renderer adapters.
2274
+ *
2275
+ * This module is the single source of truth for blendshape ordering to
2276
+ * avoid circular dependencies between inference classes.
2277
+ *
2278
+ * @category Inference
2279
+ */
2280
+ /**
2281
+ * ARKit blendshape names in alphabetical order (52 total)
2282
+ * This is the canonical ordering used by all A2E models in the SDK.
2283
+ */
2284
+ declare const ARKIT_BLENDSHAPES: readonly ["browDownLeft", "browDownRight", "browInnerUp", "browOuterUpLeft", "browOuterUpRight", "cheekPuff", "cheekSquintLeft", "cheekSquintRight", "eyeBlinkLeft", "eyeBlinkRight", "eyeLookDownLeft", "eyeLookDownRight", "eyeLookInLeft", "eyeLookInRight", "eyeLookOutLeft", "eyeLookOutRight", "eyeLookUpLeft", "eyeLookUpRight", "eyeSquintLeft", "eyeSquintRight", "eyeWideLeft", "eyeWideRight", "jawForward", "jawLeft", "jawOpen", "jawRight", "mouthClose", "mouthDimpleLeft", "mouthDimpleRight", "mouthFrownLeft", "mouthFrownRight", "mouthFunnel", "mouthLeft", "mouthLowerDownLeft", "mouthLowerDownRight", "mouthPressLeft", "mouthPressRight", "mouthPucker", "mouthRight", "mouthRollLower", "mouthRollUpper", "mouthShrugLower", "mouthShrugUpper", "mouthSmileLeft", "mouthSmileRight", "mouthStretchLeft", "mouthStretchRight", "mouthUpperUpLeft", "mouthUpperUpRight", "noseSneerLeft", "noseSneerRight", "tongueOut"];
2285
+ /** @deprecated Use ARKIT_BLENDSHAPES instead */
2286
+ declare const LAM_BLENDSHAPES: readonly ["browDownLeft", "browDownRight", "browInnerUp", "browOuterUpLeft", "browOuterUpRight", "cheekPuff", "cheekSquintLeft", "cheekSquintRight", "eyeBlinkLeft", "eyeBlinkRight", "eyeLookDownLeft", "eyeLookDownRight", "eyeLookInLeft", "eyeLookInRight", "eyeLookOutLeft", "eyeLookOutRight", "eyeLookUpLeft", "eyeLookUpRight", "eyeSquintLeft", "eyeSquintRight", "eyeWideLeft", "eyeWideRight", "jawForward", "jawLeft", "jawOpen", "jawRight", "mouthClose", "mouthDimpleLeft", "mouthDimpleRight", "mouthFrownLeft", "mouthFrownRight", "mouthFunnel", "mouthLeft", "mouthLowerDownLeft", "mouthLowerDownRight", "mouthPressLeft", "mouthPressRight", "mouthPucker", "mouthRight", "mouthRollLower", "mouthRollUpper", "mouthShrugLower", "mouthShrugUpper", "mouthSmileLeft", "mouthSmileRight", "mouthStretchLeft", "mouthStretchRight", "mouthUpperUpLeft", "mouthUpperUpRight", "noseSneerLeft", "noseSneerRight", "tongueOut"];
2287
+ /**
2288
+ * Linearly interpolate between two blendshape weight arrays.
2289
+ *
2290
+ * Pure math utility with zero renderer dependency — used by all renderer
2291
+ * adapters (@omote/three, @omote/babylon, @omote/r3f) for smooth frame
2292
+ * transitions.
2293
+ *
2294
+ * @param current - Current blendshape weights
2295
+ * @param target - Target blendshape weights
1964
2296
  * @param factor - Interpolation factor (0 = no change, 1 = snap to target). Default: 0.3
1965
2297
  * @returns Interpolated weights as number[]
1966
2298
  */
1967
2299
  declare function lerpBlendshapes(current: Float32Array | number[], target: Float32Array | number[], factor?: number): number[];
1968
2300
 
1969
2301
  /**
1970
- * Wav2Vec2 inference engine for Audio-to-Expression (A2E)
2302
+ * A2E inference engine for Audio-to-Expression (LAM model)
1971
2303
  *
1972
2304
  * Runs entirely in the browser using WebGPU or WASM.
1973
2305
  * Takes raw 16kHz audio and outputs 52 ARKit blendshapes for lip sync.
2306
+ * Uses the LAM (Large Animation Model) — see {@link A2EBackend} for the interface.
1974
2307
  *
2308
+ * @see {@link createA2E} for the recommended zero-config factory
2309
+ * @see {@link A2EBackend} for the common interface
1975
2310
  * @category Inference
1976
2311
  *
1977
2312
  * @example Basic usage
1978
2313
  * ```typescript
1979
- * import { Wav2Vec2Inference } from '@omote/core';
2314
+ * import { A2EInference } from '@omote/core';
1980
2315
  *
1981
- * const wav2vec = new Wav2Vec2Inference({ modelUrl: '/models/model.onnx' });
1982
- * await wav2vec.load();
2316
+ * const a2e = new A2EInference({ modelUrl: '/models/lam.onnx' });
2317
+ * await a2e.load();
1983
2318
  *
1984
2319
  * // Process 1 second of audio (16kHz = 16000 samples)
1985
- * const result = await wav2vec.infer(audioSamples);
2320
+ * const result = await a2e.infer(audioSamples);
1986
2321
  * console.log('Blendshapes:', result.blendshapes); // [30, 52] for 30fps
1987
2322
  * ```
1988
2323
  */
1989
2324
 
1990
- type InferenceBackend = BackendPreference;
1991
- interface Wav2Vec2InferenceConfig {
2325
+ interface A2EInferenceConfig {
1992
2326
  /** Path or URL to the ONNX model */
1993
2327
  modelUrl: string;
1994
2328
  /**
@@ -1999,7 +2333,7 @@ interface Wav2Vec2InferenceConfig {
1999
2333
  */
2000
2334
  externalDataUrl?: string | false;
2001
2335
  /** Preferred backend (auto will try WebGPU first, fallback to WASM) */
2002
- backend?: InferenceBackend;
2336
+ backend?: BackendPreference;
2003
2337
  /** Number of identity classes (default: 12 for streaming model) */
2004
2338
  numIdentityClasses?: number;
2005
2339
  /**
@@ -2009,28 +2343,9 @@ interface Wav2Vec2InferenceConfig {
2009
2343
  */
2010
2344
  chunkSize?: number;
2011
2345
  }
2012
- interface ModelInfo {
2013
- backend: 'webgpu' | 'wasm';
2014
- loadTimeMs: number;
2015
- inputNames: string[];
2016
- outputNames: string[];
2017
- }
2018
2346
 
2019
- /**
2020
- * CTC vocabulary (32 tokens from wav2vec2-base-960h)
2021
- * @deprecated ASR is handled by SenseVoice. This will be removed in a future release.
2022
- */
2023
- declare const CTC_VOCAB: string[];
2024
- interface Wav2Vec2Result {
2025
- /** Blendshape weights [frames, 52] - 30fps */
2026
- blendshapes: Float32Array[];
2027
- /** Number of blendshape frames (30fps) */
2028
- numFrames: number;
2029
- /** Inference time in ms */
2030
- inferenceTimeMs: number;
2031
- }
2032
- declare class Wav2Vec2Inference implements A2EBackend {
2033
- readonly modelId: "wav2vec2";
2347
+ declare class A2EInference implements A2EBackend {
2348
+ readonly modelId: "a2e";
2034
2349
  private session;
2035
2350
  private ort;
2036
2351
  private config;
@@ -2041,7 +2356,7 @@ declare class Wav2Vec2Inference implements A2EBackend {
2041
2356
  private inferenceQueue;
2042
2357
  private poisoned;
2043
2358
  private static readonly INFERENCE_TIMEOUT_MS;
2044
- constructor(config: Wav2Vec2InferenceConfig);
2359
+ constructor(config: A2EInferenceConfig);
2045
2360
  /**
2046
2361
  * Check if WebGPU is available and working
2047
2362
  * (iOS returns false even if navigator.gpu exists due to ONNX Runtime bugs)
@@ -2054,7 +2369,7 @@ declare class Wav2Vec2Inference implements A2EBackend {
2054
2369
  /**
2055
2370
  * Load the ONNX model
2056
2371
  */
2057
- load(): Promise<ModelInfo>;
2372
+ load(): Promise<A2EModelInfo>;
2058
2373
  /**
2059
2374
  * Run inference on raw audio
2060
2375
  * @param audioSamples - Float32Array of raw audio at 16kHz
@@ -2062,7 +2377,7 @@ declare class Wav2Vec2Inference implements A2EBackend {
2062
2377
  *
2063
2378
  * Audio will be zero-padded or truncated to chunkSize samples.
2064
2379
  */
2065
- infer(audioSamples: Float32Array, identityIndex?: number): Promise<Wav2Vec2Result>;
2380
+ infer(audioSamples: Float32Array, identityIndex?: number): Promise<A2EResult>;
2066
2381
  /**
2067
2382
  * Queue inference to serialize ONNX session calls
2068
2383
  */
@@ -2107,7 +2422,7 @@ declare class Wav2Vec2Inference implements A2EBackend {
2107
2422
  * ```
2108
2423
  */
2109
2424
  /** Model URL keys that can be configured */
2110
- type ModelUrlKey = 'lam' | 'lamIos' | 'wav2arkitCpu' | 'senseVoice' | 'sileroVad' | 'kokoroTTS' | 'kokoroVoices';
2425
+ type ModelUrlKey = 'lam' | 'senseVoice' | 'sileroVad' | 'kokoroTTS' | 'kokoroVoices';
2111
2426
  /**
2112
2427
  * Resolved model URLs — user overrides take priority, HuggingFace CDN is fallback.
2113
2428
  *
@@ -2126,8 +2441,7 @@ declare const DEFAULT_MODEL_URLS: Readonly<Record<ModelUrlKey, string>>;
2126
2441
  * @example Self-host all models
2127
2442
  * ```typescript
2128
2443
  * configureModelUrls({
2129
- * lam: 'https://cdn.example.com/models/model_fp16.onnx',
2130
- * wav2arkitCpu: 'https://cdn.example.com/models/wav2arkit_cpu.onnx',
2444
+ * lam: 'https://cdn.example.com/models/lam.onnx',
2131
2445
  * senseVoice: 'https://cdn.example.com/models/sensevoice.int8.onnx',
2132
2446
  * sileroVad: 'https://cdn.example.com/models/silero-vad.onnx',
2133
2447
  * });
@@ -2135,292 +2449,22 @@ declare const DEFAULT_MODEL_URLS: Readonly<Record<ModelUrlKey, string>>;
2135
2449
  *
2136
2450
  * @example Override only one model
2137
2451
  * ```typescript
2138
- * configureModelUrls({
2139
- * lam: '/models/model_fp16.onnx', // self-hosted, same origin
2140
- * });
2141
- * ```
2142
- */
2143
- declare function configureModelUrls(urls: Partial<Record<ModelUrlKey, string>>): void;
2144
- /**
2145
- * Reset all model URL overrides back to HuggingFace CDN defaults.
2146
- * Mainly useful for testing.
2147
- */
2148
- declare function resetModelUrls(): void;
2149
- /**
2150
- * Get the immutable HuggingFace CDN URLs (ignoring any overrides).
2151
- * Useful for documentation or fallback logic.
2152
- */
2153
- declare const HF_CDN_URLS: Readonly<Record<ModelUrlKey, string>>;
2154
-
2155
- /**
2156
- * CPU-optimized lip sync inference using wav2arkit_cpu model
2157
- *
2158
- * A Safari/iOS-compatible alternative to Wav2Vec2Inference (192MB fp16) designed
2159
- * for platforms where WebGPU crashes due to ONNX Runtime JSEP bugs.
2160
- *
2161
- * The model uses ONNX external data format:
2162
- * - wav2arkit_cpu.onnx (1.86MB graph structure)
2163
- * - wav2arkit_cpu.onnx.data (402MB weights)
2164
- * Both files are fetched and cached automatically.
2165
- *
2166
- * Key differences from Wav2Vec2Inference:
2167
- * - WASM-only backend (CPU-optimized, single-threaded, no WebGPU)
2168
- * - No identity input (baked to identity 11)
2169
- * - No ASR output (lip sync only)
2170
- * - Dynamic input length (not fixed to 16000 samples)
2171
- * - Different native blendshape ordering (remapped to LAM_BLENDSHAPES)
2172
- *
2173
- * @category Inference
2174
- *
2175
- * @example
2176
- * ```typescript
2177
- * import { Wav2ArkitCpuInference } from '@omote/core';
2178
- *
2179
- * const lam = new Wav2ArkitCpuInference({
2180
- * modelUrl: '/models/wav2arkit_cpu.onnx',
2181
- * });
2182
- * await lam.load();
2183
- *
2184
- * const { blendshapes } = await lam.infer(audioSamples);
2185
- * // blendshapes: Float32Array[] in LAM_BLENDSHAPES order, 30fps
2186
- * ```
2187
- */
2188
-
2189
- interface Wav2ArkitCpuConfig {
2190
- /** Path or URL to the wav2arkit_cpu ONNX model (.onnx graph file) */
2191
- modelUrl: string;
2192
- /**
2193
- * Path or URL to external model data file (.onnx.data weights).
2194
- * Default: `${modelUrl}.data` (e.g., /models/wav2arkit_cpu.onnx.data)
2195
- *
2196
- * Set to `false` to skip external data loading (single-file models only).
2197
- */
2198
- externalDataUrl?: string | false;
2199
- /** Preferred backend (default: 'wasm' — this model is CPU-optimized) */
2200
- backend?: BackendPreference;
2201
- }
2202
- declare class Wav2ArkitCpuInference implements A2EBackend {
2203
- readonly modelId: "wav2arkit_cpu";
2204
- readonly chunkSize: number;
2205
- private session;
2206
- private ort;
2207
- private config;
2208
- private _backend;
2209
- private isLoading;
2210
- private inferenceQueue;
2211
- private poisoned;
2212
- private static readonly INFERENCE_TIMEOUT_MS;
2213
- constructor(config: Wav2ArkitCpuConfig);
2214
- get backend(): RuntimeBackend | null;
2215
- get isLoaded(): boolean;
2216
- /**
2217
- * Load the ONNX model
2218
- */
2219
- load(): Promise<A2EModelInfo>;
2220
- /**
2221
- * Run inference on raw audio
2222
- *
2223
- * Accepts variable-length audio (not fixed to 16000 samples).
2224
- * Output frames = ceil(30 * numSamples / 16000).
2225
- *
2226
- * @param audioSamples - Float32Array of raw audio at 16kHz
2227
- * @param _identityIndex - Ignored (identity 11 is baked into the model)
2228
- */
2229
- infer(audioSamples: Float32Array, _identityIndex?: number): Promise<A2EResult>;
2230
- /**
2231
- * Queue inference to serialize ONNX session calls
2232
- */
2233
- private queueInference;
2234
- /**
2235
- * Dispose of the model and free resources
2236
- */
2237
- dispose(): Promise<void>;
2238
- }
2239
-
2240
- /**
2241
- * Web Worker-based wav2arkit_cpu lip sync inference
2242
- *
2243
- * Runs wav2arkit_cpu inference in a dedicated Web Worker to prevent main thread blocking.
2244
- * Uses inline worker script (Blob URL pattern) to avoid separate file deployment.
2245
- *
2246
- * Key design decisions:
2247
- * - WASM backend only (WebGPU doesn't work in Workers)
2248
- * - Audio copied (not transferred) to retain main thread access
2249
- * - ONNX Runtime loaded from CDN in worker (no bundler complications)
2250
- * - Blendshape symmetrization inlined in worker (no module imports)
2251
- * - iOS: passes model URLs as strings directly to ORT (avoids 400MB+ JS heap)
2252
- *
2253
- * @category Inference
2254
- *
2255
- * @example
2256
- * ```typescript
2257
- * import { Wav2ArkitCpuWorker } from '@omote/core';
2258
- *
2259
- * const lam = new Wav2ArkitCpuWorker({
2260
- * modelUrl: '/models/wav2arkit_cpu.onnx',
2261
- * });
2262
- * await lam.load();
2263
- *
2264
- * const { blendshapes } = await lam.infer(audioSamples);
2265
- * // blendshapes: Float32Array[] in LAM_BLENDSHAPES order, 30fps
2266
- * ```
2267
- */
2268
-
2269
- /**
2270
- * Configuration for Wav2ArkitCpu Worker
2271
- */
2272
- interface Wav2ArkitCpuWorkerConfig {
2273
- /** Path or URL to the wav2arkit_cpu ONNX model (.onnx graph file) */
2274
- modelUrl: string;
2275
- /**
2276
- * Path or URL to external model data file (.onnx.data weights).
2277
- * Default: `${modelUrl}.data` (e.g., /models/wav2arkit_cpu.onnx.data)
2278
- *
2279
- * Set to `false` to skip external data loading (single-file models only).
2280
- */
2281
- externalDataUrl?: string | false;
2282
- }
2283
- /**
2284
- * Wav2ArkitCpu Worker - Lip sync inference in a Web Worker
2285
- *
2286
- * Runs wav2arkit_cpu inference off the main thread to prevent UI blocking.
2287
- * Feature parity with Wav2ArkitCpuInference but runs in dedicated worker.
2288
- *
2289
- * @see Wav2ArkitCpuInference for main-thread version
2290
- */
2291
- declare class Wav2ArkitCpuWorker implements A2EBackend {
2292
- readonly modelId: "wav2arkit_cpu";
2293
- readonly chunkSize: number;
2294
- private worker;
2295
- private config;
2296
- private isLoading;
2297
- private _isLoaded;
2298
- private inferenceQueue;
2299
- private poisoned;
2300
- private pendingResolvers;
2301
- constructor(config: Wav2ArkitCpuWorkerConfig);
2302
- get isLoaded(): boolean;
2303
- /**
2304
- * Backend type (always 'wasm' for Worker, WebGPU not supported in Workers)
2305
- */
2306
- get backend(): 'wasm' | null;
2307
- /**
2308
- * Create the worker from inline script
2309
- */
2310
- private createWorker;
2311
- /**
2312
- * Handle messages from worker
2313
- */
2314
- private handleWorkerMessage;
2315
- /**
2316
- * Send message to worker and wait for response
2317
- */
2318
- private sendMessage;
2319
- /**
2320
- * Load the ONNX model in the worker
2321
- */
2322
- load(): Promise<A2EModelInfo>;
2323
- /**
2324
- * Run inference on raw audio
2325
- *
2326
- * Accepts variable-length audio (not fixed to 16000 samples).
2327
- * Output frames = ceil(30 * numSamples / 16000).
2328
- *
2329
- * @param audioSamples - Float32Array of raw audio at 16kHz
2330
- * @param _identityIndex - Ignored (identity 11 is baked into the model)
2331
- */
2332
- infer(audioSamples: Float32Array, _identityIndex?: number): Promise<A2EResult>;
2333
- /**
2334
- * Queue inference to serialize worker calls
2335
- */
2336
- private queueInference;
2337
- /**
2338
- * Dispose of the worker and free resources
2339
- */
2340
- dispose(): Promise<void>;
2341
- /**
2342
- * Check if Web Workers are supported
2343
- */
2344
- static isSupported(): boolean;
2345
- }
2346
-
2347
- /**
2348
- * Factory function for A2E with automatic GPU/CPU model selection
2349
- *
2350
- * Provides a unified API with platform-aware model selection:
2351
- *
2352
- * **Desktop (Chrome/Edge/Android):**
2353
- * Wav2Vec2 (WebGPU, 192MB fp16) → wav2arkit_cpu fallback
2354
- *
2355
- * **iOS/Safari:**
2356
- * LAM iOS (WASM, opset 18, ~192MB fp16, native LayerNorm) → wav2arkit_cpu fallback
2357
- *
2358
- * The iOS variant is the same LAM model re-exported at opset 18 with native
2359
- * LayerNormalization ops (~256 fewer graph nodes than desktop's opset 14
2360
- * decomposed LayerNorm). This dramatically reduces peak memory during ORT
2361
- * graph parsing/optimization, fitting within iOS's ~1-1.5GB tab limit.
2362
- *
2363
- * Both variants use fp16 external data format (small graph + ~192MB weights).
2364
- * On iOS, ORT streams weights directly into WASM memory via URL pass-through
2365
- * (~2MB JS heap). If the model still OOMs, A2EWithFallback falls back to
2366
- * wav2arkit_cpu (404MB fp32, lower quality).
2367
- *
2368
- * @category Inference
2369
- *
2370
- * @example Auto-detect (recommended, zero-config)
2371
- * ```typescript
2372
- * import { createA2E } from '@omote/core';
2373
- *
2374
- * const a2e = createA2E(); // uses HF CDN defaults (192MB fp16 GPU, 404MB CPU fallback)
2375
- * await a2e.load();
2376
- * const { blendshapes } = await a2e.infer(audioSamples);
2377
- * ```
2378
- *
2379
- * @example Force CPU model
2380
- * ```typescript
2381
- * const a2e = createA2E({ mode: 'cpu' });
2452
+ * configureModelUrls({
2453
+ * lam: '/models/model_fp16.onnx', // self-hosted, same origin
2454
+ * });
2382
2455
  * ```
2383
2456
  */
2384
-
2457
+ declare function configureModelUrls(urls: Partial<Record<ModelUrlKey, string>>): void;
2385
2458
  /**
2386
- * Configuration for the A2E factory
2459
+ * Reset all model URL overrides back to HuggingFace CDN defaults.
2460
+ * Mainly useful for testing.
2387
2461
  */
2388
- interface CreateA2EConfig extends InferenceFactoryConfig {
2389
- /** URL for the GPU model (Wav2Vec2, used on Chrome/Firefox/Edge). Default: HuggingFace CDN */
2390
- gpuModelUrl?: string;
2391
- /**
2392
- * URL for GPU model external data file (.onnx.data weights).
2393
- * Default: `${gpuModelUrl}.data`
2394
- *
2395
- * Set to `false` to skip external data loading (single-file models only).
2396
- */
2397
- gpuExternalDataUrl?: string | false;
2398
- /** URL for the CPU model (wav2arkit_cpu, used on Safari/iOS). Default: HuggingFace CDN */
2399
- cpuModelUrl?: string;
2400
- /**
2401
- * Model selection mode:
2402
- * - 'auto': Safari/iOS -> CPU, everything else -> GPU (default)
2403
- * - 'gpu': Force GPU model (Wav2Vec2Inference)
2404
- * - 'cpu': Force CPU model (Wav2ArkitCpuInference)
2405
- */
2406
- mode?: 'auto' | 'gpu' | 'cpu';
2407
- /** Backend preference for GPU model (default: 'auto') */
2408
- gpuBackend?: BackendPreference;
2409
- /** Number of identity classes for GPU model (default: 12) */
2410
- numIdentityClasses?: number;
2411
- /**
2412
- * Fall back to CPU model if GPU model fails to load (default: true)
2413
- * Only applies when mode is 'auto' or 'gpu'
2414
- */
2415
- fallbackOnError?: boolean;
2416
- }
2462
+ declare function resetModelUrls(): void;
2417
2463
  /**
2418
- * Create an A2E instance with automatic GPU/CPU model selection
2419
- *
2420
- * @param config - Factory configuration
2421
- * @returns An A2EBackend instance (either GPU or CPU model)
2464
+ * Get the immutable HuggingFace CDN URLs (ignoring any overrides).
2465
+ * Useful for documentation or fallback logic.
2422
2466
  */
2423
- declare function createA2E(config?: CreateA2EConfig): A2EBackend;
2467
+ declare const HF_CDN_URLS: Readonly<Record<ModelUrlKey, string>>;
2424
2468
 
2425
2469
  /**
2426
2470
  * A2EProcessor — Engine-agnostic audio-to-expression processor
@@ -2471,9 +2515,6 @@ interface A2EProcessorConfig {
2471
2515
  * The LAM model uses a one-hot identity vector (12 classes, indices 0-11) as
2472
2516
  * style conditioning alongside audio features. Different indices produce
2473
2517
  * different expression intensity across face regions (brows, eyes, cheeks).
2474
- *
2475
- * Only affects Wav2Vec2Inference (GPU model). Wav2ArkitCpuInference has
2476
- * identity 11 baked into the model weights.
2477
2518
  */
2478
2519
  identityIndex?: number;
2479
2520
  /** Callback fired with each blendshape frame (push mode) */
@@ -2482,6 +2523,7 @@ interface A2EProcessorConfig {
2482
2523
  onError?: (error: Error) => void;
2483
2524
  }
2484
2525
  declare class A2EProcessor {
2526
+ private static readonly MAX_PENDING_CHUNKS;
2485
2527
  private readonly backend;
2486
2528
  private readonly sampleRate;
2487
2529
  private readonly chunkSize;
@@ -2497,6 +2539,8 @@ declare class A2EProcessor {
2497
2539
  private _latestFrame;
2498
2540
  private dripInterval;
2499
2541
  private lastPulledFrame;
2542
+ private lastDequeuedTime;
2543
+ private decayBuffer;
2500
2544
  private inferenceRunning;
2501
2545
  private pendingChunks;
2502
2546
  private getFrameCallCount;
@@ -2641,154 +2685,6 @@ declare class BlendshapeSmoother {
2641
2685
  reset(): void;
2642
2686
  }
2643
2687
 
2644
- /**
2645
- * Factory function for Silero VAD with automatic Worker vs main thread selection
2646
- *
2647
- * Provides a unified API that automatically selects the optimal implementation:
2648
- * - Desktop browsers: Uses SileroVADWorker (off-main-thread inference)
2649
- * - Mobile devices: Uses SileroVADInference (main thread, avoids memory overhead)
2650
- * - Fallback: Gracefully falls back to main thread if Worker fails
2651
- *
2652
- * @category Inference
2653
- *
2654
- * @example Basic usage (auto-detect)
2655
- * ```typescript
2656
- * import { createSileroVAD } from '@omote/core';
2657
- *
2658
- * const vad = createSileroVAD({
2659
- * modelUrl: '/models/silero-vad.onnx',
2660
- * threshold: 0.5,
2661
- * });
2662
- *
2663
- * await vad.load();
2664
- * const result = await vad.process(audioChunk);
2665
- * if (result.isSpeech) {
2666
- * console.log('Speech detected!', result.probability);
2667
- * }
2668
- * ```
2669
- *
2670
- * @example Force worker usage
2671
- * ```typescript
2672
- * const vad = createSileroVAD({
2673
- * modelUrl: '/models/silero-vad.onnx',
2674
- * useWorker: true, // Force Worker even on mobile
2675
- * });
2676
- * ```
2677
- *
2678
- * @example Force main thread
2679
- * ```typescript
2680
- * const vad = createSileroVAD({
2681
- * modelUrl: '/models/silero-vad.onnx',
2682
- * useWorker: false, // Force main thread
2683
- * });
2684
- * ```
2685
- */
2686
-
2687
- /**
2688
- * Common interface for both SileroVADInference and SileroVADWorker
2689
- *
2690
- * This interface defines the shared API that both implementations provide,
2691
- * allowing consumers to use either interchangeably.
2692
- */
2693
- interface SileroVADBackend {
2694
- /** Current backend type (webgpu, wasm, or null if not loaded) */
2695
- readonly backend: RuntimeBackend | null;
2696
- /** Whether the model is loaded and ready for inference */
2697
- readonly isLoaded: boolean;
2698
- /** Audio sample rate (8000 or 16000 Hz) */
2699
- readonly sampleRate: number;
2700
- /** Speech detection threshold (0-1) */
2701
- readonly threshold: number;
2702
- /**
2703
- * Load the ONNX model
2704
- * @returns Model loading information
2705
- */
2706
- load(): Promise<VADModelInfo | VADWorkerModelInfo>;
2707
- /**
2708
- * Process a single audio chunk
2709
- * @param audioChunk - Float32Array of exactly chunkSize samples
2710
- * @returns VAD result with speech probability
2711
- */
2712
- process(audioChunk: Float32Array): Promise<VADResult>;
2713
- /**
2714
- * Reset state for new audio stream
2715
- */
2716
- reset(): void | Promise<void>;
2717
- /**
2718
- * Dispose of the model and free resources
2719
- */
2720
- dispose(): Promise<void>;
2721
- /**
2722
- * Get required chunk size in samples
2723
- */
2724
- getChunkSize(): number;
2725
- /**
2726
- * Get chunk duration in milliseconds
2727
- */
2728
- getChunkDurationMs(): number;
2729
- }
2730
- /**
2731
- * Configuration for the Silero VAD factory
2732
- *
2733
- * Extends SileroVADConfig with worker-specific options.
2734
- */
2735
- interface SileroVADFactoryConfig extends Omit<SileroVADConfig, 'modelUrl'>, InferenceFactoryConfig {
2736
- /** Path or URL to the ONNX model. Default: HuggingFace CDN */
2737
- modelUrl?: string;
2738
- /**
2739
- * Fallback to main thread on worker errors.
2740
- *
2741
- * When true (default), if the Worker fails to load or encounters an error,
2742
- * the factory will automatically create a main thread instance instead.
2743
- *
2744
- * When false, worker errors will propagate as exceptions.
2745
- *
2746
- * Default: true
2747
- */
2748
- fallbackOnError?: boolean;
2749
- }
2750
- /**
2751
- * Check if the current environment supports VAD Web Workers
2752
- *
2753
- * Requirements:
2754
- * - Worker constructor must exist
2755
- * - Blob URL support (for inline worker script)
2756
- *
2757
- * @returns true if VAD Worker is supported
2758
- */
2759
- declare function supportsVADWorker(): boolean;
2760
- /**
2761
- * Create a Silero VAD instance with automatic implementation selection
2762
- *
2763
- * This factory function automatically selects between:
2764
- * - **SileroVADWorker**: Off-main-thread inference (better for desktop)
2765
- * - **SileroVADInference**: Main thread inference (better for mobile)
2766
- *
2767
- * The selection is based on:
2768
- * 1. Explicit `useWorker` config (if provided)
2769
- * 2. Platform detection (mobile vs desktop)
2770
- * 3. Worker API availability
2771
- *
2772
- * Both implementations share the same interface (SileroVADBackend),
2773
- * so consumers can use either interchangeably.
2774
- *
2775
- * @param config - Factory configuration
2776
- * @returns A SileroVAD instance (either Worker or main thread)
2777
- *
2778
- * @example
2779
- * ```typescript
2780
- * // Auto-detect (recommended)
2781
- * const vad = createSileroVAD({ modelUrl: '/models/silero-vad.onnx' });
2782
- *
2783
- * // Force Worker
2784
- * const vadWorker = createSileroVAD({ modelUrl: '/models/silero-vad.onnx', useWorker: true });
2785
- *
2786
- * // Force main thread
2787
- * const vadMain = createSileroVAD({ modelUrl: '/models/silero-vad.onnx', useWorker: false });
2788
- * ```
2789
- */
2790
- declare function createSileroVAD(config?: SileroVADFactoryConfig): SileroVADBackend;
2791
-
2792
2688
  /**
2793
2689
  * SenseVoice adapter backed by UnifiedInferenceWorker
2794
2690
  *
@@ -2814,44 +2710,21 @@ declare class SenseVoiceUnifiedAdapter implements SenseVoiceBackend {
2814
2710
  }
2815
2711
 
2816
2712
  /**
2817
- * Wav2ArkitCpu adapter backed by UnifiedInferenceWorker
2818
- *
2819
- * Implements A2EBackend, delegating all inference to the shared worker.
2820
- */
2821
-
2822
- declare class Wav2ArkitCpuUnifiedAdapter implements A2EBackend {
2823
- readonly modelId: "wav2arkit_cpu";
2824
- readonly chunkSize: number;
2825
- private worker;
2826
- private config;
2827
- private _isLoaded;
2828
- private loadedGeneration;
2829
- /** Per-adapter inference queue — ensures sequential state updates. */
2830
- private inferenceQueue;
2831
- constructor(worker: UnifiedInferenceWorker, config: Wav2ArkitCpuWorkerConfig);
2832
- get isLoaded(): boolean;
2833
- get backend(): RuntimeBackend | null;
2834
- load(): Promise<A2EModelInfo>;
2835
- infer(audioSamples: Float32Array, _identityIndex?: number): Promise<A2EResult>;
2836
- dispose(): Promise<void>;
2837
- private assertLoaded;
2838
- }
2839
-
2840
- /**
2841
- * Wav2Vec2 (LAM) adapter backed by UnifiedInferenceWorker
2713
+ * A2E adapter backed by UnifiedInferenceWorker
2842
2714
  *
2843
2715
  * Implements A2EBackend, delegating all inference to the shared worker.
2844
- * Used on iOS to run LAM inference off the main thread via the unified worker.
2716
+ * Used on iOS to run A2E inference off the main thread via the unified worker.
2845
2717
  */
2846
2718
 
2847
- declare class Wav2Vec2UnifiedAdapter implements A2EBackend {
2848
- readonly modelId: "wav2vec2";
2719
+ declare class A2EUnifiedAdapter implements A2EBackend {
2720
+ readonly modelId: "a2e";
2849
2721
  readonly chunkSize: number;
2850
2722
  private worker;
2851
2723
  private modelUrl;
2852
2724
  private externalDataUrl;
2853
2725
  private numIdentityClasses;
2854
2726
  private _isLoaded;
2727
+ private _backend;
2855
2728
  private loadedGeneration;
2856
2729
  /** Per-adapter inference queue — ensures sequential state updates. */
2857
2730
  private inferenceQueue;
@@ -2944,6 +2817,11 @@ interface SynthesizeOptions {
2944
2817
  /** Speed multiplier (overrides config speed) */
2945
2818
  speed?: number;
2946
2819
  }
2820
+ /**
2821
+ * Validate TTS input parameters at API boundaries.
2822
+ * Returns trimmed text on success, throws on invalid input.
2823
+ */
2824
+ declare function validateTTSInput(text: unknown, voiceName: string, speed: number, availableVoices?: string[]): string;
2947
2825
  declare class KokoroTTSInference implements TTSBackend {
2948
2826
  private readonly config;
2949
2827
  private readonly modelUrl;
@@ -3044,144 +2922,37 @@ declare class KokoroTTSUnifiedAdapter implements TTSBackend {
3044
2922
  */
3045
2923
 
3046
2924
  declare class SileroVADUnifiedAdapter implements SileroVADBackend {
3047
- private worker;
3048
- private config;
3049
- private _isLoaded;
3050
- private loadedGeneration;
3051
- private state;
3052
- private context;
3053
- private readonly chunkSize;
3054
- private readonly contextSize;
3055
- /**
3056
- * Per-adapter inference queue — ensures sequential state updates.
3057
- *
3058
- * The unified worker processes messages serially (single thread), but this queue
3059
- * guarantees per-adapter state consistency. Example: VAD LSTM state from call N
3060
- * must be applied before call N+1 starts. Without the queue, two rapid process()
3061
- * calls could both read the same stale state.
3062
- */
3063
- private inferenceQueue;
3064
- private preSpeechBuffer;
3065
- private wasSpeaking;
3066
- constructor(worker: UnifiedInferenceWorker, config: SileroVADConfig);
3067
- get isLoaded(): boolean;
3068
- get backend(): RuntimeBackend | null;
3069
- get sampleRate(): number;
3070
- get threshold(): number;
3071
- getChunkSize(): number;
3072
- getChunkDurationMs(): number;
3073
- load(): Promise<VADWorkerModelInfo>;
3074
- process(audioChunk: Float32Array): Promise<VADResult>;
3075
- reset(): Promise<void>;
3076
- dispose(): Promise<void>;
3077
- private assertLoaded;
3078
- }
3079
-
3080
- /**
3081
- * Renderer-agnostic A2E (audio-to-expression) orchestrator
3082
- *
3083
- * Manages the mic capture + A2E inference loop independently of any
3084
- * 3D renderer. Adapter packages (@omote/three, @omote/babylon) wrap this
3085
- * thinly and pipe `latestWeights` into their renderer-specific blendshape
3086
- * controllers.
3087
- *
3088
- * Internally delegates all buffer accumulation, inference, and frame
3089
- * drip-feeding to {@link A2EProcessor}. This class only handles mic capture
3090
- * (getUserMedia, ScriptProcessorNode, resampling).
3091
- *
3092
- * @deprecated Use {@link MicLipSync} from `@omote/core` instead. MicLipSync provides
3093
- * the same mic → A2E composition with proper MicrophoneCapture integration, VAD support,
3094
- * ExpressionProfile scaling, and pause/resume. This class will be removed in a future version.
3095
- *
3096
- * @category Inference
3097
- */
3098
-
3099
- /**
3100
- * Progress event emitted during model download / compile
3101
- */
3102
- interface A2EProgressEvent {
3103
- phase: 'download' | 'compile';
3104
- progress: number;
3105
- }
3106
- /**
3107
- * Configuration for the A2EOrchestrator
3108
- */
3109
- interface A2EOrchestratorConfig {
3110
- /** URL for the GPU model (Wav2Vec2, Chrome/Firefox/Edge) */
3111
- gpuModelUrl: string;
3112
- /** URL for GPU model external data file */
3113
- gpuExternalDataUrl?: string | false;
3114
- /** URL for the CPU model (wav2arkit_cpu, Safari/iOS) */
3115
- cpuModelUrl?: string;
3116
- /** Sample rate for mic capture (default: 16000) */
3117
- sampleRate?: number;
3118
- /** Chunk size in samples for mic capture (default: 16000 = 1s at 16kHz) */
3119
- chunkSize?: number;
3120
- /** Callback fired with new blendshape weights after each inference */
3121
- onFrame?: (weights: Float32Array) => void;
3122
- /** Callback fired during model loading progress */
3123
- onProgress?: (event: A2EProgressEvent) => void;
3124
- /** Callback fired on error */
3125
- onError?: (error: Error) => void;
3126
- /** Callback fired when model is loaded and ready */
3127
- onReady?: () => void;
3128
- /** Additional createA2E config options */
3129
- a2eConfig?: Partial<CreateA2EConfig>;
3130
- }
3131
- /**
3132
- * Renderer-agnostic A2E orchestrator.
3133
- *
3134
- * Manages mic capture + delegates inference to {@link A2EProcessor}.
3135
- * Adapters read `latestWeights` each frame to apply to their meshes.
3136
- *
3137
- * @example Quick start (used by @omote/three and @omote/babylon adapters)
3138
- * ```typescript
3139
- * const orchestrator = new A2EOrchestrator({
3140
- * gpuModelUrl: '/models/wav2vec2.onnx',
3141
- * cpuModelUrl: '/models/wav2arkit_cpu.onnx',
3142
- * onFrame: (weights) => controller.update(weights),
3143
- * });
3144
- * await orchestrator.load();
3145
- * await orchestrator.start();
3146
- * ```
3147
- */
3148
- declare class A2EOrchestrator {
2925
+ private worker;
3149
2926
  private config;
3150
- private a2e;
3151
- private processor;
3152
- private stream;
3153
- private audioContext;
3154
- private scriptProcessor;
3155
- private nativeSampleRate;
3156
- private _isReady;
3157
- private _isStreaming;
3158
- private _backend;
3159
- private disposed;
3160
- constructor(config: A2EOrchestratorConfig);
3161
- /** Latest blendshape weights from inference (null if none yet) */
3162
- get latestWeights(): Float32Array | null;
3163
- /** Whether the model is loaded and ready for inference */
3164
- get isReady(): boolean;
3165
- /** Whether mic is active and inference loop is running */
3166
- get isStreaming(): boolean;
3167
- /** Current backend type (webgpu, wasm, or null) */
3168
- get backend(): string | null;
3169
- /**
3170
- * Load the A2E model and create the processor
3171
- */
3172
- load(): Promise<void>;
3173
- /**
3174
- * Start mic capture and inference loop
3175
- */
3176
- start(): Promise<void>;
3177
- /**
3178
- * Stop mic capture and inference loop
3179
- */
3180
- stop(): void;
2927
+ private _isLoaded;
2928
+ private loadedGeneration;
2929
+ private state;
2930
+ private context;
2931
+ private readonly chunkSize;
2932
+ private readonly contextSize;
3181
2933
  /**
3182
- * Dispose of all resources
2934
+ * Per-adapter inference queue — ensures sequential state updates.
2935
+ *
2936
+ * The unified worker processes messages serially (single thread), but this queue
2937
+ * guarantees per-adapter state consistency. Example: VAD LSTM state from call N
2938
+ * must be applied before call N+1 starts. Without the queue, two rapid process()
2939
+ * calls could both read the same stale state.
3183
2940
  */
2941
+ private inferenceQueue;
2942
+ private preSpeechBuffer;
2943
+ private wasSpeaking;
2944
+ constructor(worker: UnifiedInferenceWorker, config: SileroVADConfig);
2945
+ get isLoaded(): boolean;
2946
+ get backend(): RuntimeBackend | null;
2947
+ get sampleRate(): number;
2948
+ get threshold(): number;
2949
+ getChunkSize(): number;
2950
+ getChunkDurationMs(): number;
2951
+ load(): Promise<VADWorkerModelInfo>;
2952
+ process(audioChunk: Float32Array): Promise<VADResult>;
2953
+ reset(): Promise<void>;
3184
2954
  dispose(): Promise<void>;
2955
+ private assertLoaded;
3185
2956
  }
3186
2957
 
3187
2958
  /**
@@ -3555,6 +3326,10 @@ declare const KOKORO_VOICES: {
3555
3326
  readonly bm_fable: "bm_fable";
3556
3327
  readonly bm_george: "bm_george";
3557
3328
  readonly bm_lewis: "bm_lewis";
3329
+ readonly ef_dora: "ef_dora";
3330
+ readonly em_alex: "em_alex";
3331
+ readonly em_santa: "em_santa";
3332
+ readonly ff_siwis: "ff_siwis";
3558
3333
  };
3559
3334
  type KokoroVoiceName = keyof typeof KOKORO_VOICES;
3560
3335
  /**
@@ -3562,6 +3337,223 @@ type KokoroVoiceName = keyof typeof KOKORO_VOICES;
3562
3337
  */
3563
3338
  declare function listVoices(): string[];
3564
3339
 
3340
+ /**
3341
+ * ElevenLabs TTS Backend — Cloud text-to-speech via ElevenLabs REST API.
3342
+ *
3343
+ * Implements the TTSBackend interface so it can be used anywhere Kokoro TTS is used
3344
+ * (TTSPlayback, TTSSpeaker, VoicePipeline, PlaybackPipeline, etc.)
3345
+ *
3346
+ * Zero external dependencies — uses fetch() directly.
3347
+ *
3348
+ * @category Inference
3349
+ *
3350
+ * @example Basic usage
3351
+ * ```typescript
3352
+ * import { ElevenLabsTTSBackend } from '@omote/core';
3353
+ *
3354
+ * const tts = new ElevenLabsTTSBackend({
3355
+ * apiKey: 'your-api-key',
3356
+ * voiceId: 'voice-id',
3357
+ * });
3358
+ * await tts.load();
3359
+ *
3360
+ * for await (const chunk of tts.stream("Hello world!")) {
3361
+ * playbackPipeline.feedBuffer(chunk.audio);
3362
+ * }
3363
+ * ```
3364
+ *
3365
+ * @example With PlaybackPipeline
3366
+ * ```typescript
3367
+ * const speaker = new TTSSpeaker();
3368
+ * await speaker.connect(tts, { lam: createA2E() });
3369
+ * await speaker.speak("Hello!");
3370
+ * ```
3371
+ */
3372
+
3373
+ interface ElevenLabsConfig {
3374
+ /** ElevenLabs API key */
3375
+ apiKey: string;
3376
+ /** Voice ID to use */
3377
+ voiceId: string;
3378
+ /** Model ID (default: 'eleven_multilingual_v2') */
3379
+ model?: string;
3380
+ /**
3381
+ * Output format (default: 'pcm_16000').
3382
+ * Use 'pcm_16000' for lip sync compatibility (16kHz matches A2E input).
3383
+ * Other options: 'pcm_22050', 'pcm_24000', 'pcm_44100'
3384
+ */
3385
+ outputFormat?: string;
3386
+ /** Voice stability 0-1 (default: 0.5) */
3387
+ stability?: number;
3388
+ /** Voice similarity boost 0-1 (default: 0.75) */
3389
+ similarityBoost?: number;
3390
+ /** API base URL override (default: 'https://api.elevenlabs.io') */
3391
+ baseUrl?: string;
3392
+ }
3393
+ declare class ElevenLabsTTSBackend implements TTSBackend {
3394
+ private readonly apiKey;
3395
+ private readonly voiceId;
3396
+ private readonly model;
3397
+ private readonly outputFormat;
3398
+ private readonly stability;
3399
+ private readonly similarityBoost;
3400
+ private readonly baseUrl;
3401
+ private readonly _sampleRate;
3402
+ private _isLoaded;
3403
+ constructor(config: ElevenLabsConfig);
3404
+ get sampleRate(): number;
3405
+ get isLoaded(): boolean;
3406
+ /**
3407
+ * No-op for cloud TTS (no model to load).
3408
+ * Marks backend as ready.
3409
+ */
3410
+ load(): Promise<void>;
3411
+ /**
3412
+ * Stream audio from ElevenLabs for the given text.
3413
+ *
3414
+ * Uses the streaming endpoint. Yields a single chunk for non-streaming
3415
+ * or multiple chunks as response data arrives.
3416
+ */
3417
+ stream(text: string, options?: TTSStreamOptions): AsyncGenerator<TTSChunk>;
3418
+ dispose(): Promise<void>;
3419
+ private getHttpErrorMessage;
3420
+ }
3421
+
3422
+ /**
3423
+ * AWS Polly TTS Backend — Cloud text-to-speech via consumer-provided AWS SDK call.
3424
+ *
3425
+ * Implements the TTSBackend interface. Keeps @omote/core free of AWS SDK dependencies
3426
+ * by delegating the actual Polly API call to a consumer-provided function.
3427
+ *
3428
+ * @category Inference
3429
+ *
3430
+ * @example Basic usage with AWS SDK v3
3431
+ * ```typescript
3432
+ * import { PollyTTSBackend } from '@omote/core';
3433
+ * import { PollyClient, SynthesizeSpeechCommand } from '@aws-sdk/client-polly';
3434
+ *
3435
+ * const polly = new PollyClient({ region: 'us-east-1' });
3436
+ *
3437
+ * const tts = new PollyTTSBackend({
3438
+ * synthesizeFn: async (text, voice, sampleRate) => {
3439
+ * const cmd = new SynthesizeSpeechCommand({
3440
+ * Text: text,
3441
+ * VoiceId: voice,
3442
+ * Engine: 'neural',
3443
+ * OutputFormat: 'pcm',
3444
+ * SampleRate: String(sampleRate),
3445
+ * });
3446
+ * const result = await polly.send(cmd);
3447
+ * const stream = result.AudioStream;
3448
+ * // Convert stream to ArrayBuffer (Node or browser)
3449
+ * const chunks: Uint8Array[] = [];
3450
+ * for await (const chunk of stream as AsyncIterable<Uint8Array>) {
3451
+ * chunks.push(chunk);
3452
+ * }
3453
+ * const totalLength = chunks.reduce((sum, c) => sum + c.length, 0);
3454
+ * const merged = new Uint8Array(totalLength);
3455
+ * let offset = 0;
3456
+ * for (const chunk of chunks) {
3457
+ * merged.set(chunk, offset);
3458
+ * offset += chunk.length;
3459
+ * }
3460
+ * return {
3461
+ * audio: merged.buffer,
3462
+ * contentType: result.ContentType ?? 'audio/pcm',
3463
+ * };
3464
+ * },
3465
+ * });
3466
+ *
3467
+ * await tts.load();
3468
+ * for await (const chunk of tts.stream("Hello world!")) {
3469
+ * playbackPipeline.feedBuffer(chunk.audio);
3470
+ * }
3471
+ * ```
3472
+ */
3473
+
3474
+ /**
3475
+ * Result from the consumer-provided synthesize function.
3476
+ */
3477
+ interface PollySynthesizeResult {
3478
+ /** Raw PCM audio bytes (Int16 LE) */
3479
+ audio: ArrayBuffer;
3480
+ /** Content type from Polly response (e.g., 'audio/pcm') */
3481
+ contentType: string;
3482
+ }
3483
+ /**
3484
+ * Configuration for PollyTTSBackend.
3485
+ *
3486
+ * The `synthesizeFn` callback lets consumers use their own AWS SDK setup
3487
+ * (credentials, region, SDK version) without @omote/core depending on `@aws-sdk/client-polly`.
3488
+ */
3489
+ interface PollyConfig {
3490
+ /**
3491
+ * Consumer-provided function that calls AWS Polly.
3492
+ * Must return PCM audio (Int16 LE) at the requested sample rate.
3493
+ *
3494
+ * @param text - Text to synthesize
3495
+ * @param voice - Polly voice ID (e.g., 'Joanna')
3496
+ * @param sampleRate - Requested output sample rate (e.g., 16000)
3497
+ * @returns PCM audio buffer and content type
3498
+ */
3499
+ synthesizeFn: (text: string, voice: string, sampleRate: number) => Promise<PollySynthesizeResult>;
3500
+ /** Polly voice ID (default: 'Joanna') */
3501
+ voice?: string;
3502
+ /** Output sample rate in Hz (default: 16000) */
3503
+ sampleRate?: number;
3504
+ /** Polly engine type (default: 'neural') */
3505
+ engine?: 'neural' | 'standard' | 'generative' | 'long-form';
3506
+ }
3507
+ declare class PollyTTSBackend implements TTSBackend {
3508
+ private readonly synthesizeFn;
3509
+ private readonly voice;
3510
+ private readonly _sampleRate;
3511
+ private readonly engine;
3512
+ private _isLoaded;
3513
+ constructor(config: PollyConfig);
3514
+ get sampleRate(): number;
3515
+ get isLoaded(): boolean;
3516
+ /**
3517
+ * No-op for cloud TTS (no model to load).
3518
+ * Marks backend as ready.
3519
+ */
3520
+ load(): Promise<void>;
3521
+ /**
3522
+ * Synthesize audio via consumer's Polly function.
3523
+ *
3524
+ * Polly's SynthesizeSpeech is request/response (not streaming for PCM),
3525
+ * so this yields a single chunk per call. For long text, consider splitting
3526
+ * into sentences on the consumer side.
3527
+ */
3528
+ stream(text: string, options?: TTSStreamOptions): AsyncGenerator<TTSChunk>;
3529
+ dispose(): Promise<void>;
3530
+ }
3531
+
3532
+ /**
3533
+ * ORT CDN configuration
3534
+ *
3535
+ * Allows consumers to override the CDN base URL used for loading
3536
+ * ONNX Runtime WASM/WebGPU binaries. By default, ORT loads from
3537
+ * its bundled CDN path. Use {@link configureOrtCdn} to point at
3538
+ * a self-hosted or enterprise CDN.
3539
+ *
3540
+ * @category Inference
3541
+ */
3542
+ /**
3543
+ * Override the CDN base URL for ONNX Runtime WASM/WebGPU binaries.
3544
+ *
3545
+ * Must be an HTTPS URL or a relative path (starts with `/` or `./`).
3546
+ * Call this once at app startup, before loading any models.
3547
+ *
3548
+ * @param cdnPath - HTTPS URL or relative path to ORT binaries directory
3549
+ * @throws If cdnPath is not HTTPS or a relative path
3550
+ */
3551
+ declare function configureOrtCdn(cdnPath: string): void;
3552
+ /**
3553
+ * Get the current ORT CDN base URL override, or null if using defaults.
3554
+ */
3555
+ declare function getOrtCdnBase(): string | null;
3556
+
3565
3557
  /**
3566
3558
  * Emotion - Helper for creating emotion vectors for avatar animation
3567
3559
  *
@@ -3601,6 +3593,8 @@ type EmotionName = typeof EMOTION_NAMES[number];
3601
3593
  type EmotionWeights = Partial<Record<EmotionName, number>>;
3602
3594
  /** Total emotion vector size */
3603
3595
  declare const EMOTION_VECTOR_SIZE = 26;
3596
+ /** Number of explicit emotion channels */
3597
+ declare const EXPLICIT_EMOTION_COUNT = 10;
3604
3598
  /**
3605
3599
  * Create an emotion vector from named weights
3606
3600
  *
@@ -4099,7 +4093,54 @@ declare const MetricNames: {
4099
4093
  readonly CACHE_HITS: "omote.cache.hits";
4100
4094
  /** Counter: Cache misses */
4101
4095
  readonly CACHE_MISSES: "omote.cache.misses";
4096
+ /** Counter: Cache stale (version/etag mismatch) */
4097
+ readonly CACHE_STALE: "omote.cache.stale";
4098
+ /** Counter: Cache quota warning (>90% used) */
4099
+ readonly CACHE_QUOTA_WARNING: "omote.cache.quota_warning";
4100
+ /** Counter: Cache eviction (LRU) */
4101
+ readonly CACHE_EVICTION: "omote.cache.eviction";
4102
+ /** Histogram: VoicePipeline turn latency (speech end → transcript ready, excludes playback) */
4103
+ readonly VOICE_TURN_LATENCY: "omote.voice.turn.latency";
4104
+ /** Histogram: ASR transcription latency in ms */
4105
+ readonly VOICE_TRANSCRIPTION_LATENCY: "omote.voice.transcription.latency";
4106
+ /** Histogram: Response handler latency in ms */
4107
+ readonly VOICE_RESPONSE_LATENCY: "omote.voice.response.latency";
4108
+ /** Counter: Total transcriptions */
4109
+ readonly VOICE_TRANSCRIPTIONS: "omote.voice.transcriptions";
4110
+ /** Counter: Total interruptions */
4111
+ readonly VOICE_INTERRUPTIONS: "omote.voice.interruptions";
4112
+ /** Histogram: PlaybackPipeline session duration in ms */
4113
+ readonly PLAYBACK_SESSION_DURATION: "omote.playback.session.duration";
4114
+ /** Histogram: Audio chunk processing latency in ms */
4115
+ readonly PLAYBACK_CHUNK_LATENCY: "omote.playback.chunk.latency";
4116
+ /** Histogram: TTSSpeaker.connect() latency in ms */
4117
+ readonly TTS_CONNECT_LATENCY: "omote.tts.connect.latency";
4118
+ /** Histogram: TTSSpeaker.speak() latency in ms */
4119
+ readonly TTS_SPEAK_LATENCY: "omote.tts.speak.latency";
4120
+ /** Counter: TTSSpeaker.stop() aborted speak calls */
4121
+ readonly TTS_SPEAK_ABORTED: "omote.tts.speak.aborted";
4122
+ /** Counter: MicLipSync sessions started */
4123
+ readonly MIC_SESSIONS: "omote.mic.sessions";
4124
+ /** Histogram: CharacterController.update() latency in µs */
4125
+ readonly AVATAR_FRAME_LATENCY: "omote.avatar.frame.latency_us";
4126
+ /** Histogram: FaceCompositor.compose() latency in µs */
4127
+ readonly COMPOSITOR_COMPOSE_LATENCY: "omote.compositor.compose.latency_us";
4128
+ /** Counter: Frames exceeding budget threshold */
4129
+ readonly AVATAR_FRAME_DROPS: "omote.avatar.frame.drops";
4130
+ };
4131
+ /**
4132
+ * Centralized error type taxonomy for structured error reporting.
4133
+ */
4134
+ declare const ErrorTypes: {
4135
+ readonly INFERENCE: "inference_error";
4136
+ readonly NETWORK: "network_error";
4137
+ readonly TIMEOUT: "timeout";
4138
+ readonly USER: "user_error";
4139
+ readonly RUNTIME: "runtime_error";
4140
+ readonly MEDIA: "media_error";
4141
+ readonly MODEL: "model_error";
4102
4142
  };
4143
+ type ErrorType = typeof ErrorTypes[keyof typeof ErrorTypes];
4103
4144
  /**
4104
4145
  * Histogram buckets for inference latency (ms)
4105
4146
  */
@@ -4177,6 +4218,7 @@ declare class OmoteTelemetry {
4177
4218
  private exporter;
4178
4219
  private activeTraceId;
4179
4220
  private metricsIntervalId;
4221
+ private spanStack;
4180
4222
  private counters;
4181
4223
  private histograms;
4182
4224
  constructor(config: TelemetryConfig);
@@ -4274,6 +4316,14 @@ declare class OmoteTelemetry {
4274
4316
  * Get current configuration
4275
4317
  */
4276
4318
  getConfig(): TelemetryConfig;
4319
+ /**
4320
+ * Get the active span context for log-to-span correlation.
4321
+ * Returns the most recent (top of stack) active span, or null if none.
4322
+ */
4323
+ getActiveContext(): {
4324
+ traceId: string;
4325
+ spanId: string;
4326
+ } | null;
4277
4327
  }
4278
4328
 
4279
4329
  /**
@@ -4886,6 +4936,7 @@ declare class ProceduralLifeLayer {
4886
4936
  private noiseTime;
4887
4937
  private previousEnergy;
4888
4938
  private emphasisLevel;
4939
+ private readonly _outputBlendshapes;
4889
4940
  constructor(config?: LifeLayerConfig);
4890
4941
  /**
4891
4942
  * Update the life layer and produce output for this frame.
@@ -4928,6 +4979,113 @@ declare class ProceduralLifeLayer {
4928
4979
  private updateBrowNoise;
4929
4980
  }
4930
4981
 
4982
+ /**
4983
+ * Body Animation — Renderer-agnostic interfaces and utilities.
4984
+ *
4985
+ * Defines the contract for body animation controllers that each renderer
4986
+ * adapter (@omote/three, @omote/babylon, @omote/r3f) implements natively.
4987
+ *
4988
+ * Also provides the shared bone filtering logic used during animation
4989
+ * retargeting — stripping head/neck/eye tracks so body animations don't
4990
+ * conflict with the face pipeline (FaceCompositor, gaze, ProceduralLifeLayer).
4991
+ *
4992
+ * @module animation
4993
+ */
4994
+ /**
4995
+ * Renderer-agnostic animation controller interface.
4996
+ *
4997
+ * Each renderer adapter implements this against its native animation system:
4998
+ * - @omote/three → THREE.AnimationMixer + AnimationAction
4999
+ * - @omote/babylon → Babylon.js AnimationGroup
5000
+ * - @omote/r3f → React hook wrapping the Three.js implementation
5001
+ *
5002
+ * Python/Node ports implement this against their own runtimes.
5003
+ */
5004
+ interface AnimationController {
5005
+ /** Play an animation by id. */
5006
+ play(id: string, options?: {
5007
+ fadeInDuration?: number;
5008
+ }): void;
5009
+ /** Stop all playing animations. */
5010
+ stop(fadeOutDuration?: number): void;
5011
+ /** Crossfade from current animation to target. */
5012
+ crossfadeTo(id: string, duration?: number): void;
5013
+ /** Check if a specific animation is currently playing. */
5014
+ isPlaying(id: string): boolean;
5015
+ /** Check if an animation with this id is loaded. */
5016
+ hasAnimation(id: string): boolean;
5017
+ /** List of loaded animation ids. */
5018
+ readonly availableAnimations: string[];
5019
+ }
5020
+ /**
5021
+ * Describes an external animation asset to load and configure.
5022
+ * Renderer-agnostic — loaders are adapter-specific.
5023
+ */
5024
+ interface AnimationSource {
5025
+ /** Unique identifier for this animation. */
5026
+ id: string;
5027
+ /** URL to the animation file (FBX, GLB, etc.). */
5028
+ url: string;
5029
+ /** Clip name within the file (if it contains multiple clips). */
5030
+ clipName?: string;
5031
+ /** Playback options. */
5032
+ options?: AnimationSourceOptions;
5033
+ }
5034
+ interface AnimationSourceOptions {
5035
+ loop?: boolean;
5036
+ timeScale?: number;
5037
+ fadeInDuration?: number;
5038
+ fadeOutDuration?: number;
5039
+ clampWhenFinished?: boolean;
5040
+ }
5041
+ /**
5042
+ * Configuration for filtering bone tracks from body animations.
5043
+ *
5044
+ * The face pipeline (FaceCompositor, gaze tracking, ProceduralLifeLayer) owns
5045
+ * certain bones (head, neck, eyes). Body animations must strip these tracks
5046
+ * to prevent conflicts.
5047
+ */
5048
+ interface BoneFilterConfig {
5049
+ /** Bone names owned by the face pipeline (e.g., ['Head', 'Neck', 'LeftEye', 'RightEye']). */
5050
+ proceduralBones: string[];
5051
+ /** Whether to strip .position tracks (keep only quaternion/rotation). */
5052
+ filterPositionTracks: boolean;
5053
+ /** Whether to strip morphTargetInfluences tracks. */
5054
+ filterMorphTargets: boolean;
5055
+ }
5056
+ /** Mixamo bone name prefix (stripped during retargeting). */
5057
+ declare const MIXAMO_PREFIX = "mixamorig";
5058
+ /**
5059
+ * Bones that need position tracks preserved during retargeting.
5060
+ * Stripping finger/hand position tracks causes fingers to splay to bind pose.
5061
+ */
5062
+ declare const PRESERVE_POSITION_BONES: Set<string>;
5063
+ /** Default bone filter for RPM/Mixamo avatars. */
5064
+ declare const DEFAULT_BONE_FILTER: BoneFilterConfig;
5065
+ /**
5066
+ * A generic animation track descriptor. Renderers map their native track
5067
+ * objects to this shape for filtering, then map back.
5068
+ */
5069
+ interface TrackDescriptor {
5070
+ /** Full track name, e.g. "mixamorigHips.quaternion" or "Head.position". */
5071
+ name: string;
5072
+ }
5073
+ /**
5074
+ * Filter animation tracks according to a BoneFilterConfig.
5075
+ *
5076
+ * This is the renderer-agnostic core of `retargetClip`. Renderer adapters
5077
+ * call this with their native track names and use the result to decide
5078
+ * which tracks to keep.
5079
+ *
5080
+ * @returns true if the track should be KEPT (not filtered out).
5081
+ */
5082
+ declare function shouldKeepTrack(trackName: string, config: BoneFilterConfig): boolean;
5083
+ /**
5084
+ * Strip Mixamo prefix from a track name.
5085
+ * "mixamorigHips.quaternion" → "Hips.quaternion"
5086
+ */
5087
+ declare function stripMixamoPrefix(trackName: string): string;
5088
+
4931
5089
  /**
4932
5090
  * FACS (Facial Action Coding System) to ARKit Blendshape Mapping
4933
5091
  *
@@ -5147,6 +5305,41 @@ declare class FaceCompositor {
5147
5305
  private applyProfileArrays;
5148
5306
  }
5149
5307
 
5308
+ /**
5309
+ * TextEmotionAnalyzer — Lightweight keyword heuristic for mapping AI response
5310
+ * text to an emotion label.
5311
+ *
5312
+ * Returns null if no strong signal is detected (keeps current emotion).
5313
+ *
5314
+ * @category Face
5315
+ */
5316
+ /**
5317
+ * Analyze AI response text for emotional content.
5318
+ *
5319
+ * @param text - The AI response text to analyze
5320
+ * @returns An emotion label string, or null if no strong signal detected
5321
+ */
5322
+ declare function analyzeTextEmotion(text: string): string | null;
5323
+
5324
+ /**
5325
+ * EmotionTagParser — Strips `[tag]` emotion annotations from LLM response text.
5326
+ *
5327
+ * LLMs can self-annotate responses with emotion tags like `[excited]` or `[sad]`.
5328
+ * This parser extracts the first valid tag and returns clean display text.
5329
+ *
5330
+ * @category Face
5331
+ */
5332
+ /**
5333
+ * Parse emotion tags from LLM response text.
5334
+ *
5335
+ * @param text - Raw LLM response text, possibly containing `[emotion]` tags
5336
+ * @returns Object with clean display text and extracted emotion label (or null)
5337
+ */
5338
+ declare function parseEmotionTags(text: string): {
5339
+ cleanText: string;
5340
+ emotion: string | null;
5341
+ };
5342
+
5150
5343
  /**
5151
5344
  * CharacterController — Renderer-agnostic avatar composition loop
5152
5345
  *
@@ -5230,6 +5423,9 @@ declare class CharacterController {
5230
5423
  private readonly gazeYawInfluence;
5231
5424
  private readonly gazePitchInfluence;
5232
5425
  private readonly gazeSmoothing;
5426
+ private readonly frameTimes;
5427
+ private frameTimeIdx;
5428
+ private frameTimeFill;
5233
5429
  private readonly zeroBase;
5234
5430
  private readonly outputBuffer;
5235
5431
  private readonly compositorInput;
@@ -5249,6 +5445,17 @@ declare class CharacterController {
5249
5445
  setProfile(profile: CharacterProfile): void;
5250
5446
  /** Access underlying FaceCompositor for advanced use. */
5251
5447
  get compositor(): FaceCompositor;
5448
+ /**
5449
+ * Get a snapshot of frame budget performance (rolling 2-second window).
5450
+ * Useful for runtime diagnostics / dev overlays.
5451
+ */
5452
+ getPerformanceSnapshot(): {
5453
+ avgFrameUs: number;
5454
+ maxFrameUs: number;
5455
+ p95FrameUs: number;
5456
+ droppedFrames: number;
5457
+ totalFrames: number;
5458
+ };
5252
5459
  /** Reset all state (smoothing, life layer, emotions). */
5253
5460
  reset(): void;
5254
5461
  dispose(): void;
@@ -5285,7 +5492,7 @@ interface MicLipSyncConfig {
5285
5492
  micChunkSize?: number;
5286
5493
  /** Per-character expression weight scaling */
5287
5494
  profile?: ExpressionProfile;
5288
- /** Identity/style index for Wav2Vec2 (default: 0) */
5495
+ /** Identity/style index for A2E model (default: 0) */
5289
5496
  identityIndex?: number;
5290
5497
  }
5291
5498
  interface MicLipSyncFrame {
@@ -5324,9 +5531,10 @@ declare class MicLipSync extends EventEmitter<MicLipSyncEvents> {
5324
5531
  private _state;
5325
5532
  private _isSpeaking;
5326
5533
  private _currentFrame;
5327
- private _currentRawFrame;
5328
5534
  private profile;
5329
5535
  private _firstFrameEmitted;
5536
+ private readonly _profileBuffer;
5537
+ private vadQueue;
5330
5538
  private speechStartTime;
5331
5539
  private vadChunkSize;
5332
5540
  private vadBuffer;
@@ -5356,47 +5564,6 @@ declare class MicLipSync extends EventEmitter<MicLipSyncEvents> {
5356
5564
  private setState;
5357
5565
  }
5358
5566
 
5359
- /**
5360
- * Shared types for orchestration layer
5361
- *
5362
- * @category Orchestration
5363
- */
5364
-
5365
- type VoicePipelineState = 'idle' | 'loading' | 'ready' | 'listening' | 'thinking' | 'speaking' | 'error';
5366
- interface LoadingProgress {
5367
- currentModel: string;
5368
- progress: number;
5369
- totalModels: number;
5370
- modelsLoaded: number;
5371
- }
5372
- interface TranscriptResult {
5373
- text: string;
5374
- emotion?: string;
5375
- language?: string;
5376
- event?: string;
5377
- isFinal: boolean;
5378
- inferenceTimeMs?: number;
5379
- }
5380
- /**
5381
- * Consumer's response handler. VoicePipeline calls this with transcribed text.
5382
- * Consumer must stream audio back for playback + lip sync.
5383
- */
5384
- interface ResponseHandler {
5385
- (params: {
5386
- text: string;
5387
- emotion?: string;
5388
- event?: string;
5389
- /** Stream audio chunks to pipeline for playback + lip sync */
5390
- send: (chunk: Uint8Array) => Promise<void>;
5391
- /** Call when all audio has been sent */
5392
- done: () => Promise<void>;
5393
- /** Aborted on interruption or stop() */
5394
- signal: AbortSignal;
5395
- /** Session ID for backend correlation */
5396
- sessionId: string;
5397
- }): Promise<void>;
5398
- }
5399
-
5400
5567
  /**
5401
5568
  * VoicePipeline - Full conversational agent loop
5402
5569
  *
@@ -5429,10 +5596,9 @@ interface VoicePipelineBaseConfig {
5429
5596
  language?: string;
5430
5597
  };
5431
5598
  lam: {
5432
- gpuModelUrl: string;
5433
- gpuExternalDataUrl?: string | false;
5434
- cpuModelUrl: string;
5435
- mode?: 'auto' | 'gpu' | 'cpu';
5599
+ modelUrl: string;
5600
+ externalDataUrl?: string | false;
5601
+ backend?: 'auto' | 'webgpu' | 'wasm';
5436
5602
  };
5437
5603
  vad: {
5438
5604
  modelUrl: string;
@@ -5442,10 +5608,8 @@ interface VoicePipelineBaseConfig {
5442
5608
  };
5443
5609
  /** Per-character expression weight scaling */
5444
5610
  profile?: ExpressionProfile;
5445
- /** Identity/style index for Wav2Vec2 (default: 0) */
5611
+ /** Identity/style index for A2E model (default: 0) */
5446
5612
  identityIndex?: number;
5447
- /** LAM load timeout in ms — CPU fallback on timeout (default: 30000) */
5448
- lamLoadTimeoutMs?: number;
5449
5613
  /** Base silence timeout in ms (default: 500) */
5450
5614
  silenceTimeoutMs?: number;
5451
5615
  /** Extended silence timeout for long utterances (default: 700) */
@@ -5514,13 +5678,7 @@ interface VoicePipelineLocalConfig extends VoicePipelineBaseConfig {
5514
5678
  /** Optional text transform (e.g., LLM call). Receives transcript, returns response text. */
5515
5679
  onTranscript?: (text: string) => string | Promise<string>;
5516
5680
  }
5517
- /** Legacy config (no mode field) — treated as cloud mode. @deprecated Use mode: 'cloud' explicitly. */
5518
- interface VoicePipelineLegacyConfig extends VoicePipelineBaseConfig {
5519
- mode?: undefined;
5520
- /** Consumer's response handler */
5521
- onResponse: ResponseHandler;
5522
- }
5523
- type VoicePipelineConfig = VoicePipelineCloudConfig | VoicePipelineLocalConfig | VoicePipelineLegacyConfig;
5681
+ type VoicePipelineConfig = VoicePipelineCloudConfig | VoicePipelineLocalConfig;
5524
5682
  interface VoicePipelineEvents {
5525
5683
  'state': VoicePipelineState;
5526
5684
  'loading:progress': LoadingProgress;
@@ -5558,6 +5716,7 @@ declare class VoicePipeline extends EventEmitter<VoicePipelineEvents> {
5558
5716
  private interruption;
5559
5717
  private omoteEvents;
5560
5718
  private mic;
5719
+ private static readonly MAX_AUDIO_BUFFER_SAMPLES;
5561
5720
  private audioBuffer;
5562
5721
  private audioBufferSamples;
5563
5722
  private speechStartTime;
@@ -5568,7 +5727,10 @@ declare class VoicePipeline extends EventEmitter<VoicePipelineEvents> {
5568
5727
  private lastProgressiveResult;
5569
5728
  private lastProgressiveSamples;
5570
5729
  private asrErrorCount;
5730
+ private progressiveErrorCount;
5571
5731
  private responseAbortController;
5732
+ private _unsubChunk;
5733
+ private _unsubLevel;
5572
5734
  private _currentFrame;
5573
5735
  /** Current pipeline state */
5574
5736
  get state(): VoicePipelineState;
@@ -5586,7 +5748,7 @@ declare class VoicePipeline extends EventEmitter<VoicePipelineEvents> {
5586
5748
  */
5587
5749
  private loadFromBackends;
5588
5750
  /**
5589
- * Load from factories (original path). Now loads SenseVoice, LAM, and VAD in parallel.
5751
+ * Load from factories (original path). Loads SenseVoice, LAM, and VAD in parallel.
5590
5752
  */
5591
5753
  private loadFromFactories;
5592
5754
  start(): Promise<void>;
@@ -5612,4 +5774,86 @@ declare class VoicePipeline extends EventEmitter<VoicePipelineEvents> {
5612
5774
  private clearSilenceTimer;
5613
5775
  }
5614
5776
 
5615
- export { type A2EBackend, type A2EModelInfo, A2EOrchestrator, type A2EOrchestratorConfig, A2EProcessor, type A2EProcessorConfig, type A2EProgressEvent, type A2EResult, ALL_AUS, ARKIT_BLENDSHAPES, type AUActivation, AU_TO_ARKIT, type ActiveSpan, type AnimationClip, AnimationGraph, type AnimationGraphConfig, type AnimationGraphEvents, type AnimationLayer, type AnimationOutput, type AnimationState, type AnimationStateName, type AnimationTrigger, AudioChunkCoalescer, type AudioChunkCoalescerOptions, AudioEnergyAnalyzer, AudioScheduler, type AudioSchedulerOptions, BLENDSHAPE_TO_GROUP, type BackendPreference, type BlendWeight, type BlendshapeGroup, BlendshapeSmoother, type BlendshapeSmootherConfig, CTC_VOCAB, type CacheConfig, type CacheSpanAttributes, CharacterController, type CharacterControllerConfig, type CharacterProfile, type CharacterUpdateInput, type CharacterUpdateOutput, ConsoleExporter, type ConversationalState, type CreateA2EConfig, type CreateKokoroTTSConfig, type CreateSenseVoiceConfig, DEFAULT_ANIMATION_CONFIG, DEFAULT_MODEL_URLS, EMOTION_NAMES, EMOTION_TO_AU, EMOTION_VECTOR_SIZE, type EmotionAnimationMap, EmotionController, type EmotionLabel, type EmotionName, type EmotionPresetName, EmotionPresets, EmotionResolver, type EmotionWeights, EmphasisDetector, EventEmitter, type ExpressionProfile, FaceCompositor, type FaceCompositorConfig, type FaceCompositorInput, type FaceCompositorOutput, type FetchWithCacheOptions, type FullFaceFrame, FullFacePipeline, type FullFacePipelineEvents, type FullFacePipelineOptions, HF_CDN_URLS, INFERENCE_LATENCY_BUCKETS, type InferenceFactoryConfig, type InferenceSpanAttributes, type InterruptionConfig, type InterruptionEvents, InterruptionHandler, KOKORO_VOICES, type KokoroStreamChunk, type KokoroTTSConfig, KokoroTTSInference, type KokoroTTSModelInfo, type KokoroTTSResult, KokoroTTSUnifiedAdapter, KokoroTTSWorker, type KokoroVoiceName, LAM_BLENDSHAPES, type LifeLayerConfig, type LifeLayerInput, type LifeLayerOutput, type LoadingProgress, MODEL_LOAD_TIME_BUCKETS, type MetricData, MetricNames, MicLipSync, type MicLipSyncConfig, type MicLipSyncEvents, type MicLipSyncFrame, type MicLipSyncState, MicrophoneCapture, type MicrophoneCaptureConfig, ModelCache, type ModelSpanAttributes, type ModelUrlKey, OTLPExporter, type OTLPExporterConfig, OmoteEvents, OmoteTelemetry, PlaybackPipeline, type PlaybackPipelineConfig, type PlaybackPipelineEvents, type PlaybackState, ProceduralLifeLayer, type Quat, type QuotaInfo, type ResolvedEmotion, type ResponseHandler, RingBuffer, type RuntimeBackend, type SafariSpeechConfig, SafariSpeechRecognition, type SamplingConfig, type SenseVoiceBackend, type SenseVoiceConfig, SenseVoiceInference, type SenseVoiceLanguage, type SenseVoiceModelInfo, type SenseVoiceResult, SenseVoiceUnifiedAdapter, SenseVoiceWorker, type SenseVoiceWorkerConfig, type SileroVADBackend, type SileroVADConfig, type SileroVADFactoryConfig, SileroVADInference, SileroVADUnifiedAdapter, SileroVADWorker, type SpanAttributes, type SpanData, type SpeechErrorCallback, type SpeechRecognitionResult, type SpeechResultCallback, type SpeechSegment, type SynthesizeOptions, type TTSBackend, type TTSChunk, TTSPlayback, type TTSPlaybackConfig, type TTSPlaybackEvents, type TTSStreamOptions, type TelemetryConfig, type TelemetryExporter, type TelemetryExporterInterface, type TranscriptResult, type Transition, UnifiedInferenceWorker, type VADBackend, type VADModelInfo, type VADResult, type VADWorkerConfig, type VADWorkerModelInfo, type ValidationResult, type Vec3, VoicePipeline, type VoicePipelineCloudConfig, type VoicePipelineConfig, type VoicePipelineEvents, type VoicePipelineLocalConfig, type VoicePipelineState, type Wav2ArkitCpuConfig, Wav2ArkitCpuInference, Wav2ArkitCpuUnifiedAdapter, Wav2ArkitCpuWorker, type Wav2ArkitCpuWorkerConfig, Wav2Vec2Inference, type Wav2Vec2InferenceConfig, type Wav2Vec2Result, Wav2Vec2UnifiedAdapter, type WorkerHealthState, applyProfile, blendEmotions, calculatePeak, calculateRMS, configureCacheLimit, configureModelUrls, configureTelemetry, createA2E, createEmotionVector, createKokoroTTS, createSenseVoice, createSileroVAD, fetchWithCache, float32ToPcm16, formatBytes, getCacheConfig, getCacheKey, getEmotionPreset, getModelCache, getOptimalWasmThreads, getRecommendedBackend, getTelemetry, hasWebGPUApi, int16ToFloat32, isAndroid, isIOS, isIOSSafari, isMobile, isSafari, isSpeechRecognitionAvailable, isWebGPUAvailable, lerpBlendshapes, lerpEmotion, listVoices as listKokoroVoices, pcm16ToFloat32, preloadModels, resampleLinear, resetModelUrls, resolveBackend, resolveEmotion, shouldEnableWasmProxy, shouldUseCpuA2E, shouldUseNativeASR, shouldUseServerA2E, supportsVADWorker, ttsToPlaybackFormat };
5777
+ /**
5778
+ * VoiceOrchestrator — Shared voice wiring for OmoteAvatar adapters.
5779
+ *
5780
+ * Composes TTSSpeaker (local mode) or PlaybackPipeline (cloud mode) with
5781
+ * SpeechListener and InterruptionHandler. Supports both local TTS and
5782
+ * cloud TTS via discriminated union config.
5783
+ *
5784
+ * Extracted from the ~70 identical lines duplicated across three/babylon/r3f
5785
+ * adapters into a single reusable class.
5786
+ *
5787
+ * @category Orchestration
5788
+ */
5789
+
5790
+ interface VoiceOrchestratorBaseConfig {
5791
+ listener?: SpeechListenerConfig;
5792
+ interruptionEnabled?: boolean;
5793
+ profile?: ExpressionProfile;
5794
+ }
5795
+ interface VoiceOrchestratorLocalConfig extends VoiceOrchestratorBaseConfig {
5796
+ mode?: 'local';
5797
+ tts: TTSBackend;
5798
+ speaker?: TTSSpeakerConfig;
5799
+ onTranscript: (text: string, emotion?: string) => string | Promise<string> | AsyncGenerator<string>;
5800
+ }
5801
+ interface VoiceOrchestratorCloudConfig extends VoiceOrchestratorBaseConfig {
5802
+ mode: 'cloud';
5803
+ onResponse: ResponseHandler;
5804
+ lam?: {
5805
+ modelUrl?: string;
5806
+ externalDataUrl?: string | false;
5807
+ };
5808
+ }
5809
+ type VoiceOrchestratorConfig = VoiceOrchestratorLocalConfig | VoiceOrchestratorCloudConfig;
5810
+ interface VoiceOrchestratorEvents {
5811
+ 'state': ConversationalState;
5812
+ 'transcript': TranscriptResult;
5813
+ [key: string]: unknown;
5814
+ }
5815
+ declare class VoiceOrchestrator extends EventEmitter<VoiceOrchestratorEvents> {
5816
+ private speechListener;
5817
+ private interruption;
5818
+ private ttsSpeaker;
5819
+ private playbackPipeline;
5820
+ private ownedLam;
5821
+ private transcriptUnsub;
5822
+ private audioChunkUnsub;
5823
+ private connectEpoch;
5824
+ private responseAbortController;
5825
+ private _state;
5826
+ private _isSpeaking;
5827
+ private _frameSource;
5828
+ private _mode;
5829
+ private _sessionId;
5830
+ get state(): ConversationalState;
5831
+ get isSpeaking(): boolean;
5832
+ get frameSource(): FrameSource | null;
5833
+ /** Access the internal SpeechListener. */
5834
+ get listener(): SpeechListener | null;
5835
+ /** Access the internal TTSSpeaker (local mode only). */
5836
+ get speaker(): TTSSpeaker | null;
5837
+ connect(config: VoiceOrchestratorConfig): Promise<void>;
5838
+ disconnect(): Promise<void>;
5839
+ startListening(): Promise<void>;
5840
+ stopListening(): void;
5841
+ speak(text: string, options?: {
5842
+ signal?: AbortSignal;
5843
+ voice?: string;
5844
+ }): Promise<void>;
5845
+ streamText(options?: {
5846
+ signal?: AbortSignal;
5847
+ voice?: string;
5848
+ }): Promise<{
5849
+ push: (token: string) => void;
5850
+ end: () => Promise<void>;
5851
+ }>;
5852
+ stopSpeaking(): void;
5853
+ private wireLocalTranscript;
5854
+ private wireCloudTranscript;
5855
+ private handleInterruption;
5856
+ private setState;
5857
+ }
5858
+
5859
+ export { type A2EBackend, A2EInference, type A2EInferenceConfig, type A2EModelInfo, A2EProcessor, type A2EProcessorConfig, type A2EResult, A2EUnifiedAdapter, ALL_AUS, ARKIT_BLENDSHAPES, type AUActivation, AU_TO_ARKIT, type ActiveSpan, type AnimationClip, type AnimationController, AnimationGraph, type AnimationGraphConfig, type AnimationGraphEvents, type AnimationLayer, type AnimationOutput, type AnimationSource, type AnimationSourceOptions, type AnimationState, type AnimationStateName, type AnimationTrigger, AudioChunkCoalescer, type AudioChunkCoalescerOptions, AudioEnergyAnalyzer, AudioScheduler, type AudioSchedulerOptions, BLENDSHAPE_TO_GROUP, type BackendPreference, type BlendWeight, type BlendshapeGroup, BlendshapeSmoother, type BlendshapeSmootherConfig, type BoneFilterConfig, type CacheConfig, type CacheSpanAttributes, CharacterController, type CharacterControllerConfig, type CharacterProfile, type CharacterUpdateInput, type CharacterUpdateOutput, ConsoleExporter, type ConversationalState, type CreateA2EConfig, type CreateKokoroTTSConfig, type CreateSenseVoiceConfig, type CreateTTSPlayerConfig, DEFAULT_ANIMATION_CONFIG, DEFAULT_BONE_FILTER, DEFAULT_MODEL_URLS, EMOTION_NAMES, EMOTION_TO_AU, EMOTION_VECTOR_SIZE, EXPLICIT_EMOTION_COUNT, type ElevenLabsConfig, ElevenLabsTTSBackend, type EmotionAnimationMap, EmotionController, type EmotionLabel, type EmotionName, type EmotionPresetName, EmotionPresets, EmotionResolver, type EmotionWeights, EmphasisDetector, type ErrorType, ErrorTypes, EventEmitter, type ExpressionProfile, FaceCompositor, type FaceCompositorConfig, type FaceCompositorInput, type FaceCompositorOutput, type FetchWithCacheOptions, type FrameSource, type FullFaceFrame, HF_CDN_URLS, INFERENCE_LATENCY_BUCKETS, type InferenceFactoryConfig, type InferenceSpanAttributes, type InterruptionConfig, type InterruptionEvents, InterruptionHandler, KOKORO_VOICES, type KokoroStreamChunk, type KokoroTTSConfig, KokoroTTSInference, type KokoroTTSModelInfo, type KokoroTTSResult, KokoroTTSUnifiedAdapter, KokoroTTSWorker, type KokoroVoiceName, LAM_BLENDSHAPES, type LifeLayerConfig, type LifeLayerInput, type LifeLayerOutput, type LoadingProgress, MIXAMO_PREFIX, MODEL_LOAD_TIME_BUCKETS, type MetricData, MetricNames, MicLipSync, type MicLipSyncConfig, type MicLipSyncEvents, type MicLipSyncFrame, type MicLipSyncState, MicrophoneCapture, type MicrophoneCaptureConfig, ModelCache, type ModelSpanAttributes, type ModelUrlKey, OTLPExporter, type OTLPExporterConfig, OmoteEvents, OmoteTelemetry, PRESERVE_POSITION_BONES, PlaybackPipeline, type PlaybackPipelineConfig, type PlaybackPipelineEvents, type PlaybackState, type PollyConfig, type PollySynthesizeResult, PollyTTSBackend, ProceduralLifeLayer, type Quat, type QuotaInfo, type ResolvedEmotion, type ResponseHandler, RingBuffer, type RuntimeBackend, type SafariSpeechConfig, SafariSpeechRecognition, type SamplingConfig, type SenseVoiceBackend, type SenseVoiceConfig, SenseVoiceInference, type SenseVoiceLanguage, type SenseVoiceModelInfo, type SenseVoiceResult, SenseVoiceUnifiedAdapter, SenseVoiceWorker, type SenseVoiceWorkerConfig, type SileroVADBackend, type SileroVADConfig, type SileroVADFactoryConfig, SileroVADInference, SileroVADUnifiedAdapter, SileroVADWorker, type SpanAttributes, type SpanData, type SpeechErrorCallback, SpeechListener, type SpeechListenerConfig, type SpeechListenerEvents, type SpeechListenerState, type SpeechRecognitionResult, type SpeechResultCallback, type SpeechSegment, type SynthesizeOptions, type TTSBackend, type TTSChunk, TTSPlayback, type TTSPlaybackConfig, type TTSPlaybackEvents, TTSPlayer, TTSSpeaker, type TTSSpeakerConfig, type TTSStreamOptions, type TelemetryConfig, type TelemetryExporter, type TelemetryExporterInterface, type TrackDescriptor, type TranscriptResult, type Transition, UnifiedInferenceWorker, type VADBackend, type VADModelInfo, type VADResult, type VADWorkerConfig, type VADWorkerModelInfo, type ValidationResult, type Vec3, VoiceOrchestrator, type VoiceOrchestratorCloudConfig, type VoiceOrchestratorConfig, type VoiceOrchestratorEvents, type VoiceOrchestratorLocalConfig, VoicePipeline, type VoicePipelineCloudConfig, type VoicePipelineConfig, type VoicePipelineEvents, type VoicePipelineLocalConfig, type VoicePipelineState, A2EInference as Wav2Vec2Inference, type WorkerHealthState, analyzeTextEmotion, applyProfile, blendEmotions, calculatePeak, calculateRMS, configureCacheLimit, configureModelUrls, configureOrtCdn, configureTelemetry, createA2E, createEmotionVector, createKokoroTTS, createSenseVoice, createSileroVAD, createTTSPlayer, fetchWithCache, float32ToPcm16, formatBytes, getCacheConfig, getCacheKey, getEmotionPreset, getModelCache, getOptimalWasmThreads, getOrtCdnBase, getRecommendedBackend, getTelemetry, hasWebGPUApi, int16ToFloat32, isAndroid, isIOS, isIOSSafari, isMobile, isSafari, isSpeechRecognitionAvailable, isWebGPUAvailable, lerpBlendshapes, lerpEmotion, listVoices as listKokoroVoices, parseEmotionTags, pcm16ToFloat32, preloadModels, resampleLinear, resetModelUrls, resolveBackend, resolveEmotion, shouldEnableWasmProxy, shouldKeepTrack, shouldUseNativeASR, shouldUseServerA2E, stripMixamoPrefix, supportsVADWorker, ttsToPlaybackFormat, validateTTSInput };