@omote/core 0.9.6 → 0.10.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.mts CHANGED
@@ -470,7 +470,7 @@ declare function shouldUseServerA2E(): boolean;
470
470
  /**
471
471
  * Common interface for audio-to-expression (A2E) inference backends
472
472
  *
473
- * Implemented by A2EInference and A2EUnifiedAdapter, allowing PlaybackPipeline
473
+ * Implemented by A2EUnifiedAdapter, allowing PlaybackPipeline
474
474
  * and A2EProcessor to work with either implementation transparently.
475
475
  *
476
476
  * @category Inference
@@ -488,11 +488,11 @@ interface A2EModelInfo {
488
488
  /**
489
489
  * Result from A2E inference
490
490
  *
491
- * All implementations must return blendshapes in LAM_BLENDSHAPES order (alphabetical).
491
+ * All implementations must return blendshapes in ARKIT_BLENDSHAPES order (alphabetical).
492
492
  * Models with different native orderings must remap internally before returning.
493
493
  */
494
494
  interface A2EResult {
495
- /** Blendshape weights [frames, 52] in LAM_BLENDSHAPES order - 30fps */
495
+ /** Blendshape weights [frames, 52] in ARKIT_BLENDSHAPES order - 30fps */
496
496
  blendshapes: Float32Array[];
497
497
  /** Number of blendshape frames */
498
498
  numFrames: number;
@@ -507,10 +507,8 @@ interface A2EResult {
507
507
  * pipeline — A2E is the interface abstraction, LAM is the model.
508
508
  *
509
509
  * Implemented by:
510
- * - {@link A2EInference} (WebGPU/WASM, 192MB fp16)
511
- * - A2EUnifiedAdapter (shared unified worker)
510
+ * - {@link A2EUnifiedAdapter} (shared unified worker)
512
511
  *
513
- * @see {@link A2EInference} for direct usage
514
512
  * @see {@link createA2E} for the recommended factory API
515
513
  */
516
514
  interface A2EBackend {
@@ -531,7 +529,7 @@ interface A2EBackend {
531
529
  * Run inference on raw audio
532
530
  * @param audioSamples - Float32Array of raw audio at 16kHz
533
531
  * @param identityIndex - Optional identity index (ignored by CPU model)
534
- * @returns A2E result with blendshapes in LAM_BLENDSHAPES order
532
+ * @returns A2E result with blendshapes in ARKIT_BLENDSHAPES order
535
533
  */
536
534
  infer(audioSamples: Float32Array, identityIndex?: number): Promise<A2EResult>;
537
535
  /**
@@ -544,7 +542,7 @@ interface A2EBackend {
544
542
  * ExpressionProfile - Per-character weight scaling for A2E blendshape output
545
543
  *
546
544
  * Maps blendshape groups (eyes, brows, jaw, mouth, cheeks, nose, tongue)
547
- * to weight scalers. Used by PlaybackPipeline, MicLipSync, and VoicePipeline.
545
+ * to weight scalers. Used by PlaybackPipeline, MicLipSync, and VoiceOrchestrator.
548
546
  *
549
547
  * @category Audio
550
548
  */
@@ -575,7 +573,7 @@ interface ExpressionProfile {
575
573
  overrides?: Partial<Record<string, number>>;
576
574
  }
577
575
  /**
578
- * Map each LAM_BLENDSHAPES entry to its BlendshapeGroup.
576
+ * Map each ARKIT_BLENDSHAPES entry to its BlendshapeGroup.
579
577
  * Built once at module load from prefix matching.
580
578
  */
581
579
  declare const BLENDSHAPE_TO_GROUP: Map<string, BlendshapeGroup>;
@@ -678,6 +676,13 @@ declare class PlaybackPipeline extends EventEmitter<PlaybackPipelineEvents> {
678
676
  private neutralTransitionFrame;
679
677
  private neutralTransitionStart;
680
678
  private neutralAnimationId;
679
+ private static readonly RAMP_IN_HALFLIFE;
680
+ private static readonly RAMP_IN_DURATION_MS;
681
+ private rampInSmoother;
682
+ private rampInActive;
683
+ private rampInLastTime;
684
+ private rampInStartTime;
685
+ private readonly _rampInBuffer;
681
686
  private _currentFrame;
682
687
  private _currentRawFrame;
683
688
  private _emotion;
@@ -691,6 +696,8 @@ declare class PlaybackPipeline extends EventEmitter<PlaybackPipelineEvents> {
691
696
  constructor(config: PlaybackPipelineConfig);
692
697
  /** Initialize AudioContext (lazy, call after user gesture) */
693
698
  initialize(): Promise<void>;
699
+ /** Eagerly create AudioContext. Call from user gesture for iOS. */
700
+ warmup(): Promise<void>;
694
701
  /** Update ExpressionProfile at runtime */
695
702
  setProfile(profile: ExpressionProfile): void;
696
703
  /** Set the emotion label to include in emitted frames */
@@ -737,7 +744,7 @@ declare class PlaybackPipeline extends EventEmitter<PlaybackPipelineEvents> {
737
744
  * TTSBackend — Streaming text-to-speech backend interface.
738
745
  *
739
746
  * Any TTS engine (Kokoro, ElevenLabs, etc.) can implement this contract
740
- * to integrate with TTSPlayback and VoicePipeline.
747
+ * to integrate with TTSPlayback and VoiceOrchestrator.
741
748
  *
742
749
  * @category Inference
743
750
  */
@@ -781,6 +788,10 @@ interface TTSStreamOptions {
781
788
  voice?: string;
782
789
  /** Speed multiplier override per-call */
783
790
  speed?: number;
791
+ /** Language override per-call (e.g. 'en-us', 'ja'). Default: derived from voice name. */
792
+ language?: string;
793
+ /** When true, emit the entire text as a single chunk (no sentence splitting). */
794
+ singleShot?: boolean;
784
795
  }
785
796
  /**
786
797
  * A single chunk of TTS audio output
@@ -856,7 +867,11 @@ declare class TTSPlayback extends EventEmitter<TTSPlaybackEvents> {
856
867
  speak(text: string, options?: {
857
868
  signal?: AbortSignal;
858
869
  voice?: string;
870
+ speed?: number;
871
+ language?: string;
859
872
  }): Promise<void>;
873
+ /** Eagerly create AudioContext. Call from user gesture for iOS. */
874
+ warmup(): Promise<void>;
860
875
  /** Dispose of all resources. */
861
876
  dispose(): Promise<void>;
862
877
  private speakWithPrefetch;
@@ -893,34 +908,9 @@ declare class TTSPlayback extends EventEmitter<TTSPlaybackEvents> {
893
908
  declare function isWebGPUAvailable(): Promise<boolean>;
894
909
 
895
910
  /**
896
- * SenseVoice automatic speech recognition using ONNX Runtime Web
897
- *
898
- * Non-autoregressive CTC-based ASR that is 5x faster than Whisper-Small.
899
- * Runs entirely in browser via WebGPU or WASM. No transformers.js dependency.
900
- *
901
- * Uses the sherpa-onnx SenseVoice export (model.int8.onnx, 239MB int8 quantized).
902
- * Also provides emotion detection, language identification, and audio event detection
903
- * from the same forward pass.
911
+ * SenseVoice type definitions
904
912
  *
905
913
  * @category Inference
906
- *
907
- * @example Basic usage
908
- * ```typescript
909
- * import { SenseVoiceInference } from '@omote/core';
910
- *
911
- * const asr = new SenseVoiceInference({
912
- * modelUrl: '/models/sensevoice/model.int8.onnx',
913
- * tokensUrl: '/models/sensevoice/tokens.txt',
914
- * });
915
- * await asr.load();
916
- *
917
- * const { text, emotion, language } = await asr.transcribe(audioSamples);
918
- * console.log(text); // "Hello world"
919
- * console.log(emotion); // "NEUTRAL"
920
- * console.log(language); // "en"
921
- * ```
922
- *
923
- * @module inference/SenseVoiceInference
924
914
  */
925
915
 
926
916
  type SenseVoiceLanguage = 'auto' | 'zh' | 'en' | 'ja' | 'ko' | 'yue';
@@ -957,76 +947,49 @@ interface SenseVoiceModelInfo {
957
947
  outputNames: string[];
958
948
  vocabSize: number;
959
949
  }
960
- declare class SenseVoiceInference {
961
- private session;
962
- private ort;
963
- private config;
964
- private _backend;
965
- private isLoading;
966
- private inferenceQueue;
967
- private poisoned;
968
- private static readonly INFERENCE_TIMEOUT_MS;
969
- private lastLfrFrames;
970
- private webgpuShapeWarned;
971
- private tokenMap;
972
- private negMean;
973
- private invStddev;
974
- private languageId;
975
- private textNormId;
976
- constructor(config: SenseVoiceConfig);
977
- get backend(): RuntimeBackend | null;
978
- get isLoaded(): boolean;
950
+ /**
951
+ * Configuration for SenseVoice Worker (used by SenseVoiceUnifiedAdapter)
952
+ */
953
+ interface SenseVoiceWorkerConfig {
954
+ /** Path or URL to model.int8.onnx (239MB) */
955
+ modelUrl: string;
956
+ /** Path or URL to tokens.txt vocabulary file (default: sibling of modelUrl) */
957
+ tokensUrl?: string;
958
+ /** Language hint (default: 'auto' for auto-detection) */
959
+ language?: SenseVoiceLanguage;
960
+ /** Text normalization: 'with_itn' applies inverse text normalization (default: 'with_itn') */
961
+ textNorm?: 'with_itn' | 'without_itn';
962
+ }
963
+ /**
964
+ * Common interface for SenseVoice implementations
965
+ */
966
+ interface SenseVoiceBackend {
967
+ /** Whether the model is loaded and ready for inference */
968
+ readonly isLoaded: boolean;
969
+ /** Current backend type ('wasm' for worker, full backend for main thread, null if not loaded) */
970
+ readonly backend: 'wasm' | 'webgpu' | null;
971
+ /**
972
+ * Load the ONNX model
973
+ * @param onProgress - Optional progress callback (fires once at 100% for worker)
974
+ * @returns Model loading information
975
+ */
979
976
  load(onProgress?: (loaded: number, total: number) => void): Promise<SenseVoiceModelInfo>;
980
977
  /**
981
978
  * Transcribe audio samples to text
982
- *
983
- * @param audioSamples Float32Array of audio samples at 16kHz, [-1, 1] range
984
- * @returns Transcription result with text, emotion, language, and event
979
+ * @param audioSamples - Float32Array of audio samples at 16kHz
980
+ * @returns Transcription result
985
981
  */
986
982
  transcribe(audioSamples: Float32Array): Promise<SenseVoiceResult>;
987
- private queueInference;
983
+ /**
984
+ * Dispose of the model and free resources
985
+ */
988
986
  dispose(): Promise<void>;
989
987
  }
990
988
 
991
989
  /**
992
- * Silero VAD (Voice Activity Detection) inference
993
- *
994
- * Neural network-based VAD running in browser via ONNX Runtime Web.
995
- * Much more accurate than RMS-based energy detection.
996
- *
997
- * Uses lazy loading to conditionally load WebGPU or WASM-only bundle:
998
- * - iOS: Loads WASM-only bundle (WebGPU crashes due to Safari bugs)
999
- * - Android/Desktop: Loads WebGPU bundle (with WASM fallback)
990
+ * Silero VAD type definitions
1000
991
  *
1001
992
  * @category Inference
1002
- *
1003
- * @example Basic usage
1004
- * ```typescript
1005
- * import { SileroVADInference } from '@omote/core';
1006
- *
1007
- * const vad = new SileroVADInference({
1008
- * modelUrl: '/models/silero-vad.onnx'
1009
- * });
1010
- * await vad.load();
1011
- *
1012
- * // Process 32ms chunks (512 samples at 16kHz)
1013
- * const probability = await vad.process(audioChunk);
1014
- * if (probability > 0.5) {
1015
- * console.log('Speech detected!');
1016
- * }
1017
- * ```
1018
- *
1019
- * @example Streaming with state management
1020
- * ```typescript
1021
- * // State is automatically maintained between process() calls
1022
- * // Call reset() when starting a new audio stream
1023
- * vad.reset();
1024
- *
1025
- * for (const chunk of audioChunks) {
1026
- * const prob = await vad.process(chunk);
1027
- * // prob is speech probability [0, 1]
1028
- * }
1029
- * ```
1030
993
  */
1031
994
 
1032
995
  type VADBackend = BackendPreference;
@@ -1096,117 +1059,6 @@ interface SpeechSegment {
1096
1059
  /** Average probability during segment */
1097
1060
  avgProbability: number;
1098
1061
  }
1099
- /**
1100
- * Silero VAD - Neural network voice activity detection
1101
- *
1102
- * Based on snakers4/silero-vad ONNX model.
1103
- * Processes 32ms chunks (512 samples at 16kHz) with LSTM state.
1104
- *
1105
- * @see https://github.com/snakers4/silero-vad
1106
- */
1107
- declare class SileroVADInference {
1108
- private session;
1109
- private ort;
1110
- private config;
1111
- private _backend;
1112
- private isLoading;
1113
- private state;
1114
- private context;
1115
- private readonly chunkSize;
1116
- private readonly contextSize;
1117
- private inferenceQueue;
1118
- private preSpeechBuffer;
1119
- private wasSpeaking;
1120
- private srTensor;
1121
- constructor(config: SileroVADConfig);
1122
- get backend(): RuntimeBackend | null;
1123
- get isLoaded(): boolean;
1124
- get sampleRate(): number;
1125
- get threshold(): number;
1126
- /**
1127
- * Get required chunk size in samples
1128
- */
1129
- getChunkSize(): number;
1130
- /**
1131
- * Get chunk duration in milliseconds
1132
- */
1133
- getChunkDurationMs(): number;
1134
- /**
1135
- * Check if WebGPU is available and working
1136
- * (iOS returns false even if navigator.gpu exists due to ONNX Runtime bugs)
1137
- */
1138
- static isWebGPUAvailable: typeof isWebGPUAvailable;
1139
- /**
1140
- * Load the ONNX model
1141
- */
1142
- load(): Promise<VADModelInfo>;
1143
- /**
1144
- * Reset state for new audio stream
1145
- */
1146
- reset(): void;
1147
- /**
1148
- * Process a single audio chunk
1149
- *
1150
- * @param audioChunk - Float32Array of exactly chunkSize samples (512 for 16kHz, 256 for 8kHz)
1151
- * @returns VAD result with speech probability
1152
- */
1153
- process(audioChunk: Float32Array): Promise<VADResult>;
1154
- /**
1155
- * Process audio and detect speech segments
1156
- *
1157
- * @param audio - Complete audio buffer
1158
- * @param options - Detection options
1159
- * @returns Array of speech segments
1160
- */
1161
- detectSpeech(audio: Float32Array, options?: {
1162
- /** Minimum speech duration in ms (default: 250) */
1163
- minSpeechDurationMs?: number;
1164
- /** Minimum silence duration to end segment in ms (default: 300) */
1165
- minSilenceDurationMs?: number;
1166
- /** Padding to add before/after speech in ms (default: 30) */
1167
- speechPadMs?: number;
1168
- }): Promise<SpeechSegment[]>;
1169
- /**
1170
- * Queue inference to serialize ONNX session calls
1171
- */
1172
- private queueInference;
1173
- /**
1174
- * Dispose of the model and free resources
1175
- */
1176
- dispose(): Promise<void>;
1177
- }
1178
-
1179
- /**
1180
- * Silero VAD Web Worker implementation
1181
- *
1182
- * Runs Silero VAD inference in a dedicated Web Worker to prevent main thread blocking.
1183
- * Uses inline worker script (Blob URL pattern) to avoid separate file deployment.
1184
- *
1185
- * Key design decisions:
1186
- * - WASM backend only (WebGPU doesn't work in Workers)
1187
- * - LSTM state serialized as Float32Array (Tensors can't cross worker boundary)
1188
- * - Audio copied (not transferred) to retain main thread access for pre-speech buffer
1189
- * - ONNX Runtime loaded from CDN in worker (no bundler complications)
1190
- *
1191
- * @category Inference
1192
- *
1193
- * @example Basic usage
1194
- * ```typescript
1195
- * import { SileroVADWorker } from '@omote/core';
1196
- *
1197
- * const vad = new SileroVADWorker({
1198
- * modelUrl: '/models/silero-vad.onnx'
1199
- * });
1200
- * await vad.load();
1201
- *
1202
- * // Process 32ms chunks (512 samples at 16kHz)
1203
- * const result = await vad.process(audioChunk);
1204
- * if (result.isSpeech) {
1205
- * console.log('Speech detected!', result.probability);
1206
- * }
1207
- * ```
1208
- */
1209
-
1210
1062
  /**
1211
1063
  * Configuration for Silero VAD Worker
1212
1064
  */
@@ -1219,13 +1071,6 @@ interface VADWorkerConfig {
1219
1071
  threshold?: number;
1220
1072
  /**
1221
1073
  * Number of audio chunks to keep in pre-speech buffer.
1222
- * When VAD triggers, these chunks are prepended to the speech buffer
1223
- * to capture the beginning of speech that occurred before detection.
1224
- *
1225
- * At 512 samples/chunk and 16kHz:
1226
- * - 10 chunks = 320ms of pre-speech audio
1227
- * - 15 chunks = 480ms of pre-speech audio
1228
- *
1229
1074
  * Default: 10 chunks (320ms)
1230
1075
  */
1231
1076
  preSpeechBufferChunks?: number;
@@ -1241,85 +1086,45 @@ interface VADWorkerModelInfo {
1241
1086
  sampleRate: number;
1242
1087
  chunkSize: number;
1243
1088
  }
1244
-
1245
1089
  /**
1246
- * Silero VAD Worker - Voice Activity Detection in a Web Worker
1247
- *
1248
- * Runs Silero VAD inference off the main thread to prevent UI blocking.
1249
- * Feature parity with SileroVADInference but runs in dedicated worker.
1250
- *
1251
- * @see SileroVADInference for main-thread version
1090
+ * Common interface for Silero VAD implementations
1252
1091
  */
1253
- declare class SileroVADWorker {
1254
- private worker;
1255
- private config;
1256
- private isLoading;
1257
- private _isLoaded;
1258
- private poisoned;
1259
- private state;
1260
- private context;
1261
- private readonly chunkSize;
1262
- private readonly contextSize;
1263
- private inferenceQueue;
1264
- private preSpeechBuffer;
1265
- private wasSpeaking;
1266
- private pendingResolvers;
1267
- private messageId;
1268
- constructor(config: VADWorkerConfig);
1269
- get isLoaded(): boolean;
1270
- /**
1271
- * Backend type (always 'wasm' for Worker, WebGPU not supported in Workers)
1272
- */
1273
- get backend(): 'wasm' | null;
1274
- get sampleRate(): number;
1275
- get threshold(): number;
1276
- /**
1277
- * Get required chunk size in samples
1278
- */
1279
- getChunkSize(): number;
1280
- /**
1281
- * Get chunk duration in milliseconds
1282
- */
1283
- getChunkDurationMs(): number;
1284
- /**
1285
- * Create the worker from inline script
1286
- */
1287
- private createWorker;
1288
- /**
1289
- * Handle messages from worker
1290
- */
1291
- private handleWorkerMessage;
1292
- /**
1293
- * Send message to worker and wait for response
1294
- */
1295
- private sendMessage;
1296
- /**
1297
- * Load the ONNX model in the worker
1298
- */
1299
- load(): Promise<VADWorkerModelInfo>;
1092
+ interface SileroVADBackend {
1093
+ /** Current backend type (webgpu, wasm, or null if not loaded) */
1094
+ readonly backend: RuntimeBackend | null;
1095
+ /** Whether the model is loaded and ready for inference */
1096
+ readonly isLoaded: boolean;
1097
+ /** Audio sample rate (8000 or 16000 Hz) */
1098
+ readonly sampleRate: number;
1099
+ /** Speech detection threshold (0-1) */
1100
+ readonly threshold: number;
1300
1101
  /**
1301
- * Reset state for new audio stream
1102
+ * Load the ONNX model
1103
+ * @returns Model loading information
1302
1104
  */
1303
- reset(): Promise<void>;
1105
+ load(): Promise<VADModelInfo | VADWorkerModelInfo>;
1304
1106
  /**
1305
1107
  * Process a single audio chunk
1306
- *
1307
- * @param audioChunk - Float32Array of exactly chunkSize samples (512 for 16kHz, 256 for 8kHz)
1108
+ * @param audioChunk - Float32Array of exactly chunkSize samples
1308
1109
  * @returns VAD result with speech probability
1309
1110
  */
1310
1111
  process(audioChunk: Float32Array): Promise<VADResult>;
1311
1112
  /**
1312
- * Queue inference to serialize worker calls
1113
+ * Reset state for new audio stream
1313
1114
  */
1314
- private queueInference;
1115
+ reset(): void | Promise<void>;
1315
1116
  /**
1316
- * Dispose of the worker and free resources
1117
+ * Dispose of the model and free resources
1317
1118
  */
1318
1119
  dispose(): Promise<void>;
1319
1120
  /**
1320
- * Check if Web Workers are supported
1121
+ * Get required chunk size in samples
1321
1122
  */
1322
- static isSupported(): boolean;
1123
+ getChunkSize(): number;
1124
+ /**
1125
+ * Get chunk duration in milliseconds
1126
+ */
1127
+ getChunkDurationMs(): number;
1323
1128
  }
1324
1129
 
1325
1130
  /**
@@ -1447,43 +1252,33 @@ declare class UnifiedInferenceWorker {
1447
1252
 
1448
1253
  /** Base config shared across all inference factory functions */
1449
1254
  interface InferenceFactoryConfig {
1450
- /**
1451
- * Worker mode:
1452
- * - 'auto' (default): Use Worker if supported, else main thread
1453
- * - true: Force Worker (throws if unsupported)
1454
- * - false: Force main thread
1455
- */
1456
- useWorker?: boolean | 'auto';
1457
1255
  /**
1458
1256
  * Unified inference worker instance.
1459
- * When provided, routes inference through the shared worker,
1257
+ * Routes inference through the shared worker,
1460
1258
  * keeping all inference off the main thread.
1461
- * Takes precedence over useWorker setting.
1462
1259
  */
1463
1260
  unifiedWorker?: UnifiedInferenceWorker;
1464
1261
  }
1465
1262
 
1466
1263
  /**
1467
- * Factory function for A2E inference
1264
+ * Factory function for A2E inference via UnifiedInferenceWorker
1468
1265
  *
1469
1266
  * Creates an A2EBackend instance with zero-config defaults (HuggingFace CDN).
1470
- * Supports unified worker mode for iOS off-main-thread inference.
1267
+ * Routes inference through the shared unified worker.
1471
1268
  *
1472
1269
  * @category Inference
1473
1270
  *
1474
- * @example Auto-detect (recommended, zero-config)
1271
+ * @example
1475
1272
  * ```typescript
1476
- * import { createA2E } from '@omote/core';
1273
+ * import { createA2E, UnifiedInferenceWorker } from '@omote/core';
1274
+ *
1275
+ * const worker = new UnifiedInferenceWorker();
1276
+ * await worker.init();
1477
1277
  *
1478
- * const a2e = createA2E(); // uses HF CDN defaults (192MB fp16)
1278
+ * const a2e = createA2E({ unifiedWorker: worker });
1479
1279
  * await a2e.load();
1480
1280
  * const { blendshapes } = await a2e.infer(audioSamples);
1481
1281
  * ```
1482
- *
1483
- * @example Custom model URL
1484
- * ```typescript
1485
- * const a2e = createA2E({ modelUrl: '/models/lam.onnx' });
1486
- * ```
1487
1282
  */
1488
1283
 
1489
1284
  /**
@@ -1499,13 +1294,13 @@ interface CreateA2EConfig extends InferenceFactoryConfig {
1499
1294
  * Set to `false` to skip external data loading (single-file models only).
1500
1295
  */
1501
1296
  externalDataUrl?: string | false;
1502
- /** Backend preference (default: 'auto') */
1503
- backend?: BackendPreference;
1504
1297
  /** Number of identity classes (default: 12) */
1505
1298
  numIdentityClasses?: number;
1506
1299
  }
1507
1300
  /**
1508
- * Create an A2E instance
1301
+ * Create an A2E instance via the unified worker.
1302
+ *
1303
+ * If no `unifiedWorker` is provided, a dedicated worker is created on load().
1509
1304
  *
1510
1305
  * @param config - Factory configuration
1511
1306
  * @returns An A2EBackend instance
@@ -1521,7 +1316,7 @@ declare function createA2E(config?: CreateA2EConfig): A2EBackend;
1521
1316
  /**
1522
1317
  * Generic frame source -- any object that emits 'frame' events with blendshapes.
1523
1318
  *
1524
- * Implemented by PlaybackPipeline, MicLipSync, VoicePipeline, and any custom source.
1319
+ * Implemented by PlaybackPipeline, MicLipSync, and any custom source.
1525
1320
  * Used by OmoteAvatar (all renderer adapters) to receive animation frames.
1526
1321
  */
1527
1322
  interface FrameSource {
@@ -1550,7 +1345,7 @@ interface TranscriptResult {
1550
1345
  inferenceTimeMs?: number;
1551
1346
  }
1552
1347
  /**
1553
- * Consumer's response handler. VoicePipeline calls this with transcribed text.
1348
+ * Consumer's response handler. VoiceOrchestrator calls this with transcribed text.
1554
1349
  * Consumer must stream audio back for playback + lip sync.
1555
1350
  */
1556
1351
  interface ResponseHandler {
@@ -1581,6 +1376,8 @@ interface ResponseHandler {
1581
1376
  */
1582
1377
 
1583
1378
  interface TTSSpeakerConfig {
1379
+ /** Skip LAM download — audio playback only, no lip sync. Default: false. */
1380
+ audioOnly?: boolean;
1584
1381
  /** Per-character expression weight scaling */
1585
1382
  profile?: ExpressionProfile;
1586
1383
  /** Identity/style index for A2E model (default: 0) */
@@ -1593,8 +1390,8 @@ interface TTSSpeakerConfig {
1593
1390
  neutralTransitionMs?: number;
1594
1391
  /** Pre-built A2E backend (skip internal createA2E). */
1595
1392
  lam?: A2EBackend;
1596
- /** LAM model config (only when lam not provided) */
1597
- models?: CreateA2EConfig;
1393
+ /** LAM model config (only when lam not provided). unifiedWorker is supplied by TTSSpeaker. */
1394
+ models?: Omit<CreateA2EConfig, 'unifiedWorker'>;
1598
1395
  /** Shared unified worker (recommended for iOS) */
1599
1396
  unifiedWorker?: UnifiedInferenceWorker;
1600
1397
  }
@@ -1603,6 +1400,7 @@ declare class TTSSpeaker {
1603
1400
  private tts;
1604
1401
  private ownedLam;
1605
1402
  private ownedWorker;
1403
+ private usesSharedWorker;
1606
1404
  private currentAbort;
1607
1405
  private _isSpeaking;
1608
1406
  private _audioOnly;
@@ -1616,11 +1414,8 @@ declare class TTSSpeaker {
1616
1414
  /**
1617
1415
  * Connect a TTS backend.
1618
1416
  *
1619
- * When config includes `lam`, `unifiedWorker`, or `models`, the full lip sync
1620
- * pipeline is created (LAM + TTSPlayback + PlaybackPipeline).
1621
- *
1622
- * When config is omitted or has none of those, audio-only mode is used:
1623
- * TTS → AudioScheduler (speakers only, no blendshapes, no LAM download).
1417
+ * By default, the full lip sync pipeline is created (auto-downloads LAM).
1418
+ * Pass `audioOnly: true` for audio-only mode (no blendshapes, no LAM download).
1624
1419
  *
1625
1420
  * @param tts - TTS backend to use for speech synthesis
1626
1421
  * @param config - Optional configuration for A2E, expression profile, etc.
@@ -1636,6 +1431,8 @@ declare class TTSSpeaker {
1636
1431
  speak(text: string, options?: {
1637
1432
  signal?: AbortSignal;
1638
1433
  voice?: string;
1434
+ speed?: number;
1435
+ language?: string;
1639
1436
  }): Promise<void>;
1640
1437
  /** Audio-only speak: TTS → resample → AudioScheduler (no blendshapes). */
1641
1438
  private speakAudioOnly;
@@ -1655,13 +1452,20 @@ declare class TTSSpeaker {
1655
1452
  streamText(options: {
1656
1453
  signal?: AbortSignal;
1657
1454
  voice?: string;
1455
+ speed?: number;
1456
+ language?: string;
1658
1457
  }): Promise<{
1659
1458
  push: (token: string) => void;
1660
1459
  end: () => Promise<void>;
1661
1460
  }>;
1662
1461
  /** streamText in audio-only mode: TTS → AudioScheduler (no blendshapes). */
1663
1462
  private streamTextAudioOnly;
1664
- /** Abort current speak if any. */
1463
+ /**
1464
+ * Warm up AudioContext for iOS/Safari autoplay policy.
1465
+ * Call from a user gesture handler (click/tap) before speak().
1466
+ */
1467
+ warmup(): Promise<void>;
1468
+ /** Abort current speak if any. Triggers neutral transition on PlaybackPipeline. */
1665
1469
  stop(): void;
1666
1470
  /** Clean teardown of all owned resources. */
1667
1471
  dispose(): Promise<void>;
@@ -1697,11 +1501,13 @@ interface CreateTTSPlayerConfig {
1697
1501
  modelUrl?: string;
1698
1502
  /** Voice data base URL override */
1699
1503
  voiceBaseUrl?: string;
1504
+ /** Shared unified worker (created automatically if not provided) */
1505
+ unifiedWorker?: UnifiedInferenceWorker;
1700
1506
  }
1701
1507
  /**
1702
1508
  * Zero-config TTS player. Speak text through speakers without an avatar.
1703
1509
  *
1704
- * Uses Kokoro TTS (82M q8, ~92MB) with automatic worker selection.
1510
+ * Uses Kokoro TTS (82M q8, ~92MB) with automatic worker creation.
1705
1511
  * No LAM model is downloaded — audio plays directly through AudioScheduler.
1706
1512
  */
1707
1513
  declare function createTTSPlayer(config?: CreateTTSPlayerConfig): TTSPlayer;
@@ -1710,254 +1516,27 @@ declare function createTTSPlayer(config?: CreateTTSPlayerConfig): TTSPlayer;
1710
1516
  */
1711
1517
  declare class TTSPlayer extends TTSSpeaker {
1712
1518
  private backend;
1713
- constructor(tts: TTSBackend);
1519
+ private ttsWorker;
1520
+ private ttsPlayerUsesSharedWorker;
1521
+ private ttsConfig;
1522
+ constructor(config?: CreateTTSPlayerConfig);
1714
1523
  /** Load TTS model and connect in audio-only mode. */
1715
1524
  load(): Promise<void>;
1716
1525
  /** Whether the TTS model is loaded and ready. */
1717
1526
  get isLoaded(): boolean;
1527
+ dispose(): Promise<void>;
1718
1528
  }
1719
1529
 
1720
1530
  /**
1721
- * Factory function for SenseVoice ASR with automatic Worker vs main thread selection
1722
- *
1723
- * Provides a unified API that automatically selects the optimal implementation:
1724
- * - Worker supported: Uses SenseVoiceWorker (off-main-thread inference)
1725
- * - Worker unsupported: Uses SenseVoiceInference (main thread)
1726
- *
1727
- * @category Inference
1728
- *
1729
- * @example Auto-detect (recommended)
1730
- * ```typescript
1731
- * import { createSenseVoice } from '@omote/core';
1531
+ * SpeechListener Standalone listening primitive.
1732
1532
  *
1733
- * const asr = createSenseVoice({
1734
- * modelUrl: '/models/sensevoice/model.int8.onnx',
1735
- * });
1736
- * await asr.load();
1737
- * const { text, emotion } = await asr.transcribe(audioSamples);
1738
- * ```
1533
+ * Composes: MicrophoneCapture SileroVAD → SenseVoice ASR → transcript events.
1534
+ * Used independently or alongside TTSSpeaker and VoiceOrchestrator.
1739
1535
  *
1740
- * @example Force worker
1741
- * ```typescript
1742
- * const asr = createSenseVoice({
1743
- * modelUrl: '/models/sensevoice/model.int8.onnx',
1744
- * useWorker: true,
1745
- * });
1746
- * ```
1536
+ * Does NOT handle TTS, LAM, or response routing — those belong to TTSSpeaker
1537
+ * and VoiceOrchestrator respectively.
1747
1538
  *
1748
- * @example Force main thread
1749
- * ```typescript
1750
- * const asr = createSenseVoice({
1751
- * modelUrl: '/models/sensevoice/model.int8.onnx',
1752
- * useWorker: false,
1753
- * });
1754
- * ```
1755
- */
1756
-
1757
- /**
1758
- * Common interface for both SenseVoiceInference and SenseVoiceWorker
1759
- */
1760
- interface SenseVoiceBackend {
1761
- /** Whether the model is loaded and ready for inference */
1762
- readonly isLoaded: boolean;
1763
- /** Current backend type ('wasm' for worker, full backend for main thread, null if not loaded) */
1764
- readonly backend: 'wasm' | 'webgpu' | null;
1765
- /**
1766
- * Load the ONNX model
1767
- * @param onProgress - Optional progress callback (fires once at 100% for worker)
1768
- * @returns Model loading information
1769
- */
1770
- load(onProgress?: (loaded: number, total: number) => void): Promise<SenseVoiceModelInfo>;
1771
- /**
1772
- * Transcribe audio samples to text
1773
- * @param audioSamples - Float32Array of audio samples at 16kHz
1774
- * @returns Transcription result
1775
- */
1776
- transcribe(audioSamples: Float32Array): Promise<SenseVoiceResult>;
1777
- /**
1778
- * Dispose of the model and free resources
1779
- */
1780
- dispose(): Promise<void>;
1781
- }
1782
- /**
1783
- * Configuration for the SenseVoice factory
1784
- */
1785
- interface CreateSenseVoiceConfig extends InferenceFactoryConfig {
1786
- /** Path or URL to model.int8.onnx (239MB). Default: HuggingFace CDN */
1787
- modelUrl?: string;
1788
- /** Path or URL to tokens.txt vocabulary file (default: sibling of modelUrl) */
1789
- tokensUrl?: string;
1790
- /** Language hint (default: 'auto') */
1791
- language?: SenseVoiceLanguage;
1792
- /** Text normalization (default: 'with_itn') */
1793
- textNorm?: 'with_itn' | 'without_itn';
1794
- }
1795
- /**
1796
- * Create a SenseVoice ASR instance with automatic implementation selection
1797
- *
1798
- * @param config - Factory configuration
1799
- * @returns A SenseVoiceBackend instance (either Worker or main thread)
1800
- */
1801
- declare function createSenseVoice(config?: CreateSenseVoiceConfig): SenseVoiceBackend;
1802
-
1803
- /**
1804
- * Factory function for Silero VAD with automatic Worker vs main thread selection
1805
- *
1806
- * Provides a unified API that automatically selects the optimal implementation:
1807
- * - Desktop browsers: Uses SileroVADWorker (off-main-thread inference)
1808
- * - Mobile devices: Uses SileroVADInference (main thread, avoids memory overhead)
1809
- * - Fallback: Gracefully falls back to main thread if Worker fails
1810
- *
1811
- * @category Inference
1812
- *
1813
- * @example Basic usage (auto-detect)
1814
- * ```typescript
1815
- * import { createSileroVAD } from '@omote/core';
1816
- *
1817
- * const vad = createSileroVAD({
1818
- * modelUrl: '/models/silero-vad.onnx',
1819
- * threshold: 0.5,
1820
- * });
1821
- *
1822
- * await vad.load();
1823
- * const result = await vad.process(audioChunk);
1824
- * if (result.isSpeech) {
1825
- * console.log('Speech detected!', result.probability);
1826
- * }
1827
- * ```
1828
- *
1829
- * @example Force worker usage
1830
- * ```typescript
1831
- * const vad = createSileroVAD({
1832
- * modelUrl: '/models/silero-vad.onnx',
1833
- * useWorker: true, // Force Worker even on mobile
1834
- * });
1835
- * ```
1836
- *
1837
- * @example Force main thread
1838
- * ```typescript
1839
- * const vad = createSileroVAD({
1840
- * modelUrl: '/models/silero-vad.onnx',
1841
- * useWorker: false, // Force main thread
1842
- * });
1843
- * ```
1844
- */
1845
-
1846
- /**
1847
- * Common interface for both SileroVADInference and SileroVADWorker
1848
- *
1849
- * This interface defines the shared API that both implementations provide,
1850
- * allowing consumers to use either interchangeably.
1851
- */
1852
- interface SileroVADBackend {
1853
- /** Current backend type (webgpu, wasm, or null if not loaded) */
1854
- readonly backend: RuntimeBackend | null;
1855
- /** Whether the model is loaded and ready for inference */
1856
- readonly isLoaded: boolean;
1857
- /** Audio sample rate (8000 or 16000 Hz) */
1858
- readonly sampleRate: number;
1859
- /** Speech detection threshold (0-1) */
1860
- readonly threshold: number;
1861
- /**
1862
- * Load the ONNX model
1863
- * @returns Model loading information
1864
- */
1865
- load(): Promise<VADModelInfo | VADWorkerModelInfo>;
1866
- /**
1867
- * Process a single audio chunk
1868
- * @param audioChunk - Float32Array of exactly chunkSize samples
1869
- * @returns VAD result with speech probability
1870
- */
1871
- process(audioChunk: Float32Array): Promise<VADResult>;
1872
- /**
1873
- * Reset state for new audio stream
1874
- */
1875
- reset(): void | Promise<void>;
1876
- /**
1877
- * Dispose of the model and free resources
1878
- */
1879
- dispose(): Promise<void>;
1880
- /**
1881
- * Get required chunk size in samples
1882
- */
1883
- getChunkSize(): number;
1884
- /**
1885
- * Get chunk duration in milliseconds
1886
- */
1887
- getChunkDurationMs(): number;
1888
- }
1889
- /**
1890
- * Configuration for the Silero VAD factory
1891
- *
1892
- * Extends SileroVADConfig with worker-specific options.
1893
- */
1894
- interface SileroVADFactoryConfig extends Omit<SileroVADConfig, 'modelUrl'>, InferenceFactoryConfig {
1895
- /** Path or URL to the ONNX model. Default: HuggingFace CDN */
1896
- modelUrl?: string;
1897
- /**
1898
- * Fallback to main thread on worker errors.
1899
- *
1900
- * When true (default), if the Worker fails to load or encounters an error,
1901
- * the factory will automatically create a main thread instance instead.
1902
- *
1903
- * When false, worker errors will propagate as exceptions.
1904
- *
1905
- * Default: true
1906
- */
1907
- fallbackOnError?: boolean;
1908
- }
1909
- /**
1910
- * Check if the current environment supports VAD Web Workers
1911
- *
1912
- * Requirements:
1913
- * - Worker constructor must exist
1914
- * - Blob URL support (for inline worker script)
1915
- *
1916
- * @returns true if VAD Worker is supported
1917
- */
1918
- declare function supportsVADWorker(): boolean;
1919
- /**
1920
- * Create a Silero VAD instance with automatic implementation selection
1921
- *
1922
- * This factory function automatically selects between:
1923
- * - **SileroVADWorker**: Off-main-thread inference (better for desktop)
1924
- * - **SileroVADInference**: Main thread inference (better for mobile)
1925
- *
1926
- * The selection is based on:
1927
- * 1. Explicit `useWorker` config (if provided)
1928
- * 2. Platform detection (mobile vs desktop)
1929
- * 3. Worker API availability
1930
- *
1931
- * Both implementations share the same interface (SileroVADBackend),
1932
- * so consumers can use either interchangeably.
1933
- *
1934
- * @param config - Factory configuration
1935
- * @returns A SileroVAD instance (either Worker or main thread)
1936
- *
1937
- * @example
1938
- * ```typescript
1939
- * // Auto-detect (recommended)
1940
- * const vad = createSileroVAD({ modelUrl: '/models/silero-vad.onnx' });
1941
- *
1942
- * // Force Worker
1943
- * const vadWorker = createSileroVAD({ modelUrl: '/models/silero-vad.onnx', useWorker: true });
1944
- *
1945
- * // Force main thread
1946
- * const vadMain = createSileroVAD({ modelUrl: '/models/silero-vad.onnx', useWorker: false });
1947
- * ```
1948
- */
1949
- declare function createSileroVAD(config?: SileroVADFactoryConfig): SileroVADBackend;
1950
-
1951
- /**
1952
- * SpeechListener — Standalone listening primitive.
1953
- *
1954
- * Composes: MicrophoneCapture → SileroVAD → SenseVoice ASR → transcript events.
1955
- * Extracted from VoicePipeline's listening half so it can be used independently.
1956
- *
1957
- * Does NOT handle TTS, LAM, or response routing — those belong to TTSSpeaker
1958
- * and VoicePipeline respectively.
1959
- *
1960
- * @category Audio
1539
+ * @category Audio
1961
1540
  */
1962
1541
 
1963
1542
  interface SpeechListenerConfig {
@@ -1974,6 +1553,7 @@ interface SpeechListenerConfig {
1974
1553
  modelUrl: string;
1975
1554
  tokensUrl?: string;
1976
1555
  language?: string;
1556
+ textNorm?: 'with_itn' | 'without_itn';
1977
1557
  };
1978
1558
  vad: {
1979
1559
  modelUrl: string;
@@ -2028,6 +1608,7 @@ declare class SpeechListener extends EventEmitter<SpeechListenerEvents> {
2028
1608
  private asr;
2029
1609
  private vad;
2030
1610
  private ownedWorker;
1611
+ private usesSharedWorker;
2031
1612
  private mic;
2032
1613
  private omoteEvents;
2033
1614
  private _unsubChunk;
@@ -2157,114 +1738,48 @@ declare class InterruptionHandler extends EventEmitter<InterruptionEvents> {
2157
1738
  }
2158
1739
 
2159
1740
  /**
2160
- * SenseVoice ASR Web Worker implementation
2161
- *
2162
- * Runs SenseVoice speech recognition in a dedicated Web Worker to prevent
2163
- * main thread blocking. Uses inline worker script (Blob URL pattern) to
2164
- * avoid separate file deployment.
2165
- *
2166
- * Key design decisions:
2167
- * - WASM backend only (WebGPU doesn't work in Workers)
2168
- * - All preprocessing (fbank, LFR, CMVN) and CTC decoding inlined in worker
2169
- * - Audio copied (not transferred) to retain main thread access
2170
- * - ONNX Runtime loaded from CDN in worker (no bundler complications)
2171
- * - iOS: model URL passed as string to ORT (avoids 239MB JS heap allocation)
1741
+ * Factory function for SenseVoice ASR via UnifiedInferenceWorker
2172
1742
  *
2173
1743
  * @category Inference
2174
1744
  *
2175
- * @example Basic usage
1745
+ * @example
2176
1746
  * ```typescript
2177
- * import { SenseVoiceWorker } from '@omote/core';
1747
+ * import { createSenseVoice, UnifiedInferenceWorker } from '@omote/core';
2178
1748
  *
2179
- * const asr = new SenseVoiceWorker({
1749
+ * const worker = new UnifiedInferenceWorker();
1750
+ * await worker.init();
1751
+ *
1752
+ * const asr = createSenseVoice({
2180
1753
  * modelUrl: '/models/sensevoice/model.int8.onnx',
2181
- * tokensUrl: '/models/sensevoice/tokens.txt',
1754
+ * unifiedWorker: worker,
2182
1755
  * });
2183
1756
  * await asr.load();
2184
- *
2185
- * const { text, emotion, language } = await asr.transcribe(audioSamples);
2186
- * console.log(text); // "Hello world"
2187
- * console.log(emotion); // "NEUTRAL"
2188
- * console.log(language); // "en"
1757
+ * const { text, emotion } = await asr.transcribe(audioSamples);
2189
1758
  * ```
2190
1759
  */
2191
1760
 
2192
1761
  /**
2193
- * Configuration for SenseVoice Worker
1762
+ * Configuration for the SenseVoice factory
2194
1763
  */
2195
- interface SenseVoiceWorkerConfig {
2196
- /** Path or URL to model.int8.onnx (239MB) */
2197
- modelUrl: string;
1764
+ interface CreateSenseVoiceConfig extends InferenceFactoryConfig {
1765
+ /** Path or URL to model.int8.onnx (239MB). Default: HuggingFace CDN */
1766
+ modelUrl?: string;
2198
1767
  /** Path or URL to tokens.txt vocabulary file (default: sibling of modelUrl) */
2199
1768
  tokensUrl?: string;
2200
- /** Language hint (default: 'auto' for auto-detection) */
1769
+ /** Language hint (default: 'auto') */
2201
1770
  language?: SenseVoiceLanguage;
2202
- /** Text normalization: 'with_itn' applies inverse text normalization (default: 'with_itn') */
1771
+ /** Text normalization (default: 'with_itn') */
2203
1772
  textNorm?: 'with_itn' | 'without_itn';
2204
1773
  }
2205
1774
  /**
2206
- * SenseVoice ASR Worker - Speech Recognition in a Web Worker
1775
+ * Create a SenseVoice ASR instance via the unified worker.
2207
1776
  *
2208
- * Runs SenseVoice inference off the main thread to prevent UI blocking.
2209
- * All preprocessing (fbank, LFR, CMVN) and CTC decoding run in the worker.
1777
+ * If no `unifiedWorker` is provided, a dedicated worker is created on load().
2210
1778
  *
2211
- * @see SenseVoiceInference for main-thread version
1779
+ * @param config - Factory configuration
1780
+ * @returns A SenseVoiceBackend instance
2212
1781
  */
2213
- declare class SenseVoiceWorker {
2214
- private worker;
2215
- private config;
2216
- private isLoading;
2217
- private _isLoaded;
2218
- private inferenceQueue;
2219
- private poisoned;
2220
- private pendingResolvers;
2221
- private languageId;
2222
- private textNormId;
2223
- constructor(config: SenseVoiceWorkerConfig);
2224
- get isLoaded(): boolean;
2225
- /**
2226
- * Backend type (always 'wasm' for Worker, WebGPU not supported in Workers)
2227
- */
2228
- get backend(): 'wasm' | null;
2229
- /**
2230
- * Create the worker from inline script
2231
- */
2232
- private createWorker;
2233
- /**
2234
- * Handle messages from worker
2235
- */
2236
- private handleWorkerMessage;
2237
- /**
2238
- * Send message to worker and wait for response
2239
- */
2240
- private sendMessage;
2241
- /**
2242
- * Load the ONNX model in the worker
2243
- *
2244
- * @param onProgress - Optional progress callback. Fires once at 100% when load completes
2245
- * (the worker downloads and loads the model internally, so granular progress is not available).
2246
- */
2247
- load(onProgress?: (loaded: number, total: number) => void): Promise<SenseVoiceModelInfo>;
2248
- /**
2249
- * Transcribe audio samples to text
2250
- *
2251
- * @param audioSamples Float32Array of audio samples at 16kHz, [-1, 1] range
2252
- * @returns Transcription result with text, emotion, language, and event
2253
- */
2254
- transcribe(audioSamples: Float32Array): Promise<SenseVoiceResult>;
2255
- /**
2256
- * Queue inference to serialize worker calls
2257
- */
2258
- private queueInference;
2259
- /**
2260
- * Dispose of the worker and free resources
2261
- */
2262
- dispose(): Promise<void>;
2263
- /**
2264
- * Check if Web Workers are supported
2265
- */
2266
- static isSupported(): boolean;
2267
- }
1782
+ declare function createSenseVoice(config?: CreateSenseVoiceConfig): SenseVoiceBackend;
2268
1783
 
2269
1784
  /**
2270
1785
  * Shared blendshape constants and utilities for lip sync inference
@@ -2298,100 +1813,6 @@ declare const LAM_BLENDSHAPES: readonly ["browDownLeft", "browDownRight", "browI
2298
1813
  */
2299
1814
  declare function lerpBlendshapes(current: Float32Array | number[], target: Float32Array | number[], factor?: number): number[];
2300
1815
 
2301
- /**
2302
- * A2E inference engine for Audio-to-Expression (LAM model)
2303
- *
2304
- * Runs entirely in the browser using WebGPU or WASM.
2305
- * Takes raw 16kHz audio and outputs 52 ARKit blendshapes for lip sync.
2306
- * Uses the LAM (Large Animation Model) — see {@link A2EBackend} for the interface.
2307
- *
2308
- * @see {@link createA2E} for the recommended zero-config factory
2309
- * @see {@link A2EBackend} for the common interface
2310
- * @category Inference
2311
- *
2312
- * @example Basic usage
2313
- * ```typescript
2314
- * import { A2EInference } from '@omote/core';
2315
- *
2316
- * const a2e = new A2EInference({ modelUrl: '/models/lam.onnx' });
2317
- * await a2e.load();
2318
- *
2319
- * // Process 1 second of audio (16kHz = 16000 samples)
2320
- * const result = await a2e.infer(audioSamples);
2321
- * console.log('Blendshapes:', result.blendshapes); // [30, 52] for 30fps
2322
- * ```
2323
- */
2324
-
2325
- interface A2EInferenceConfig {
2326
- /** Path or URL to the ONNX model */
2327
- modelUrl: string;
2328
- /**
2329
- * Path or URL to external model data file (.onnx.data weights).
2330
- * Default: `${modelUrl}.data` (e.g., /models/model.onnx.data)
2331
- *
2332
- * Set to `false` to skip external data loading (single-file models only).
2333
- */
2334
- externalDataUrl?: string | false;
2335
- /** Preferred backend (auto will try WebGPU first, fallback to WASM) */
2336
- backend?: BackendPreference;
2337
- /** Number of identity classes (default: 12 for streaming model) */
2338
- numIdentityClasses?: number;
2339
- /**
2340
- * Number of audio samples per inference chunk (default: 16000).
2341
- * Model supports variable chunk sizes. Smaller chunks = lower latency,
2342
- * more inference overhead. 8000 (500ms) is recommended for real-time lip sync.
2343
- */
2344
- chunkSize?: number;
2345
- }
2346
-
2347
- declare class A2EInference implements A2EBackend {
2348
- readonly modelId: "a2e";
2349
- private session;
2350
- private ort;
2351
- private config;
2352
- private _backend;
2353
- private isLoading;
2354
- private numIdentityClasses;
2355
- readonly chunkSize: number;
2356
- private inferenceQueue;
2357
- private poisoned;
2358
- private static readonly INFERENCE_TIMEOUT_MS;
2359
- constructor(config: A2EInferenceConfig);
2360
- /**
2361
- * Check if WebGPU is available and working
2362
- * (iOS returns false even if navigator.gpu exists due to ONNX Runtime bugs)
2363
- */
2364
- static isWebGPUAvailable: typeof isWebGPUAvailable;
2365
- get backend(): 'webgpu' | 'wasm' | null;
2366
- get isLoaded(): boolean;
2367
- /** True if inference timed out and the session is permanently unusable */
2368
- get isSessionPoisoned(): boolean;
2369
- /**
2370
- * Load the ONNX model
2371
- */
2372
- load(): Promise<A2EModelInfo>;
2373
- /**
2374
- * Run inference on raw audio
2375
- * @param audioSamples - Float32Array of raw audio at 16kHz
2376
- * @param identityIndex - Optional identity index (0-11, default 0 = neutral)
2377
- *
2378
- * Audio will be zero-padded or truncated to chunkSize samples.
2379
- */
2380
- infer(audioSamples: Float32Array, identityIndex?: number): Promise<A2EResult>;
2381
- /**
2382
- * Queue inference to serialize ONNX session calls
2383
- */
2384
- private queueInference;
2385
- /**
2386
- * Get blendshape value by name for a specific frame
2387
- */
2388
- getBlendshape(blendshapes: Float32Array, name: typeof LAM_BLENDSHAPES[number]): number;
2389
- /**
2390
- * Dispose of the model and free resources
2391
- */
2392
- dispose(): Promise<void>;
2393
- }
2394
-
2395
1816
  /**
2396
1817
  * Default and user-configurable model URLs for all ONNX models
2397
1818
  *
@@ -2427,7 +1848,7 @@ type ModelUrlKey = 'lam' | 'senseVoice' | 'sileroVad' | 'kokoroTTS' | 'kokoroVoi
2427
1848
  * Resolved model URLs — user overrides take priority, HuggingFace CDN is fallback.
2428
1849
  *
2429
1850
  * All SDK factories (`createA2E`, `createSenseVoice`, `createSileroVAD`) and
2430
- * orchestrators (`VoicePipeline`) read from this object. Call
1851
+ * orchestrators (`VoiceOrchestrator`) read from this object. Call
2431
1852
  * {@link configureModelUrls} before constructing any pipelines to point
2432
1853
  * models at your own CDN.
2433
1854
  */
@@ -2697,6 +2118,44 @@ declare class BlendshapeSmoother {
2697
2118
  reset(): void;
2698
2119
  }
2699
2120
 
2121
+ /**
2122
+ * Factory function for Silero VAD via UnifiedInferenceWorker
2123
+ *
2124
+ * @category Inference
2125
+ *
2126
+ * @example
2127
+ * ```typescript
2128
+ * import { createSileroVAD, UnifiedInferenceWorker } from '@omote/core';
2129
+ *
2130
+ * const worker = new UnifiedInferenceWorker();
2131
+ * await worker.init();
2132
+ *
2133
+ * const vad = createSileroVAD({
2134
+ * modelUrl: '/models/silero-vad.onnx',
2135
+ * unifiedWorker: worker,
2136
+ * });
2137
+ * await vad.load();
2138
+ * const result = await vad.process(audioChunk);
2139
+ * ```
2140
+ */
2141
+
2142
+ /**
2143
+ * Configuration for the Silero VAD factory
2144
+ */
2145
+ interface SileroVADFactoryConfig extends Omit<SileroVADConfig, 'modelUrl'>, InferenceFactoryConfig {
2146
+ /** Path or URL to the ONNX model. Default: HuggingFace CDN */
2147
+ modelUrl?: string;
2148
+ }
2149
+ /**
2150
+ * Create a Silero VAD instance via the unified worker.
2151
+ *
2152
+ * If no `unifiedWorker` is provided, a dedicated worker is created on load().
2153
+ *
2154
+ * @param config - Factory configuration
2155
+ * @returns A SileroVADBackend instance
2156
+ */
2157
+ declare function createSileroVAD(config?: SileroVADFactoryConfig): SileroVADBackend;
2158
+
2700
2159
  /**
2701
2160
  * SenseVoice adapter backed by UnifiedInferenceWorker
2702
2161
  *
@@ -2755,34 +2214,9 @@ declare class A2EUnifiedAdapter implements A2EBackend {
2755
2214
  }
2756
2215
 
2757
2216
  /**
2758
- * Kokoro TTS inference using ONNX Runtime Web
2759
- *
2760
- * Pure ONNX pipeline for browser-based text-to-speech. No transformers.js dependency.
2761
- * Uses eSpeak-NG WASM for phonemization and Kokoro-82M (q8, 92MB) for synthesis.
2762
- *
2763
- * Pipeline: Text → Normalize → Phonemize (eSpeak WASM) → Tokenize → Voice Style → ONNX → Audio
2217
+ * Kokoro TTS type definitions
2764
2218
  *
2765
2219
  * @category Inference
2766
- *
2767
- * @example Basic usage
2768
- * ```typescript
2769
- * import { KokoroTTSInference } from '@omote/core';
2770
- *
2771
- * const tts = new KokoroTTSInference({ defaultVoice: 'af_heart' });
2772
- * await tts.load();
2773
- *
2774
- * const { audio, duration } = await tts.synthesize("Hello world");
2775
- * // audio: Float32Array @ 24kHz
2776
- * ```
2777
- *
2778
- * @example Streaming (sentence-by-sentence)
2779
- * ```typescript
2780
- * for await (const chunk of tts.stream("First sentence. Second sentence.")) {
2781
- * playbackPipeline.feedBuffer(chunk.audio);
2782
- * }
2783
- * ```
2784
- *
2785
- * @module inference/KokoroTTSInference
2786
2220
  */
2787
2221
 
2788
2222
  interface KokoroTTSConfig {
@@ -2796,6 +2230,8 @@ interface KokoroTTSConfig {
2796
2230
  backend?: BackendPreference;
2797
2231
  /** Speech speed multiplier (default: 1.0) */
2798
2232
  speed?: number;
2233
+ /** Eagerly load phonemizer + default voice during load() instead of first speak(). Default: true. */
2234
+ eagerLoad?: boolean;
2799
2235
  }
2800
2236
  interface KokoroTTSResult {
2801
2237
  /** Audio samples at 24kHz */
@@ -2834,67 +2270,6 @@ interface SynthesizeOptions {
2834
2270
  * Returns trimmed text on success, throws on invalid input.
2835
2271
  */
2836
2272
  declare function validateTTSInput(text: unknown, voiceName: string, speed: number, availableVoices?: string[]): string;
2837
- declare class KokoroTTSInference implements TTSBackend {
2838
- private readonly config;
2839
- private readonly modelUrl;
2840
- private readonly voiceBaseUrl;
2841
- private ort;
2842
- private session;
2843
- private _backend;
2844
- private isLoading;
2845
- private poisoned;
2846
- private inferenceQueue;
2847
- private phonemizerReady;
2848
- private defaultVoiceLoaded;
2849
- /** Cached voice data (voice name → Float32Array) */
2850
- private loadedVoices;
2851
- constructor(config?: KokoroTTSConfig);
2852
- get isLoaded(): boolean;
2853
- get sampleRate(): number;
2854
- /**
2855
- * Load the ONNX model, phonemizer WASM, and default voice.
2856
- * Safe to call multiple times (no-ops after first successful load).
2857
- */
2858
- load(): Promise<KokoroTTSModelInfo>;
2859
- /**
2860
- * Lazily initialize phonemizer and default voice on first use.
2861
- * Moves 100-200ms of main-thread blocking out of load() into first synthesis.
2862
- */
2863
- private ensureReady;
2864
- /**
2865
- * Synthesize speech from text (one-shot, full audio output).
2866
- *
2867
- * @param text - Input text to synthesize
2868
- * @param options - Voice and speed overrides
2869
- * @returns Audio Float32Array at 24kHz with duration
2870
- */
2871
- synthesize(text: string, options?: SynthesizeOptions): Promise<KokoroTTSResult>;
2872
- /**
2873
- * Stream synthesis sentence-by-sentence (async generator).
2874
- * Splits text on sentence boundaries and yields audio for each.
2875
- *
2876
- * Compatible with both `SynthesizeOptions` (legacy) and `TTSStreamOptions` (TTSBackend).
2877
- *
2878
- * @param text - Input text (can be multiple sentences)
2879
- * @param options - Voice, speed, and abort signal overrides
2880
- */
2881
- stream(text: string, options?: SynthesizeOptions & TTSStreamOptions): AsyncGenerator<KokoroStreamChunk & TTSChunk>;
2882
- /**
2883
- * Preload a voice (fetches and caches the .bin file).
2884
- */
2885
- preloadVoice(voiceName: string): Promise<void>;
2886
- /**
2887
- * List available voice names.
2888
- */
2889
- listVoices(): string[];
2890
- /**
2891
- * Release the ONNX session and clear cached voices.
2892
- */
2893
- dispose(): Promise<void>;
2894
- private ensureVoice;
2895
- private queueInference;
2896
- private runInference;
2897
- }
2898
2273
 
2899
2274
  /**
2900
2275
  * Kokoro TTS adapter backed by UnifiedInferenceWorker
@@ -2910,6 +2285,7 @@ declare class KokoroTTSUnifiedAdapter implements TTSBackend {
2910
2285
  private readonly modelUrl;
2911
2286
  private readonly voiceBaseUrl;
2912
2287
  private _isLoaded;
2288
+ private _backend;
2913
2289
  private loadedGeneration;
2914
2290
  /** Per-adapter inference queue — ensures sequential state updates. */
2915
2291
  private inferenceQueue;
@@ -3131,148 +2507,61 @@ declare class SafariSpeechRecognition {
3131
2507
  /**
3132
2508
  * Remove an error callback
3133
2509
  */
3134
- offError(callback: SpeechErrorCallback): void;
3135
- /**
3136
- * Start listening for speech
3137
- *
3138
- * On iOS Safari, this will trigger the microphone permission prompt
3139
- * if not already granted.
3140
- */
3141
- start(): Promise<void>;
3142
- /**
3143
- * Stop listening and return the final transcript
3144
- */
3145
- stop(): Promise<SpeechRecognitionResult>;
3146
- /**
3147
- * Abort recognition without waiting for final result
3148
- */
3149
- abort(): void;
3150
- /**
3151
- * NOT SUPPORTED: Transcribe audio buffer
3152
- *
3153
- * Safari Speech API does not support transcribing pre-recorded audio.
3154
- * It only works with live microphone input.
3155
- *
3156
- * For batch transcription on iOS, use server-side Whisper or a cloud ASR service.
3157
- *
3158
- * @throws Error always - this method is not supported
3159
- */
3160
- transcribe(_audio: Float32Array): Promise<SpeechRecognitionResult>;
3161
- /**
3162
- * Dispose of recognition resources
3163
- */
3164
- dispose(): void;
3165
- /**
3166
- * Set up event handlers for the recognition instance
3167
- */
3168
- private setupEventHandlers;
3169
- /**
3170
- * Emit result to all registered callbacks
3171
- */
3172
- private emitResult;
3173
- /**
3174
- * Emit error to all registered callbacks
3175
- */
3176
- private emitError;
3177
- }
3178
-
3179
- /**
3180
- * Kokoro TTS Web Worker implementation
3181
- *
3182
- * Moves the heavy ONNX `session.run()` to a dedicated Web Worker to prevent
3183
- * main thread blocking (~1-2s per sentence on WASM). Phonemizer, tokenizer,
3184
- * and voice logic stay on the main thread (fast, <10ms combined).
3185
- *
3186
- * Architecture:
3187
- * ```
3188
- * Main Thread (KokoroTTSWorker): Worker (WORKER_SCRIPT):
3189
- * stream(text) →
3190
- * splitSentences(text)
3191
- * for each sentence:
3192
- * phonemize(sentence) → phonemes
3193
- * tokenize(phonemes) → tokens
3194
- * ensureVoice() → style
3195
- * postMessage(tokens, style, speed) ──→ session.run(feeds)
3196
- * await result ←── postMessage(audio)
3197
- * yield {audio, text, phonemes, duration}
3198
- * ```
3199
- *
3200
- * @category Inference
3201
- *
3202
- * @example Basic usage
3203
- * ```typescript
3204
- * import { KokoroTTSWorker } from '@omote/core';
3205
- *
3206
- * const tts = new KokoroTTSWorker({ defaultVoice: 'af_heart' });
3207
- * await tts.load();
3208
- *
3209
- * for await (const chunk of tts.stream("Hello world!")) {
3210
- * playbackPipeline.feedBuffer(chunk.audio);
3211
- * }
3212
- * ```
3213
- *
3214
- * @module inference/KokoroTTSWorker
3215
- */
3216
-
3217
- /**
3218
- * Kokoro TTS Worker — off-main-thread ONNX inference for non-blocking TTS.
3219
- *
3220
- * Phonemizer/tokenizer/voice logic run on the main thread (fast, <10ms).
3221
- * Only the heavy ONNX `session.run()` is delegated to the worker.
3222
- *
3223
- * Implements the same TTSBackend interface as KokoroTTSInference.
3224
- *
3225
- * @see KokoroTTSInference for main-thread version
3226
- */
3227
- declare class KokoroTTSWorker implements TTSBackend {
3228
- private readonly config;
3229
- private readonly modelUrl;
3230
- private readonly voiceBaseUrl;
3231
- private worker;
3232
- private _isLoaded;
3233
- private isLoading;
3234
- private poisoned;
3235
- /** Serializes all worker calls (stream sentence chunks + synthesize) */
3236
- private inferenceQueue;
3237
- /** Cached voice data (voice name → Float32Array) */
3238
- private loadedVoices;
3239
- /** Pending message handlers */
3240
- private pendingResolvers;
3241
- constructor(config?: KokoroTTSConfig);
3242
- get isLoaded(): boolean;
3243
- get sampleRate(): number;
3244
- load(): Promise<KokoroTTSModelInfo>;
3245
- synthesize(text: string, options?: SynthesizeOptions): Promise<KokoroTTSResult>;
3246
- stream(text: string, options?: SynthesizeOptions & TTSStreamOptions): AsyncGenerator<KokoroStreamChunk & TTSChunk>;
3247
- preloadVoice(voiceName: string): Promise<void>;
3248
- listVoices(): string[];
3249
- dispose(): Promise<void>;
3250
- static isSupported(): boolean;
3251
- private ensureVoice;
3252
- private createWorker;
3253
- private handleWorkerMessage;
3254
- private sendMessage;
2510
+ offError(callback: SpeechErrorCallback): void;
3255
2511
  /**
3256
- * Queue worker inference through the serialization queue.
3257
- * Sends pre-computed tokens + style to worker, returns audio.
2512
+ * Start listening for speech
2513
+ *
2514
+ * On iOS Safari, this will trigger the microphone permission prompt
2515
+ * if not already granted.
3258
2516
  */
3259
- private runWorkerInference;
2517
+ start(): Promise<void>;
2518
+ /**
2519
+ * Stop listening and return the final transcript
2520
+ */
2521
+ stop(): Promise<SpeechRecognitionResult>;
2522
+ /**
2523
+ * Abort recognition without waiting for final result
2524
+ */
2525
+ abort(): void;
2526
+ /**
2527
+ * NOT SUPPORTED: Transcribe audio buffer
2528
+ *
2529
+ * Safari Speech API does not support transcribing pre-recorded audio.
2530
+ * It only works with live microphone input.
2531
+ *
2532
+ * For batch transcription on iOS, use server-side Whisper or a cloud ASR service.
2533
+ *
2534
+ * @throws Error always - this method is not supported
2535
+ */
2536
+ transcribe(_audio: Float32Array): Promise<SpeechRecognitionResult>;
2537
+ /**
2538
+ * Dispose of recognition resources
2539
+ */
2540
+ dispose(): void;
2541
+ /**
2542
+ * Set up event handlers for the recognition instance
2543
+ */
2544
+ private setupEventHandlers;
2545
+ /**
2546
+ * Emit result to all registered callbacks
2547
+ */
2548
+ private emitResult;
3260
2549
  /**
3261
- * One-shot synthesis (phonemize + tokenize + worker inference).
2550
+ * Emit error to all registered callbacks
3262
2551
  */
3263
- private queueInference;
2552
+ private emitError;
3264
2553
  }
3265
2554
 
3266
2555
  /**
3267
- * Factory function for Kokoro TTS with automatic Worker vs main thread selection
2556
+ * Factory function for Kokoro TTS via UnifiedInferenceWorker
3268
2557
  *
3269
- * Provides a unified API that automatically selects the optimal implementation:
3270
- * - Desktop: Uses KokoroTTSWorker (off-main-thread inference, no render hitching)
3271
- * - iOS: Uses KokoroTTSInference (main thread, shared ORT instance to avoid OOM)
2558
+ * When called without a `unifiedWorker`, a dedicated worker is created
2559
+ * automatically on the first `load()` call. Pass a shared worker when using
2560
+ * VoiceOrchestrator or multiple models to avoid extra WASM instances.
3272
2561
  *
3273
2562
  * @category Inference
3274
2563
  *
3275
- * @example Auto-detect (recommended)
2564
+ * @example Standalone (auto-creates worker)
3276
2565
  * ```typescript
3277
2566
  * import { createKokoroTTS } from '@omote/core';
3278
2567
  *
@@ -3284,14 +2573,9 @@ declare class KokoroTTSWorker implements TTSBackend {
3284
2573
  * }
3285
2574
  * ```
3286
2575
  *
3287
- * @example Force worker
2576
+ * @example With shared worker
3288
2577
  * ```typescript
3289
- * const tts = createKokoroTTS({ defaultVoice: 'af_heart', useWorker: true });
3290
- * ```
3291
- *
3292
- * @example Force main thread
3293
- * ```typescript
3294
- * const tts = createKokoroTTS({ defaultVoice: 'af_heart', useWorker: false });
2578
+ * const tts = createKokoroTTS({ defaultVoice: 'af_heart', unifiedWorker: worker });
3295
2579
  * ```
3296
2580
  */
3297
2581
 
@@ -3301,10 +2585,12 @@ declare class KokoroTTSWorker implements TTSBackend {
3301
2585
  interface CreateKokoroTTSConfig extends KokoroTTSConfig, InferenceFactoryConfig {
3302
2586
  }
3303
2587
  /**
3304
- * Create a Kokoro TTS instance with automatic implementation selection.
2588
+ * Create a Kokoro TTS instance via the unified worker.
2589
+ *
2590
+ * If no `unifiedWorker` is provided, a dedicated worker is created on load().
3305
2591
  *
3306
2592
  * @param config - Factory configuration
3307
- * @returns A TTSBackend instance (either Worker or main thread)
2593
+ * @returns A TTSBackend instance
3308
2594
  */
3309
2595
  declare function createKokoroTTS(config?: CreateKokoroTTSConfig): TTSBackend;
3310
2596
 
@@ -3353,7 +2639,7 @@ declare function listVoices(): string[];
3353
2639
  * ElevenLabs TTS Backend — Cloud text-to-speech via ElevenLabs REST API.
3354
2640
  *
3355
2641
  * Implements the TTSBackend interface so it can be used anywhere Kokoro TTS is used
3356
- * (TTSPlayback, TTSSpeaker, VoicePipeline, PlaybackPipeline, etc.)
2642
+ * (TTSPlayback, TTSSpeaker, VoiceOrchestrator, PlaybackPipeline, etc.)
3357
2643
  *
3358
2644
  * Zero external dependencies — uses fetch() directly.
3359
2645
  *
@@ -3431,141 +2717,6 @@ declare class ElevenLabsTTSBackend implements TTSBackend {
3431
2717
  private getHttpErrorMessage;
3432
2718
  }
3433
2719
 
3434
- /**
3435
- * AWS Polly TTS Backend — Cloud text-to-speech via consumer-provided AWS SDK call.
3436
- *
3437
- * Implements the TTSBackend interface. Keeps @omote/core free of AWS SDK dependencies
3438
- * by delegating the actual Polly API call to a consumer-provided function.
3439
- *
3440
- * @category Inference
3441
- *
3442
- * @example Basic usage with AWS SDK v3
3443
- * ```typescript
3444
- * import { PollyTTSBackend } from '@omote/core';
3445
- * import { PollyClient, SynthesizeSpeechCommand } from '@aws-sdk/client-polly';
3446
- *
3447
- * const polly = new PollyClient({ region: 'us-east-1' });
3448
- *
3449
- * const tts = new PollyTTSBackend({
3450
- * synthesizeFn: async (text, voice, sampleRate) => {
3451
- * const cmd = new SynthesizeSpeechCommand({
3452
- * Text: text,
3453
- * VoiceId: voice,
3454
- * Engine: 'neural',
3455
- * OutputFormat: 'pcm',
3456
- * SampleRate: String(sampleRate),
3457
- * });
3458
- * const result = await polly.send(cmd);
3459
- * const stream = result.AudioStream;
3460
- * // Convert stream to ArrayBuffer (Node or browser)
3461
- * const chunks: Uint8Array[] = [];
3462
- * for await (const chunk of stream as AsyncIterable<Uint8Array>) {
3463
- * chunks.push(chunk);
3464
- * }
3465
- * const totalLength = chunks.reduce((sum, c) => sum + c.length, 0);
3466
- * const merged = new Uint8Array(totalLength);
3467
- * let offset = 0;
3468
- * for (const chunk of chunks) {
3469
- * merged.set(chunk, offset);
3470
- * offset += chunk.length;
3471
- * }
3472
- * return {
3473
- * audio: merged.buffer,
3474
- * contentType: result.ContentType ?? 'audio/pcm',
3475
- * };
3476
- * },
3477
- * });
3478
- *
3479
- * await tts.load();
3480
- * for await (const chunk of tts.stream("Hello world!")) {
3481
- * playbackPipeline.feedBuffer(chunk.audio);
3482
- * }
3483
- * ```
3484
- */
3485
-
3486
- /**
3487
- * Result from the consumer-provided synthesize function.
3488
- */
3489
- interface PollySynthesizeResult {
3490
- /** Raw PCM audio bytes (Int16 LE) */
3491
- audio: ArrayBuffer;
3492
- /** Content type from Polly response (e.g., 'audio/pcm') */
3493
- contentType: string;
3494
- }
3495
- /**
3496
- * Configuration for PollyTTSBackend.
3497
- *
3498
- * The `synthesizeFn` callback lets consumers use their own AWS SDK setup
3499
- * (credentials, region, SDK version) without @omote/core depending on `@aws-sdk/client-polly`.
3500
- */
3501
- interface PollyConfig {
3502
- /**
3503
- * Consumer-provided function that calls AWS Polly.
3504
- * Must return PCM audio (Int16 LE) at the requested sample rate.
3505
- *
3506
- * @param text - Text to synthesize
3507
- * @param voice - Polly voice ID (e.g., 'Joanna')
3508
- * @param sampleRate - Requested output sample rate (e.g., 16000)
3509
- * @returns PCM audio buffer and content type
3510
- */
3511
- synthesizeFn: (text: string, voice: string, sampleRate: number) => Promise<PollySynthesizeResult>;
3512
- /** Polly voice ID (default: 'Joanna') */
3513
- voice?: string;
3514
- /** Output sample rate in Hz (default: 16000) */
3515
- sampleRate?: number;
3516
- /** Polly engine type (default: 'neural') */
3517
- engine?: 'neural' | 'standard' | 'generative' | 'long-form';
3518
- }
3519
- declare class PollyTTSBackend implements TTSBackend {
3520
- private readonly synthesizeFn;
3521
- private readonly voice;
3522
- private readonly _sampleRate;
3523
- private readonly engine;
3524
- private _isLoaded;
3525
- constructor(config: PollyConfig);
3526
- get sampleRate(): number;
3527
- get isLoaded(): boolean;
3528
- /**
3529
- * No-op for cloud TTS (no model to load).
3530
- * Marks backend as ready.
3531
- */
3532
- load(): Promise<void>;
3533
- /**
3534
- * Synthesize audio via consumer's Polly function.
3535
- *
3536
- * Polly's SynthesizeSpeech is request/response (not streaming for PCM),
3537
- * so this yields a single chunk per call. For long text, consider splitting
3538
- * into sentences on the consumer side.
3539
- */
3540
- stream(text: string, options?: TTSStreamOptions): AsyncGenerator<TTSChunk>;
3541
- dispose(): Promise<void>;
3542
- }
3543
-
3544
- /**
3545
- * ORT CDN configuration
3546
- *
3547
- * Allows consumers to override the CDN base URL used for loading
3548
- * ONNX Runtime WASM/WebGPU binaries. By default, ORT loads from
3549
- * its bundled CDN path. Use {@link configureOrtCdn} to point at
3550
- * a self-hosted or enterprise CDN.
3551
- *
3552
- * @category Inference
3553
- */
3554
- /**
3555
- * Override the CDN base URL for ONNX Runtime WASM/WebGPU binaries.
3556
- *
3557
- * Must be an HTTPS URL or a relative path (starts with `/` or `./`).
3558
- * Call this once at app startup, before loading any models.
3559
- *
3560
- * @param cdnPath - HTTPS URL or relative path to ORT binaries directory
3561
- * @throws If cdnPath is not HTTPS or a relative path
3562
- */
3563
- declare function configureOrtCdn(cdnPath: string): void;
3564
- /**
3565
- * Get the current ORT CDN base URL override, or null if using defaults.
3566
- */
3567
- declare function getOrtCdnBase(): string | null;
3568
-
3569
2720
  /**
3570
2721
  * Emotion - Helper for creating emotion vectors for avatar animation
3571
2722
  *
@@ -4111,7 +3262,7 @@ declare const MetricNames: {
4111
3262
  readonly CACHE_QUOTA_WARNING: "omote.cache.quota_warning";
4112
3263
  /** Counter: Cache eviction (LRU) */
4113
3264
  readonly CACHE_EVICTION: "omote.cache.eviction";
4114
- /** Histogram: VoicePipeline turn latency (speech end → transcript ready, excludes playback) */
3265
+ /** Histogram: Voice turn latency (speech end → transcript ready, excludes playback) */
4115
3266
  readonly VOICE_TURN_LATENCY: "omote.voice.turn.latency";
4116
3267
  /** Histogram: ASR transcription latency in ms */
4117
3268
  readonly VOICE_TRANSCRIPTION_LATENCY: "omote.voice.transcription.latency";
@@ -4959,7 +4110,7 @@ declare class ProceduralLifeLayer {
4959
4110
  */
4960
4111
  update(delta: number, input?: LifeLayerInput): LifeLayerOutput;
4961
4112
  /**
4962
- * Write life layer output directly to a Float32Array[52] in LAM_BLENDSHAPES order.
4113
+ * Write life layer output directly to a Float32Array[52] in ARKIT_BLENDSHAPES order.
4963
4114
  *
4964
4115
  * Includes micro-jitter (0.4% amplitude simplex noise on all channels) to
4965
4116
  * break uncanny stillness on undriven channels.
@@ -5294,7 +4445,7 @@ declare class FaceCompositor {
5294
4445
  /**
5295
4446
  * Compose a single output frame from the 5-stage signal chain.
5296
4447
  *
5297
- * @param base - A2E raw output (Float32Array[52], LAM_BLENDSHAPES order)
4448
+ * @param base - A2E raw output (Float32Array[52], ARKIT_BLENDSHAPES order)
5298
4449
  * @param input - Per-frame input (deltaTime, emotion, life layer params)
5299
4450
  * @param target - Optional pre-allocated output buffer (avoids per-frame allocation).
5300
4451
  * When omitted, an internal buffer is used (valid until next compose() call).
@@ -5576,216 +4727,6 @@ declare class MicLipSync extends EventEmitter<MicLipSyncEvents> {
5576
4727
  private setState;
5577
4728
  }
5578
4729
 
5579
- /**
5580
- * VoicePipeline - Full conversational agent loop
5581
- *
5582
- * Composes: MicrophoneCapture → SileroVAD → SenseVoice ASR → PlaybackPipeline (A2E)
5583
- *
5584
- * State machine: idle → loading → ready → listening → thinking → speaking → listening → ...
5585
- *
5586
- * The consumer provides an `onResponse` callback that receives transcribed text
5587
- * and streams audio back for playback + lip sync. VoicePipeline is backend-agnostic.
5588
- *
5589
- * @category Orchestration
5590
- */
5591
-
5592
- /** Shared config options for all VoicePipeline modes */
5593
- interface VoicePipelineBaseConfig {
5594
- /** Pre-built backends — skip internal factory creation. Takes precedence over `models`. */
5595
- backends?: {
5596
- asr: SenseVoiceBackend;
5597
- lam: A2EBackend;
5598
- vad: SileroVADBackend;
5599
- tts?: TTSBackend;
5600
- };
5601
- /** External unified worker (reuse across pipelines). Takes precedence over internal creation. */
5602
- unifiedWorker?: UnifiedInferenceWorker;
5603
- /** URLs and options for model loading. Required if `backends` not provided. */
5604
- models?: {
5605
- senseVoice: {
5606
- modelUrl: string;
5607
- tokensUrl?: string;
5608
- language?: string;
5609
- };
5610
- lam: {
5611
- modelUrl: string;
5612
- externalDataUrl?: string | false;
5613
- backend?: 'auto' | 'webgpu' | 'wasm';
5614
- };
5615
- vad: {
5616
- modelUrl: string;
5617
- threshold?: number;
5618
- preSpeechBufferChunks?: number;
5619
- };
5620
- };
5621
- /** Per-character expression weight scaling */
5622
- profile?: ExpressionProfile;
5623
- /** Identity/style index for A2E model (default: 0) */
5624
- identityIndex?: number;
5625
- /** Base silence timeout in ms (default: 500) */
5626
- silenceTimeoutMs?: number;
5627
- /** Extended silence timeout for long utterances (default: 700) */
5628
- silenceTimeoutExtendedMs?: number;
5629
- /** Enable adaptive timeout based on speech duration (default: true) */
5630
- adaptiveTimeout?: boolean;
5631
- /** Minimum audio duration in seconds (default: 0.3) */
5632
- minAudioDurationSec?: number;
5633
- /** Minimum audio energy (default: 0.02) */
5634
- minAudioEnergy?: number;
5635
- /** Enable audio normalization for quiet audio (default: true) */
5636
- normalizeAudio?: boolean;
5637
- /** Progressive transcription interval — desktop (default: 500ms) */
5638
- progressiveIntervalMs?: number;
5639
- /** Progressive transcription interval — iOS (default: 800ms) */
5640
- progressiveIntervalIosMs?: number;
5641
- /** Coverage threshold to use progressive result (default: 0.8) */
5642
- progressiveCoverageThreshold?: number;
5643
- /** Minimum samples before progressive transcription starts (default: 8000) */
5644
- progressiveMinSamples?: number;
5645
- /** Timeout for individual SenseVoice.transcribe() calls (default: 10000ms) */
5646
- transcriptionTimeoutMs?: number;
5647
- /** Enable barge-in detection (default: true) */
5648
- interruptionEnabled?: boolean;
5649
- /** Minimum speech duration for interruption (default: 200ms) */
5650
- interruptionMinSpeechMs?: number;
5651
- /** Audio playback delay (default: auto-detected) */
5652
- audioDelayMs?: number;
5653
- /** Coalescer target duration (default: 200ms) */
5654
- chunkTargetMs?: number;
5655
- /** Enable neutral transition on playback complete (default: true) */
5656
- neutralTransitionEnabled?: boolean;
5657
- /** Duration of neutral fade-out (default: 250ms) */
5658
- neutralTransitionMs?: number;
5659
- }
5660
- /** Cloud TTS mode: consumer handles response + audio streaming */
5661
- interface VoicePipelineCloudConfig extends VoicePipelineBaseConfig {
5662
- mode: 'cloud';
5663
- /** Consumer's response handler (streams audio back) */
5664
- onResponse: ResponseHandler;
5665
- }
5666
- /** Local TTS mode: SDK handles synthesis internally via TTSBackend */
5667
- interface VoicePipelineLocalConfig extends VoicePipelineBaseConfig {
5668
- mode: 'local';
5669
- /**
5670
- * TTS backend (e.g., KokoroTTSInference). Provide either `tts` or `ttsConfig`.
5671
- *
5672
- * When `tts` is provided, VoicePipeline uses it as-is. On iOS, this means
5673
- * inference runs on the main thread (may cause UI freezes).
5674
- *
5675
- * Prefer `ttsConfig` for automatic unified worker integration on iOS.
5676
- */
5677
- tts?: TTSBackend;
5678
- /**
5679
- * Kokoro TTS configuration. When provided, VoicePipeline creates the TTS
5680
- * internally and passes the unified worker on iOS for off-main-thread inference.
5681
- *
5682
- * Takes precedence over `tts` if both are provided.
5683
- */
5684
- ttsConfig?: {
5685
- defaultVoice?: string;
5686
- speed?: number;
5687
- modelUrl?: string;
5688
- voiceBaseUrl?: string;
5689
- };
5690
- /** Optional text transform (e.g., LLM call). Receives transcript, returns response text. */
5691
- onTranscript?: (text: string) => string | Promise<string>;
5692
- }
5693
- type VoicePipelineConfig = VoicePipelineCloudConfig | VoicePipelineLocalConfig;
5694
- interface VoicePipelineEvents {
5695
- 'state': VoicePipelineState;
5696
- 'loading:progress': LoadingProgress;
5697
- 'transcript': TranscriptResult;
5698
- 'frame': FullFaceFrame;
5699
- 'frame:raw': Float32Array;
5700
- 'speech:start': void;
5701
- 'speech:end': {
5702
- durationMs: number;
5703
- };
5704
- 'playback:start': {
5705
- time: number;
5706
- };
5707
- 'playback:complete': void;
5708
- 'interruption': void;
5709
- 'audio:level': {
5710
- rms: number;
5711
- peak: number;
5712
- };
5713
- 'error': Error;
5714
- [key: string]: unknown;
5715
- }
5716
- declare class VoicePipeline extends EventEmitter<VoicePipelineEvents> {
5717
- private readonly config;
5718
- private readonly isLocalMode;
5719
- private _state;
5720
- private stopped;
5721
- private epoch;
5722
- private _sessionId;
5723
- private asr;
5724
- private lam;
5725
- private vad;
5726
- private unifiedWorker;
5727
- private playback;
5728
- private interruption;
5729
- private omoteEvents;
5730
- private mic;
5731
- private static readonly MAX_AUDIO_BUFFER_SAMPLES;
5732
- private audioBuffer;
5733
- private audioBufferSamples;
5734
- private speechStartTime;
5735
- private silenceTimer;
5736
- private isSpeaking;
5737
- private progressiveTimer;
5738
- private progressivePromise;
5739
- private lastProgressiveResult;
5740
- private lastProgressiveSamples;
5741
- private asrErrorCount;
5742
- private progressiveErrorCount;
5743
- private responseAbortController;
5744
- private _unsubChunk;
5745
- private _unsubLevel;
5746
- private _currentFrame;
5747
- /** Current pipeline state */
5748
- get state(): VoicePipelineState;
5749
- /** Latest blendshape frame */
5750
- get currentFrame(): Float32Array | null;
5751
- /** Whether user is currently speaking */
5752
- get isSpeechActive(): boolean;
5753
- /** Session ID (generated on start(), null before) */
5754
- get sessionId(): string | null;
5755
- constructor(config: VoicePipelineConfig);
5756
- loadModels(): Promise<void>;
5757
- /**
5758
- * Load from pre-built backends (dependency injection path).
5759
- * Loads any backends that aren't loaded yet.
5760
- */
5761
- private loadFromBackends;
5762
- /**
5763
- * Load from factories (original path). Loads SenseVoice, LAM, and VAD in parallel.
5764
- */
5765
- private loadFromFactories;
5766
- start(): Promise<void>;
5767
- stop(): void;
5768
- setProfile(profile: ExpressionProfile): void;
5769
- dispose(): Promise<void>;
5770
- private processAudioChunk;
5771
- private getSilenceTimeout;
5772
- private onSilenceDetected;
5773
- private processEndOfSpeech;
5774
- private callResponseHandler;
5775
- /** Cloud mode: delegate to consumer's onResponse handler */
5776
- private handleCloudResponse;
5777
- /** Local mode: synthesize text with TTSBackend, stream to PlaybackPipeline */
5778
- private handleLocalResponse;
5779
- private handleInterruption;
5780
- private startProgressiveTranscription;
5781
- private stopProgressiveTranscription;
5782
- private transcribeWithTimeout;
5783
- private normalizeAudio;
5784
- private setState;
5785
- private emitProgress;
5786
- private clearSilenceTimer;
5787
- }
5788
-
5789
4730
  /**
5790
4731
  * VoiceOrchestrator — Shared voice wiring for OmoteAvatar adapters.
5791
4732
  *
@@ -5803,6 +4744,11 @@ interface VoiceOrchestratorBaseConfig {
5803
4744
  listener?: SpeechListenerConfig;
5804
4745
  interruptionEnabled?: boolean;
5805
4746
  profile?: ExpressionProfile;
4747
+ onStateChange?: (state: ConversationalState) => void;
4748
+ onLoadingProgress?: (progress: LoadingProgress) => void;
4749
+ onError?: (error: Error) => void;
4750
+ onTranscriptEvent?: (result: TranscriptResult) => void;
4751
+ onInterruption?: () => void;
5806
4752
  }
5807
4753
  interface VoiceOrchestratorLocalConfig extends VoiceOrchestratorBaseConfig {
5808
4754
  mode?: 'local';
@@ -5816,12 +4762,23 @@ interface VoiceOrchestratorCloudConfig extends VoiceOrchestratorBaseConfig {
5816
4762
  lam?: {
5817
4763
  modelUrl?: string;
5818
4764
  externalDataUrl?: string | false;
4765
+ unifiedWorker?: UnifiedInferenceWorker;
5819
4766
  };
4767
+ identityIndex?: number;
4768
+ neutralTransitionEnabled?: boolean;
5820
4769
  }
5821
4770
  type VoiceOrchestratorConfig = VoiceOrchestratorLocalConfig | VoiceOrchestratorCloudConfig;
5822
4771
  interface VoiceOrchestratorEvents {
5823
4772
  'state': ConversationalState;
5824
4773
  'transcript': TranscriptResult;
4774
+ 'interruption': void;
4775
+ 'loading:progress': LoadingProgress;
4776
+ 'error': Error;
4777
+ 'audio:level': {
4778
+ rms: number;
4779
+ peak: number;
4780
+ };
4781
+ 'playback:complete': void;
5825
4782
  [key: string]: unknown;
5826
4783
  }
5827
4784
  declare class VoiceOrchestrator extends EventEmitter<VoiceOrchestratorEvents> {
@@ -5830,6 +4787,8 @@ declare class VoiceOrchestrator extends EventEmitter<VoiceOrchestratorEvents> {
5830
4787
  private ttsSpeaker;
5831
4788
  private playbackPipeline;
5832
4789
  private ownedLam;
4790
+ private ownedWorker;
4791
+ private usesSharedWorker;
5833
4792
  private transcriptUnsub;
5834
4793
  private audioChunkUnsub;
5835
4794
  private connectEpoch;
@@ -5853,10 +4812,14 @@ declare class VoiceOrchestrator extends EventEmitter<VoiceOrchestratorEvents> {
5853
4812
  speak(text: string, options?: {
5854
4813
  signal?: AbortSignal;
5855
4814
  voice?: string;
4815
+ speed?: number;
4816
+ language?: string;
5856
4817
  }): Promise<void>;
5857
4818
  streamText(options?: {
5858
4819
  signal?: AbortSignal;
5859
4820
  voice?: string;
4821
+ speed?: number;
4822
+ language?: string;
5860
4823
  }): Promise<{
5861
4824
  push: (token: string) => void;
5862
4825
  end: () => Promise<void>;
@@ -5868,4 +4831,4 @@ declare class VoiceOrchestrator extends EventEmitter<VoiceOrchestratorEvents> {
5868
4831
  private setState;
5869
4832
  }
5870
4833
 
5871
- export { type A2EBackend, A2EInference, type A2EInferenceConfig, type A2EModelInfo, A2EProcessor, type A2EProcessorConfig, type A2EResult, A2EUnifiedAdapter, ALL_AUS, ARKIT_BLENDSHAPES, type AUActivation, AU_TO_ARKIT, type ActiveSpan, type AnimationClip, type AnimationController, AnimationGraph, type AnimationGraphConfig, type AnimationGraphEvents, type AnimationLayer, type AnimationOutput, type AnimationSource, type AnimationSourceOptions, type AnimationState, type AnimationStateName, type AnimationTrigger, AudioChunkCoalescer, type AudioChunkCoalescerOptions, AudioEnergyAnalyzer, AudioScheduler, type AudioSchedulerOptions, BLENDSHAPE_TO_GROUP, type BackendPreference, type BlendWeight, type BlendshapeGroup, BlendshapeSmoother, type BlendshapeSmootherConfig, type BoneFilterConfig, type CacheConfig, type CacheSpanAttributes, CharacterController, type CharacterControllerConfig, type CharacterProfile, type CharacterUpdateInput, type CharacterUpdateOutput, ConsoleExporter, type ConversationalState, type CreateA2EConfig, type CreateKokoroTTSConfig, type CreateSenseVoiceConfig, type CreateTTSPlayerConfig, DEFAULT_ANIMATION_CONFIG, DEFAULT_BONE_FILTER, DEFAULT_MODEL_URLS, EMOTION_NAMES, EMOTION_TO_AU, EMOTION_VECTOR_SIZE, EXPLICIT_EMOTION_COUNT, type ElevenLabsConfig, ElevenLabsTTSBackend, type EmotionAnimationMap, EmotionController, type EmotionLabel, type EmotionName, type EmotionPresetName, EmotionPresets, EmotionResolver, type EmotionWeights, EmphasisDetector, type ErrorType, ErrorTypes, EventEmitter, type ExpressionProfile, FaceCompositor, type FaceCompositorConfig, type FaceCompositorInput, type FaceCompositorOutput, type FetchWithCacheOptions, type FrameSource, type FullFaceFrame, HF_CDN_URLS, INFERENCE_LATENCY_BUCKETS, type InferenceFactoryConfig, type InferenceSpanAttributes, type InterruptionConfig, type InterruptionEvents, InterruptionHandler, KOKORO_VOICES, type KokoroStreamChunk, type KokoroTTSConfig, KokoroTTSInference, type KokoroTTSModelInfo, type KokoroTTSResult, KokoroTTSUnifiedAdapter, KokoroTTSWorker, type KokoroVoiceName, LAM_BLENDSHAPES, type LifeLayerConfig, type LifeLayerInput, type LifeLayerOutput, type LoadingProgress, MIXAMO_PREFIX, MODEL_LOAD_TIME_BUCKETS, type MetricData, MetricNames, MicLipSync, type MicLipSyncConfig, type MicLipSyncEvents, type MicLipSyncFrame, type MicLipSyncState, MicrophoneCapture, type MicrophoneCaptureConfig, ModelCache, type ModelSpanAttributes, type ModelUrlKey, OTLPExporter, type OTLPExporterConfig, OmoteEvents, OmoteTelemetry, PRESERVE_POSITION_BONES, PlaybackPipeline, type PlaybackPipelineConfig, type PlaybackPipelineEvents, type PlaybackState, type PollyConfig, type PollySynthesizeResult, PollyTTSBackend, ProceduralLifeLayer, type Quat, type QuotaInfo, type ResolvedEmotion, type ResponseHandler, RingBuffer, type RuntimeBackend, type SafariSpeechConfig, SafariSpeechRecognition, type SamplingConfig, type SenseVoiceBackend, type SenseVoiceConfig, SenseVoiceInference, type SenseVoiceLanguage, type SenseVoiceModelInfo, type SenseVoiceResult, SenseVoiceUnifiedAdapter, SenseVoiceWorker, type SenseVoiceWorkerConfig, type SileroVADBackend, type SileroVADConfig, type SileroVADFactoryConfig, SileroVADInference, SileroVADUnifiedAdapter, SileroVADWorker, type SpanAttributes, type SpanData, type SpeechErrorCallback, SpeechListener, type SpeechListenerConfig, type SpeechListenerEvents, type SpeechListenerState, type SpeechRecognitionResult, type SpeechResultCallback, type SpeechSegment, type SynthesizeOptions, type TTSBackend, type TTSChunk, TTSPlayback, type TTSPlaybackConfig, type TTSPlaybackEvents, TTSPlayer, TTSSpeaker, type TTSSpeakerConfig, type TTSStreamOptions, type TelemetryConfig, type TelemetryExporter, type TelemetryExporterInterface, type TrackDescriptor, type TranscriptResult, type Transition, UnifiedInferenceWorker, type VADBackend, type VADModelInfo, type VADResult, type VADWorkerConfig, type VADWorkerModelInfo, type ValidationResult, type Vec3, VoiceOrchestrator, type VoiceOrchestratorCloudConfig, type VoiceOrchestratorConfig, type VoiceOrchestratorEvents, type VoiceOrchestratorLocalConfig, VoicePipeline, type VoicePipelineCloudConfig, type VoicePipelineConfig, type VoicePipelineEvents, type VoicePipelineLocalConfig, type VoicePipelineState, A2EInference as Wav2Vec2Inference, type WorkerHealthState, analyzeTextEmotion, applyProfile, blendEmotions, calculatePeak, calculateRMS, configureCacheLimit, configureModelUrls, configureOrtCdn, configureTelemetry, createA2E, createEmotionVector, createKokoroTTS, createSenseVoice, createSileroVAD, createTTSPlayer, fetchWithCache, float32ToPcm16, formatBytes, getCacheConfig, getCacheKey, getEmotionPreset, getModelCache, getOptimalWasmThreads, getOrtCdnBase, getRecommendedBackend, getTelemetry, hasWebGPUApi, int16ToFloat32, isAndroid, isIOS, isIOSSafari, isMobile, isSafari, isSpeechRecognitionAvailable, isWebGPUAvailable, lerpBlendshapes, lerpEmotion, listVoices as listKokoroVoices, parseEmotionTags, pcm16ToFloat32, preloadModels, resampleLinear, resetModelUrls, resolveBackend, resolveEmotion, shouldEnableWasmProxy, shouldKeepTrack, shouldUseNativeASR, shouldUseServerA2E, stripMixamoPrefix, supportsVADWorker, ttsToPlaybackFormat, validateTTSInput };
4834
+ export { type A2EBackend, type A2EModelInfo, A2EProcessor, type A2EProcessorConfig, type A2EResult, A2EUnifiedAdapter, ALL_AUS, ARKIT_BLENDSHAPES, type AUActivation, AU_TO_ARKIT, type ActiveSpan, type AnimationClip, type AnimationController, AnimationGraph, type AnimationGraphConfig, type AnimationGraphEvents, type AnimationLayer, type AnimationOutput, type AnimationSource, type AnimationSourceOptions, type AnimationState, type AnimationStateName, type AnimationTrigger, AudioChunkCoalescer, type AudioChunkCoalescerOptions, AudioEnergyAnalyzer, AudioScheduler, type AudioSchedulerOptions, BLENDSHAPE_TO_GROUP, type BackendPreference, type BlendWeight, type BlendshapeGroup, BlendshapeSmoother, type BlendshapeSmootherConfig, type BoneFilterConfig, type CacheConfig, type CacheSpanAttributes, CharacterController, type CharacterControllerConfig, type CharacterProfile, type CharacterUpdateInput, type CharacterUpdateOutput, ConsoleExporter, type ConversationalState, type CreateA2EConfig, type CreateKokoroTTSConfig, type CreateSenseVoiceConfig, type CreateTTSPlayerConfig, DEFAULT_ANIMATION_CONFIG, DEFAULT_BONE_FILTER, DEFAULT_MODEL_URLS, EMOTION_NAMES, EMOTION_TO_AU, EMOTION_VECTOR_SIZE, EXPLICIT_EMOTION_COUNT, type ElevenLabsConfig, ElevenLabsTTSBackend, type EmotionAnimationMap, EmotionController, type EmotionLabel, type EmotionName, type EmotionPresetName, EmotionPresets, EmotionResolver, type EmotionWeights, EmphasisDetector, type ErrorType, ErrorTypes, EventEmitter, type ExpressionProfile, FaceCompositor, type FaceCompositorConfig, type FaceCompositorInput, type FaceCompositorOutput, type FetchWithCacheOptions, type FrameSource, type FullFaceFrame, HF_CDN_URLS, INFERENCE_LATENCY_BUCKETS, type InferenceFactoryConfig, type InferenceSpanAttributes, type InterruptionConfig, type InterruptionEvents, InterruptionHandler, KOKORO_VOICES, type KokoroStreamChunk, type KokoroTTSConfig, type KokoroTTSModelInfo, type KokoroTTSResult, KokoroTTSUnifiedAdapter, type KokoroVoiceName, LAM_BLENDSHAPES, type LifeLayerConfig, type LifeLayerInput, type LifeLayerOutput, type LoadingProgress, MIXAMO_PREFIX, MODEL_LOAD_TIME_BUCKETS, type MetricData, MetricNames, MicLipSync, type MicLipSyncConfig, type MicLipSyncEvents, type MicLipSyncFrame, type MicLipSyncState, MicrophoneCapture, type MicrophoneCaptureConfig, ModelCache, type ModelSpanAttributes, type ModelUrlKey, OTLPExporter, type OTLPExporterConfig, OmoteEvents, OmoteTelemetry, PRESERVE_POSITION_BONES, PlaybackPipeline, type PlaybackPipelineConfig, type PlaybackPipelineEvents, type PlaybackState, ProceduralLifeLayer, type Quat, type QuotaInfo, type ResolvedEmotion, type ResponseHandler, RingBuffer, type RuntimeBackend, type SafariSpeechConfig, SafariSpeechRecognition, type SamplingConfig, type SenseVoiceBackend, type SenseVoiceConfig, type SenseVoiceLanguage, type SenseVoiceModelInfo, type SenseVoiceResult, SenseVoiceUnifiedAdapter, type SenseVoiceWorkerConfig, type SileroVADBackend, type SileroVADConfig, type SileroVADFactoryConfig, SileroVADUnifiedAdapter, type SpanAttributes, type SpanData, type SpeechErrorCallback, SpeechListener, type SpeechListenerConfig, type SpeechListenerEvents, type SpeechListenerState, type SpeechRecognitionResult, type SpeechResultCallback, type SpeechSegment, type SynthesizeOptions, type TTSBackend, type TTSChunk, TTSPlayback, type TTSPlaybackConfig, type TTSPlaybackEvents, TTSPlayer, TTSSpeaker, type TTSSpeakerConfig, type TTSStreamOptions, type TelemetryConfig, type TelemetryExporter, type TelemetryExporterInterface, type TrackDescriptor, type TranscriptResult, type Transition, UnifiedInferenceWorker, type VADBackend, type VADModelInfo, type VADResult, type VADWorkerConfig, type VADWorkerModelInfo, type ValidationResult, type Vec3, VoiceOrchestrator, type VoiceOrchestratorCloudConfig, type VoiceOrchestratorConfig, type VoiceOrchestratorEvents, type VoiceOrchestratorLocalConfig, type VoicePipelineState, type WorkerHealthState, analyzeTextEmotion, applyProfile, blendEmotions, calculatePeak, calculateRMS, configureCacheLimit, configureModelUrls, configureTelemetry, createA2E, createEmotionVector, createKokoroTTS, createSenseVoice, createSileroVAD, createTTSPlayer, fetchWithCache, float32ToPcm16, formatBytes, getCacheConfig, getCacheKey, getEmotionPreset, getModelCache, getOptimalWasmThreads, getRecommendedBackend, getTelemetry, hasWebGPUApi, int16ToFloat32, isAndroid, isIOS, isIOSSafari, isMobile, isSafari, isSpeechRecognitionAvailable, isWebGPUAvailable, lerpBlendshapes, lerpEmotion, listVoices as listKokoroVoices, parseEmotionTags, pcm16ToFloat32, preloadModels, resampleLinear, resetModelUrls, resolveBackend, resolveEmotion, shouldEnableWasmProxy, shouldKeepTrack, shouldUseNativeASR, shouldUseServerA2E, stripMixamoPrefix, ttsToPlaybackFormat, validateTTSInput };