@omote/core 0.9.7 → 0.10.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. package/README.md +77 -35
  2. package/dist/{chunk-X5OTUOE6.mjs → chunk-3FILA2CD.mjs} +63 -205
  3. package/dist/chunk-3FILA2CD.mjs.map +1 -0
  4. package/dist/{chunk-CYBTTLG7.mjs → chunk-5WIOGMJA.mjs} +77 -219
  5. package/dist/chunk-5WIOGMJA.mjs.map +1 -0
  6. package/dist/{chunk-3NDJA3I4.mjs → chunk-NWZMIQK4.mjs} +135 -206
  7. package/dist/chunk-NWZMIQK4.mjs.map +1 -0
  8. package/dist/{chunk-Y3DTP5P3.mjs → chunk-VSYYT4HO.mjs} +1 -1
  9. package/dist/{chunk-X5OTUOE6.mjs.map → chunk-VSYYT4HO.mjs.map} +1 -1
  10. package/dist/chunk-WW4XAUJ3.mjs +208 -0
  11. package/dist/chunk-WW4XAUJ3.mjs.map +1 -0
  12. package/dist/index.d.mts +336 -1375
  13. package/dist/index.d.ts +336 -1375
  14. package/dist/index.js +6738 -11284
  15. package/dist/index.js.map +1 -1
  16. package/dist/index.mjs +6099 -10719
  17. package/dist/index.mjs.map +1 -1
  18. package/dist/logging/index.js +5 -0
  19. package/dist/logging/index.js.map +1 -1
  20. package/dist/logging/index.mjs +1 -1
  21. package/dist/otlp-2BML6FIK.mjs +7 -0
  22. package/dist/otlp-2BML6FIK.mjs.map +1 -0
  23. package/package.json +1 -2
  24. package/dist/Logger-BeUI6jG7.d.mts +0 -145
  25. package/dist/Logger-BeUI6jG7.d.ts +0 -145
  26. package/dist/Logger-DSoGAYJu.d.mts +0 -141
  27. package/dist/Logger-DSoGAYJu.d.ts +0 -141
  28. package/dist/chunk-3NDJA3I4.mjs.map +0 -1
  29. package/dist/chunk-CYBTTLG7.mjs.map +0 -1
  30. package/dist/chunk-ESU52TDS.mjs +0 -287
  31. package/dist/chunk-ESU52TDS.mjs.map +0 -1
  32. package/dist/chunk-MXKJOF4I.mjs +0 -38
  33. package/dist/chunk-MXKJOF4I.mjs.map +0 -1
  34. package/dist/chunk-XK22BRG4.mjs +0 -38
  35. package/dist/chunk-XK22BRG4.mjs.map +0 -1
  36. package/dist/chunk-Y3DTP5P3.mjs.map +0 -1
package/dist/index.d.ts CHANGED
@@ -470,7 +470,7 @@ declare function shouldUseServerA2E(): boolean;
470
470
  /**
471
471
  * Common interface for audio-to-expression (A2E) inference backends
472
472
  *
473
- * Implemented by A2EInference and A2EUnifiedAdapter, allowing PlaybackPipeline
473
+ * Implemented by A2EUnifiedAdapter, allowing PlaybackPipeline
474
474
  * and A2EProcessor to work with either implementation transparently.
475
475
  *
476
476
  * @category Inference
@@ -488,11 +488,11 @@ interface A2EModelInfo {
488
488
  /**
489
489
  * Result from A2E inference
490
490
  *
491
- * All implementations must return blendshapes in LAM_BLENDSHAPES order (alphabetical).
491
+ * All implementations must return blendshapes in ARKIT_BLENDSHAPES order (alphabetical).
492
492
  * Models with different native orderings must remap internally before returning.
493
493
  */
494
494
  interface A2EResult {
495
- /** Blendshape weights [frames, 52] in LAM_BLENDSHAPES order - 30fps */
495
+ /** Blendshape weights [frames, 52] in ARKIT_BLENDSHAPES order - 30fps */
496
496
  blendshapes: Float32Array[];
497
497
  /** Number of blendshape frames */
498
498
  numFrames: number;
@@ -507,10 +507,8 @@ interface A2EResult {
507
507
  * pipeline — A2E is the interface abstraction, LAM is the model.
508
508
  *
509
509
  * Implemented by:
510
- * - {@link A2EInference} (WebGPU/WASM, 192MB fp16)
511
- * - A2EUnifiedAdapter (shared unified worker)
510
+ * - {@link A2EUnifiedAdapter} (shared unified worker)
512
511
  *
513
- * @see {@link A2EInference} for direct usage
514
512
  * @see {@link createA2E} for the recommended factory API
515
513
  */
516
514
  interface A2EBackend {
@@ -531,7 +529,7 @@ interface A2EBackend {
531
529
  * Run inference on raw audio
532
530
  * @param audioSamples - Float32Array of raw audio at 16kHz
533
531
  * @param identityIndex - Optional identity index (ignored by CPU model)
534
- * @returns A2E result with blendshapes in LAM_BLENDSHAPES order
532
+ * @returns A2E result with blendshapes in ARKIT_BLENDSHAPES order
535
533
  */
536
534
  infer(audioSamples: Float32Array, identityIndex?: number): Promise<A2EResult>;
537
535
  /**
@@ -544,7 +542,7 @@ interface A2EBackend {
544
542
  * ExpressionProfile - Per-character weight scaling for A2E blendshape output
545
543
  *
546
544
  * Maps blendshape groups (eyes, brows, jaw, mouth, cheeks, nose, tongue)
547
- * to weight scalers. Used by PlaybackPipeline, MicLipSync, and VoicePipeline.
545
+ * to weight scalers. Used by PlaybackPipeline, MicLipSync, and VoiceOrchestrator.
548
546
  *
549
547
  * @category Audio
550
548
  */
@@ -575,7 +573,7 @@ interface ExpressionProfile {
575
573
  overrides?: Partial<Record<string, number>>;
576
574
  }
577
575
  /**
578
- * Map each LAM_BLENDSHAPES entry to its BlendshapeGroup.
576
+ * Map each ARKIT_BLENDSHAPES entry to its BlendshapeGroup.
579
577
  * Built once at module load from prefix matching.
580
578
  */
581
579
  declare const BLENDSHAPE_TO_GROUP: Map<string, BlendshapeGroup>;
@@ -698,6 +696,8 @@ declare class PlaybackPipeline extends EventEmitter<PlaybackPipelineEvents> {
698
696
  constructor(config: PlaybackPipelineConfig);
699
697
  /** Initialize AudioContext (lazy, call after user gesture) */
700
698
  initialize(): Promise<void>;
699
+ /** Eagerly create AudioContext. Call from user gesture for iOS. */
700
+ warmup(): Promise<void>;
701
701
  /** Update ExpressionProfile at runtime */
702
702
  setProfile(profile: ExpressionProfile): void;
703
703
  /** Set the emotion label to include in emitted frames */
@@ -744,7 +744,7 @@ declare class PlaybackPipeline extends EventEmitter<PlaybackPipelineEvents> {
744
744
  * TTSBackend — Streaming text-to-speech backend interface.
745
745
  *
746
746
  * Any TTS engine (Kokoro, ElevenLabs, etc.) can implement this contract
747
- * to integrate with TTSPlayback and VoicePipeline.
747
+ * to integrate with TTSPlayback and VoiceOrchestrator.
748
748
  *
749
749
  * @category Inference
750
750
  */
@@ -788,6 +788,10 @@ interface TTSStreamOptions {
788
788
  voice?: string;
789
789
  /** Speed multiplier override per-call */
790
790
  speed?: number;
791
+ /** Language override per-call (e.g. 'en-us', 'ja'). Default: derived from voice name. */
792
+ language?: string;
793
+ /** When true, emit the entire text as a single chunk (no sentence splitting). */
794
+ singleShot?: boolean;
791
795
  }
792
796
  /**
793
797
  * A single chunk of TTS audio output
@@ -863,7 +867,11 @@ declare class TTSPlayback extends EventEmitter<TTSPlaybackEvents> {
863
867
  speak(text: string, options?: {
864
868
  signal?: AbortSignal;
865
869
  voice?: string;
870
+ speed?: number;
871
+ language?: string;
866
872
  }): Promise<void>;
873
+ /** Eagerly create AudioContext. Call from user gesture for iOS. */
874
+ warmup(): Promise<void>;
867
875
  /** Dispose of all resources. */
868
876
  dispose(): Promise<void>;
869
877
  private speakWithPrefetch;
@@ -900,34 +908,9 @@ declare class TTSPlayback extends EventEmitter<TTSPlaybackEvents> {
900
908
  declare function isWebGPUAvailable(): Promise<boolean>;
901
909
 
902
910
  /**
903
- * SenseVoice automatic speech recognition using ONNX Runtime Web
904
- *
905
- * Non-autoregressive CTC-based ASR that is 5x faster than Whisper-Small.
906
- * Runs entirely in browser via WebGPU or WASM. No transformers.js dependency.
907
- *
908
- * Uses the sherpa-onnx SenseVoice export (model.int8.onnx, 239MB int8 quantized).
909
- * Also provides emotion detection, language identification, and audio event detection
910
- * from the same forward pass.
911
+ * SenseVoice type definitions
911
912
  *
912
913
  * @category Inference
913
- *
914
- * @example Basic usage
915
- * ```typescript
916
- * import { SenseVoiceInference } from '@omote/core';
917
- *
918
- * const asr = new SenseVoiceInference({
919
- * modelUrl: '/models/sensevoice/model.int8.onnx',
920
- * tokensUrl: '/models/sensevoice/tokens.txt',
921
- * });
922
- * await asr.load();
923
- *
924
- * const { text, emotion, language } = await asr.transcribe(audioSamples);
925
- * console.log(text); // "Hello world"
926
- * console.log(emotion); // "NEUTRAL"
927
- * console.log(language); // "en"
928
- * ```
929
- *
930
- * @module inference/SenseVoiceInference
931
914
  */
932
915
 
933
916
  type SenseVoiceLanguage = 'auto' | 'zh' | 'en' | 'ja' | 'ko' | 'yue';
@@ -964,76 +947,49 @@ interface SenseVoiceModelInfo {
964
947
  outputNames: string[];
965
948
  vocabSize: number;
966
949
  }
967
- declare class SenseVoiceInference {
968
- private session;
969
- private ort;
970
- private config;
971
- private _backend;
972
- private isLoading;
973
- private inferenceQueue;
974
- private poisoned;
975
- private static readonly INFERENCE_TIMEOUT_MS;
976
- private lastLfrFrames;
977
- private webgpuShapeWarned;
978
- private tokenMap;
979
- private negMean;
980
- private invStddev;
981
- private languageId;
982
- private textNormId;
983
- constructor(config: SenseVoiceConfig);
984
- get backend(): RuntimeBackend | null;
985
- get isLoaded(): boolean;
950
+ /**
951
+ * Configuration for SenseVoice Worker (used by SenseVoiceUnifiedAdapter)
952
+ */
953
+ interface SenseVoiceWorkerConfig {
954
+ /** Path or URL to model.int8.onnx (239MB) */
955
+ modelUrl: string;
956
+ /** Path or URL to tokens.txt vocabulary file (default: sibling of modelUrl) */
957
+ tokensUrl?: string;
958
+ /** Language hint (default: 'auto' for auto-detection) */
959
+ language?: SenseVoiceLanguage;
960
+ /** Text normalization: 'with_itn' applies inverse text normalization (default: 'with_itn') */
961
+ textNorm?: 'with_itn' | 'without_itn';
962
+ }
963
+ /**
964
+ * Common interface for SenseVoice implementations
965
+ */
966
+ interface SenseVoiceBackend {
967
+ /** Whether the model is loaded and ready for inference */
968
+ readonly isLoaded: boolean;
969
+ /** Current backend type ('wasm' for worker, full backend for main thread, null if not loaded) */
970
+ readonly backend: 'wasm' | 'webgpu' | null;
971
+ /**
972
+ * Load the ONNX model
973
+ * @param onProgress - Optional progress callback (fires once at 100% for worker)
974
+ * @returns Model loading information
975
+ */
986
976
  load(onProgress?: (loaded: number, total: number) => void): Promise<SenseVoiceModelInfo>;
987
977
  /**
988
978
  * Transcribe audio samples to text
989
- *
990
- * @param audioSamples Float32Array of audio samples at 16kHz, [-1, 1] range
991
- * @returns Transcription result with text, emotion, language, and event
979
+ * @param audioSamples - Float32Array of audio samples at 16kHz
980
+ * @returns Transcription result
992
981
  */
993
982
  transcribe(audioSamples: Float32Array): Promise<SenseVoiceResult>;
994
- private queueInference;
983
+ /**
984
+ * Dispose of the model and free resources
985
+ */
995
986
  dispose(): Promise<void>;
996
987
  }
997
988
 
998
989
  /**
999
- * Silero VAD (Voice Activity Detection) inference
1000
- *
1001
- * Neural network-based VAD running in browser via ONNX Runtime Web.
1002
- * Much more accurate than RMS-based energy detection.
1003
- *
1004
- * Uses lazy loading to conditionally load WebGPU or WASM-only bundle:
1005
- * - iOS: Loads WASM-only bundle (WebGPU crashes due to Safari bugs)
1006
- * - Android/Desktop: Loads WebGPU bundle (with WASM fallback)
990
+ * Silero VAD type definitions
1007
991
  *
1008
992
  * @category Inference
1009
- *
1010
- * @example Basic usage
1011
- * ```typescript
1012
- * import { SileroVADInference } from '@omote/core';
1013
- *
1014
- * const vad = new SileroVADInference({
1015
- * modelUrl: '/models/silero-vad.onnx'
1016
- * });
1017
- * await vad.load();
1018
- *
1019
- * // Process 32ms chunks (512 samples at 16kHz)
1020
- * const probability = await vad.process(audioChunk);
1021
- * if (probability > 0.5) {
1022
- * console.log('Speech detected!');
1023
- * }
1024
- * ```
1025
- *
1026
- * @example Streaming with state management
1027
- * ```typescript
1028
- * // State is automatically maintained between process() calls
1029
- * // Call reset() when starting a new audio stream
1030
- * vad.reset();
1031
- *
1032
- * for (const chunk of audioChunks) {
1033
- * const prob = await vad.process(chunk);
1034
- * // prob is speech probability [0, 1]
1035
- * }
1036
- * ```
1037
993
  */
1038
994
 
1039
995
  type VADBackend = BackendPreference;
@@ -1103,117 +1059,6 @@ interface SpeechSegment {
1103
1059
  /** Average probability during segment */
1104
1060
  avgProbability: number;
1105
1061
  }
1106
- /**
1107
- * Silero VAD - Neural network voice activity detection
1108
- *
1109
- * Based on snakers4/silero-vad ONNX model.
1110
- * Processes 32ms chunks (512 samples at 16kHz) with LSTM state.
1111
- *
1112
- * @see https://github.com/snakers4/silero-vad
1113
- */
1114
- declare class SileroVADInference {
1115
- private session;
1116
- private ort;
1117
- private config;
1118
- private _backend;
1119
- private isLoading;
1120
- private state;
1121
- private context;
1122
- private readonly chunkSize;
1123
- private readonly contextSize;
1124
- private inferenceQueue;
1125
- private preSpeechBuffer;
1126
- private wasSpeaking;
1127
- private srTensor;
1128
- constructor(config: SileroVADConfig);
1129
- get backend(): RuntimeBackend | null;
1130
- get isLoaded(): boolean;
1131
- get sampleRate(): number;
1132
- get threshold(): number;
1133
- /**
1134
- * Get required chunk size in samples
1135
- */
1136
- getChunkSize(): number;
1137
- /**
1138
- * Get chunk duration in milliseconds
1139
- */
1140
- getChunkDurationMs(): number;
1141
- /**
1142
- * Check if WebGPU is available and working
1143
- * (iOS returns false even if navigator.gpu exists due to ONNX Runtime bugs)
1144
- */
1145
- static isWebGPUAvailable: typeof isWebGPUAvailable;
1146
- /**
1147
- * Load the ONNX model
1148
- */
1149
- load(): Promise<VADModelInfo>;
1150
- /**
1151
- * Reset state for new audio stream
1152
- */
1153
- reset(): void;
1154
- /**
1155
- * Process a single audio chunk
1156
- *
1157
- * @param audioChunk - Float32Array of exactly chunkSize samples (512 for 16kHz, 256 for 8kHz)
1158
- * @returns VAD result with speech probability
1159
- */
1160
- process(audioChunk: Float32Array): Promise<VADResult>;
1161
- /**
1162
- * Process audio and detect speech segments
1163
- *
1164
- * @param audio - Complete audio buffer
1165
- * @param options - Detection options
1166
- * @returns Array of speech segments
1167
- */
1168
- detectSpeech(audio: Float32Array, options?: {
1169
- /** Minimum speech duration in ms (default: 250) */
1170
- minSpeechDurationMs?: number;
1171
- /** Minimum silence duration to end segment in ms (default: 300) */
1172
- minSilenceDurationMs?: number;
1173
- /** Padding to add before/after speech in ms (default: 30) */
1174
- speechPadMs?: number;
1175
- }): Promise<SpeechSegment[]>;
1176
- /**
1177
- * Queue inference to serialize ONNX session calls
1178
- */
1179
- private queueInference;
1180
- /**
1181
- * Dispose of the model and free resources
1182
- */
1183
- dispose(): Promise<void>;
1184
- }
1185
-
1186
- /**
1187
- * Silero VAD Web Worker implementation
1188
- *
1189
- * Runs Silero VAD inference in a dedicated Web Worker to prevent main thread blocking.
1190
- * Uses inline worker script (Blob URL pattern) to avoid separate file deployment.
1191
- *
1192
- * Key design decisions:
1193
- * - WASM backend only (WebGPU doesn't work in Workers)
1194
- * - LSTM state serialized as Float32Array (Tensors can't cross worker boundary)
1195
- * - Audio copied (not transferred) to retain main thread access for pre-speech buffer
1196
- * - ONNX Runtime loaded from CDN in worker (no bundler complications)
1197
- *
1198
- * @category Inference
1199
- *
1200
- * @example Basic usage
1201
- * ```typescript
1202
- * import { SileroVADWorker } from '@omote/core';
1203
- *
1204
- * const vad = new SileroVADWorker({
1205
- * modelUrl: '/models/silero-vad.onnx'
1206
- * });
1207
- * await vad.load();
1208
- *
1209
- * // Process 32ms chunks (512 samples at 16kHz)
1210
- * const result = await vad.process(audioChunk);
1211
- * if (result.isSpeech) {
1212
- * console.log('Speech detected!', result.probability);
1213
- * }
1214
- * ```
1215
- */
1216
-
1217
1062
  /**
1218
1063
  * Configuration for Silero VAD Worker
1219
1064
  */
@@ -1226,13 +1071,6 @@ interface VADWorkerConfig {
1226
1071
  threshold?: number;
1227
1072
  /**
1228
1073
  * Number of audio chunks to keep in pre-speech buffer.
1229
- * When VAD triggers, these chunks are prepended to the speech buffer
1230
- * to capture the beginning of speech that occurred before detection.
1231
- *
1232
- * At 512 samples/chunk and 16kHz:
1233
- * - 10 chunks = 320ms of pre-speech audio
1234
- * - 15 chunks = 480ms of pre-speech audio
1235
- *
1236
1074
  * Default: 10 chunks (320ms)
1237
1075
  */
1238
1076
  preSpeechBufferChunks?: number;
@@ -1248,85 +1086,45 @@ interface VADWorkerModelInfo {
1248
1086
  sampleRate: number;
1249
1087
  chunkSize: number;
1250
1088
  }
1251
-
1252
1089
  /**
1253
- * Silero VAD Worker - Voice Activity Detection in a Web Worker
1254
- *
1255
- * Runs Silero VAD inference off the main thread to prevent UI blocking.
1256
- * Feature parity with SileroVADInference but runs in dedicated worker.
1257
- *
1258
- * @see SileroVADInference for main-thread version
1090
+ * Common interface for Silero VAD implementations
1259
1091
  */
1260
- declare class SileroVADWorker {
1261
- private worker;
1262
- private config;
1263
- private isLoading;
1264
- private _isLoaded;
1265
- private poisoned;
1266
- private state;
1267
- private context;
1268
- private readonly chunkSize;
1269
- private readonly contextSize;
1270
- private inferenceQueue;
1271
- private preSpeechBuffer;
1272
- private wasSpeaking;
1273
- private pendingResolvers;
1274
- private messageId;
1275
- constructor(config: VADWorkerConfig);
1276
- get isLoaded(): boolean;
1277
- /**
1278
- * Backend type (always 'wasm' for Worker, WebGPU not supported in Workers)
1279
- */
1280
- get backend(): 'wasm' | null;
1281
- get sampleRate(): number;
1282
- get threshold(): number;
1283
- /**
1284
- * Get required chunk size in samples
1285
- */
1286
- getChunkSize(): number;
1287
- /**
1288
- * Get chunk duration in milliseconds
1289
- */
1290
- getChunkDurationMs(): number;
1291
- /**
1292
- * Create the worker from inline script
1293
- */
1294
- private createWorker;
1295
- /**
1296
- * Handle messages from worker
1297
- */
1298
- private handleWorkerMessage;
1299
- /**
1300
- * Send message to worker and wait for response
1301
- */
1302
- private sendMessage;
1303
- /**
1304
- * Load the ONNX model in the worker
1305
- */
1306
- load(): Promise<VADWorkerModelInfo>;
1092
+ interface SileroVADBackend {
1093
+ /** Current backend type (webgpu, wasm, or null if not loaded) */
1094
+ readonly backend: RuntimeBackend | null;
1095
+ /** Whether the model is loaded and ready for inference */
1096
+ readonly isLoaded: boolean;
1097
+ /** Audio sample rate (8000 or 16000 Hz) */
1098
+ readonly sampleRate: number;
1099
+ /** Speech detection threshold (0-1) */
1100
+ readonly threshold: number;
1307
1101
  /**
1308
- * Reset state for new audio stream
1102
+ * Load the ONNX model
1103
+ * @returns Model loading information
1309
1104
  */
1310
- reset(): Promise<void>;
1105
+ load(): Promise<VADModelInfo | VADWorkerModelInfo>;
1311
1106
  /**
1312
1107
  * Process a single audio chunk
1313
- *
1314
- * @param audioChunk - Float32Array of exactly chunkSize samples (512 for 16kHz, 256 for 8kHz)
1108
+ * @param audioChunk - Float32Array of exactly chunkSize samples
1315
1109
  * @returns VAD result with speech probability
1316
1110
  */
1317
1111
  process(audioChunk: Float32Array): Promise<VADResult>;
1318
1112
  /**
1319
- * Queue inference to serialize worker calls
1113
+ * Reset state for new audio stream
1320
1114
  */
1321
- private queueInference;
1115
+ reset(): void | Promise<void>;
1322
1116
  /**
1323
- * Dispose of the worker and free resources
1117
+ * Dispose of the model and free resources
1324
1118
  */
1325
1119
  dispose(): Promise<void>;
1326
1120
  /**
1327
- * Check if Web Workers are supported
1121
+ * Get required chunk size in samples
1328
1122
  */
1329
- static isSupported(): boolean;
1123
+ getChunkSize(): number;
1124
+ /**
1125
+ * Get chunk duration in milliseconds
1126
+ */
1127
+ getChunkDurationMs(): number;
1330
1128
  }
1331
1129
 
1332
1130
  /**
@@ -1454,43 +1252,33 @@ declare class UnifiedInferenceWorker {
1454
1252
 
1455
1253
  /** Base config shared across all inference factory functions */
1456
1254
  interface InferenceFactoryConfig {
1457
- /**
1458
- * Worker mode:
1459
- * - 'auto' (default): Use Worker if supported, else main thread
1460
- * - true: Force Worker (throws if unsupported)
1461
- * - false: Force main thread
1462
- */
1463
- useWorker?: boolean | 'auto';
1464
1255
  /**
1465
1256
  * Unified inference worker instance.
1466
- * When provided, routes inference through the shared worker,
1257
+ * Routes inference through the shared worker,
1467
1258
  * keeping all inference off the main thread.
1468
- * Takes precedence over useWorker setting.
1469
1259
  */
1470
1260
  unifiedWorker?: UnifiedInferenceWorker;
1471
1261
  }
1472
1262
 
1473
1263
  /**
1474
- * Factory function for A2E inference
1264
+ * Factory function for A2E inference via UnifiedInferenceWorker
1475
1265
  *
1476
1266
  * Creates an A2EBackend instance with zero-config defaults (HuggingFace CDN).
1477
- * Supports unified worker mode for iOS off-main-thread inference.
1267
+ * Routes inference through the shared unified worker.
1478
1268
  *
1479
1269
  * @category Inference
1480
1270
  *
1481
- * @example Auto-detect (recommended, zero-config)
1271
+ * @example
1482
1272
  * ```typescript
1483
- * import { createA2E } from '@omote/core';
1273
+ * import { createA2E, UnifiedInferenceWorker } from '@omote/core';
1274
+ *
1275
+ * const worker = new UnifiedInferenceWorker();
1276
+ * await worker.init();
1484
1277
  *
1485
- * const a2e = createA2E(); // uses HF CDN defaults (192MB fp16)
1278
+ * const a2e = createA2E({ unifiedWorker: worker });
1486
1279
  * await a2e.load();
1487
1280
  * const { blendshapes } = await a2e.infer(audioSamples);
1488
1281
  * ```
1489
- *
1490
- * @example Custom model URL
1491
- * ```typescript
1492
- * const a2e = createA2E({ modelUrl: '/models/lam.onnx' });
1493
- * ```
1494
1282
  */
1495
1283
 
1496
1284
  /**
@@ -1506,13 +1294,13 @@ interface CreateA2EConfig extends InferenceFactoryConfig {
1506
1294
  * Set to `false` to skip external data loading (single-file models only).
1507
1295
  */
1508
1296
  externalDataUrl?: string | false;
1509
- /** Backend preference (default: 'auto') */
1510
- backend?: BackendPreference;
1511
1297
  /** Number of identity classes (default: 12) */
1512
1298
  numIdentityClasses?: number;
1513
1299
  }
1514
1300
  /**
1515
- * Create an A2E instance
1301
+ * Create an A2E instance via the unified worker.
1302
+ *
1303
+ * If no `unifiedWorker` is provided, a dedicated worker is created on load().
1516
1304
  *
1517
1305
  * @param config - Factory configuration
1518
1306
  * @returns An A2EBackend instance
@@ -1528,7 +1316,7 @@ declare function createA2E(config?: CreateA2EConfig): A2EBackend;
1528
1316
  /**
1529
1317
  * Generic frame source -- any object that emits 'frame' events with blendshapes.
1530
1318
  *
1531
- * Implemented by PlaybackPipeline, MicLipSync, VoicePipeline, and any custom source.
1319
+ * Implemented by PlaybackPipeline, MicLipSync, and any custom source.
1532
1320
  * Used by OmoteAvatar (all renderer adapters) to receive animation frames.
1533
1321
  */
1534
1322
  interface FrameSource {
@@ -1557,7 +1345,7 @@ interface TranscriptResult {
1557
1345
  inferenceTimeMs?: number;
1558
1346
  }
1559
1347
  /**
1560
- * Consumer's response handler. VoicePipeline calls this with transcribed text.
1348
+ * Consumer's response handler. VoiceOrchestrator calls this with transcribed text.
1561
1349
  * Consumer must stream audio back for playback + lip sync.
1562
1350
  */
1563
1351
  interface ResponseHandler {
@@ -1588,6 +1376,8 @@ interface ResponseHandler {
1588
1376
  */
1589
1377
 
1590
1378
  interface TTSSpeakerConfig {
1379
+ /** Skip LAM download — audio playback only, no lip sync. Default: false. */
1380
+ audioOnly?: boolean;
1591
1381
  /** Per-character expression weight scaling */
1592
1382
  profile?: ExpressionProfile;
1593
1383
  /** Identity/style index for A2E model (default: 0) */
@@ -1600,8 +1390,8 @@ interface TTSSpeakerConfig {
1600
1390
  neutralTransitionMs?: number;
1601
1391
  /** Pre-built A2E backend (skip internal createA2E). */
1602
1392
  lam?: A2EBackend;
1603
- /** LAM model config (only when lam not provided) */
1604
- models?: CreateA2EConfig;
1393
+ /** LAM model config (only when lam not provided). unifiedWorker is supplied by TTSSpeaker. */
1394
+ models?: Omit<CreateA2EConfig, 'unifiedWorker'>;
1605
1395
  /** Shared unified worker (recommended for iOS) */
1606
1396
  unifiedWorker?: UnifiedInferenceWorker;
1607
1397
  }
@@ -1610,6 +1400,7 @@ declare class TTSSpeaker {
1610
1400
  private tts;
1611
1401
  private ownedLam;
1612
1402
  private ownedWorker;
1403
+ private usesSharedWorker;
1613
1404
  private currentAbort;
1614
1405
  private _isSpeaking;
1615
1406
  private _audioOnly;
@@ -1623,11 +1414,8 @@ declare class TTSSpeaker {
1623
1414
  /**
1624
1415
  * Connect a TTS backend.
1625
1416
  *
1626
- * When config includes `lam`, `unifiedWorker`, or `models`, the full lip sync
1627
- * pipeline is created (LAM + TTSPlayback + PlaybackPipeline).
1628
- *
1629
- * When config is omitted or has none of those, audio-only mode is used:
1630
- * TTS → AudioScheduler (speakers only, no blendshapes, no LAM download).
1417
+ * By default, the full lip sync pipeline is created (auto-downloads LAM).
1418
+ * Pass `audioOnly: true` for audio-only mode (no blendshapes, no LAM download).
1631
1419
  *
1632
1420
  * @param tts - TTS backend to use for speech synthesis
1633
1421
  * @param config - Optional configuration for A2E, expression profile, etc.
@@ -1643,6 +1431,8 @@ declare class TTSSpeaker {
1643
1431
  speak(text: string, options?: {
1644
1432
  signal?: AbortSignal;
1645
1433
  voice?: string;
1434
+ speed?: number;
1435
+ language?: string;
1646
1436
  }): Promise<void>;
1647
1437
  /** Audio-only speak: TTS → resample → AudioScheduler (no blendshapes). */
1648
1438
  private speakAudioOnly;
@@ -1662,13 +1452,20 @@ declare class TTSSpeaker {
1662
1452
  streamText(options: {
1663
1453
  signal?: AbortSignal;
1664
1454
  voice?: string;
1455
+ speed?: number;
1456
+ language?: string;
1665
1457
  }): Promise<{
1666
1458
  push: (token: string) => void;
1667
1459
  end: () => Promise<void>;
1668
1460
  }>;
1669
1461
  /** streamText in audio-only mode: TTS → AudioScheduler (no blendshapes). */
1670
1462
  private streamTextAudioOnly;
1671
- /** Abort current speak if any. */
1463
+ /**
1464
+ * Warm up AudioContext for iOS/Safari autoplay policy.
1465
+ * Call from a user gesture handler (click/tap) before speak().
1466
+ */
1467
+ warmup(): Promise<void>;
1468
+ /** Abort current speak if any. Triggers neutral transition on PlaybackPipeline. */
1672
1469
  stop(): void;
1673
1470
  /** Clean teardown of all owned resources. */
1674
1471
  dispose(): Promise<void>;
@@ -1704,11 +1501,13 @@ interface CreateTTSPlayerConfig {
1704
1501
  modelUrl?: string;
1705
1502
  /** Voice data base URL override */
1706
1503
  voiceBaseUrl?: string;
1504
+ /** Shared unified worker (created automatically if not provided) */
1505
+ unifiedWorker?: UnifiedInferenceWorker;
1707
1506
  }
1708
1507
  /**
1709
1508
  * Zero-config TTS player. Speak text through speakers without an avatar.
1710
1509
  *
1711
- * Uses Kokoro TTS (82M q8, ~92MB) with automatic worker selection.
1510
+ * Uses Kokoro TTS (82M q8, ~92MB) with automatic worker creation.
1712
1511
  * No LAM model is downloaded — audio plays directly through AudioScheduler.
1713
1512
  */
1714
1513
  declare function createTTSPlayer(config?: CreateTTSPlayerConfig): TTSPlayer;
@@ -1717,254 +1516,27 @@ declare function createTTSPlayer(config?: CreateTTSPlayerConfig): TTSPlayer;
1717
1516
  */
1718
1517
  declare class TTSPlayer extends TTSSpeaker {
1719
1518
  private backend;
1720
- constructor(tts: TTSBackend);
1519
+ private ttsWorker;
1520
+ private ttsPlayerUsesSharedWorker;
1521
+ private ttsConfig;
1522
+ constructor(config?: CreateTTSPlayerConfig);
1721
1523
  /** Load TTS model and connect in audio-only mode. */
1722
1524
  load(): Promise<void>;
1723
1525
  /** Whether the TTS model is loaded and ready. */
1724
1526
  get isLoaded(): boolean;
1527
+ dispose(): Promise<void>;
1725
1528
  }
1726
1529
 
1727
1530
  /**
1728
- * Factory function for SenseVoice ASR with automatic Worker vs main thread selection
1729
- *
1730
- * Provides a unified API that automatically selects the optimal implementation:
1731
- * - Worker supported: Uses SenseVoiceWorker (off-main-thread inference)
1732
- * - Worker unsupported: Uses SenseVoiceInference (main thread)
1733
- *
1734
- * @category Inference
1735
- *
1736
- * @example Auto-detect (recommended)
1737
- * ```typescript
1738
- * import { createSenseVoice } from '@omote/core';
1531
+ * SpeechListener Standalone listening primitive.
1739
1532
  *
1740
- * const asr = createSenseVoice({
1741
- * modelUrl: '/models/sensevoice/model.int8.onnx',
1742
- * });
1743
- * await asr.load();
1744
- * const { text, emotion } = await asr.transcribe(audioSamples);
1745
- * ```
1533
+ * Composes: MicrophoneCapture SileroVAD → SenseVoice ASR → transcript events.
1534
+ * Used independently or alongside TTSSpeaker and VoiceOrchestrator.
1746
1535
  *
1747
- * @example Force worker
1748
- * ```typescript
1749
- * const asr = createSenseVoice({
1750
- * modelUrl: '/models/sensevoice/model.int8.onnx',
1751
- * useWorker: true,
1752
- * });
1753
- * ```
1536
+ * Does NOT handle TTS, LAM, or response routing — those belong to TTSSpeaker
1537
+ * and VoiceOrchestrator respectively.
1754
1538
  *
1755
- * @example Force main thread
1756
- * ```typescript
1757
- * const asr = createSenseVoice({
1758
- * modelUrl: '/models/sensevoice/model.int8.onnx',
1759
- * useWorker: false,
1760
- * });
1761
- * ```
1762
- */
1763
-
1764
- /**
1765
- * Common interface for both SenseVoiceInference and SenseVoiceWorker
1766
- */
1767
- interface SenseVoiceBackend {
1768
- /** Whether the model is loaded and ready for inference */
1769
- readonly isLoaded: boolean;
1770
- /** Current backend type ('wasm' for worker, full backend for main thread, null if not loaded) */
1771
- readonly backend: 'wasm' | 'webgpu' | null;
1772
- /**
1773
- * Load the ONNX model
1774
- * @param onProgress - Optional progress callback (fires once at 100% for worker)
1775
- * @returns Model loading information
1776
- */
1777
- load(onProgress?: (loaded: number, total: number) => void): Promise<SenseVoiceModelInfo>;
1778
- /**
1779
- * Transcribe audio samples to text
1780
- * @param audioSamples - Float32Array of audio samples at 16kHz
1781
- * @returns Transcription result
1782
- */
1783
- transcribe(audioSamples: Float32Array): Promise<SenseVoiceResult>;
1784
- /**
1785
- * Dispose of the model and free resources
1786
- */
1787
- dispose(): Promise<void>;
1788
- }
1789
- /**
1790
- * Configuration for the SenseVoice factory
1791
- */
1792
- interface CreateSenseVoiceConfig extends InferenceFactoryConfig {
1793
- /** Path or URL to model.int8.onnx (239MB). Default: HuggingFace CDN */
1794
- modelUrl?: string;
1795
- /** Path or URL to tokens.txt vocabulary file (default: sibling of modelUrl) */
1796
- tokensUrl?: string;
1797
- /** Language hint (default: 'auto') */
1798
- language?: SenseVoiceLanguage;
1799
- /** Text normalization (default: 'with_itn') */
1800
- textNorm?: 'with_itn' | 'without_itn';
1801
- }
1802
- /**
1803
- * Create a SenseVoice ASR instance with automatic implementation selection
1804
- *
1805
- * @param config - Factory configuration
1806
- * @returns A SenseVoiceBackend instance (either Worker or main thread)
1807
- */
1808
- declare function createSenseVoice(config?: CreateSenseVoiceConfig): SenseVoiceBackend;
1809
-
1810
- /**
1811
- * Factory function for Silero VAD with automatic Worker vs main thread selection
1812
- *
1813
- * Provides a unified API that automatically selects the optimal implementation:
1814
- * - Desktop browsers: Uses SileroVADWorker (off-main-thread inference)
1815
- * - Mobile devices: Uses SileroVADInference (main thread, avoids memory overhead)
1816
- * - Fallback: Gracefully falls back to main thread if Worker fails
1817
- *
1818
- * @category Inference
1819
- *
1820
- * @example Basic usage (auto-detect)
1821
- * ```typescript
1822
- * import { createSileroVAD } from '@omote/core';
1823
- *
1824
- * const vad = createSileroVAD({
1825
- * modelUrl: '/models/silero-vad.onnx',
1826
- * threshold: 0.5,
1827
- * });
1828
- *
1829
- * await vad.load();
1830
- * const result = await vad.process(audioChunk);
1831
- * if (result.isSpeech) {
1832
- * console.log('Speech detected!', result.probability);
1833
- * }
1834
- * ```
1835
- *
1836
- * @example Force worker usage
1837
- * ```typescript
1838
- * const vad = createSileroVAD({
1839
- * modelUrl: '/models/silero-vad.onnx',
1840
- * useWorker: true, // Force Worker even on mobile
1841
- * });
1842
- * ```
1843
- *
1844
- * @example Force main thread
1845
- * ```typescript
1846
- * const vad = createSileroVAD({
1847
- * modelUrl: '/models/silero-vad.onnx',
1848
- * useWorker: false, // Force main thread
1849
- * });
1850
- * ```
1851
- */
1852
-
1853
- /**
1854
- * Common interface for both SileroVADInference and SileroVADWorker
1855
- *
1856
- * This interface defines the shared API that both implementations provide,
1857
- * allowing consumers to use either interchangeably.
1858
- */
1859
- interface SileroVADBackend {
1860
- /** Current backend type (webgpu, wasm, or null if not loaded) */
1861
- readonly backend: RuntimeBackend | null;
1862
- /** Whether the model is loaded and ready for inference */
1863
- readonly isLoaded: boolean;
1864
- /** Audio sample rate (8000 or 16000 Hz) */
1865
- readonly sampleRate: number;
1866
- /** Speech detection threshold (0-1) */
1867
- readonly threshold: number;
1868
- /**
1869
- * Load the ONNX model
1870
- * @returns Model loading information
1871
- */
1872
- load(): Promise<VADModelInfo | VADWorkerModelInfo>;
1873
- /**
1874
- * Process a single audio chunk
1875
- * @param audioChunk - Float32Array of exactly chunkSize samples
1876
- * @returns VAD result with speech probability
1877
- */
1878
- process(audioChunk: Float32Array): Promise<VADResult>;
1879
- /**
1880
- * Reset state for new audio stream
1881
- */
1882
- reset(): void | Promise<void>;
1883
- /**
1884
- * Dispose of the model and free resources
1885
- */
1886
- dispose(): Promise<void>;
1887
- /**
1888
- * Get required chunk size in samples
1889
- */
1890
- getChunkSize(): number;
1891
- /**
1892
- * Get chunk duration in milliseconds
1893
- */
1894
- getChunkDurationMs(): number;
1895
- }
1896
- /**
1897
- * Configuration for the Silero VAD factory
1898
- *
1899
- * Extends SileroVADConfig with worker-specific options.
1900
- */
1901
- interface SileroVADFactoryConfig extends Omit<SileroVADConfig, 'modelUrl'>, InferenceFactoryConfig {
1902
- /** Path or URL to the ONNX model. Default: HuggingFace CDN */
1903
- modelUrl?: string;
1904
- /**
1905
- * Fallback to main thread on worker errors.
1906
- *
1907
- * When true (default), if the Worker fails to load or encounters an error,
1908
- * the factory will automatically create a main thread instance instead.
1909
- *
1910
- * When false, worker errors will propagate as exceptions.
1911
- *
1912
- * Default: true
1913
- */
1914
- fallbackOnError?: boolean;
1915
- }
1916
- /**
1917
- * Check if the current environment supports VAD Web Workers
1918
- *
1919
- * Requirements:
1920
- * - Worker constructor must exist
1921
- * - Blob URL support (for inline worker script)
1922
- *
1923
- * @returns true if VAD Worker is supported
1924
- */
1925
- declare function supportsVADWorker(): boolean;
1926
- /**
1927
- * Create a Silero VAD instance with automatic implementation selection
1928
- *
1929
- * This factory function automatically selects between:
1930
- * - **SileroVADWorker**: Off-main-thread inference (better for desktop)
1931
- * - **SileroVADInference**: Main thread inference (better for mobile)
1932
- *
1933
- * The selection is based on:
1934
- * 1. Explicit `useWorker` config (if provided)
1935
- * 2. Platform detection (mobile vs desktop)
1936
- * 3. Worker API availability
1937
- *
1938
- * Both implementations share the same interface (SileroVADBackend),
1939
- * so consumers can use either interchangeably.
1940
- *
1941
- * @param config - Factory configuration
1942
- * @returns A SileroVAD instance (either Worker or main thread)
1943
- *
1944
- * @example
1945
- * ```typescript
1946
- * // Auto-detect (recommended)
1947
- * const vad = createSileroVAD({ modelUrl: '/models/silero-vad.onnx' });
1948
- *
1949
- * // Force Worker
1950
- * const vadWorker = createSileroVAD({ modelUrl: '/models/silero-vad.onnx', useWorker: true });
1951
- *
1952
- * // Force main thread
1953
- * const vadMain = createSileroVAD({ modelUrl: '/models/silero-vad.onnx', useWorker: false });
1954
- * ```
1955
- */
1956
- declare function createSileroVAD(config?: SileroVADFactoryConfig): SileroVADBackend;
1957
-
1958
- /**
1959
- * SpeechListener — Standalone listening primitive.
1960
- *
1961
- * Composes: MicrophoneCapture → SileroVAD → SenseVoice ASR → transcript events.
1962
- * Extracted from VoicePipeline's listening half so it can be used independently.
1963
- *
1964
- * Does NOT handle TTS, LAM, or response routing — those belong to TTSSpeaker
1965
- * and VoicePipeline respectively.
1966
- *
1967
- * @category Audio
1539
+ * @category Audio
1968
1540
  */
1969
1541
 
1970
1542
  interface SpeechListenerConfig {
@@ -1981,6 +1553,7 @@ interface SpeechListenerConfig {
1981
1553
  modelUrl: string;
1982
1554
  tokensUrl?: string;
1983
1555
  language?: string;
1556
+ textNorm?: 'with_itn' | 'without_itn';
1984
1557
  };
1985
1558
  vad: {
1986
1559
  modelUrl: string;
@@ -2035,6 +1608,7 @@ declare class SpeechListener extends EventEmitter<SpeechListenerEvents> {
2035
1608
  private asr;
2036
1609
  private vad;
2037
1610
  private ownedWorker;
1611
+ private usesSharedWorker;
2038
1612
  private mic;
2039
1613
  private omoteEvents;
2040
1614
  private _unsubChunk;
@@ -2164,240 +1738,80 @@ declare class InterruptionHandler extends EventEmitter<InterruptionEvents> {
2164
1738
  }
2165
1739
 
2166
1740
  /**
2167
- * SenseVoice ASR Web Worker implementation
2168
- *
2169
- * Runs SenseVoice speech recognition in a dedicated Web Worker to prevent
2170
- * main thread blocking. Uses inline worker script (Blob URL pattern) to
2171
- * avoid separate file deployment.
2172
- *
2173
- * Key design decisions:
2174
- * - WASM backend only (WebGPU doesn't work in Workers)
2175
- * - All preprocessing (fbank, LFR, CMVN) and CTC decoding inlined in worker
2176
- * - Audio copied (not transferred) to retain main thread access
2177
- * - ONNX Runtime loaded from CDN in worker (no bundler complications)
2178
- * - iOS: model URL passed as string to ORT (avoids 239MB JS heap allocation)
1741
+ * Factory function for SenseVoice ASR via UnifiedInferenceWorker
2179
1742
  *
2180
1743
  * @category Inference
2181
1744
  *
2182
- * @example Basic usage
1745
+ * @example
2183
1746
  * ```typescript
2184
- * import { SenseVoiceWorker } from '@omote/core';
1747
+ * import { createSenseVoice, UnifiedInferenceWorker } from '@omote/core';
2185
1748
  *
2186
- * const asr = new SenseVoiceWorker({
1749
+ * const worker = new UnifiedInferenceWorker();
1750
+ * await worker.init();
1751
+ *
1752
+ * const asr = createSenseVoice({
2187
1753
  * modelUrl: '/models/sensevoice/model.int8.onnx',
2188
- * tokensUrl: '/models/sensevoice/tokens.txt',
1754
+ * unifiedWorker: worker,
2189
1755
  * });
2190
1756
  * await asr.load();
2191
- *
2192
- * const { text, emotion, language } = await asr.transcribe(audioSamples);
2193
- * console.log(text); // "Hello world"
2194
- * console.log(emotion); // "NEUTRAL"
2195
- * console.log(language); // "en"
1757
+ * const { text, emotion } = await asr.transcribe(audioSamples);
2196
1758
  * ```
2197
1759
  */
2198
1760
 
2199
1761
  /**
2200
- * Configuration for SenseVoice Worker
1762
+ * Configuration for the SenseVoice factory
2201
1763
  */
2202
- interface SenseVoiceWorkerConfig {
2203
- /** Path or URL to model.int8.onnx (239MB) */
2204
- modelUrl: string;
1764
+ interface CreateSenseVoiceConfig extends InferenceFactoryConfig {
1765
+ /** Path or URL to model.int8.onnx (239MB). Default: HuggingFace CDN */
1766
+ modelUrl?: string;
2205
1767
  /** Path or URL to tokens.txt vocabulary file (default: sibling of modelUrl) */
2206
1768
  tokensUrl?: string;
2207
- /** Language hint (default: 'auto' for auto-detection) */
1769
+ /** Language hint (default: 'auto') */
2208
1770
  language?: SenseVoiceLanguage;
2209
- /** Text normalization: 'with_itn' applies inverse text normalization (default: 'with_itn') */
1771
+ /** Text normalization (default: 'with_itn') */
2210
1772
  textNorm?: 'with_itn' | 'without_itn';
2211
1773
  }
2212
1774
  /**
2213
- * SenseVoice ASR Worker - Speech Recognition in a Web Worker
1775
+ * Create a SenseVoice ASR instance via the unified worker.
2214
1776
  *
2215
- * Runs SenseVoice inference off the main thread to prevent UI blocking.
2216
- * All preprocessing (fbank, LFR, CMVN) and CTC decoding run in the worker.
1777
+ * If no `unifiedWorker` is provided, a dedicated worker is created on load().
2217
1778
  *
2218
- * @see SenseVoiceInference for main-thread version
1779
+ * @param config - Factory configuration
1780
+ * @returns A SenseVoiceBackend instance
2219
1781
  */
2220
- declare class SenseVoiceWorker {
2221
- private worker;
2222
- private config;
2223
- private isLoading;
2224
- private _isLoaded;
2225
- private inferenceQueue;
2226
- private poisoned;
2227
- private pendingResolvers;
2228
- private languageId;
2229
- private textNormId;
2230
- constructor(config: SenseVoiceWorkerConfig);
2231
- get isLoaded(): boolean;
2232
- /**
2233
- * Backend type (always 'wasm' for Worker, WebGPU not supported in Workers)
2234
- */
2235
- get backend(): 'wasm' | null;
2236
- /**
2237
- * Create the worker from inline script
2238
- */
2239
- private createWorker;
2240
- /**
2241
- * Handle messages from worker
2242
- */
2243
- private handleWorkerMessage;
2244
- /**
2245
- * Send message to worker and wait for response
2246
- */
2247
- private sendMessage;
2248
- /**
2249
- * Load the ONNX model in the worker
2250
- *
2251
- * @param onProgress - Optional progress callback. Fires once at 100% when load completes
2252
- * (the worker downloads and loads the model internally, so granular progress is not available).
2253
- */
2254
- load(onProgress?: (loaded: number, total: number) => void): Promise<SenseVoiceModelInfo>;
2255
- /**
2256
- * Transcribe audio samples to text
2257
- *
2258
- * @param audioSamples Float32Array of audio samples at 16kHz, [-1, 1] range
2259
- * @returns Transcription result with text, emotion, language, and event
2260
- */
2261
- transcribe(audioSamples: Float32Array): Promise<SenseVoiceResult>;
2262
- /**
2263
- * Queue inference to serialize worker calls
2264
- */
2265
- private queueInference;
2266
- /**
2267
- * Dispose of the worker and free resources
2268
- */
2269
- dispose(): Promise<void>;
2270
- /**
2271
- * Check if Web Workers are supported
2272
- */
2273
- static isSupported(): boolean;
2274
- }
1782
+ declare function createSenseVoice(config?: CreateSenseVoiceConfig): SenseVoiceBackend;
2275
1783
 
2276
1784
  /**
2277
1785
  * Shared blendshape constants and utilities for lip sync inference
2278
1786
  *
2279
1787
  * Contains ARKIT_BLENDSHAPES (canonical 52-blendshape ordering), symmetrization,
2280
- * and interpolation utilities used by A2EInference and all renderer adapters.
2281
- *
2282
- * This module is the single source of truth for blendshape ordering to
2283
- * avoid circular dependencies between inference classes.
2284
- *
2285
- * @category Inference
2286
- */
2287
- /**
2288
- * ARKit blendshape names in alphabetical order (52 total)
2289
- * This is the canonical ordering used by all A2E models in the SDK.
2290
- */
2291
- declare const ARKIT_BLENDSHAPES: readonly ["browDownLeft", "browDownRight", "browInnerUp", "browOuterUpLeft", "browOuterUpRight", "cheekPuff", "cheekSquintLeft", "cheekSquintRight", "eyeBlinkLeft", "eyeBlinkRight", "eyeLookDownLeft", "eyeLookDownRight", "eyeLookInLeft", "eyeLookInRight", "eyeLookOutLeft", "eyeLookOutRight", "eyeLookUpLeft", "eyeLookUpRight", "eyeSquintLeft", "eyeSquintRight", "eyeWideLeft", "eyeWideRight", "jawForward", "jawLeft", "jawOpen", "jawRight", "mouthClose", "mouthDimpleLeft", "mouthDimpleRight", "mouthFrownLeft", "mouthFrownRight", "mouthFunnel", "mouthLeft", "mouthLowerDownLeft", "mouthLowerDownRight", "mouthPressLeft", "mouthPressRight", "mouthPucker", "mouthRight", "mouthRollLower", "mouthRollUpper", "mouthShrugLower", "mouthShrugUpper", "mouthSmileLeft", "mouthSmileRight", "mouthStretchLeft", "mouthStretchRight", "mouthUpperUpLeft", "mouthUpperUpRight", "noseSneerLeft", "noseSneerRight", "tongueOut"];
2292
- /** @deprecated Use ARKIT_BLENDSHAPES instead */
2293
- declare const LAM_BLENDSHAPES: readonly ["browDownLeft", "browDownRight", "browInnerUp", "browOuterUpLeft", "browOuterUpRight", "cheekPuff", "cheekSquintLeft", "cheekSquintRight", "eyeBlinkLeft", "eyeBlinkRight", "eyeLookDownLeft", "eyeLookDownRight", "eyeLookInLeft", "eyeLookInRight", "eyeLookOutLeft", "eyeLookOutRight", "eyeLookUpLeft", "eyeLookUpRight", "eyeSquintLeft", "eyeSquintRight", "eyeWideLeft", "eyeWideRight", "jawForward", "jawLeft", "jawOpen", "jawRight", "mouthClose", "mouthDimpleLeft", "mouthDimpleRight", "mouthFrownLeft", "mouthFrownRight", "mouthFunnel", "mouthLeft", "mouthLowerDownLeft", "mouthLowerDownRight", "mouthPressLeft", "mouthPressRight", "mouthPucker", "mouthRight", "mouthRollLower", "mouthRollUpper", "mouthShrugLower", "mouthShrugUpper", "mouthSmileLeft", "mouthSmileRight", "mouthStretchLeft", "mouthStretchRight", "mouthUpperUpLeft", "mouthUpperUpRight", "noseSneerLeft", "noseSneerRight", "tongueOut"];
2294
- /**
2295
- * Linearly interpolate between two blendshape weight arrays.
2296
- *
2297
- * Pure math utility with zero renderer dependency — used by all renderer
2298
- * adapters (@omote/three, @omote/babylon, @omote/r3f) for smooth frame
2299
- * transitions.
2300
- *
2301
- * @param current - Current blendshape weights
2302
- * @param target - Target blendshape weights
2303
- * @param factor - Interpolation factor (0 = no change, 1 = snap to target). Default: 0.3
2304
- * @returns Interpolated weights as number[]
2305
- */
2306
- declare function lerpBlendshapes(current: Float32Array | number[], target: Float32Array | number[], factor?: number): number[];
2307
-
2308
- /**
2309
- * A2E inference engine for Audio-to-Expression (LAM model)
2310
- *
2311
- * Runs entirely in the browser using WebGPU or WASM.
2312
- * Takes raw 16kHz audio and outputs 52 ARKit blendshapes for lip sync.
2313
- * Uses the LAM (Large Animation Model) — see {@link A2EBackend} for the interface.
2314
- *
2315
- * @see {@link createA2E} for the recommended zero-config factory
2316
- * @see {@link A2EBackend} for the common interface
2317
- * @category Inference
2318
- *
2319
- * @example Basic usage
2320
- * ```typescript
2321
- * import { A2EInference } from '@omote/core';
2322
- *
2323
- * const a2e = new A2EInference({ modelUrl: '/models/lam.onnx' });
2324
- * await a2e.load();
2325
- *
2326
- * // Process 1 second of audio (16kHz = 16000 samples)
2327
- * const result = await a2e.infer(audioSamples);
2328
- * console.log('Blendshapes:', result.blendshapes); // [30, 52] for 30fps
2329
- * ```
2330
- */
2331
-
2332
- interface A2EInferenceConfig {
2333
- /** Path or URL to the ONNX model */
2334
- modelUrl: string;
2335
- /**
2336
- * Path or URL to external model data file (.onnx.data weights).
2337
- * Default: `${modelUrl}.data` (e.g., /models/model.onnx.data)
2338
- *
2339
- * Set to `false` to skip external data loading (single-file models only).
2340
- */
2341
- externalDataUrl?: string | false;
2342
- /** Preferred backend (auto will try WebGPU first, fallback to WASM) */
2343
- backend?: BackendPreference;
2344
- /** Number of identity classes (default: 12 for streaming model) */
2345
- numIdentityClasses?: number;
2346
- /**
2347
- * Number of audio samples per inference chunk (default: 16000).
2348
- * Model supports variable chunk sizes. Smaller chunks = lower latency,
2349
- * more inference overhead. 8000 (500ms) is recommended for real-time lip sync.
2350
- */
2351
- chunkSize?: number;
2352
- }
2353
-
2354
- declare class A2EInference implements A2EBackend {
2355
- readonly modelId: "a2e";
2356
- private session;
2357
- private ort;
2358
- private config;
2359
- private _backend;
2360
- private isLoading;
2361
- private numIdentityClasses;
2362
- readonly chunkSize: number;
2363
- private inferenceQueue;
2364
- private poisoned;
2365
- private static readonly INFERENCE_TIMEOUT_MS;
2366
- constructor(config: A2EInferenceConfig);
2367
- /**
2368
- * Check if WebGPU is available and working
2369
- * (iOS returns false even if navigator.gpu exists due to ONNX Runtime bugs)
2370
- */
2371
- static isWebGPUAvailable: typeof isWebGPUAvailable;
2372
- get backend(): 'webgpu' | 'wasm' | null;
2373
- get isLoaded(): boolean;
2374
- /** True if inference timed out and the session is permanently unusable */
2375
- get isSessionPoisoned(): boolean;
2376
- /**
2377
- * Load the ONNX model
2378
- */
2379
- load(): Promise<A2EModelInfo>;
2380
- /**
2381
- * Run inference on raw audio
2382
- * @param audioSamples - Float32Array of raw audio at 16kHz
2383
- * @param identityIndex - Optional identity index (0-11, default 0 = neutral)
2384
- *
2385
- * Audio will be zero-padded or truncated to chunkSize samples.
2386
- */
2387
- infer(audioSamples: Float32Array, identityIndex?: number): Promise<A2EResult>;
2388
- /**
2389
- * Queue inference to serialize ONNX session calls
2390
- */
2391
- private queueInference;
2392
- /**
2393
- * Get blendshape value by name for a specific frame
2394
- */
2395
- getBlendshape(blendshapes: Float32Array, name: typeof LAM_BLENDSHAPES[number]): number;
2396
- /**
2397
- * Dispose of the model and free resources
2398
- */
2399
- dispose(): Promise<void>;
2400
- }
1788
+ * and interpolation utilities used by A2EInference and all renderer adapters.
1789
+ *
1790
+ * This module is the single source of truth for blendshape ordering to
1791
+ * avoid circular dependencies between inference classes.
1792
+ *
1793
+ * @category Inference
1794
+ */
1795
+ /**
1796
+ * ARKit blendshape names in alphabetical order (52 total)
1797
+ * This is the canonical ordering used by all A2E models in the SDK.
1798
+ */
1799
+ declare const ARKIT_BLENDSHAPES: readonly ["browDownLeft", "browDownRight", "browInnerUp", "browOuterUpLeft", "browOuterUpRight", "cheekPuff", "cheekSquintLeft", "cheekSquintRight", "eyeBlinkLeft", "eyeBlinkRight", "eyeLookDownLeft", "eyeLookDownRight", "eyeLookInLeft", "eyeLookInRight", "eyeLookOutLeft", "eyeLookOutRight", "eyeLookUpLeft", "eyeLookUpRight", "eyeSquintLeft", "eyeSquintRight", "eyeWideLeft", "eyeWideRight", "jawForward", "jawLeft", "jawOpen", "jawRight", "mouthClose", "mouthDimpleLeft", "mouthDimpleRight", "mouthFrownLeft", "mouthFrownRight", "mouthFunnel", "mouthLeft", "mouthLowerDownLeft", "mouthLowerDownRight", "mouthPressLeft", "mouthPressRight", "mouthPucker", "mouthRight", "mouthRollLower", "mouthRollUpper", "mouthShrugLower", "mouthShrugUpper", "mouthSmileLeft", "mouthSmileRight", "mouthStretchLeft", "mouthStretchRight", "mouthUpperUpLeft", "mouthUpperUpRight", "noseSneerLeft", "noseSneerRight", "tongueOut"];
1800
+ /** @deprecated Use ARKIT_BLENDSHAPES instead */
1801
+ declare const LAM_BLENDSHAPES: readonly ["browDownLeft", "browDownRight", "browInnerUp", "browOuterUpLeft", "browOuterUpRight", "cheekPuff", "cheekSquintLeft", "cheekSquintRight", "eyeBlinkLeft", "eyeBlinkRight", "eyeLookDownLeft", "eyeLookDownRight", "eyeLookInLeft", "eyeLookInRight", "eyeLookOutLeft", "eyeLookOutRight", "eyeLookUpLeft", "eyeLookUpRight", "eyeSquintLeft", "eyeSquintRight", "eyeWideLeft", "eyeWideRight", "jawForward", "jawLeft", "jawOpen", "jawRight", "mouthClose", "mouthDimpleLeft", "mouthDimpleRight", "mouthFrownLeft", "mouthFrownRight", "mouthFunnel", "mouthLeft", "mouthLowerDownLeft", "mouthLowerDownRight", "mouthPressLeft", "mouthPressRight", "mouthPucker", "mouthRight", "mouthRollLower", "mouthRollUpper", "mouthShrugLower", "mouthShrugUpper", "mouthSmileLeft", "mouthSmileRight", "mouthStretchLeft", "mouthStretchRight", "mouthUpperUpLeft", "mouthUpperUpRight", "noseSneerLeft", "noseSneerRight", "tongueOut"];
1802
+ /**
1803
+ * Linearly interpolate between two blendshape weight arrays.
1804
+ *
1805
+ * Pure math utility with zero renderer dependency — used by all renderer
1806
+ * adapters (@omote/three, @omote/babylon, @omote/r3f) for smooth frame
1807
+ * transitions.
1808
+ *
1809
+ * @param current - Current blendshape weights
1810
+ * @param target - Target blendshape weights
1811
+ * @param factor - Interpolation factor (0 = no change, 1 = snap to target). Default: 0.3
1812
+ * @returns Interpolated weights as number[]
1813
+ */
1814
+ declare function lerpBlendshapes(current: Float32Array | number[], target: Float32Array | number[], factor?: number): number[];
2401
1815
 
2402
1816
  /**
2403
1817
  * Default and user-configurable model URLs for all ONNX models
@@ -2434,7 +1848,7 @@ type ModelUrlKey = 'lam' | 'senseVoice' | 'sileroVad' | 'kokoroTTS' | 'kokoroVoi
2434
1848
  * Resolved model URLs — user overrides take priority, HuggingFace CDN is fallback.
2435
1849
  *
2436
1850
  * All SDK factories (`createA2E`, `createSenseVoice`, `createSileroVAD`) and
2437
- * orchestrators (`VoicePipeline`) read from this object. Call
1851
+ * orchestrators (`VoiceOrchestrator`) read from this object. Call
2438
1852
  * {@link configureModelUrls} before constructing any pipelines to point
2439
1853
  * models at your own CDN.
2440
1854
  */
@@ -2704,6 +2118,44 @@ declare class BlendshapeSmoother {
2704
2118
  reset(): void;
2705
2119
  }
2706
2120
 
2121
+ /**
2122
+ * Factory function for Silero VAD via UnifiedInferenceWorker
2123
+ *
2124
+ * @category Inference
2125
+ *
2126
+ * @example
2127
+ * ```typescript
2128
+ * import { createSileroVAD, UnifiedInferenceWorker } from '@omote/core';
2129
+ *
2130
+ * const worker = new UnifiedInferenceWorker();
2131
+ * await worker.init();
2132
+ *
2133
+ * const vad = createSileroVAD({
2134
+ * modelUrl: '/models/silero-vad.onnx',
2135
+ * unifiedWorker: worker,
2136
+ * });
2137
+ * await vad.load();
2138
+ * const result = await vad.process(audioChunk);
2139
+ * ```
2140
+ */
2141
+
2142
+ /**
2143
+ * Configuration for the Silero VAD factory
2144
+ */
2145
+ interface SileroVADFactoryConfig extends Omit<SileroVADConfig, 'modelUrl'>, InferenceFactoryConfig {
2146
+ /** Path or URL to the ONNX model. Default: HuggingFace CDN */
2147
+ modelUrl?: string;
2148
+ }
2149
+ /**
2150
+ * Create a Silero VAD instance via the unified worker.
2151
+ *
2152
+ * If no `unifiedWorker` is provided, a dedicated worker is created on load().
2153
+ *
2154
+ * @param config - Factory configuration
2155
+ * @returns A SileroVADBackend instance
2156
+ */
2157
+ declare function createSileroVAD(config?: SileroVADFactoryConfig): SileroVADBackend;
2158
+
2707
2159
  /**
2708
2160
  * SenseVoice adapter backed by UnifiedInferenceWorker
2709
2161
  *
@@ -2762,34 +2214,9 @@ declare class A2EUnifiedAdapter implements A2EBackend {
2762
2214
  }
2763
2215
 
2764
2216
  /**
2765
- * Kokoro TTS inference using ONNX Runtime Web
2766
- *
2767
- * Pure ONNX pipeline for browser-based text-to-speech. No transformers.js dependency.
2768
- * Uses eSpeak-NG WASM for phonemization and Kokoro-82M (q8, 92MB) for synthesis.
2769
- *
2770
- * Pipeline: Text → Normalize → Phonemize (eSpeak WASM) → Tokenize → Voice Style → ONNX → Audio
2217
+ * Kokoro TTS type definitions
2771
2218
  *
2772
2219
  * @category Inference
2773
- *
2774
- * @example Basic usage
2775
- * ```typescript
2776
- * import { KokoroTTSInference } from '@omote/core';
2777
- *
2778
- * const tts = new KokoroTTSInference({ defaultVoice: 'af_heart' });
2779
- * await tts.load();
2780
- *
2781
- * const { audio, duration } = await tts.synthesize("Hello world");
2782
- * // audio: Float32Array @ 24kHz
2783
- * ```
2784
- *
2785
- * @example Streaming (sentence-by-sentence)
2786
- * ```typescript
2787
- * for await (const chunk of tts.stream("First sentence. Second sentence.")) {
2788
- * playbackPipeline.feedBuffer(chunk.audio);
2789
- * }
2790
- * ```
2791
- *
2792
- * @module inference/KokoroTTSInference
2793
2220
  */
2794
2221
 
2795
2222
  interface KokoroTTSConfig {
@@ -2803,6 +2230,8 @@ interface KokoroTTSConfig {
2803
2230
  backend?: BackendPreference;
2804
2231
  /** Speech speed multiplier (default: 1.0) */
2805
2232
  speed?: number;
2233
+ /** Eagerly load phonemizer + default voice during load() instead of first speak(). Default: true. */
2234
+ eagerLoad?: boolean;
2806
2235
  }
2807
2236
  interface KokoroTTSResult {
2808
2237
  /** Audio samples at 24kHz */
@@ -2841,67 +2270,6 @@ interface SynthesizeOptions {
2841
2270
  * Returns trimmed text on success, throws on invalid input.
2842
2271
  */
2843
2272
  declare function validateTTSInput(text: unknown, voiceName: string, speed: number, availableVoices?: string[]): string;
2844
- declare class KokoroTTSInference implements TTSBackend {
2845
- private readonly config;
2846
- private readonly modelUrl;
2847
- private readonly voiceBaseUrl;
2848
- private ort;
2849
- private session;
2850
- private _backend;
2851
- private isLoading;
2852
- private poisoned;
2853
- private inferenceQueue;
2854
- private phonemizerReady;
2855
- private defaultVoiceLoaded;
2856
- /** Cached voice data (voice name → Float32Array) */
2857
- private loadedVoices;
2858
- constructor(config?: KokoroTTSConfig);
2859
- get isLoaded(): boolean;
2860
- get sampleRate(): number;
2861
- /**
2862
- * Load the ONNX model, phonemizer WASM, and default voice.
2863
- * Safe to call multiple times (no-ops after first successful load).
2864
- */
2865
- load(): Promise<KokoroTTSModelInfo>;
2866
- /**
2867
- * Lazily initialize phonemizer and default voice on first use.
2868
- * Moves 100-200ms of main-thread blocking out of load() into first synthesis.
2869
- */
2870
- private ensureReady;
2871
- /**
2872
- * Synthesize speech from text (one-shot, full audio output).
2873
- *
2874
- * @param text - Input text to synthesize
2875
- * @param options - Voice and speed overrides
2876
- * @returns Audio Float32Array at 24kHz with duration
2877
- */
2878
- synthesize(text: string, options?: SynthesizeOptions): Promise<KokoroTTSResult>;
2879
- /**
2880
- * Stream synthesis sentence-by-sentence (async generator).
2881
- * Splits text on sentence boundaries and yields audio for each.
2882
- *
2883
- * Compatible with both `SynthesizeOptions` (legacy) and `TTSStreamOptions` (TTSBackend).
2884
- *
2885
- * @param text - Input text (can be multiple sentences)
2886
- * @param options - Voice, speed, and abort signal overrides
2887
- */
2888
- stream(text: string, options?: SynthesizeOptions & TTSStreamOptions): AsyncGenerator<KokoroStreamChunk & TTSChunk>;
2889
- /**
2890
- * Preload a voice (fetches and caches the .bin file).
2891
- */
2892
- preloadVoice(voiceName: string): Promise<void>;
2893
- /**
2894
- * List available voice names.
2895
- */
2896
- listVoices(): string[];
2897
- /**
2898
- * Release the ONNX session and clear cached voices.
2899
- */
2900
- dispose(): Promise<void>;
2901
- private ensureVoice;
2902
- private queueInference;
2903
- private runInference;
2904
- }
2905
2273
 
2906
2274
  /**
2907
2275
  * Kokoro TTS adapter backed by UnifiedInferenceWorker
@@ -2917,6 +2285,7 @@ declare class KokoroTTSUnifiedAdapter implements TTSBackend {
2917
2285
  private readonly modelUrl;
2918
2286
  private readonly voiceBaseUrl;
2919
2287
  private _isLoaded;
2288
+ private _backend;
2920
2289
  private loadedGeneration;
2921
2290
  /** Per-adapter inference queue — ensures sequential state updates. */
2922
2291
  private inferenceQueue;
@@ -3184,102 +2553,15 @@ declare class SafariSpeechRecognition {
3184
2553
  }
3185
2554
 
3186
2555
  /**
3187
- * Kokoro TTS Web Worker implementation
3188
- *
3189
- * Moves the heavy ONNX `session.run()` to a dedicated Web Worker to prevent
3190
- * main thread blocking (~1-2s per sentence on WASM). Phonemizer, tokenizer,
3191
- * and voice logic stay on the main thread (fast, <10ms combined).
3192
- *
3193
- * Architecture:
3194
- * ```
3195
- * Main Thread (KokoroTTSWorker): Worker (WORKER_SCRIPT):
3196
- * stream(text) →
3197
- * splitSentences(text)
3198
- * for each sentence:
3199
- * phonemize(sentence) → phonemes
3200
- * tokenize(phonemes) → tokens
3201
- * ensureVoice() → style
3202
- * postMessage(tokens, style, speed) ──→ session.run(feeds)
3203
- * await result ←── postMessage(audio)
3204
- * yield {audio, text, phonemes, duration}
3205
- * ```
3206
- *
3207
- * @category Inference
3208
- *
3209
- * @example Basic usage
3210
- * ```typescript
3211
- * import { KokoroTTSWorker } from '@omote/core';
3212
- *
3213
- * const tts = new KokoroTTSWorker({ defaultVoice: 'af_heart' });
3214
- * await tts.load();
3215
- *
3216
- * for await (const chunk of tts.stream("Hello world!")) {
3217
- * playbackPipeline.feedBuffer(chunk.audio);
3218
- * }
3219
- * ```
3220
- *
3221
- * @module inference/KokoroTTSWorker
3222
- */
3223
-
3224
- /**
3225
- * Kokoro TTS Worker — off-main-thread ONNX inference for non-blocking TTS.
3226
- *
3227
- * Phonemizer/tokenizer/voice logic run on the main thread (fast, <10ms).
3228
- * Only the heavy ONNX `session.run()` is delegated to the worker.
3229
- *
3230
- * Implements the same TTSBackend interface as KokoroTTSInference.
3231
- *
3232
- * @see KokoroTTSInference for main-thread version
3233
- */
3234
- declare class KokoroTTSWorker implements TTSBackend {
3235
- private readonly config;
3236
- private readonly modelUrl;
3237
- private readonly voiceBaseUrl;
3238
- private worker;
3239
- private _isLoaded;
3240
- private isLoading;
3241
- private poisoned;
3242
- /** Serializes all worker calls (stream sentence chunks + synthesize) */
3243
- private inferenceQueue;
3244
- /** Cached voice data (voice name → Float32Array) */
3245
- private loadedVoices;
3246
- /** Pending message handlers */
3247
- private pendingResolvers;
3248
- constructor(config?: KokoroTTSConfig);
3249
- get isLoaded(): boolean;
3250
- get sampleRate(): number;
3251
- load(): Promise<KokoroTTSModelInfo>;
3252
- synthesize(text: string, options?: SynthesizeOptions): Promise<KokoroTTSResult>;
3253
- stream(text: string, options?: SynthesizeOptions & TTSStreamOptions): AsyncGenerator<KokoroStreamChunk & TTSChunk>;
3254
- preloadVoice(voiceName: string): Promise<void>;
3255
- listVoices(): string[];
3256
- dispose(): Promise<void>;
3257
- static isSupported(): boolean;
3258
- private ensureVoice;
3259
- private createWorker;
3260
- private handleWorkerMessage;
3261
- private sendMessage;
3262
- /**
3263
- * Queue worker inference through the serialization queue.
3264
- * Sends pre-computed tokens + style to worker, returns audio.
3265
- */
3266
- private runWorkerInference;
3267
- /**
3268
- * One-shot synthesis (phonemize + tokenize + worker inference).
3269
- */
3270
- private queueInference;
3271
- }
3272
-
3273
- /**
3274
- * Factory function for Kokoro TTS with automatic Worker vs main thread selection
2556
+ * Factory function for Kokoro TTS via UnifiedInferenceWorker
3275
2557
  *
3276
- * Provides a unified API that automatically selects the optimal implementation:
3277
- * - Desktop: Uses KokoroTTSWorker (off-main-thread inference, no render hitching)
3278
- * - iOS: Uses KokoroTTSInference (main thread, shared ORT instance to avoid OOM)
2558
+ * When called without a `unifiedWorker`, a dedicated worker is created
2559
+ * automatically on the first `load()` call. Pass a shared worker when using
2560
+ * VoiceOrchestrator or multiple models to avoid extra WASM instances.
3279
2561
  *
3280
2562
  * @category Inference
3281
2563
  *
3282
- * @example Auto-detect (recommended)
2564
+ * @example Standalone (auto-creates worker)
3283
2565
  * ```typescript
3284
2566
  * import { createKokoroTTS } from '@omote/core';
3285
2567
  *
@@ -3291,14 +2573,9 @@ declare class KokoroTTSWorker implements TTSBackend {
3291
2573
  * }
3292
2574
  * ```
3293
2575
  *
3294
- * @example Force worker
2576
+ * @example With shared worker
3295
2577
  * ```typescript
3296
- * const tts = createKokoroTTS({ defaultVoice: 'af_heart', useWorker: true });
3297
- * ```
3298
- *
3299
- * @example Force main thread
3300
- * ```typescript
3301
- * const tts = createKokoroTTS({ defaultVoice: 'af_heart', useWorker: false });
2578
+ * const tts = createKokoroTTS({ defaultVoice: 'af_heart', unifiedWorker: worker });
3302
2579
  * ```
3303
2580
  */
3304
2581
 
@@ -3308,10 +2585,12 @@ declare class KokoroTTSWorker implements TTSBackend {
3308
2585
  interface CreateKokoroTTSConfig extends KokoroTTSConfig, InferenceFactoryConfig {
3309
2586
  }
3310
2587
  /**
3311
- * Create a Kokoro TTS instance with automatic implementation selection.
2588
+ * Create a Kokoro TTS instance via the unified worker.
2589
+ *
2590
+ * If no `unifiedWorker` is provided, a dedicated worker is created on load().
3312
2591
  *
3313
2592
  * @param config - Factory configuration
3314
- * @returns A TTSBackend instance (either Worker or main thread)
2593
+ * @returns A TTSBackend instance
3315
2594
  */
3316
2595
  declare function createKokoroTTS(config?: CreateKokoroTTSConfig): TTSBackend;
3317
2596
 
@@ -3360,7 +2639,7 @@ declare function listVoices(): string[];
3360
2639
  * ElevenLabs TTS Backend — Cloud text-to-speech via ElevenLabs REST API.
3361
2640
  *
3362
2641
  * Implements the TTSBackend interface so it can be used anywhere Kokoro TTS is used
3363
- * (TTSPlayback, TTSSpeaker, VoicePipeline, PlaybackPipeline, etc.)
2642
+ * (TTSPlayback, TTSSpeaker, VoiceOrchestrator, PlaybackPipeline, etc.)
3364
2643
  *
3365
2644
  * Zero external dependencies — uses fetch() directly.
3366
2645
  *
@@ -3438,141 +2717,6 @@ declare class ElevenLabsTTSBackend implements TTSBackend {
3438
2717
  private getHttpErrorMessage;
3439
2718
  }
3440
2719
 
3441
- /**
3442
- * AWS Polly TTS Backend — Cloud text-to-speech via consumer-provided AWS SDK call.
3443
- *
3444
- * Implements the TTSBackend interface. Keeps @omote/core free of AWS SDK dependencies
3445
- * by delegating the actual Polly API call to a consumer-provided function.
3446
- *
3447
- * @category Inference
3448
- *
3449
- * @example Basic usage with AWS SDK v3
3450
- * ```typescript
3451
- * import { PollyTTSBackend } from '@omote/core';
3452
- * import { PollyClient, SynthesizeSpeechCommand } from '@aws-sdk/client-polly';
3453
- *
3454
- * const polly = new PollyClient({ region: 'us-east-1' });
3455
- *
3456
- * const tts = new PollyTTSBackend({
3457
- * synthesizeFn: async (text, voice, sampleRate) => {
3458
- * const cmd = new SynthesizeSpeechCommand({
3459
- * Text: text,
3460
- * VoiceId: voice,
3461
- * Engine: 'neural',
3462
- * OutputFormat: 'pcm',
3463
- * SampleRate: String(sampleRate),
3464
- * });
3465
- * const result = await polly.send(cmd);
3466
- * const stream = result.AudioStream;
3467
- * // Convert stream to ArrayBuffer (Node or browser)
3468
- * const chunks: Uint8Array[] = [];
3469
- * for await (const chunk of stream as AsyncIterable<Uint8Array>) {
3470
- * chunks.push(chunk);
3471
- * }
3472
- * const totalLength = chunks.reduce((sum, c) => sum + c.length, 0);
3473
- * const merged = new Uint8Array(totalLength);
3474
- * let offset = 0;
3475
- * for (const chunk of chunks) {
3476
- * merged.set(chunk, offset);
3477
- * offset += chunk.length;
3478
- * }
3479
- * return {
3480
- * audio: merged.buffer,
3481
- * contentType: result.ContentType ?? 'audio/pcm',
3482
- * };
3483
- * },
3484
- * });
3485
- *
3486
- * await tts.load();
3487
- * for await (const chunk of tts.stream("Hello world!")) {
3488
- * playbackPipeline.feedBuffer(chunk.audio);
3489
- * }
3490
- * ```
3491
- */
3492
-
3493
- /**
3494
- * Result from the consumer-provided synthesize function.
3495
- */
3496
- interface PollySynthesizeResult {
3497
- /** Raw PCM audio bytes (Int16 LE) */
3498
- audio: ArrayBuffer;
3499
- /** Content type from Polly response (e.g., 'audio/pcm') */
3500
- contentType: string;
3501
- }
3502
- /**
3503
- * Configuration for PollyTTSBackend.
3504
- *
3505
- * The `synthesizeFn` callback lets consumers use their own AWS SDK setup
3506
- * (credentials, region, SDK version) without @omote/core depending on `@aws-sdk/client-polly`.
3507
- */
3508
- interface PollyConfig {
3509
- /**
3510
- * Consumer-provided function that calls AWS Polly.
3511
- * Must return PCM audio (Int16 LE) at the requested sample rate.
3512
- *
3513
- * @param text - Text to synthesize
3514
- * @param voice - Polly voice ID (e.g., 'Joanna')
3515
- * @param sampleRate - Requested output sample rate (e.g., 16000)
3516
- * @returns PCM audio buffer and content type
3517
- */
3518
- synthesizeFn: (text: string, voice: string, sampleRate: number) => Promise<PollySynthesizeResult>;
3519
- /** Polly voice ID (default: 'Joanna') */
3520
- voice?: string;
3521
- /** Output sample rate in Hz (default: 16000) */
3522
- sampleRate?: number;
3523
- /** Polly engine type (default: 'neural') */
3524
- engine?: 'neural' | 'standard' | 'generative' | 'long-form';
3525
- }
3526
- declare class PollyTTSBackend implements TTSBackend {
3527
- private readonly synthesizeFn;
3528
- private readonly voice;
3529
- private readonly _sampleRate;
3530
- private readonly engine;
3531
- private _isLoaded;
3532
- constructor(config: PollyConfig);
3533
- get sampleRate(): number;
3534
- get isLoaded(): boolean;
3535
- /**
3536
- * No-op for cloud TTS (no model to load).
3537
- * Marks backend as ready.
3538
- */
3539
- load(): Promise<void>;
3540
- /**
3541
- * Synthesize audio via consumer's Polly function.
3542
- *
3543
- * Polly's SynthesizeSpeech is request/response (not streaming for PCM),
3544
- * so this yields a single chunk per call. For long text, consider splitting
3545
- * into sentences on the consumer side.
3546
- */
3547
- stream(text: string, options?: TTSStreamOptions): AsyncGenerator<TTSChunk>;
3548
- dispose(): Promise<void>;
3549
- }
3550
-
3551
- /**
3552
- * ORT CDN configuration
3553
- *
3554
- * Allows consumers to override the CDN base URL used for loading
3555
- * ONNX Runtime WASM/WebGPU binaries. By default, ORT loads from
3556
- * its bundled CDN path. Use {@link configureOrtCdn} to point at
3557
- * a self-hosted or enterprise CDN.
3558
- *
3559
- * @category Inference
3560
- */
3561
- /**
3562
- * Override the CDN base URL for ONNX Runtime WASM/WebGPU binaries.
3563
- *
3564
- * Must be an HTTPS URL or a relative path (starts with `/` or `./`).
3565
- * Call this once at app startup, before loading any models.
3566
- *
3567
- * @param cdnPath - HTTPS URL or relative path to ORT binaries directory
3568
- * @throws If cdnPath is not HTTPS or a relative path
3569
- */
3570
- declare function configureOrtCdn(cdnPath: string): void;
3571
- /**
3572
- * Get the current ORT CDN base URL override, or null if using defaults.
3573
- */
3574
- declare function getOrtCdnBase(): string | null;
3575
-
3576
2720
  /**
3577
2721
  * Emotion - Helper for creating emotion vectors for avatar animation
3578
2722
  *
@@ -3987,13 +3131,90 @@ interface FetchWithCacheOptions {
3987
3131
  */
3988
3132
  declare function fetchWithCache(url: string, optionsOrProgress?: FetchWithCacheOptions | ((loaded: number, total: number) => void)): Promise<ArrayBuffer>;
3989
3133
  /**
3990
- * Preload models into cache without creating sessions
3134
+ * Preload models into cache without creating sessions
3135
+ */
3136
+ declare function preloadModels(urls: string[], onProgress?: (current: number, total: number, url: string) => void): Promise<void>;
3137
+ /**
3138
+ * Format bytes as human readable string
3139
+ */
3140
+ declare function formatBytes(bytes: number): string;
3141
+
3142
+ /**
3143
+ * Console Exporter
3144
+ *
3145
+ * Exports telemetry data to the browser console for development/debugging.
3146
+ *
3147
+ * @category Telemetry
3148
+ */
3149
+
3150
+ /**
3151
+ * Span data structure for export
3152
+ */
3153
+ interface SpanData {
3154
+ name: string;
3155
+ traceId: string;
3156
+ spanId: string;
3157
+ parentSpanId?: string;
3158
+ startTime: number;
3159
+ endTime: number;
3160
+ durationMs: number;
3161
+ /** Epoch timestamp in ms for OTLP export (start) */
3162
+ epochMs: number;
3163
+ /** Epoch timestamp in ms for OTLP export (end) */
3164
+ endEpochMs: number;
3165
+ status: 'ok' | 'error';
3166
+ attributes: SpanAttributes;
3167
+ error?: Error;
3168
+ }
3169
+ /**
3170
+ * Metric data structure for export
3171
+ */
3172
+ interface MetricData {
3173
+ name: string;
3174
+ type: 'counter' | 'histogram';
3175
+ value: number;
3176
+ attributes: Record<string, string | number | boolean>;
3177
+ timestamp: number;
3178
+ /** Histogram bucket data for OTLP export */
3179
+ histogramData?: {
3180
+ count: number;
3181
+ sum: number;
3182
+ min: number;
3183
+ max: number;
3184
+ bucketBoundaries: number[];
3185
+ bucketCounts: number[];
3186
+ };
3187
+ }
3188
+ /**
3189
+ * Exporter interface that all exporters must implement
3991
3190
  */
3992
- declare function preloadModels(urls: string[], onProgress?: (current: number, total: number, url: string) => void): Promise<void>;
3191
+ interface TelemetryExporterInterface {
3192
+ /** Export a completed span */
3193
+ exportSpan(span: SpanData): void;
3194
+ /** Export a metric */
3195
+ exportMetric(metric: MetricData): void;
3196
+ /** Flush any buffered data */
3197
+ flush(): Promise<void>;
3198
+ /** Shutdown the exporter */
3199
+ shutdown(): Promise<void>;
3200
+ }
3993
3201
  /**
3994
- * Format bytes as human readable string
3202
+ * Console exporter for development/debugging
3203
+ *
3204
+ * Outputs spans and metrics to the browser console with formatting.
3995
3205
  */
3996
- declare function formatBytes(bytes: number): string;
3206
+ declare class ConsoleExporter implements TelemetryExporterInterface {
3207
+ private enabled;
3208
+ private prefix;
3209
+ constructor(options?: {
3210
+ enabled?: boolean;
3211
+ prefix?: string;
3212
+ });
3213
+ exportSpan(span: SpanData): void;
3214
+ exportMetric(metric: MetricData): void;
3215
+ flush(): Promise<void>;
3216
+ shutdown(): Promise<void>;
3217
+ }
3997
3218
 
3998
3219
  /**
3999
3220
  * Telemetry Types
@@ -4046,6 +3267,8 @@ interface TelemetryConfig {
4046
3267
  metricsEnabled?: boolean;
4047
3268
  /** Metrics export interval in ms. Default: 60000 */
4048
3269
  metricsIntervalMs?: number;
3270
+ /** Custom exporter instance (overrides `exporter` when provided) */
3271
+ customExporter?: TelemetryExporterInterface;
4049
3272
  }
4050
3273
  /**
4051
3274
  * Span attributes for model operations
@@ -4118,7 +3341,7 @@ declare const MetricNames: {
4118
3341
  readonly CACHE_QUOTA_WARNING: "omote.cache.quota_warning";
4119
3342
  /** Counter: Cache eviction (LRU) */
4120
3343
  readonly CACHE_EVICTION: "omote.cache.eviction";
4121
- /** Histogram: VoicePipeline turn latency (speech end → transcript ready, excludes playback) */
3344
+ /** Histogram: Voice turn latency (speech end → transcript ready, excludes playback) */
4122
3345
  readonly VOICE_TURN_LATENCY: "omote.voice.turn.latency";
4123
3346
  /** Histogram: ASR transcription latency in ms */
4124
3347
  readonly VOICE_TRANSCRIPTION_LATENCY: "omote.voice.transcription.latency";
@@ -4146,20 +3369,9 @@ declare const MetricNames: {
4146
3369
  readonly COMPOSITOR_COMPOSE_LATENCY: "omote.compositor.compose.latency_us";
4147
3370
  /** Counter: Frames exceeding budget threshold */
4148
3371
  readonly AVATAR_FRAME_DROPS: "omote.avatar.frame.drops";
3372
+ /** Counter: Audio scheduling gaps (playback fell behind) */
3373
+ readonly AUDIO_SCHEDULE_GAP: "omote.audio.schedule_gap";
4149
3374
  };
4150
- /**
4151
- * Centralized error type taxonomy for structured error reporting.
4152
- */
4153
- declare const ErrorTypes: {
4154
- readonly INFERENCE: "inference_error";
4155
- readonly NETWORK: "network_error";
4156
- readonly TIMEOUT: "timeout";
4157
- readonly USER: "user_error";
4158
- readonly RUNTIME: "runtime_error";
4159
- readonly MEDIA: "media_error";
4160
- readonly MODEL: "model_error";
4161
- };
4162
- type ErrorType = typeof ErrorTypes[keyof typeof ErrorTypes];
4163
3375
  /**
4164
3376
  * Histogram buckets for inference latency (ms)
4165
3377
  */
@@ -4235,6 +3447,7 @@ declare function getTelemetry(): OmoteTelemetry | null;
4235
3447
  declare class OmoteTelemetry {
4236
3448
  private config;
4237
3449
  private exporter;
3450
+ private exporterReady;
4238
3451
  private activeTraceId;
4239
3452
  private metricsIntervalId;
4240
3453
  private spanStack;
@@ -4310,7 +3523,7 @@ declare class OmoteTelemetry {
4310
3523
  * });
4311
3524
  * ```
4312
3525
  */
4313
- recordHistogram(name: string, value: number, attributes?: Record<string, string | number | boolean>): void;
3526
+ recordHistogram(name: string, value: number, attributes?: Record<string, string | number | boolean>, bucketBoundaries?: number[]): void;
4314
3527
  /**
4315
3528
  * Generate unique key for metric with attributes
4316
3529
  */
@@ -4345,70 +3558,6 @@ declare class OmoteTelemetry {
4345
3558
  } | null;
4346
3559
  }
4347
3560
 
4348
- /**
4349
- * Console Exporter
4350
- *
4351
- * Exports telemetry data to the browser console for development/debugging.
4352
- *
4353
- * @category Telemetry
4354
- */
4355
-
4356
- /**
4357
- * Span data structure for export
4358
- */
4359
- interface SpanData {
4360
- name: string;
4361
- traceId: string;
4362
- spanId: string;
4363
- parentSpanId?: string;
4364
- startTime: number;
4365
- endTime: number;
4366
- durationMs: number;
4367
- status: 'ok' | 'error';
4368
- attributes: SpanAttributes;
4369
- error?: Error;
4370
- }
4371
- /**
4372
- * Metric data structure for export
4373
- */
4374
- interface MetricData {
4375
- name: string;
4376
- type: 'counter' | 'histogram';
4377
- value: number;
4378
- attributes: Record<string, string | number | boolean>;
4379
- timestamp: number;
4380
- }
4381
- /**
4382
- * Exporter interface that all exporters must implement
4383
- */
4384
- interface TelemetryExporterInterface {
4385
- /** Export a completed span */
4386
- exportSpan(span: SpanData): void;
4387
- /** Export a metric */
4388
- exportMetric(metric: MetricData): void;
4389
- /** Flush any buffered data */
4390
- flush(): Promise<void>;
4391
- /** Shutdown the exporter */
4392
- shutdown(): Promise<void>;
4393
- }
4394
- /**
4395
- * Console exporter for development/debugging
4396
- *
4397
- * Outputs spans and metrics to the browser console with formatting.
4398
- */
4399
- declare class ConsoleExporter implements TelemetryExporterInterface {
4400
- private enabled;
4401
- private prefix;
4402
- constructor(options?: {
4403
- enabled?: boolean;
4404
- prefix?: string;
4405
- });
4406
- exportSpan(span: SpanData): void;
4407
- exportMetric(metric: MetricData): void;
4408
- flush(): Promise<void>;
4409
- shutdown(): Promise<void>;
4410
- }
4411
-
4412
3561
  /**
4413
3562
  * OTLP Exporter
4414
3563
  *
@@ -4966,7 +4115,7 @@ declare class ProceduralLifeLayer {
4966
4115
  */
4967
4116
  update(delta: number, input?: LifeLayerInput): LifeLayerOutput;
4968
4117
  /**
4969
- * Write life layer output directly to a Float32Array[52] in LAM_BLENDSHAPES order.
4118
+ * Write life layer output directly to a Float32Array[52] in ARKIT_BLENDSHAPES order.
4970
4119
  *
4971
4120
  * Includes micro-jitter (0.4% amplitude simplex noise on all channels) to
4972
4121
  * break uncanny stillness on undriven channels.
@@ -5301,7 +4450,7 @@ declare class FaceCompositor {
5301
4450
  /**
5302
4451
  * Compose a single output frame from the 5-stage signal chain.
5303
4452
  *
5304
- * @param base - A2E raw output (Float32Array[52], LAM_BLENDSHAPES order)
4453
+ * @param base - A2E raw output (Float32Array[52], ARKIT_BLENDSHAPES order)
5305
4454
  * @param input - Per-frame input (deltaTime, emotion, life layer params)
5306
4455
  * @param target - Optional pre-allocated output buffer (avoids per-frame allocation).
5307
4456
  * When omitted, an internal buffer is used (valid until next compose() call).
@@ -5583,216 +4732,6 @@ declare class MicLipSync extends EventEmitter<MicLipSyncEvents> {
5583
4732
  private setState;
5584
4733
  }
5585
4734
 
5586
- /**
5587
- * VoicePipeline - Full conversational agent loop
5588
- *
5589
- * Composes: MicrophoneCapture → SileroVAD → SenseVoice ASR → PlaybackPipeline (A2E)
5590
- *
5591
- * State machine: idle → loading → ready → listening → thinking → speaking → listening → ...
5592
- *
5593
- * The consumer provides an `onResponse` callback that receives transcribed text
5594
- * and streams audio back for playback + lip sync. VoicePipeline is backend-agnostic.
5595
- *
5596
- * @category Orchestration
5597
- */
5598
-
5599
- /** Shared config options for all VoicePipeline modes */
5600
- interface VoicePipelineBaseConfig {
5601
- /** Pre-built backends — skip internal factory creation. Takes precedence over `models`. */
5602
- backends?: {
5603
- asr: SenseVoiceBackend;
5604
- lam: A2EBackend;
5605
- vad: SileroVADBackend;
5606
- tts?: TTSBackend;
5607
- };
5608
- /** External unified worker (reuse across pipelines). Takes precedence over internal creation. */
5609
- unifiedWorker?: UnifiedInferenceWorker;
5610
- /** URLs and options for model loading. Required if `backends` not provided. */
5611
- models?: {
5612
- senseVoice: {
5613
- modelUrl: string;
5614
- tokensUrl?: string;
5615
- language?: string;
5616
- };
5617
- lam: {
5618
- modelUrl: string;
5619
- externalDataUrl?: string | false;
5620
- backend?: 'auto' | 'webgpu' | 'wasm';
5621
- };
5622
- vad: {
5623
- modelUrl: string;
5624
- threshold?: number;
5625
- preSpeechBufferChunks?: number;
5626
- };
5627
- };
5628
- /** Per-character expression weight scaling */
5629
- profile?: ExpressionProfile;
5630
- /** Identity/style index for A2E model (default: 0) */
5631
- identityIndex?: number;
5632
- /** Base silence timeout in ms (default: 500) */
5633
- silenceTimeoutMs?: number;
5634
- /** Extended silence timeout for long utterances (default: 700) */
5635
- silenceTimeoutExtendedMs?: number;
5636
- /** Enable adaptive timeout based on speech duration (default: true) */
5637
- adaptiveTimeout?: boolean;
5638
- /** Minimum audio duration in seconds (default: 0.3) */
5639
- minAudioDurationSec?: number;
5640
- /** Minimum audio energy (default: 0.02) */
5641
- minAudioEnergy?: number;
5642
- /** Enable audio normalization for quiet audio (default: true) */
5643
- normalizeAudio?: boolean;
5644
- /** Progressive transcription interval — desktop (default: 500ms) */
5645
- progressiveIntervalMs?: number;
5646
- /** Progressive transcription interval — iOS (default: 800ms) */
5647
- progressiveIntervalIosMs?: number;
5648
- /** Coverage threshold to use progressive result (default: 0.8) */
5649
- progressiveCoverageThreshold?: number;
5650
- /** Minimum samples before progressive transcription starts (default: 8000) */
5651
- progressiveMinSamples?: number;
5652
- /** Timeout for individual SenseVoice.transcribe() calls (default: 10000ms) */
5653
- transcriptionTimeoutMs?: number;
5654
- /** Enable barge-in detection (default: true) */
5655
- interruptionEnabled?: boolean;
5656
- /** Minimum speech duration for interruption (default: 200ms) */
5657
- interruptionMinSpeechMs?: number;
5658
- /** Audio playback delay (default: auto-detected) */
5659
- audioDelayMs?: number;
5660
- /** Coalescer target duration (default: 200ms) */
5661
- chunkTargetMs?: number;
5662
- /** Enable neutral transition on playback complete (default: true) */
5663
- neutralTransitionEnabled?: boolean;
5664
- /** Duration of neutral fade-out (default: 250ms) */
5665
- neutralTransitionMs?: number;
5666
- }
5667
- /** Cloud TTS mode: consumer handles response + audio streaming */
5668
- interface VoicePipelineCloudConfig extends VoicePipelineBaseConfig {
5669
- mode: 'cloud';
5670
- /** Consumer's response handler (streams audio back) */
5671
- onResponse: ResponseHandler;
5672
- }
5673
- /** Local TTS mode: SDK handles synthesis internally via TTSBackend */
5674
- interface VoicePipelineLocalConfig extends VoicePipelineBaseConfig {
5675
- mode: 'local';
5676
- /**
5677
- * TTS backend (e.g., KokoroTTSInference). Provide either `tts` or `ttsConfig`.
5678
- *
5679
- * When `tts` is provided, VoicePipeline uses it as-is. On iOS, this means
5680
- * inference runs on the main thread (may cause UI freezes).
5681
- *
5682
- * Prefer `ttsConfig` for automatic unified worker integration on iOS.
5683
- */
5684
- tts?: TTSBackend;
5685
- /**
5686
- * Kokoro TTS configuration. When provided, VoicePipeline creates the TTS
5687
- * internally and passes the unified worker on iOS for off-main-thread inference.
5688
- *
5689
- * Takes precedence over `tts` if both are provided.
5690
- */
5691
- ttsConfig?: {
5692
- defaultVoice?: string;
5693
- speed?: number;
5694
- modelUrl?: string;
5695
- voiceBaseUrl?: string;
5696
- };
5697
- /** Optional text transform (e.g., LLM call). Receives transcript, returns response text. */
5698
- onTranscript?: (text: string) => string | Promise<string>;
5699
- }
5700
- type VoicePipelineConfig = VoicePipelineCloudConfig | VoicePipelineLocalConfig;
5701
- interface VoicePipelineEvents {
5702
- 'state': VoicePipelineState;
5703
- 'loading:progress': LoadingProgress;
5704
- 'transcript': TranscriptResult;
5705
- 'frame': FullFaceFrame;
5706
- 'frame:raw': Float32Array;
5707
- 'speech:start': void;
5708
- 'speech:end': {
5709
- durationMs: number;
5710
- };
5711
- 'playback:start': {
5712
- time: number;
5713
- };
5714
- 'playback:complete': void;
5715
- 'interruption': void;
5716
- 'audio:level': {
5717
- rms: number;
5718
- peak: number;
5719
- };
5720
- 'error': Error;
5721
- [key: string]: unknown;
5722
- }
5723
- declare class VoicePipeline extends EventEmitter<VoicePipelineEvents> {
5724
- private readonly config;
5725
- private readonly isLocalMode;
5726
- private _state;
5727
- private stopped;
5728
- private epoch;
5729
- private _sessionId;
5730
- private asr;
5731
- private lam;
5732
- private vad;
5733
- private unifiedWorker;
5734
- private playback;
5735
- private interruption;
5736
- private omoteEvents;
5737
- private mic;
5738
- private static readonly MAX_AUDIO_BUFFER_SAMPLES;
5739
- private audioBuffer;
5740
- private audioBufferSamples;
5741
- private speechStartTime;
5742
- private silenceTimer;
5743
- private isSpeaking;
5744
- private progressiveTimer;
5745
- private progressivePromise;
5746
- private lastProgressiveResult;
5747
- private lastProgressiveSamples;
5748
- private asrErrorCount;
5749
- private progressiveErrorCount;
5750
- private responseAbortController;
5751
- private _unsubChunk;
5752
- private _unsubLevel;
5753
- private _currentFrame;
5754
- /** Current pipeline state */
5755
- get state(): VoicePipelineState;
5756
- /** Latest blendshape frame */
5757
- get currentFrame(): Float32Array | null;
5758
- /** Whether user is currently speaking */
5759
- get isSpeechActive(): boolean;
5760
- /** Session ID (generated on start(), null before) */
5761
- get sessionId(): string | null;
5762
- constructor(config: VoicePipelineConfig);
5763
- loadModels(): Promise<void>;
5764
- /**
5765
- * Load from pre-built backends (dependency injection path).
5766
- * Loads any backends that aren't loaded yet.
5767
- */
5768
- private loadFromBackends;
5769
- /**
5770
- * Load from factories (original path). Loads SenseVoice, LAM, and VAD in parallel.
5771
- */
5772
- private loadFromFactories;
5773
- start(): Promise<void>;
5774
- stop(): void;
5775
- setProfile(profile: ExpressionProfile): void;
5776
- dispose(): Promise<void>;
5777
- private processAudioChunk;
5778
- private getSilenceTimeout;
5779
- private onSilenceDetected;
5780
- private processEndOfSpeech;
5781
- private callResponseHandler;
5782
- /** Cloud mode: delegate to consumer's onResponse handler */
5783
- private handleCloudResponse;
5784
- /** Local mode: synthesize text with TTSBackend, stream to PlaybackPipeline */
5785
- private handleLocalResponse;
5786
- private handleInterruption;
5787
- private startProgressiveTranscription;
5788
- private stopProgressiveTranscription;
5789
- private transcribeWithTimeout;
5790
- private normalizeAudio;
5791
- private setState;
5792
- private emitProgress;
5793
- private clearSilenceTimer;
5794
- }
5795
-
5796
4735
  /**
5797
4736
  * VoiceOrchestrator — Shared voice wiring for OmoteAvatar adapters.
5798
4737
  *
@@ -5810,6 +4749,11 @@ interface VoiceOrchestratorBaseConfig {
5810
4749
  listener?: SpeechListenerConfig;
5811
4750
  interruptionEnabled?: boolean;
5812
4751
  profile?: ExpressionProfile;
4752
+ onStateChange?: (state: ConversationalState) => void;
4753
+ onLoadingProgress?: (progress: LoadingProgress) => void;
4754
+ onError?: (error: Error) => void;
4755
+ onTranscriptEvent?: (result: TranscriptResult) => void;
4756
+ onInterruption?: () => void;
5813
4757
  }
5814
4758
  interface VoiceOrchestratorLocalConfig extends VoiceOrchestratorBaseConfig {
5815
4759
  mode?: 'local';
@@ -5823,12 +4767,23 @@ interface VoiceOrchestratorCloudConfig extends VoiceOrchestratorBaseConfig {
5823
4767
  lam?: {
5824
4768
  modelUrl?: string;
5825
4769
  externalDataUrl?: string | false;
4770
+ unifiedWorker?: UnifiedInferenceWorker;
5826
4771
  };
4772
+ identityIndex?: number;
4773
+ neutralTransitionEnabled?: boolean;
5827
4774
  }
5828
4775
  type VoiceOrchestratorConfig = VoiceOrchestratorLocalConfig | VoiceOrchestratorCloudConfig;
5829
4776
  interface VoiceOrchestratorEvents {
5830
4777
  'state': ConversationalState;
5831
4778
  'transcript': TranscriptResult;
4779
+ 'interruption': void;
4780
+ 'loading:progress': LoadingProgress;
4781
+ 'error': Error;
4782
+ 'audio:level': {
4783
+ rms: number;
4784
+ peak: number;
4785
+ };
4786
+ 'playback:complete': void;
5832
4787
  [key: string]: unknown;
5833
4788
  }
5834
4789
  declare class VoiceOrchestrator extends EventEmitter<VoiceOrchestratorEvents> {
@@ -5837,6 +4792,8 @@ declare class VoiceOrchestrator extends EventEmitter<VoiceOrchestratorEvents> {
5837
4792
  private ttsSpeaker;
5838
4793
  private playbackPipeline;
5839
4794
  private ownedLam;
4795
+ private ownedWorker;
4796
+ private usesSharedWorker;
5840
4797
  private transcriptUnsub;
5841
4798
  private audioChunkUnsub;
5842
4799
  private connectEpoch;
@@ -5860,10 +4817,14 @@ declare class VoiceOrchestrator extends EventEmitter<VoiceOrchestratorEvents> {
5860
4817
  speak(text: string, options?: {
5861
4818
  signal?: AbortSignal;
5862
4819
  voice?: string;
4820
+ speed?: number;
4821
+ language?: string;
5863
4822
  }): Promise<void>;
5864
4823
  streamText(options?: {
5865
4824
  signal?: AbortSignal;
5866
4825
  voice?: string;
4826
+ speed?: number;
4827
+ language?: string;
5867
4828
  }): Promise<{
5868
4829
  push: (token: string) => void;
5869
4830
  end: () => Promise<void>;
@@ -5875,4 +4836,4 @@ declare class VoiceOrchestrator extends EventEmitter<VoiceOrchestratorEvents> {
5875
4836
  private setState;
5876
4837
  }
5877
4838
 
5878
- export { type A2EBackend, A2EInference, type A2EInferenceConfig, type A2EModelInfo, A2EProcessor, type A2EProcessorConfig, type A2EResult, A2EUnifiedAdapter, ALL_AUS, ARKIT_BLENDSHAPES, type AUActivation, AU_TO_ARKIT, type ActiveSpan, type AnimationClip, type AnimationController, AnimationGraph, type AnimationGraphConfig, type AnimationGraphEvents, type AnimationLayer, type AnimationOutput, type AnimationSource, type AnimationSourceOptions, type AnimationState, type AnimationStateName, type AnimationTrigger, AudioChunkCoalescer, type AudioChunkCoalescerOptions, AudioEnergyAnalyzer, AudioScheduler, type AudioSchedulerOptions, BLENDSHAPE_TO_GROUP, type BackendPreference, type BlendWeight, type BlendshapeGroup, BlendshapeSmoother, type BlendshapeSmootherConfig, type BoneFilterConfig, type CacheConfig, type CacheSpanAttributes, CharacterController, type CharacterControllerConfig, type CharacterProfile, type CharacterUpdateInput, type CharacterUpdateOutput, ConsoleExporter, type ConversationalState, type CreateA2EConfig, type CreateKokoroTTSConfig, type CreateSenseVoiceConfig, type CreateTTSPlayerConfig, DEFAULT_ANIMATION_CONFIG, DEFAULT_BONE_FILTER, DEFAULT_MODEL_URLS, EMOTION_NAMES, EMOTION_TO_AU, EMOTION_VECTOR_SIZE, EXPLICIT_EMOTION_COUNT, type ElevenLabsConfig, ElevenLabsTTSBackend, type EmotionAnimationMap, EmotionController, type EmotionLabel, type EmotionName, type EmotionPresetName, EmotionPresets, EmotionResolver, type EmotionWeights, EmphasisDetector, type ErrorType, ErrorTypes, EventEmitter, type ExpressionProfile, FaceCompositor, type FaceCompositorConfig, type FaceCompositorInput, type FaceCompositorOutput, type FetchWithCacheOptions, type FrameSource, type FullFaceFrame, HF_CDN_URLS, INFERENCE_LATENCY_BUCKETS, type InferenceFactoryConfig, type InferenceSpanAttributes, type InterruptionConfig, type InterruptionEvents, InterruptionHandler, KOKORO_VOICES, type KokoroStreamChunk, type KokoroTTSConfig, KokoroTTSInference, type KokoroTTSModelInfo, type KokoroTTSResult, KokoroTTSUnifiedAdapter, KokoroTTSWorker, type KokoroVoiceName, LAM_BLENDSHAPES, type LifeLayerConfig, type LifeLayerInput, type LifeLayerOutput, type LoadingProgress, MIXAMO_PREFIX, MODEL_LOAD_TIME_BUCKETS, type MetricData, MetricNames, MicLipSync, type MicLipSyncConfig, type MicLipSyncEvents, type MicLipSyncFrame, type MicLipSyncState, MicrophoneCapture, type MicrophoneCaptureConfig, ModelCache, type ModelSpanAttributes, type ModelUrlKey, OTLPExporter, type OTLPExporterConfig, OmoteEvents, OmoteTelemetry, PRESERVE_POSITION_BONES, PlaybackPipeline, type PlaybackPipelineConfig, type PlaybackPipelineEvents, type PlaybackState, type PollyConfig, type PollySynthesizeResult, PollyTTSBackend, ProceduralLifeLayer, type Quat, type QuotaInfo, type ResolvedEmotion, type ResponseHandler, RingBuffer, type RuntimeBackend, type SafariSpeechConfig, SafariSpeechRecognition, type SamplingConfig, type SenseVoiceBackend, type SenseVoiceConfig, SenseVoiceInference, type SenseVoiceLanguage, type SenseVoiceModelInfo, type SenseVoiceResult, SenseVoiceUnifiedAdapter, SenseVoiceWorker, type SenseVoiceWorkerConfig, type SileroVADBackend, type SileroVADConfig, type SileroVADFactoryConfig, SileroVADInference, SileroVADUnifiedAdapter, SileroVADWorker, type SpanAttributes, type SpanData, type SpeechErrorCallback, SpeechListener, type SpeechListenerConfig, type SpeechListenerEvents, type SpeechListenerState, type SpeechRecognitionResult, type SpeechResultCallback, type SpeechSegment, type SynthesizeOptions, type TTSBackend, type TTSChunk, TTSPlayback, type TTSPlaybackConfig, type TTSPlaybackEvents, TTSPlayer, TTSSpeaker, type TTSSpeakerConfig, type TTSStreamOptions, type TelemetryConfig, type TelemetryExporter, type TelemetryExporterInterface, type TrackDescriptor, type TranscriptResult, type Transition, UnifiedInferenceWorker, type VADBackend, type VADModelInfo, type VADResult, type VADWorkerConfig, type VADWorkerModelInfo, type ValidationResult, type Vec3, VoiceOrchestrator, type VoiceOrchestratorCloudConfig, type VoiceOrchestratorConfig, type VoiceOrchestratorEvents, type VoiceOrchestratorLocalConfig, VoicePipeline, type VoicePipelineCloudConfig, type VoicePipelineConfig, type VoicePipelineEvents, type VoicePipelineLocalConfig, type VoicePipelineState, A2EInference as Wav2Vec2Inference, type WorkerHealthState, analyzeTextEmotion, applyProfile, blendEmotions, calculatePeak, calculateRMS, configureCacheLimit, configureModelUrls, configureOrtCdn, configureTelemetry, createA2E, createEmotionVector, createKokoroTTS, createSenseVoice, createSileroVAD, createTTSPlayer, fetchWithCache, float32ToPcm16, formatBytes, getCacheConfig, getCacheKey, getEmotionPreset, getModelCache, getOptimalWasmThreads, getOrtCdnBase, getRecommendedBackend, getTelemetry, hasWebGPUApi, int16ToFloat32, isAndroid, isIOS, isIOSSafari, isMobile, isSafari, isSpeechRecognitionAvailable, isWebGPUAvailable, lerpBlendshapes, lerpEmotion, listVoices as listKokoroVoices, parseEmotionTags, pcm16ToFloat32, preloadModels, resampleLinear, resetModelUrls, resolveBackend, resolveEmotion, shouldEnableWasmProxy, shouldKeepTrack, shouldUseNativeASR, shouldUseServerA2E, stripMixamoPrefix, supportsVADWorker, ttsToPlaybackFormat, validateTTSInput };
4839
+ export { type A2EBackend, type A2EModelInfo, A2EProcessor, type A2EProcessorConfig, type A2EResult, A2EUnifiedAdapter, ALL_AUS, ARKIT_BLENDSHAPES, type AUActivation, AU_TO_ARKIT, type ActiveSpan, type AnimationClip, type AnimationController, AnimationGraph, type AnimationGraphConfig, type AnimationGraphEvents, type AnimationLayer, type AnimationOutput, type AnimationSource, type AnimationSourceOptions, type AnimationState, type AnimationStateName, type AnimationTrigger, AudioChunkCoalescer, type AudioChunkCoalescerOptions, AudioEnergyAnalyzer, AudioScheduler, type AudioSchedulerOptions, BLENDSHAPE_TO_GROUP, type BackendPreference, type BlendWeight, type BlendshapeGroup, BlendshapeSmoother, type BlendshapeSmootherConfig, type BoneFilterConfig, type CacheConfig, type CacheSpanAttributes, CharacterController, type CharacterControllerConfig, type CharacterProfile, type CharacterUpdateInput, type CharacterUpdateOutput, ConsoleExporter, type ConversationalState, type CreateA2EConfig, type CreateKokoroTTSConfig, type CreateSenseVoiceConfig, type CreateTTSPlayerConfig, DEFAULT_ANIMATION_CONFIG, DEFAULT_BONE_FILTER, DEFAULT_MODEL_URLS, EMOTION_NAMES, EMOTION_TO_AU, EMOTION_VECTOR_SIZE, EXPLICIT_EMOTION_COUNT, type ElevenLabsConfig, ElevenLabsTTSBackend, type EmotionAnimationMap, EmotionController, type EmotionLabel, type EmotionName, type EmotionPresetName, EmotionPresets, EmotionResolver, type EmotionWeights, EmphasisDetector, EventEmitter, type ExpressionProfile, FaceCompositor, type FaceCompositorConfig, type FaceCompositorInput, type FaceCompositorOutput, type FetchWithCacheOptions, type FrameSource, type FullFaceFrame, HF_CDN_URLS, INFERENCE_LATENCY_BUCKETS, type InferenceFactoryConfig, type InferenceSpanAttributes, type InterruptionConfig, type InterruptionEvents, InterruptionHandler, KOKORO_VOICES, type KokoroStreamChunk, type KokoroTTSConfig, type KokoroTTSModelInfo, type KokoroTTSResult, KokoroTTSUnifiedAdapter, type KokoroVoiceName, LAM_BLENDSHAPES, type LifeLayerConfig, type LifeLayerInput, type LifeLayerOutput, type LoadingProgress, MIXAMO_PREFIX, MODEL_LOAD_TIME_BUCKETS, type MetricData, MetricNames, MicLipSync, type MicLipSyncConfig, type MicLipSyncEvents, type MicLipSyncFrame, type MicLipSyncState, MicrophoneCapture, type MicrophoneCaptureConfig, ModelCache, type ModelSpanAttributes, type ModelUrlKey, OTLPExporter, type OTLPExporterConfig, OmoteEvents, OmoteTelemetry, PRESERVE_POSITION_BONES, PlaybackPipeline, type PlaybackPipelineConfig, type PlaybackPipelineEvents, type PlaybackState, ProceduralLifeLayer, type Quat, type QuotaInfo, type ResolvedEmotion, type ResponseHandler, RingBuffer, type RuntimeBackend, type SafariSpeechConfig, SafariSpeechRecognition, type SamplingConfig, type SenseVoiceBackend, type SenseVoiceConfig, type SenseVoiceLanguage, type SenseVoiceModelInfo, type SenseVoiceResult, SenseVoiceUnifiedAdapter, type SenseVoiceWorkerConfig, type SileroVADBackend, type SileroVADConfig, type SileroVADFactoryConfig, SileroVADUnifiedAdapter, type SpanAttributes, type SpanData, type SpeechErrorCallback, SpeechListener, type SpeechListenerConfig, type SpeechListenerEvents, type SpeechListenerState, type SpeechRecognitionResult, type SpeechResultCallback, type SpeechSegment, type SynthesizeOptions, type TTSBackend, type TTSChunk, TTSPlayback, type TTSPlaybackConfig, type TTSPlaybackEvents, TTSPlayer, TTSSpeaker, type TTSSpeakerConfig, type TTSStreamOptions, type TelemetryConfig, type TelemetryExporter, type TelemetryExporterInterface, type TrackDescriptor, type TranscriptResult, type Transition, UnifiedInferenceWorker, type VADBackend, type VADModelInfo, type VADResult, type VADWorkerConfig, type VADWorkerModelInfo, type ValidationResult, type Vec3, VoiceOrchestrator, type VoiceOrchestratorCloudConfig, type VoiceOrchestratorConfig, type VoiceOrchestratorEvents, type VoiceOrchestratorLocalConfig, type VoicePipelineState, type WorkerHealthState, analyzeTextEmotion, applyProfile, blendEmotions, calculatePeak, calculateRMS, configureCacheLimit, configureModelUrls, configureTelemetry, createA2E, createEmotionVector, createKokoroTTS, createSenseVoice, createSileroVAD, createTTSPlayer, fetchWithCache, float32ToPcm16, formatBytes, getCacheConfig, getCacheKey, getEmotionPreset, getModelCache, getOptimalWasmThreads, getRecommendedBackend, getTelemetry, hasWebGPUApi, int16ToFloat32, isAndroid, isIOS, isIOSSafari, isMobile, isSafari, isSpeechRecognitionAvailable, isWebGPUAvailable, lerpBlendshapes, lerpEmotion, listVoices as listKokoroVoices, parseEmotionTags, pcm16ToFloat32, preloadModels, resampleLinear, resetModelUrls, resolveBackend, resolveEmotion, shouldEnableWasmProxy, shouldKeepTrack, shouldUseNativeASR, shouldUseServerA2E, stripMixamoPrefix, ttsToPlaybackFormat, validateTTSInput };