@omote/core 0.9.7 → 0.10.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -1
- package/dist/{chunk-Y3DTP5P3.mjs → chunk-VSYYT4HO.mjs} +1 -1
- package/dist/{chunk-X5OTUOE6.mjs.map → chunk-VSYYT4HO.mjs.map} +1 -1
- package/dist/index.d.mts +261 -1305
- package/dist/index.d.ts +261 -1305
- package/dist/index.js +6380 -11034
- package/dist/index.js.map +1 -1
- package/dist/index.mjs +6379 -11033
- package/dist/index.mjs.map +1 -1
- package/dist/logging/index.js.map +1 -1
- package/dist/logging/index.mjs +1 -1
- package/package.json +1 -2
- package/dist/Logger-BeUI6jG7.d.mts +0 -145
- package/dist/Logger-BeUI6jG7.d.ts +0 -145
- package/dist/Logger-DSoGAYJu.d.mts +0 -141
- package/dist/Logger-DSoGAYJu.d.ts +0 -141
- package/dist/chunk-3NDJA3I4.mjs +0 -853
- package/dist/chunk-3NDJA3I4.mjs.map +0 -1
- package/dist/chunk-CYBTTLG7.mjs +0 -927
- package/dist/chunk-CYBTTLG7.mjs.map +0 -1
- package/dist/chunk-ESU52TDS.mjs +0 -287
- package/dist/chunk-ESU52TDS.mjs.map +0 -1
- package/dist/chunk-MXKJOF4I.mjs +0 -38
- package/dist/chunk-MXKJOF4I.mjs.map +0 -1
- package/dist/chunk-X5OTUOE6.mjs +0 -927
- package/dist/chunk-XK22BRG4.mjs +0 -38
- package/dist/chunk-XK22BRG4.mjs.map +0 -1
- package/dist/chunk-Y3DTP5P3.mjs.map +0 -1
package/dist/index.d.ts
CHANGED
|
@@ -470,7 +470,7 @@ declare function shouldUseServerA2E(): boolean;
|
|
|
470
470
|
/**
|
|
471
471
|
* Common interface for audio-to-expression (A2E) inference backends
|
|
472
472
|
*
|
|
473
|
-
* Implemented by
|
|
473
|
+
* Implemented by A2EUnifiedAdapter, allowing PlaybackPipeline
|
|
474
474
|
* and A2EProcessor to work with either implementation transparently.
|
|
475
475
|
*
|
|
476
476
|
* @category Inference
|
|
@@ -488,11 +488,11 @@ interface A2EModelInfo {
|
|
|
488
488
|
/**
|
|
489
489
|
* Result from A2E inference
|
|
490
490
|
*
|
|
491
|
-
* All implementations must return blendshapes in
|
|
491
|
+
* All implementations must return blendshapes in ARKIT_BLENDSHAPES order (alphabetical).
|
|
492
492
|
* Models with different native orderings must remap internally before returning.
|
|
493
493
|
*/
|
|
494
494
|
interface A2EResult {
|
|
495
|
-
/** Blendshape weights [frames, 52] in
|
|
495
|
+
/** Blendshape weights [frames, 52] in ARKIT_BLENDSHAPES order - 30fps */
|
|
496
496
|
blendshapes: Float32Array[];
|
|
497
497
|
/** Number of blendshape frames */
|
|
498
498
|
numFrames: number;
|
|
@@ -507,10 +507,8 @@ interface A2EResult {
|
|
|
507
507
|
* pipeline — A2E is the interface abstraction, LAM is the model.
|
|
508
508
|
*
|
|
509
509
|
* Implemented by:
|
|
510
|
-
* - {@link
|
|
511
|
-
* - A2EUnifiedAdapter (shared unified worker)
|
|
510
|
+
* - {@link A2EUnifiedAdapter} (shared unified worker)
|
|
512
511
|
*
|
|
513
|
-
* @see {@link A2EInference} for direct usage
|
|
514
512
|
* @see {@link createA2E} for the recommended factory API
|
|
515
513
|
*/
|
|
516
514
|
interface A2EBackend {
|
|
@@ -531,7 +529,7 @@ interface A2EBackend {
|
|
|
531
529
|
* Run inference on raw audio
|
|
532
530
|
* @param audioSamples - Float32Array of raw audio at 16kHz
|
|
533
531
|
* @param identityIndex - Optional identity index (ignored by CPU model)
|
|
534
|
-
* @returns A2E result with blendshapes in
|
|
532
|
+
* @returns A2E result with blendshapes in ARKIT_BLENDSHAPES order
|
|
535
533
|
*/
|
|
536
534
|
infer(audioSamples: Float32Array, identityIndex?: number): Promise<A2EResult>;
|
|
537
535
|
/**
|
|
@@ -544,7 +542,7 @@ interface A2EBackend {
|
|
|
544
542
|
* ExpressionProfile - Per-character weight scaling for A2E blendshape output
|
|
545
543
|
*
|
|
546
544
|
* Maps blendshape groups (eyes, brows, jaw, mouth, cheeks, nose, tongue)
|
|
547
|
-
* to weight scalers. Used by PlaybackPipeline, MicLipSync, and
|
|
545
|
+
* to weight scalers. Used by PlaybackPipeline, MicLipSync, and VoiceOrchestrator.
|
|
548
546
|
*
|
|
549
547
|
* @category Audio
|
|
550
548
|
*/
|
|
@@ -575,7 +573,7 @@ interface ExpressionProfile {
|
|
|
575
573
|
overrides?: Partial<Record<string, number>>;
|
|
576
574
|
}
|
|
577
575
|
/**
|
|
578
|
-
* Map each
|
|
576
|
+
* Map each ARKIT_BLENDSHAPES entry to its BlendshapeGroup.
|
|
579
577
|
* Built once at module load from prefix matching.
|
|
580
578
|
*/
|
|
581
579
|
declare const BLENDSHAPE_TO_GROUP: Map<string, BlendshapeGroup>;
|
|
@@ -698,6 +696,8 @@ declare class PlaybackPipeline extends EventEmitter<PlaybackPipelineEvents> {
|
|
|
698
696
|
constructor(config: PlaybackPipelineConfig);
|
|
699
697
|
/** Initialize AudioContext (lazy, call after user gesture) */
|
|
700
698
|
initialize(): Promise<void>;
|
|
699
|
+
/** Eagerly create AudioContext. Call from user gesture for iOS. */
|
|
700
|
+
warmup(): Promise<void>;
|
|
701
701
|
/** Update ExpressionProfile at runtime */
|
|
702
702
|
setProfile(profile: ExpressionProfile): void;
|
|
703
703
|
/** Set the emotion label to include in emitted frames */
|
|
@@ -744,7 +744,7 @@ declare class PlaybackPipeline extends EventEmitter<PlaybackPipelineEvents> {
|
|
|
744
744
|
* TTSBackend — Streaming text-to-speech backend interface.
|
|
745
745
|
*
|
|
746
746
|
* Any TTS engine (Kokoro, ElevenLabs, etc.) can implement this contract
|
|
747
|
-
* to integrate with TTSPlayback and
|
|
747
|
+
* to integrate with TTSPlayback and VoiceOrchestrator.
|
|
748
748
|
*
|
|
749
749
|
* @category Inference
|
|
750
750
|
*/
|
|
@@ -788,6 +788,10 @@ interface TTSStreamOptions {
|
|
|
788
788
|
voice?: string;
|
|
789
789
|
/** Speed multiplier override per-call */
|
|
790
790
|
speed?: number;
|
|
791
|
+
/** Language override per-call (e.g. 'en-us', 'ja'). Default: derived from voice name. */
|
|
792
|
+
language?: string;
|
|
793
|
+
/** When true, emit the entire text as a single chunk (no sentence splitting). */
|
|
794
|
+
singleShot?: boolean;
|
|
791
795
|
}
|
|
792
796
|
/**
|
|
793
797
|
* A single chunk of TTS audio output
|
|
@@ -863,7 +867,11 @@ declare class TTSPlayback extends EventEmitter<TTSPlaybackEvents> {
|
|
|
863
867
|
speak(text: string, options?: {
|
|
864
868
|
signal?: AbortSignal;
|
|
865
869
|
voice?: string;
|
|
870
|
+
speed?: number;
|
|
871
|
+
language?: string;
|
|
866
872
|
}): Promise<void>;
|
|
873
|
+
/** Eagerly create AudioContext. Call from user gesture for iOS. */
|
|
874
|
+
warmup(): Promise<void>;
|
|
867
875
|
/** Dispose of all resources. */
|
|
868
876
|
dispose(): Promise<void>;
|
|
869
877
|
private speakWithPrefetch;
|
|
@@ -900,34 +908,9 @@ declare class TTSPlayback extends EventEmitter<TTSPlaybackEvents> {
|
|
|
900
908
|
declare function isWebGPUAvailable(): Promise<boolean>;
|
|
901
909
|
|
|
902
910
|
/**
|
|
903
|
-
* SenseVoice
|
|
904
|
-
*
|
|
905
|
-
* Non-autoregressive CTC-based ASR that is 5x faster than Whisper-Small.
|
|
906
|
-
* Runs entirely in browser via WebGPU or WASM. No transformers.js dependency.
|
|
907
|
-
*
|
|
908
|
-
* Uses the sherpa-onnx SenseVoice export (model.int8.onnx, 239MB int8 quantized).
|
|
909
|
-
* Also provides emotion detection, language identification, and audio event detection
|
|
910
|
-
* from the same forward pass.
|
|
911
|
+
* SenseVoice type definitions
|
|
911
912
|
*
|
|
912
913
|
* @category Inference
|
|
913
|
-
*
|
|
914
|
-
* @example Basic usage
|
|
915
|
-
* ```typescript
|
|
916
|
-
* import { SenseVoiceInference } from '@omote/core';
|
|
917
|
-
*
|
|
918
|
-
* const asr = new SenseVoiceInference({
|
|
919
|
-
* modelUrl: '/models/sensevoice/model.int8.onnx',
|
|
920
|
-
* tokensUrl: '/models/sensevoice/tokens.txt',
|
|
921
|
-
* });
|
|
922
|
-
* await asr.load();
|
|
923
|
-
*
|
|
924
|
-
* const { text, emotion, language } = await asr.transcribe(audioSamples);
|
|
925
|
-
* console.log(text); // "Hello world"
|
|
926
|
-
* console.log(emotion); // "NEUTRAL"
|
|
927
|
-
* console.log(language); // "en"
|
|
928
|
-
* ```
|
|
929
|
-
*
|
|
930
|
-
* @module inference/SenseVoiceInference
|
|
931
914
|
*/
|
|
932
915
|
|
|
933
916
|
type SenseVoiceLanguage = 'auto' | 'zh' | 'en' | 'ja' | 'ko' | 'yue';
|
|
@@ -964,76 +947,49 @@ interface SenseVoiceModelInfo {
|
|
|
964
947
|
outputNames: string[];
|
|
965
948
|
vocabSize: number;
|
|
966
949
|
}
|
|
967
|
-
|
|
968
|
-
|
|
969
|
-
|
|
970
|
-
|
|
971
|
-
|
|
972
|
-
|
|
973
|
-
|
|
974
|
-
|
|
975
|
-
|
|
976
|
-
|
|
977
|
-
|
|
978
|
-
|
|
979
|
-
|
|
980
|
-
|
|
981
|
-
|
|
982
|
-
|
|
983
|
-
|
|
984
|
-
|
|
985
|
-
|
|
950
|
+
/**
|
|
951
|
+
* Configuration for SenseVoice Worker (used by SenseVoiceUnifiedAdapter)
|
|
952
|
+
*/
|
|
953
|
+
interface SenseVoiceWorkerConfig {
|
|
954
|
+
/** Path or URL to model.int8.onnx (239MB) */
|
|
955
|
+
modelUrl: string;
|
|
956
|
+
/** Path or URL to tokens.txt vocabulary file (default: sibling of modelUrl) */
|
|
957
|
+
tokensUrl?: string;
|
|
958
|
+
/** Language hint (default: 'auto' for auto-detection) */
|
|
959
|
+
language?: SenseVoiceLanguage;
|
|
960
|
+
/** Text normalization: 'with_itn' applies inverse text normalization (default: 'with_itn') */
|
|
961
|
+
textNorm?: 'with_itn' | 'without_itn';
|
|
962
|
+
}
|
|
963
|
+
/**
|
|
964
|
+
* Common interface for SenseVoice implementations
|
|
965
|
+
*/
|
|
966
|
+
interface SenseVoiceBackend {
|
|
967
|
+
/** Whether the model is loaded and ready for inference */
|
|
968
|
+
readonly isLoaded: boolean;
|
|
969
|
+
/** Current backend type ('wasm' for worker, full backend for main thread, null if not loaded) */
|
|
970
|
+
readonly backend: 'wasm' | 'webgpu' | null;
|
|
971
|
+
/**
|
|
972
|
+
* Load the ONNX model
|
|
973
|
+
* @param onProgress - Optional progress callback (fires once at 100% for worker)
|
|
974
|
+
* @returns Model loading information
|
|
975
|
+
*/
|
|
986
976
|
load(onProgress?: (loaded: number, total: number) => void): Promise<SenseVoiceModelInfo>;
|
|
987
977
|
/**
|
|
988
978
|
* Transcribe audio samples to text
|
|
989
|
-
*
|
|
990
|
-
* @
|
|
991
|
-
* @returns Transcription result with text, emotion, language, and event
|
|
979
|
+
* @param audioSamples - Float32Array of audio samples at 16kHz
|
|
980
|
+
* @returns Transcription result
|
|
992
981
|
*/
|
|
993
982
|
transcribe(audioSamples: Float32Array): Promise<SenseVoiceResult>;
|
|
994
|
-
|
|
983
|
+
/**
|
|
984
|
+
* Dispose of the model and free resources
|
|
985
|
+
*/
|
|
995
986
|
dispose(): Promise<void>;
|
|
996
987
|
}
|
|
997
988
|
|
|
998
989
|
/**
|
|
999
|
-
* Silero VAD
|
|
1000
|
-
*
|
|
1001
|
-
* Neural network-based VAD running in browser via ONNX Runtime Web.
|
|
1002
|
-
* Much more accurate than RMS-based energy detection.
|
|
1003
|
-
*
|
|
1004
|
-
* Uses lazy loading to conditionally load WebGPU or WASM-only bundle:
|
|
1005
|
-
* - iOS: Loads WASM-only bundle (WebGPU crashes due to Safari bugs)
|
|
1006
|
-
* - Android/Desktop: Loads WebGPU bundle (with WASM fallback)
|
|
990
|
+
* Silero VAD type definitions
|
|
1007
991
|
*
|
|
1008
992
|
* @category Inference
|
|
1009
|
-
*
|
|
1010
|
-
* @example Basic usage
|
|
1011
|
-
* ```typescript
|
|
1012
|
-
* import { SileroVADInference } from '@omote/core';
|
|
1013
|
-
*
|
|
1014
|
-
* const vad = new SileroVADInference({
|
|
1015
|
-
* modelUrl: '/models/silero-vad.onnx'
|
|
1016
|
-
* });
|
|
1017
|
-
* await vad.load();
|
|
1018
|
-
*
|
|
1019
|
-
* // Process 32ms chunks (512 samples at 16kHz)
|
|
1020
|
-
* const probability = await vad.process(audioChunk);
|
|
1021
|
-
* if (probability > 0.5) {
|
|
1022
|
-
* console.log('Speech detected!');
|
|
1023
|
-
* }
|
|
1024
|
-
* ```
|
|
1025
|
-
*
|
|
1026
|
-
* @example Streaming with state management
|
|
1027
|
-
* ```typescript
|
|
1028
|
-
* // State is automatically maintained between process() calls
|
|
1029
|
-
* // Call reset() when starting a new audio stream
|
|
1030
|
-
* vad.reset();
|
|
1031
|
-
*
|
|
1032
|
-
* for (const chunk of audioChunks) {
|
|
1033
|
-
* const prob = await vad.process(chunk);
|
|
1034
|
-
* // prob is speech probability [0, 1]
|
|
1035
|
-
* }
|
|
1036
|
-
* ```
|
|
1037
993
|
*/
|
|
1038
994
|
|
|
1039
995
|
type VADBackend = BackendPreference;
|
|
@@ -1103,117 +1059,6 @@ interface SpeechSegment {
|
|
|
1103
1059
|
/** Average probability during segment */
|
|
1104
1060
|
avgProbability: number;
|
|
1105
1061
|
}
|
|
1106
|
-
/**
|
|
1107
|
-
* Silero VAD - Neural network voice activity detection
|
|
1108
|
-
*
|
|
1109
|
-
* Based on snakers4/silero-vad ONNX model.
|
|
1110
|
-
* Processes 32ms chunks (512 samples at 16kHz) with LSTM state.
|
|
1111
|
-
*
|
|
1112
|
-
* @see https://github.com/snakers4/silero-vad
|
|
1113
|
-
*/
|
|
1114
|
-
declare class SileroVADInference {
|
|
1115
|
-
private session;
|
|
1116
|
-
private ort;
|
|
1117
|
-
private config;
|
|
1118
|
-
private _backend;
|
|
1119
|
-
private isLoading;
|
|
1120
|
-
private state;
|
|
1121
|
-
private context;
|
|
1122
|
-
private readonly chunkSize;
|
|
1123
|
-
private readonly contextSize;
|
|
1124
|
-
private inferenceQueue;
|
|
1125
|
-
private preSpeechBuffer;
|
|
1126
|
-
private wasSpeaking;
|
|
1127
|
-
private srTensor;
|
|
1128
|
-
constructor(config: SileroVADConfig);
|
|
1129
|
-
get backend(): RuntimeBackend | null;
|
|
1130
|
-
get isLoaded(): boolean;
|
|
1131
|
-
get sampleRate(): number;
|
|
1132
|
-
get threshold(): number;
|
|
1133
|
-
/**
|
|
1134
|
-
* Get required chunk size in samples
|
|
1135
|
-
*/
|
|
1136
|
-
getChunkSize(): number;
|
|
1137
|
-
/**
|
|
1138
|
-
* Get chunk duration in milliseconds
|
|
1139
|
-
*/
|
|
1140
|
-
getChunkDurationMs(): number;
|
|
1141
|
-
/**
|
|
1142
|
-
* Check if WebGPU is available and working
|
|
1143
|
-
* (iOS returns false even if navigator.gpu exists due to ONNX Runtime bugs)
|
|
1144
|
-
*/
|
|
1145
|
-
static isWebGPUAvailable: typeof isWebGPUAvailable;
|
|
1146
|
-
/**
|
|
1147
|
-
* Load the ONNX model
|
|
1148
|
-
*/
|
|
1149
|
-
load(): Promise<VADModelInfo>;
|
|
1150
|
-
/**
|
|
1151
|
-
* Reset state for new audio stream
|
|
1152
|
-
*/
|
|
1153
|
-
reset(): void;
|
|
1154
|
-
/**
|
|
1155
|
-
* Process a single audio chunk
|
|
1156
|
-
*
|
|
1157
|
-
* @param audioChunk - Float32Array of exactly chunkSize samples (512 for 16kHz, 256 for 8kHz)
|
|
1158
|
-
* @returns VAD result with speech probability
|
|
1159
|
-
*/
|
|
1160
|
-
process(audioChunk: Float32Array): Promise<VADResult>;
|
|
1161
|
-
/**
|
|
1162
|
-
* Process audio and detect speech segments
|
|
1163
|
-
*
|
|
1164
|
-
* @param audio - Complete audio buffer
|
|
1165
|
-
* @param options - Detection options
|
|
1166
|
-
* @returns Array of speech segments
|
|
1167
|
-
*/
|
|
1168
|
-
detectSpeech(audio: Float32Array, options?: {
|
|
1169
|
-
/** Minimum speech duration in ms (default: 250) */
|
|
1170
|
-
minSpeechDurationMs?: number;
|
|
1171
|
-
/** Minimum silence duration to end segment in ms (default: 300) */
|
|
1172
|
-
minSilenceDurationMs?: number;
|
|
1173
|
-
/** Padding to add before/after speech in ms (default: 30) */
|
|
1174
|
-
speechPadMs?: number;
|
|
1175
|
-
}): Promise<SpeechSegment[]>;
|
|
1176
|
-
/**
|
|
1177
|
-
* Queue inference to serialize ONNX session calls
|
|
1178
|
-
*/
|
|
1179
|
-
private queueInference;
|
|
1180
|
-
/**
|
|
1181
|
-
* Dispose of the model and free resources
|
|
1182
|
-
*/
|
|
1183
|
-
dispose(): Promise<void>;
|
|
1184
|
-
}
|
|
1185
|
-
|
|
1186
|
-
/**
|
|
1187
|
-
* Silero VAD Web Worker implementation
|
|
1188
|
-
*
|
|
1189
|
-
* Runs Silero VAD inference in a dedicated Web Worker to prevent main thread blocking.
|
|
1190
|
-
* Uses inline worker script (Blob URL pattern) to avoid separate file deployment.
|
|
1191
|
-
*
|
|
1192
|
-
* Key design decisions:
|
|
1193
|
-
* - WASM backend only (WebGPU doesn't work in Workers)
|
|
1194
|
-
* - LSTM state serialized as Float32Array (Tensors can't cross worker boundary)
|
|
1195
|
-
* - Audio copied (not transferred) to retain main thread access for pre-speech buffer
|
|
1196
|
-
* - ONNX Runtime loaded from CDN in worker (no bundler complications)
|
|
1197
|
-
*
|
|
1198
|
-
* @category Inference
|
|
1199
|
-
*
|
|
1200
|
-
* @example Basic usage
|
|
1201
|
-
* ```typescript
|
|
1202
|
-
* import { SileroVADWorker } from '@omote/core';
|
|
1203
|
-
*
|
|
1204
|
-
* const vad = new SileroVADWorker({
|
|
1205
|
-
* modelUrl: '/models/silero-vad.onnx'
|
|
1206
|
-
* });
|
|
1207
|
-
* await vad.load();
|
|
1208
|
-
*
|
|
1209
|
-
* // Process 32ms chunks (512 samples at 16kHz)
|
|
1210
|
-
* const result = await vad.process(audioChunk);
|
|
1211
|
-
* if (result.isSpeech) {
|
|
1212
|
-
* console.log('Speech detected!', result.probability);
|
|
1213
|
-
* }
|
|
1214
|
-
* ```
|
|
1215
|
-
*/
|
|
1216
|
-
|
|
1217
1062
|
/**
|
|
1218
1063
|
* Configuration for Silero VAD Worker
|
|
1219
1064
|
*/
|
|
@@ -1226,13 +1071,6 @@ interface VADWorkerConfig {
|
|
|
1226
1071
|
threshold?: number;
|
|
1227
1072
|
/**
|
|
1228
1073
|
* Number of audio chunks to keep in pre-speech buffer.
|
|
1229
|
-
* When VAD triggers, these chunks are prepended to the speech buffer
|
|
1230
|
-
* to capture the beginning of speech that occurred before detection.
|
|
1231
|
-
*
|
|
1232
|
-
* At 512 samples/chunk and 16kHz:
|
|
1233
|
-
* - 10 chunks = 320ms of pre-speech audio
|
|
1234
|
-
* - 15 chunks = 480ms of pre-speech audio
|
|
1235
|
-
*
|
|
1236
1074
|
* Default: 10 chunks (320ms)
|
|
1237
1075
|
*/
|
|
1238
1076
|
preSpeechBufferChunks?: number;
|
|
@@ -1248,85 +1086,45 @@ interface VADWorkerModelInfo {
|
|
|
1248
1086
|
sampleRate: number;
|
|
1249
1087
|
chunkSize: number;
|
|
1250
1088
|
}
|
|
1251
|
-
|
|
1252
1089
|
/**
|
|
1253
|
-
*
|
|
1254
|
-
*
|
|
1255
|
-
* Runs Silero VAD inference off the main thread to prevent UI blocking.
|
|
1256
|
-
* Feature parity with SileroVADInference but runs in dedicated worker.
|
|
1257
|
-
*
|
|
1258
|
-
* @see SileroVADInference for main-thread version
|
|
1090
|
+
* Common interface for Silero VAD implementations
|
|
1259
1091
|
*/
|
|
1260
|
-
|
|
1261
|
-
|
|
1262
|
-
|
|
1263
|
-
|
|
1264
|
-
|
|
1265
|
-
|
|
1266
|
-
|
|
1267
|
-
|
|
1268
|
-
|
|
1269
|
-
private readonly contextSize;
|
|
1270
|
-
private inferenceQueue;
|
|
1271
|
-
private preSpeechBuffer;
|
|
1272
|
-
private wasSpeaking;
|
|
1273
|
-
private pendingResolvers;
|
|
1274
|
-
private messageId;
|
|
1275
|
-
constructor(config: VADWorkerConfig);
|
|
1276
|
-
get isLoaded(): boolean;
|
|
1277
|
-
/**
|
|
1278
|
-
* Backend type (always 'wasm' for Worker, WebGPU not supported in Workers)
|
|
1279
|
-
*/
|
|
1280
|
-
get backend(): 'wasm' | null;
|
|
1281
|
-
get sampleRate(): number;
|
|
1282
|
-
get threshold(): number;
|
|
1283
|
-
/**
|
|
1284
|
-
* Get required chunk size in samples
|
|
1285
|
-
*/
|
|
1286
|
-
getChunkSize(): number;
|
|
1287
|
-
/**
|
|
1288
|
-
* Get chunk duration in milliseconds
|
|
1289
|
-
*/
|
|
1290
|
-
getChunkDurationMs(): number;
|
|
1291
|
-
/**
|
|
1292
|
-
* Create the worker from inline script
|
|
1293
|
-
*/
|
|
1294
|
-
private createWorker;
|
|
1295
|
-
/**
|
|
1296
|
-
* Handle messages from worker
|
|
1297
|
-
*/
|
|
1298
|
-
private handleWorkerMessage;
|
|
1299
|
-
/**
|
|
1300
|
-
* Send message to worker and wait for response
|
|
1301
|
-
*/
|
|
1302
|
-
private sendMessage;
|
|
1303
|
-
/**
|
|
1304
|
-
* Load the ONNX model in the worker
|
|
1305
|
-
*/
|
|
1306
|
-
load(): Promise<VADWorkerModelInfo>;
|
|
1092
|
+
interface SileroVADBackend {
|
|
1093
|
+
/** Current backend type (webgpu, wasm, or null if not loaded) */
|
|
1094
|
+
readonly backend: RuntimeBackend | null;
|
|
1095
|
+
/** Whether the model is loaded and ready for inference */
|
|
1096
|
+
readonly isLoaded: boolean;
|
|
1097
|
+
/** Audio sample rate (8000 or 16000 Hz) */
|
|
1098
|
+
readonly sampleRate: number;
|
|
1099
|
+
/** Speech detection threshold (0-1) */
|
|
1100
|
+
readonly threshold: number;
|
|
1307
1101
|
/**
|
|
1308
|
-
*
|
|
1102
|
+
* Load the ONNX model
|
|
1103
|
+
* @returns Model loading information
|
|
1309
1104
|
*/
|
|
1310
|
-
|
|
1105
|
+
load(): Promise<VADModelInfo | VADWorkerModelInfo>;
|
|
1311
1106
|
/**
|
|
1312
1107
|
* Process a single audio chunk
|
|
1313
|
-
*
|
|
1314
|
-
* @param audioChunk - Float32Array of exactly chunkSize samples (512 for 16kHz, 256 for 8kHz)
|
|
1108
|
+
* @param audioChunk - Float32Array of exactly chunkSize samples
|
|
1315
1109
|
* @returns VAD result with speech probability
|
|
1316
1110
|
*/
|
|
1317
1111
|
process(audioChunk: Float32Array): Promise<VADResult>;
|
|
1318
1112
|
/**
|
|
1319
|
-
*
|
|
1113
|
+
* Reset state for new audio stream
|
|
1320
1114
|
*/
|
|
1321
|
-
|
|
1115
|
+
reset(): void | Promise<void>;
|
|
1322
1116
|
/**
|
|
1323
|
-
* Dispose of the
|
|
1117
|
+
* Dispose of the model and free resources
|
|
1324
1118
|
*/
|
|
1325
1119
|
dispose(): Promise<void>;
|
|
1326
1120
|
/**
|
|
1327
|
-
*
|
|
1121
|
+
* Get required chunk size in samples
|
|
1328
1122
|
*/
|
|
1329
|
-
|
|
1123
|
+
getChunkSize(): number;
|
|
1124
|
+
/**
|
|
1125
|
+
* Get chunk duration in milliseconds
|
|
1126
|
+
*/
|
|
1127
|
+
getChunkDurationMs(): number;
|
|
1330
1128
|
}
|
|
1331
1129
|
|
|
1332
1130
|
/**
|
|
@@ -1454,43 +1252,33 @@ declare class UnifiedInferenceWorker {
|
|
|
1454
1252
|
|
|
1455
1253
|
/** Base config shared across all inference factory functions */
|
|
1456
1254
|
interface InferenceFactoryConfig {
|
|
1457
|
-
/**
|
|
1458
|
-
* Worker mode:
|
|
1459
|
-
* - 'auto' (default): Use Worker if supported, else main thread
|
|
1460
|
-
* - true: Force Worker (throws if unsupported)
|
|
1461
|
-
* - false: Force main thread
|
|
1462
|
-
*/
|
|
1463
|
-
useWorker?: boolean | 'auto';
|
|
1464
1255
|
/**
|
|
1465
1256
|
* Unified inference worker instance.
|
|
1466
|
-
*
|
|
1257
|
+
* Routes inference through the shared worker,
|
|
1467
1258
|
* keeping all inference off the main thread.
|
|
1468
|
-
* Takes precedence over useWorker setting.
|
|
1469
1259
|
*/
|
|
1470
1260
|
unifiedWorker?: UnifiedInferenceWorker;
|
|
1471
1261
|
}
|
|
1472
1262
|
|
|
1473
1263
|
/**
|
|
1474
|
-
* Factory function for A2E inference
|
|
1264
|
+
* Factory function for A2E inference via UnifiedInferenceWorker
|
|
1475
1265
|
*
|
|
1476
1266
|
* Creates an A2EBackend instance with zero-config defaults (HuggingFace CDN).
|
|
1477
|
-
*
|
|
1267
|
+
* Routes inference through the shared unified worker.
|
|
1478
1268
|
*
|
|
1479
1269
|
* @category Inference
|
|
1480
1270
|
*
|
|
1481
|
-
* @example
|
|
1271
|
+
* @example
|
|
1482
1272
|
* ```typescript
|
|
1483
|
-
* import { createA2E } from '@omote/core';
|
|
1273
|
+
* import { createA2E, UnifiedInferenceWorker } from '@omote/core';
|
|
1274
|
+
*
|
|
1275
|
+
* const worker = new UnifiedInferenceWorker();
|
|
1276
|
+
* await worker.init();
|
|
1484
1277
|
*
|
|
1485
|
-
* const a2e = createA2E(
|
|
1278
|
+
* const a2e = createA2E({ unifiedWorker: worker });
|
|
1486
1279
|
* await a2e.load();
|
|
1487
1280
|
* const { blendshapes } = await a2e.infer(audioSamples);
|
|
1488
1281
|
* ```
|
|
1489
|
-
*
|
|
1490
|
-
* @example Custom model URL
|
|
1491
|
-
* ```typescript
|
|
1492
|
-
* const a2e = createA2E({ modelUrl: '/models/lam.onnx' });
|
|
1493
|
-
* ```
|
|
1494
1282
|
*/
|
|
1495
1283
|
|
|
1496
1284
|
/**
|
|
@@ -1506,13 +1294,13 @@ interface CreateA2EConfig extends InferenceFactoryConfig {
|
|
|
1506
1294
|
* Set to `false` to skip external data loading (single-file models only).
|
|
1507
1295
|
*/
|
|
1508
1296
|
externalDataUrl?: string | false;
|
|
1509
|
-
/** Backend preference (default: 'auto') */
|
|
1510
|
-
backend?: BackendPreference;
|
|
1511
1297
|
/** Number of identity classes (default: 12) */
|
|
1512
1298
|
numIdentityClasses?: number;
|
|
1513
1299
|
}
|
|
1514
1300
|
/**
|
|
1515
|
-
* Create an A2E instance
|
|
1301
|
+
* Create an A2E instance via the unified worker.
|
|
1302
|
+
*
|
|
1303
|
+
* If no `unifiedWorker` is provided, a dedicated worker is created on load().
|
|
1516
1304
|
*
|
|
1517
1305
|
* @param config - Factory configuration
|
|
1518
1306
|
* @returns An A2EBackend instance
|
|
@@ -1528,7 +1316,7 @@ declare function createA2E(config?: CreateA2EConfig): A2EBackend;
|
|
|
1528
1316
|
/**
|
|
1529
1317
|
* Generic frame source -- any object that emits 'frame' events with blendshapes.
|
|
1530
1318
|
*
|
|
1531
|
-
* Implemented by PlaybackPipeline, MicLipSync,
|
|
1319
|
+
* Implemented by PlaybackPipeline, MicLipSync, and any custom source.
|
|
1532
1320
|
* Used by OmoteAvatar (all renderer adapters) to receive animation frames.
|
|
1533
1321
|
*/
|
|
1534
1322
|
interface FrameSource {
|
|
@@ -1557,7 +1345,7 @@ interface TranscriptResult {
|
|
|
1557
1345
|
inferenceTimeMs?: number;
|
|
1558
1346
|
}
|
|
1559
1347
|
/**
|
|
1560
|
-
* Consumer's response handler.
|
|
1348
|
+
* Consumer's response handler. VoiceOrchestrator calls this with transcribed text.
|
|
1561
1349
|
* Consumer must stream audio back for playback + lip sync.
|
|
1562
1350
|
*/
|
|
1563
1351
|
interface ResponseHandler {
|
|
@@ -1588,6 +1376,8 @@ interface ResponseHandler {
|
|
|
1588
1376
|
*/
|
|
1589
1377
|
|
|
1590
1378
|
interface TTSSpeakerConfig {
|
|
1379
|
+
/** Skip LAM download — audio playback only, no lip sync. Default: false. */
|
|
1380
|
+
audioOnly?: boolean;
|
|
1591
1381
|
/** Per-character expression weight scaling */
|
|
1592
1382
|
profile?: ExpressionProfile;
|
|
1593
1383
|
/** Identity/style index for A2E model (default: 0) */
|
|
@@ -1600,8 +1390,8 @@ interface TTSSpeakerConfig {
|
|
|
1600
1390
|
neutralTransitionMs?: number;
|
|
1601
1391
|
/** Pre-built A2E backend (skip internal createA2E). */
|
|
1602
1392
|
lam?: A2EBackend;
|
|
1603
|
-
/** LAM model config (only when lam not provided) */
|
|
1604
|
-
models?: CreateA2EConfig
|
|
1393
|
+
/** LAM model config (only when lam not provided). unifiedWorker is supplied by TTSSpeaker. */
|
|
1394
|
+
models?: Omit<CreateA2EConfig, 'unifiedWorker'>;
|
|
1605
1395
|
/** Shared unified worker (recommended for iOS) */
|
|
1606
1396
|
unifiedWorker?: UnifiedInferenceWorker;
|
|
1607
1397
|
}
|
|
@@ -1610,6 +1400,7 @@ declare class TTSSpeaker {
|
|
|
1610
1400
|
private tts;
|
|
1611
1401
|
private ownedLam;
|
|
1612
1402
|
private ownedWorker;
|
|
1403
|
+
private usesSharedWorker;
|
|
1613
1404
|
private currentAbort;
|
|
1614
1405
|
private _isSpeaking;
|
|
1615
1406
|
private _audioOnly;
|
|
@@ -1623,11 +1414,8 @@ declare class TTSSpeaker {
|
|
|
1623
1414
|
/**
|
|
1624
1415
|
* Connect a TTS backend.
|
|
1625
1416
|
*
|
|
1626
|
-
*
|
|
1627
|
-
*
|
|
1628
|
-
*
|
|
1629
|
-
* When config is omitted or has none of those, audio-only mode is used:
|
|
1630
|
-
* TTS → AudioScheduler (speakers only, no blendshapes, no LAM download).
|
|
1417
|
+
* By default, the full lip sync pipeline is created (auto-downloads LAM).
|
|
1418
|
+
* Pass `audioOnly: true` for audio-only mode (no blendshapes, no LAM download).
|
|
1631
1419
|
*
|
|
1632
1420
|
* @param tts - TTS backend to use for speech synthesis
|
|
1633
1421
|
* @param config - Optional configuration for A2E, expression profile, etc.
|
|
@@ -1643,6 +1431,8 @@ declare class TTSSpeaker {
|
|
|
1643
1431
|
speak(text: string, options?: {
|
|
1644
1432
|
signal?: AbortSignal;
|
|
1645
1433
|
voice?: string;
|
|
1434
|
+
speed?: number;
|
|
1435
|
+
language?: string;
|
|
1646
1436
|
}): Promise<void>;
|
|
1647
1437
|
/** Audio-only speak: TTS → resample → AudioScheduler (no blendshapes). */
|
|
1648
1438
|
private speakAudioOnly;
|
|
@@ -1662,13 +1452,20 @@ declare class TTSSpeaker {
|
|
|
1662
1452
|
streamText(options: {
|
|
1663
1453
|
signal?: AbortSignal;
|
|
1664
1454
|
voice?: string;
|
|
1455
|
+
speed?: number;
|
|
1456
|
+
language?: string;
|
|
1665
1457
|
}): Promise<{
|
|
1666
1458
|
push: (token: string) => void;
|
|
1667
1459
|
end: () => Promise<void>;
|
|
1668
1460
|
}>;
|
|
1669
1461
|
/** streamText in audio-only mode: TTS → AudioScheduler (no blendshapes). */
|
|
1670
1462
|
private streamTextAudioOnly;
|
|
1671
|
-
/**
|
|
1463
|
+
/**
|
|
1464
|
+
* Warm up AudioContext for iOS/Safari autoplay policy.
|
|
1465
|
+
* Call from a user gesture handler (click/tap) before speak().
|
|
1466
|
+
*/
|
|
1467
|
+
warmup(): Promise<void>;
|
|
1468
|
+
/** Abort current speak if any. Triggers neutral transition on PlaybackPipeline. */
|
|
1672
1469
|
stop(): void;
|
|
1673
1470
|
/** Clean teardown of all owned resources. */
|
|
1674
1471
|
dispose(): Promise<void>;
|
|
@@ -1704,11 +1501,13 @@ interface CreateTTSPlayerConfig {
|
|
|
1704
1501
|
modelUrl?: string;
|
|
1705
1502
|
/** Voice data base URL override */
|
|
1706
1503
|
voiceBaseUrl?: string;
|
|
1504
|
+
/** Shared unified worker (created automatically if not provided) */
|
|
1505
|
+
unifiedWorker?: UnifiedInferenceWorker;
|
|
1707
1506
|
}
|
|
1708
1507
|
/**
|
|
1709
1508
|
* Zero-config TTS player. Speak text through speakers without an avatar.
|
|
1710
1509
|
*
|
|
1711
|
-
* Uses Kokoro TTS (82M q8, ~92MB) with automatic worker
|
|
1510
|
+
* Uses Kokoro TTS (82M q8, ~92MB) with automatic worker creation.
|
|
1712
1511
|
* No LAM model is downloaded — audio plays directly through AudioScheduler.
|
|
1713
1512
|
*/
|
|
1714
1513
|
declare function createTTSPlayer(config?: CreateTTSPlayerConfig): TTSPlayer;
|
|
@@ -1717,254 +1516,27 @@ declare function createTTSPlayer(config?: CreateTTSPlayerConfig): TTSPlayer;
|
|
|
1717
1516
|
*/
|
|
1718
1517
|
declare class TTSPlayer extends TTSSpeaker {
|
|
1719
1518
|
private backend;
|
|
1720
|
-
|
|
1519
|
+
private ttsWorker;
|
|
1520
|
+
private ttsPlayerUsesSharedWorker;
|
|
1521
|
+
private ttsConfig;
|
|
1522
|
+
constructor(config?: CreateTTSPlayerConfig);
|
|
1721
1523
|
/** Load TTS model and connect in audio-only mode. */
|
|
1722
1524
|
load(): Promise<void>;
|
|
1723
1525
|
/** Whether the TTS model is loaded and ready. */
|
|
1724
1526
|
get isLoaded(): boolean;
|
|
1527
|
+
dispose(): Promise<void>;
|
|
1725
1528
|
}
|
|
1726
1529
|
|
|
1727
1530
|
/**
|
|
1728
|
-
*
|
|
1729
|
-
*
|
|
1730
|
-
* Provides a unified API that automatically selects the optimal implementation:
|
|
1731
|
-
* - Worker supported: Uses SenseVoiceWorker (off-main-thread inference)
|
|
1732
|
-
* - Worker unsupported: Uses SenseVoiceInference (main thread)
|
|
1733
|
-
*
|
|
1734
|
-
* @category Inference
|
|
1735
|
-
*
|
|
1736
|
-
* @example Auto-detect (recommended)
|
|
1737
|
-
* ```typescript
|
|
1738
|
-
* import { createSenseVoice } from '@omote/core';
|
|
1531
|
+
* SpeechListener — Standalone listening primitive.
|
|
1739
1532
|
*
|
|
1740
|
-
*
|
|
1741
|
-
*
|
|
1742
|
-
* });
|
|
1743
|
-
* await asr.load();
|
|
1744
|
-
* const { text, emotion } = await asr.transcribe(audioSamples);
|
|
1745
|
-
* ```
|
|
1533
|
+
* Composes: MicrophoneCapture → SileroVAD → SenseVoice ASR → transcript events.
|
|
1534
|
+
* Used independently or alongside TTSSpeaker and VoiceOrchestrator.
|
|
1746
1535
|
*
|
|
1747
|
-
*
|
|
1748
|
-
*
|
|
1749
|
-
* const asr = createSenseVoice({
|
|
1750
|
-
* modelUrl: '/models/sensevoice/model.int8.onnx',
|
|
1751
|
-
* useWorker: true,
|
|
1752
|
-
* });
|
|
1753
|
-
* ```
|
|
1536
|
+
* Does NOT handle TTS, LAM, or response routing — those belong to TTSSpeaker
|
|
1537
|
+
* and VoiceOrchestrator respectively.
|
|
1754
1538
|
*
|
|
1755
|
-
* @
|
|
1756
|
-
* ```typescript
|
|
1757
|
-
* const asr = createSenseVoice({
|
|
1758
|
-
* modelUrl: '/models/sensevoice/model.int8.onnx',
|
|
1759
|
-
* useWorker: false,
|
|
1760
|
-
* });
|
|
1761
|
-
* ```
|
|
1762
|
-
*/
|
|
1763
|
-
|
|
1764
|
-
/**
|
|
1765
|
-
* Common interface for both SenseVoiceInference and SenseVoiceWorker
|
|
1766
|
-
*/
|
|
1767
|
-
interface SenseVoiceBackend {
|
|
1768
|
-
/** Whether the model is loaded and ready for inference */
|
|
1769
|
-
readonly isLoaded: boolean;
|
|
1770
|
-
/** Current backend type ('wasm' for worker, full backend for main thread, null if not loaded) */
|
|
1771
|
-
readonly backend: 'wasm' | 'webgpu' | null;
|
|
1772
|
-
/**
|
|
1773
|
-
* Load the ONNX model
|
|
1774
|
-
* @param onProgress - Optional progress callback (fires once at 100% for worker)
|
|
1775
|
-
* @returns Model loading information
|
|
1776
|
-
*/
|
|
1777
|
-
load(onProgress?: (loaded: number, total: number) => void): Promise<SenseVoiceModelInfo>;
|
|
1778
|
-
/**
|
|
1779
|
-
* Transcribe audio samples to text
|
|
1780
|
-
* @param audioSamples - Float32Array of audio samples at 16kHz
|
|
1781
|
-
* @returns Transcription result
|
|
1782
|
-
*/
|
|
1783
|
-
transcribe(audioSamples: Float32Array): Promise<SenseVoiceResult>;
|
|
1784
|
-
/**
|
|
1785
|
-
* Dispose of the model and free resources
|
|
1786
|
-
*/
|
|
1787
|
-
dispose(): Promise<void>;
|
|
1788
|
-
}
|
|
1789
|
-
/**
|
|
1790
|
-
* Configuration for the SenseVoice factory
|
|
1791
|
-
*/
|
|
1792
|
-
interface CreateSenseVoiceConfig extends InferenceFactoryConfig {
|
|
1793
|
-
/** Path or URL to model.int8.onnx (239MB). Default: HuggingFace CDN */
|
|
1794
|
-
modelUrl?: string;
|
|
1795
|
-
/** Path or URL to tokens.txt vocabulary file (default: sibling of modelUrl) */
|
|
1796
|
-
tokensUrl?: string;
|
|
1797
|
-
/** Language hint (default: 'auto') */
|
|
1798
|
-
language?: SenseVoiceLanguage;
|
|
1799
|
-
/** Text normalization (default: 'with_itn') */
|
|
1800
|
-
textNorm?: 'with_itn' | 'without_itn';
|
|
1801
|
-
}
|
|
1802
|
-
/**
|
|
1803
|
-
* Create a SenseVoice ASR instance with automatic implementation selection
|
|
1804
|
-
*
|
|
1805
|
-
* @param config - Factory configuration
|
|
1806
|
-
* @returns A SenseVoiceBackend instance (either Worker or main thread)
|
|
1807
|
-
*/
|
|
1808
|
-
declare function createSenseVoice(config?: CreateSenseVoiceConfig): SenseVoiceBackend;
|
|
1809
|
-
|
|
1810
|
-
/**
|
|
1811
|
-
* Factory function for Silero VAD with automatic Worker vs main thread selection
|
|
1812
|
-
*
|
|
1813
|
-
* Provides a unified API that automatically selects the optimal implementation:
|
|
1814
|
-
* - Desktop browsers: Uses SileroVADWorker (off-main-thread inference)
|
|
1815
|
-
* - Mobile devices: Uses SileroVADInference (main thread, avoids memory overhead)
|
|
1816
|
-
* - Fallback: Gracefully falls back to main thread if Worker fails
|
|
1817
|
-
*
|
|
1818
|
-
* @category Inference
|
|
1819
|
-
*
|
|
1820
|
-
* @example Basic usage (auto-detect)
|
|
1821
|
-
* ```typescript
|
|
1822
|
-
* import { createSileroVAD } from '@omote/core';
|
|
1823
|
-
*
|
|
1824
|
-
* const vad = createSileroVAD({
|
|
1825
|
-
* modelUrl: '/models/silero-vad.onnx',
|
|
1826
|
-
* threshold: 0.5,
|
|
1827
|
-
* });
|
|
1828
|
-
*
|
|
1829
|
-
* await vad.load();
|
|
1830
|
-
* const result = await vad.process(audioChunk);
|
|
1831
|
-
* if (result.isSpeech) {
|
|
1832
|
-
* console.log('Speech detected!', result.probability);
|
|
1833
|
-
* }
|
|
1834
|
-
* ```
|
|
1835
|
-
*
|
|
1836
|
-
* @example Force worker usage
|
|
1837
|
-
* ```typescript
|
|
1838
|
-
* const vad = createSileroVAD({
|
|
1839
|
-
* modelUrl: '/models/silero-vad.onnx',
|
|
1840
|
-
* useWorker: true, // Force Worker even on mobile
|
|
1841
|
-
* });
|
|
1842
|
-
* ```
|
|
1843
|
-
*
|
|
1844
|
-
* @example Force main thread
|
|
1845
|
-
* ```typescript
|
|
1846
|
-
* const vad = createSileroVAD({
|
|
1847
|
-
* modelUrl: '/models/silero-vad.onnx',
|
|
1848
|
-
* useWorker: false, // Force main thread
|
|
1849
|
-
* });
|
|
1850
|
-
* ```
|
|
1851
|
-
*/
|
|
1852
|
-
|
|
1853
|
-
/**
|
|
1854
|
-
* Common interface for both SileroVADInference and SileroVADWorker
|
|
1855
|
-
*
|
|
1856
|
-
* This interface defines the shared API that both implementations provide,
|
|
1857
|
-
* allowing consumers to use either interchangeably.
|
|
1858
|
-
*/
|
|
1859
|
-
interface SileroVADBackend {
|
|
1860
|
-
/** Current backend type (webgpu, wasm, or null if not loaded) */
|
|
1861
|
-
readonly backend: RuntimeBackend | null;
|
|
1862
|
-
/** Whether the model is loaded and ready for inference */
|
|
1863
|
-
readonly isLoaded: boolean;
|
|
1864
|
-
/** Audio sample rate (8000 or 16000 Hz) */
|
|
1865
|
-
readonly sampleRate: number;
|
|
1866
|
-
/** Speech detection threshold (0-1) */
|
|
1867
|
-
readonly threshold: number;
|
|
1868
|
-
/**
|
|
1869
|
-
* Load the ONNX model
|
|
1870
|
-
* @returns Model loading information
|
|
1871
|
-
*/
|
|
1872
|
-
load(): Promise<VADModelInfo | VADWorkerModelInfo>;
|
|
1873
|
-
/**
|
|
1874
|
-
* Process a single audio chunk
|
|
1875
|
-
* @param audioChunk - Float32Array of exactly chunkSize samples
|
|
1876
|
-
* @returns VAD result with speech probability
|
|
1877
|
-
*/
|
|
1878
|
-
process(audioChunk: Float32Array): Promise<VADResult>;
|
|
1879
|
-
/**
|
|
1880
|
-
* Reset state for new audio stream
|
|
1881
|
-
*/
|
|
1882
|
-
reset(): void | Promise<void>;
|
|
1883
|
-
/**
|
|
1884
|
-
* Dispose of the model and free resources
|
|
1885
|
-
*/
|
|
1886
|
-
dispose(): Promise<void>;
|
|
1887
|
-
/**
|
|
1888
|
-
* Get required chunk size in samples
|
|
1889
|
-
*/
|
|
1890
|
-
getChunkSize(): number;
|
|
1891
|
-
/**
|
|
1892
|
-
* Get chunk duration in milliseconds
|
|
1893
|
-
*/
|
|
1894
|
-
getChunkDurationMs(): number;
|
|
1895
|
-
}
|
|
1896
|
-
/**
|
|
1897
|
-
* Configuration for the Silero VAD factory
|
|
1898
|
-
*
|
|
1899
|
-
* Extends SileroVADConfig with worker-specific options.
|
|
1900
|
-
*/
|
|
1901
|
-
interface SileroVADFactoryConfig extends Omit<SileroVADConfig, 'modelUrl'>, InferenceFactoryConfig {
|
|
1902
|
-
/** Path or URL to the ONNX model. Default: HuggingFace CDN */
|
|
1903
|
-
modelUrl?: string;
|
|
1904
|
-
/**
|
|
1905
|
-
* Fallback to main thread on worker errors.
|
|
1906
|
-
*
|
|
1907
|
-
* When true (default), if the Worker fails to load or encounters an error,
|
|
1908
|
-
* the factory will automatically create a main thread instance instead.
|
|
1909
|
-
*
|
|
1910
|
-
* When false, worker errors will propagate as exceptions.
|
|
1911
|
-
*
|
|
1912
|
-
* Default: true
|
|
1913
|
-
*/
|
|
1914
|
-
fallbackOnError?: boolean;
|
|
1915
|
-
}
|
|
1916
|
-
/**
|
|
1917
|
-
* Check if the current environment supports VAD Web Workers
|
|
1918
|
-
*
|
|
1919
|
-
* Requirements:
|
|
1920
|
-
* - Worker constructor must exist
|
|
1921
|
-
* - Blob URL support (for inline worker script)
|
|
1922
|
-
*
|
|
1923
|
-
* @returns true if VAD Worker is supported
|
|
1924
|
-
*/
|
|
1925
|
-
declare function supportsVADWorker(): boolean;
|
|
1926
|
-
/**
|
|
1927
|
-
* Create a Silero VAD instance with automatic implementation selection
|
|
1928
|
-
*
|
|
1929
|
-
* This factory function automatically selects between:
|
|
1930
|
-
* - **SileroVADWorker**: Off-main-thread inference (better for desktop)
|
|
1931
|
-
* - **SileroVADInference**: Main thread inference (better for mobile)
|
|
1932
|
-
*
|
|
1933
|
-
* The selection is based on:
|
|
1934
|
-
* 1. Explicit `useWorker` config (if provided)
|
|
1935
|
-
* 2. Platform detection (mobile vs desktop)
|
|
1936
|
-
* 3. Worker API availability
|
|
1937
|
-
*
|
|
1938
|
-
* Both implementations share the same interface (SileroVADBackend),
|
|
1939
|
-
* so consumers can use either interchangeably.
|
|
1940
|
-
*
|
|
1941
|
-
* @param config - Factory configuration
|
|
1942
|
-
* @returns A SileroVAD instance (either Worker or main thread)
|
|
1943
|
-
*
|
|
1944
|
-
* @example
|
|
1945
|
-
* ```typescript
|
|
1946
|
-
* // Auto-detect (recommended)
|
|
1947
|
-
* const vad = createSileroVAD({ modelUrl: '/models/silero-vad.onnx' });
|
|
1948
|
-
*
|
|
1949
|
-
* // Force Worker
|
|
1950
|
-
* const vadWorker = createSileroVAD({ modelUrl: '/models/silero-vad.onnx', useWorker: true });
|
|
1951
|
-
*
|
|
1952
|
-
* // Force main thread
|
|
1953
|
-
* const vadMain = createSileroVAD({ modelUrl: '/models/silero-vad.onnx', useWorker: false });
|
|
1954
|
-
* ```
|
|
1955
|
-
*/
|
|
1956
|
-
declare function createSileroVAD(config?: SileroVADFactoryConfig): SileroVADBackend;
|
|
1957
|
-
|
|
1958
|
-
/**
|
|
1959
|
-
* SpeechListener — Standalone listening primitive.
|
|
1960
|
-
*
|
|
1961
|
-
* Composes: MicrophoneCapture → SileroVAD → SenseVoice ASR → transcript events.
|
|
1962
|
-
* Extracted from VoicePipeline's listening half so it can be used independently.
|
|
1963
|
-
*
|
|
1964
|
-
* Does NOT handle TTS, LAM, or response routing — those belong to TTSSpeaker
|
|
1965
|
-
* and VoicePipeline respectively.
|
|
1966
|
-
*
|
|
1967
|
-
* @category Audio
|
|
1539
|
+
* @category Audio
|
|
1968
1540
|
*/
|
|
1969
1541
|
|
|
1970
1542
|
interface SpeechListenerConfig {
|
|
@@ -1981,6 +1553,7 @@ interface SpeechListenerConfig {
|
|
|
1981
1553
|
modelUrl: string;
|
|
1982
1554
|
tokensUrl?: string;
|
|
1983
1555
|
language?: string;
|
|
1556
|
+
textNorm?: 'with_itn' | 'without_itn';
|
|
1984
1557
|
};
|
|
1985
1558
|
vad: {
|
|
1986
1559
|
modelUrl: string;
|
|
@@ -2035,6 +1608,7 @@ declare class SpeechListener extends EventEmitter<SpeechListenerEvents> {
|
|
|
2035
1608
|
private asr;
|
|
2036
1609
|
private vad;
|
|
2037
1610
|
private ownedWorker;
|
|
1611
|
+
private usesSharedWorker;
|
|
2038
1612
|
private mic;
|
|
2039
1613
|
private omoteEvents;
|
|
2040
1614
|
private _unsubChunk;
|
|
@@ -2164,114 +1738,48 @@ declare class InterruptionHandler extends EventEmitter<InterruptionEvents> {
|
|
|
2164
1738
|
}
|
|
2165
1739
|
|
|
2166
1740
|
/**
|
|
2167
|
-
* SenseVoice ASR
|
|
2168
|
-
*
|
|
2169
|
-
* Runs SenseVoice speech recognition in a dedicated Web Worker to prevent
|
|
2170
|
-
* main thread blocking. Uses inline worker script (Blob URL pattern) to
|
|
2171
|
-
* avoid separate file deployment.
|
|
2172
|
-
*
|
|
2173
|
-
* Key design decisions:
|
|
2174
|
-
* - WASM backend only (WebGPU doesn't work in Workers)
|
|
2175
|
-
* - All preprocessing (fbank, LFR, CMVN) and CTC decoding inlined in worker
|
|
2176
|
-
* - Audio copied (not transferred) to retain main thread access
|
|
2177
|
-
* - ONNX Runtime loaded from CDN in worker (no bundler complications)
|
|
2178
|
-
* - iOS: model URL passed as string to ORT (avoids 239MB JS heap allocation)
|
|
1741
|
+
* Factory function for SenseVoice ASR via UnifiedInferenceWorker
|
|
2179
1742
|
*
|
|
2180
1743
|
* @category Inference
|
|
2181
1744
|
*
|
|
2182
|
-
* @example
|
|
1745
|
+
* @example
|
|
2183
1746
|
* ```typescript
|
|
2184
|
-
* import {
|
|
1747
|
+
* import { createSenseVoice, UnifiedInferenceWorker } from '@omote/core';
|
|
2185
1748
|
*
|
|
2186
|
-
* const
|
|
1749
|
+
* const worker = new UnifiedInferenceWorker();
|
|
1750
|
+
* await worker.init();
|
|
1751
|
+
*
|
|
1752
|
+
* const asr = createSenseVoice({
|
|
2187
1753
|
* modelUrl: '/models/sensevoice/model.int8.onnx',
|
|
2188
|
-
*
|
|
1754
|
+
* unifiedWorker: worker,
|
|
2189
1755
|
* });
|
|
2190
1756
|
* await asr.load();
|
|
2191
|
-
*
|
|
2192
|
-
* const { text, emotion, language } = await asr.transcribe(audioSamples);
|
|
2193
|
-
* console.log(text); // "Hello world"
|
|
2194
|
-
* console.log(emotion); // "NEUTRAL"
|
|
2195
|
-
* console.log(language); // "en"
|
|
1757
|
+
* const { text, emotion } = await asr.transcribe(audioSamples);
|
|
2196
1758
|
* ```
|
|
2197
1759
|
*/
|
|
2198
1760
|
|
|
2199
1761
|
/**
|
|
2200
|
-
* Configuration for SenseVoice
|
|
1762
|
+
* Configuration for the SenseVoice factory
|
|
2201
1763
|
*/
|
|
2202
|
-
interface
|
|
2203
|
-
/** Path or URL to model.int8.onnx (239MB) */
|
|
2204
|
-
modelUrl
|
|
1764
|
+
interface CreateSenseVoiceConfig extends InferenceFactoryConfig {
|
|
1765
|
+
/** Path or URL to model.int8.onnx (239MB). Default: HuggingFace CDN */
|
|
1766
|
+
modelUrl?: string;
|
|
2205
1767
|
/** Path or URL to tokens.txt vocabulary file (default: sibling of modelUrl) */
|
|
2206
1768
|
tokensUrl?: string;
|
|
2207
|
-
/** Language hint (default: 'auto'
|
|
1769
|
+
/** Language hint (default: 'auto') */
|
|
2208
1770
|
language?: SenseVoiceLanguage;
|
|
2209
|
-
/** Text normalization
|
|
1771
|
+
/** Text normalization (default: 'with_itn') */
|
|
2210
1772
|
textNorm?: 'with_itn' | 'without_itn';
|
|
2211
1773
|
}
|
|
2212
1774
|
/**
|
|
2213
|
-
* SenseVoice ASR
|
|
1775
|
+
* Create a SenseVoice ASR instance via the unified worker.
|
|
2214
1776
|
*
|
|
2215
|
-
*
|
|
2216
|
-
* All preprocessing (fbank, LFR, CMVN) and CTC decoding run in the worker.
|
|
1777
|
+
* If no `unifiedWorker` is provided, a dedicated worker is created on load().
|
|
2217
1778
|
*
|
|
2218
|
-
* @
|
|
1779
|
+
* @param config - Factory configuration
|
|
1780
|
+
* @returns A SenseVoiceBackend instance
|
|
2219
1781
|
*/
|
|
2220
|
-
declare
|
|
2221
|
-
private worker;
|
|
2222
|
-
private config;
|
|
2223
|
-
private isLoading;
|
|
2224
|
-
private _isLoaded;
|
|
2225
|
-
private inferenceQueue;
|
|
2226
|
-
private poisoned;
|
|
2227
|
-
private pendingResolvers;
|
|
2228
|
-
private languageId;
|
|
2229
|
-
private textNormId;
|
|
2230
|
-
constructor(config: SenseVoiceWorkerConfig);
|
|
2231
|
-
get isLoaded(): boolean;
|
|
2232
|
-
/**
|
|
2233
|
-
* Backend type (always 'wasm' for Worker, WebGPU not supported in Workers)
|
|
2234
|
-
*/
|
|
2235
|
-
get backend(): 'wasm' | null;
|
|
2236
|
-
/**
|
|
2237
|
-
* Create the worker from inline script
|
|
2238
|
-
*/
|
|
2239
|
-
private createWorker;
|
|
2240
|
-
/**
|
|
2241
|
-
* Handle messages from worker
|
|
2242
|
-
*/
|
|
2243
|
-
private handleWorkerMessage;
|
|
2244
|
-
/**
|
|
2245
|
-
* Send message to worker and wait for response
|
|
2246
|
-
*/
|
|
2247
|
-
private sendMessage;
|
|
2248
|
-
/**
|
|
2249
|
-
* Load the ONNX model in the worker
|
|
2250
|
-
*
|
|
2251
|
-
* @param onProgress - Optional progress callback. Fires once at 100% when load completes
|
|
2252
|
-
* (the worker downloads and loads the model internally, so granular progress is not available).
|
|
2253
|
-
*/
|
|
2254
|
-
load(onProgress?: (loaded: number, total: number) => void): Promise<SenseVoiceModelInfo>;
|
|
2255
|
-
/**
|
|
2256
|
-
* Transcribe audio samples to text
|
|
2257
|
-
*
|
|
2258
|
-
* @param audioSamples Float32Array of audio samples at 16kHz, [-1, 1] range
|
|
2259
|
-
* @returns Transcription result with text, emotion, language, and event
|
|
2260
|
-
*/
|
|
2261
|
-
transcribe(audioSamples: Float32Array): Promise<SenseVoiceResult>;
|
|
2262
|
-
/**
|
|
2263
|
-
* Queue inference to serialize worker calls
|
|
2264
|
-
*/
|
|
2265
|
-
private queueInference;
|
|
2266
|
-
/**
|
|
2267
|
-
* Dispose of the worker and free resources
|
|
2268
|
-
*/
|
|
2269
|
-
dispose(): Promise<void>;
|
|
2270
|
-
/**
|
|
2271
|
-
* Check if Web Workers are supported
|
|
2272
|
-
*/
|
|
2273
|
-
static isSupported(): boolean;
|
|
2274
|
-
}
|
|
1782
|
+
declare function createSenseVoice(config?: CreateSenseVoiceConfig): SenseVoiceBackend;
|
|
2275
1783
|
|
|
2276
1784
|
/**
|
|
2277
1785
|
* Shared blendshape constants and utilities for lip sync inference
|
|
@@ -2305,100 +1813,6 @@ declare const LAM_BLENDSHAPES: readonly ["browDownLeft", "browDownRight", "browI
|
|
|
2305
1813
|
*/
|
|
2306
1814
|
declare function lerpBlendshapes(current: Float32Array | number[], target: Float32Array | number[], factor?: number): number[];
|
|
2307
1815
|
|
|
2308
|
-
/**
|
|
2309
|
-
* A2E inference engine for Audio-to-Expression (LAM model)
|
|
2310
|
-
*
|
|
2311
|
-
* Runs entirely in the browser using WebGPU or WASM.
|
|
2312
|
-
* Takes raw 16kHz audio and outputs 52 ARKit blendshapes for lip sync.
|
|
2313
|
-
* Uses the LAM (Large Animation Model) — see {@link A2EBackend} for the interface.
|
|
2314
|
-
*
|
|
2315
|
-
* @see {@link createA2E} for the recommended zero-config factory
|
|
2316
|
-
* @see {@link A2EBackend} for the common interface
|
|
2317
|
-
* @category Inference
|
|
2318
|
-
*
|
|
2319
|
-
* @example Basic usage
|
|
2320
|
-
* ```typescript
|
|
2321
|
-
* import { A2EInference } from '@omote/core';
|
|
2322
|
-
*
|
|
2323
|
-
* const a2e = new A2EInference({ modelUrl: '/models/lam.onnx' });
|
|
2324
|
-
* await a2e.load();
|
|
2325
|
-
*
|
|
2326
|
-
* // Process 1 second of audio (16kHz = 16000 samples)
|
|
2327
|
-
* const result = await a2e.infer(audioSamples);
|
|
2328
|
-
* console.log('Blendshapes:', result.blendshapes); // [30, 52] for 30fps
|
|
2329
|
-
* ```
|
|
2330
|
-
*/
|
|
2331
|
-
|
|
2332
|
-
interface A2EInferenceConfig {
|
|
2333
|
-
/** Path or URL to the ONNX model */
|
|
2334
|
-
modelUrl: string;
|
|
2335
|
-
/**
|
|
2336
|
-
* Path or URL to external model data file (.onnx.data weights).
|
|
2337
|
-
* Default: `${modelUrl}.data` (e.g., /models/model.onnx.data)
|
|
2338
|
-
*
|
|
2339
|
-
* Set to `false` to skip external data loading (single-file models only).
|
|
2340
|
-
*/
|
|
2341
|
-
externalDataUrl?: string | false;
|
|
2342
|
-
/** Preferred backend (auto will try WebGPU first, fallback to WASM) */
|
|
2343
|
-
backend?: BackendPreference;
|
|
2344
|
-
/** Number of identity classes (default: 12 for streaming model) */
|
|
2345
|
-
numIdentityClasses?: number;
|
|
2346
|
-
/**
|
|
2347
|
-
* Number of audio samples per inference chunk (default: 16000).
|
|
2348
|
-
* Model supports variable chunk sizes. Smaller chunks = lower latency,
|
|
2349
|
-
* more inference overhead. 8000 (500ms) is recommended for real-time lip sync.
|
|
2350
|
-
*/
|
|
2351
|
-
chunkSize?: number;
|
|
2352
|
-
}
|
|
2353
|
-
|
|
2354
|
-
declare class A2EInference implements A2EBackend {
|
|
2355
|
-
readonly modelId: "a2e";
|
|
2356
|
-
private session;
|
|
2357
|
-
private ort;
|
|
2358
|
-
private config;
|
|
2359
|
-
private _backend;
|
|
2360
|
-
private isLoading;
|
|
2361
|
-
private numIdentityClasses;
|
|
2362
|
-
readonly chunkSize: number;
|
|
2363
|
-
private inferenceQueue;
|
|
2364
|
-
private poisoned;
|
|
2365
|
-
private static readonly INFERENCE_TIMEOUT_MS;
|
|
2366
|
-
constructor(config: A2EInferenceConfig);
|
|
2367
|
-
/**
|
|
2368
|
-
* Check if WebGPU is available and working
|
|
2369
|
-
* (iOS returns false even if navigator.gpu exists due to ONNX Runtime bugs)
|
|
2370
|
-
*/
|
|
2371
|
-
static isWebGPUAvailable: typeof isWebGPUAvailable;
|
|
2372
|
-
get backend(): 'webgpu' | 'wasm' | null;
|
|
2373
|
-
get isLoaded(): boolean;
|
|
2374
|
-
/** True if inference timed out and the session is permanently unusable */
|
|
2375
|
-
get isSessionPoisoned(): boolean;
|
|
2376
|
-
/**
|
|
2377
|
-
* Load the ONNX model
|
|
2378
|
-
*/
|
|
2379
|
-
load(): Promise<A2EModelInfo>;
|
|
2380
|
-
/**
|
|
2381
|
-
* Run inference on raw audio
|
|
2382
|
-
* @param audioSamples - Float32Array of raw audio at 16kHz
|
|
2383
|
-
* @param identityIndex - Optional identity index (0-11, default 0 = neutral)
|
|
2384
|
-
*
|
|
2385
|
-
* Audio will be zero-padded or truncated to chunkSize samples.
|
|
2386
|
-
*/
|
|
2387
|
-
infer(audioSamples: Float32Array, identityIndex?: number): Promise<A2EResult>;
|
|
2388
|
-
/**
|
|
2389
|
-
* Queue inference to serialize ONNX session calls
|
|
2390
|
-
*/
|
|
2391
|
-
private queueInference;
|
|
2392
|
-
/**
|
|
2393
|
-
* Get blendshape value by name for a specific frame
|
|
2394
|
-
*/
|
|
2395
|
-
getBlendshape(blendshapes: Float32Array, name: typeof LAM_BLENDSHAPES[number]): number;
|
|
2396
|
-
/**
|
|
2397
|
-
* Dispose of the model and free resources
|
|
2398
|
-
*/
|
|
2399
|
-
dispose(): Promise<void>;
|
|
2400
|
-
}
|
|
2401
|
-
|
|
2402
1816
|
/**
|
|
2403
1817
|
* Default and user-configurable model URLs for all ONNX models
|
|
2404
1818
|
*
|
|
@@ -2434,7 +1848,7 @@ type ModelUrlKey = 'lam' | 'senseVoice' | 'sileroVad' | 'kokoroTTS' | 'kokoroVoi
|
|
|
2434
1848
|
* Resolved model URLs — user overrides take priority, HuggingFace CDN is fallback.
|
|
2435
1849
|
*
|
|
2436
1850
|
* All SDK factories (`createA2E`, `createSenseVoice`, `createSileroVAD`) and
|
|
2437
|
-
* orchestrators (`
|
|
1851
|
+
* orchestrators (`VoiceOrchestrator`) read from this object. Call
|
|
2438
1852
|
* {@link configureModelUrls} before constructing any pipelines to point
|
|
2439
1853
|
* models at your own CDN.
|
|
2440
1854
|
*/
|
|
@@ -2704,6 +2118,44 @@ declare class BlendshapeSmoother {
|
|
|
2704
2118
|
reset(): void;
|
|
2705
2119
|
}
|
|
2706
2120
|
|
|
2121
|
+
/**
|
|
2122
|
+
* Factory function for Silero VAD via UnifiedInferenceWorker
|
|
2123
|
+
*
|
|
2124
|
+
* @category Inference
|
|
2125
|
+
*
|
|
2126
|
+
* @example
|
|
2127
|
+
* ```typescript
|
|
2128
|
+
* import { createSileroVAD, UnifiedInferenceWorker } from '@omote/core';
|
|
2129
|
+
*
|
|
2130
|
+
* const worker = new UnifiedInferenceWorker();
|
|
2131
|
+
* await worker.init();
|
|
2132
|
+
*
|
|
2133
|
+
* const vad = createSileroVAD({
|
|
2134
|
+
* modelUrl: '/models/silero-vad.onnx',
|
|
2135
|
+
* unifiedWorker: worker,
|
|
2136
|
+
* });
|
|
2137
|
+
* await vad.load();
|
|
2138
|
+
* const result = await vad.process(audioChunk);
|
|
2139
|
+
* ```
|
|
2140
|
+
*/
|
|
2141
|
+
|
|
2142
|
+
/**
|
|
2143
|
+
* Configuration for the Silero VAD factory
|
|
2144
|
+
*/
|
|
2145
|
+
interface SileroVADFactoryConfig extends Omit<SileroVADConfig, 'modelUrl'>, InferenceFactoryConfig {
|
|
2146
|
+
/** Path or URL to the ONNX model. Default: HuggingFace CDN */
|
|
2147
|
+
modelUrl?: string;
|
|
2148
|
+
}
|
|
2149
|
+
/**
|
|
2150
|
+
* Create a Silero VAD instance via the unified worker.
|
|
2151
|
+
*
|
|
2152
|
+
* If no `unifiedWorker` is provided, a dedicated worker is created on load().
|
|
2153
|
+
*
|
|
2154
|
+
* @param config - Factory configuration
|
|
2155
|
+
* @returns A SileroVADBackend instance
|
|
2156
|
+
*/
|
|
2157
|
+
declare function createSileroVAD(config?: SileroVADFactoryConfig): SileroVADBackend;
|
|
2158
|
+
|
|
2707
2159
|
/**
|
|
2708
2160
|
* SenseVoice adapter backed by UnifiedInferenceWorker
|
|
2709
2161
|
*
|
|
@@ -2762,34 +2214,9 @@ declare class A2EUnifiedAdapter implements A2EBackend {
|
|
|
2762
2214
|
}
|
|
2763
2215
|
|
|
2764
2216
|
/**
|
|
2765
|
-
* Kokoro TTS
|
|
2766
|
-
*
|
|
2767
|
-
* Pure ONNX pipeline for browser-based text-to-speech. No transformers.js dependency.
|
|
2768
|
-
* Uses eSpeak-NG WASM for phonemization and Kokoro-82M (q8, 92MB) for synthesis.
|
|
2769
|
-
*
|
|
2770
|
-
* Pipeline: Text → Normalize → Phonemize (eSpeak WASM) → Tokenize → Voice Style → ONNX → Audio
|
|
2217
|
+
* Kokoro TTS type definitions
|
|
2771
2218
|
*
|
|
2772
2219
|
* @category Inference
|
|
2773
|
-
*
|
|
2774
|
-
* @example Basic usage
|
|
2775
|
-
* ```typescript
|
|
2776
|
-
* import { KokoroTTSInference } from '@omote/core';
|
|
2777
|
-
*
|
|
2778
|
-
* const tts = new KokoroTTSInference({ defaultVoice: 'af_heart' });
|
|
2779
|
-
* await tts.load();
|
|
2780
|
-
*
|
|
2781
|
-
* const { audio, duration } = await tts.synthesize("Hello world");
|
|
2782
|
-
* // audio: Float32Array @ 24kHz
|
|
2783
|
-
* ```
|
|
2784
|
-
*
|
|
2785
|
-
* @example Streaming (sentence-by-sentence)
|
|
2786
|
-
* ```typescript
|
|
2787
|
-
* for await (const chunk of tts.stream("First sentence. Second sentence.")) {
|
|
2788
|
-
* playbackPipeline.feedBuffer(chunk.audio);
|
|
2789
|
-
* }
|
|
2790
|
-
* ```
|
|
2791
|
-
*
|
|
2792
|
-
* @module inference/KokoroTTSInference
|
|
2793
2220
|
*/
|
|
2794
2221
|
|
|
2795
2222
|
interface KokoroTTSConfig {
|
|
@@ -2803,6 +2230,8 @@ interface KokoroTTSConfig {
|
|
|
2803
2230
|
backend?: BackendPreference;
|
|
2804
2231
|
/** Speech speed multiplier (default: 1.0) */
|
|
2805
2232
|
speed?: number;
|
|
2233
|
+
/** Eagerly load phonemizer + default voice during load() instead of first speak(). Default: true. */
|
|
2234
|
+
eagerLoad?: boolean;
|
|
2806
2235
|
}
|
|
2807
2236
|
interface KokoroTTSResult {
|
|
2808
2237
|
/** Audio samples at 24kHz */
|
|
@@ -2841,67 +2270,6 @@ interface SynthesizeOptions {
|
|
|
2841
2270
|
* Returns trimmed text on success, throws on invalid input.
|
|
2842
2271
|
*/
|
|
2843
2272
|
declare function validateTTSInput(text: unknown, voiceName: string, speed: number, availableVoices?: string[]): string;
|
|
2844
|
-
declare class KokoroTTSInference implements TTSBackend {
|
|
2845
|
-
private readonly config;
|
|
2846
|
-
private readonly modelUrl;
|
|
2847
|
-
private readonly voiceBaseUrl;
|
|
2848
|
-
private ort;
|
|
2849
|
-
private session;
|
|
2850
|
-
private _backend;
|
|
2851
|
-
private isLoading;
|
|
2852
|
-
private poisoned;
|
|
2853
|
-
private inferenceQueue;
|
|
2854
|
-
private phonemizerReady;
|
|
2855
|
-
private defaultVoiceLoaded;
|
|
2856
|
-
/** Cached voice data (voice name → Float32Array) */
|
|
2857
|
-
private loadedVoices;
|
|
2858
|
-
constructor(config?: KokoroTTSConfig);
|
|
2859
|
-
get isLoaded(): boolean;
|
|
2860
|
-
get sampleRate(): number;
|
|
2861
|
-
/**
|
|
2862
|
-
* Load the ONNX model, phonemizer WASM, and default voice.
|
|
2863
|
-
* Safe to call multiple times (no-ops after first successful load).
|
|
2864
|
-
*/
|
|
2865
|
-
load(): Promise<KokoroTTSModelInfo>;
|
|
2866
|
-
/**
|
|
2867
|
-
* Lazily initialize phonemizer and default voice on first use.
|
|
2868
|
-
* Moves 100-200ms of main-thread blocking out of load() into first synthesis.
|
|
2869
|
-
*/
|
|
2870
|
-
private ensureReady;
|
|
2871
|
-
/**
|
|
2872
|
-
* Synthesize speech from text (one-shot, full audio output).
|
|
2873
|
-
*
|
|
2874
|
-
* @param text - Input text to synthesize
|
|
2875
|
-
* @param options - Voice and speed overrides
|
|
2876
|
-
* @returns Audio Float32Array at 24kHz with duration
|
|
2877
|
-
*/
|
|
2878
|
-
synthesize(text: string, options?: SynthesizeOptions): Promise<KokoroTTSResult>;
|
|
2879
|
-
/**
|
|
2880
|
-
* Stream synthesis sentence-by-sentence (async generator).
|
|
2881
|
-
* Splits text on sentence boundaries and yields audio for each.
|
|
2882
|
-
*
|
|
2883
|
-
* Compatible with both `SynthesizeOptions` (legacy) and `TTSStreamOptions` (TTSBackend).
|
|
2884
|
-
*
|
|
2885
|
-
* @param text - Input text (can be multiple sentences)
|
|
2886
|
-
* @param options - Voice, speed, and abort signal overrides
|
|
2887
|
-
*/
|
|
2888
|
-
stream(text: string, options?: SynthesizeOptions & TTSStreamOptions): AsyncGenerator<KokoroStreamChunk & TTSChunk>;
|
|
2889
|
-
/**
|
|
2890
|
-
* Preload a voice (fetches and caches the .bin file).
|
|
2891
|
-
*/
|
|
2892
|
-
preloadVoice(voiceName: string): Promise<void>;
|
|
2893
|
-
/**
|
|
2894
|
-
* List available voice names.
|
|
2895
|
-
*/
|
|
2896
|
-
listVoices(): string[];
|
|
2897
|
-
/**
|
|
2898
|
-
* Release the ONNX session and clear cached voices.
|
|
2899
|
-
*/
|
|
2900
|
-
dispose(): Promise<void>;
|
|
2901
|
-
private ensureVoice;
|
|
2902
|
-
private queueInference;
|
|
2903
|
-
private runInference;
|
|
2904
|
-
}
|
|
2905
2273
|
|
|
2906
2274
|
/**
|
|
2907
2275
|
* Kokoro TTS adapter backed by UnifiedInferenceWorker
|
|
@@ -2917,6 +2285,7 @@ declare class KokoroTTSUnifiedAdapter implements TTSBackend {
|
|
|
2917
2285
|
private readonly modelUrl;
|
|
2918
2286
|
private readonly voiceBaseUrl;
|
|
2919
2287
|
private _isLoaded;
|
|
2288
|
+
private _backend;
|
|
2920
2289
|
private loadedGeneration;
|
|
2921
2290
|
/** Per-adapter inference queue — ensures sequential state updates. */
|
|
2922
2291
|
private inferenceQueue;
|
|
@@ -3138,148 +2507,61 @@ declare class SafariSpeechRecognition {
|
|
|
3138
2507
|
/**
|
|
3139
2508
|
* Remove an error callback
|
|
3140
2509
|
*/
|
|
3141
|
-
offError(callback: SpeechErrorCallback): void;
|
|
3142
|
-
/**
|
|
3143
|
-
* Start listening for speech
|
|
3144
|
-
*
|
|
3145
|
-
* On iOS Safari, this will trigger the microphone permission prompt
|
|
3146
|
-
* if not already granted.
|
|
3147
|
-
*/
|
|
3148
|
-
start(): Promise<void>;
|
|
3149
|
-
/**
|
|
3150
|
-
* Stop listening and return the final transcript
|
|
3151
|
-
*/
|
|
3152
|
-
stop(): Promise<SpeechRecognitionResult>;
|
|
3153
|
-
/**
|
|
3154
|
-
* Abort recognition without waiting for final result
|
|
3155
|
-
*/
|
|
3156
|
-
abort(): void;
|
|
3157
|
-
/**
|
|
3158
|
-
* NOT SUPPORTED: Transcribe audio buffer
|
|
3159
|
-
*
|
|
3160
|
-
* Safari Speech API does not support transcribing pre-recorded audio.
|
|
3161
|
-
* It only works with live microphone input.
|
|
3162
|
-
*
|
|
3163
|
-
* For batch transcription on iOS, use server-side Whisper or a cloud ASR service.
|
|
3164
|
-
*
|
|
3165
|
-
* @throws Error always - this method is not supported
|
|
3166
|
-
*/
|
|
3167
|
-
transcribe(_audio: Float32Array): Promise<SpeechRecognitionResult>;
|
|
3168
|
-
/**
|
|
3169
|
-
* Dispose of recognition resources
|
|
3170
|
-
*/
|
|
3171
|
-
dispose(): void;
|
|
3172
|
-
/**
|
|
3173
|
-
* Set up event handlers for the recognition instance
|
|
3174
|
-
*/
|
|
3175
|
-
private setupEventHandlers;
|
|
3176
|
-
/**
|
|
3177
|
-
* Emit result to all registered callbacks
|
|
3178
|
-
*/
|
|
3179
|
-
private emitResult;
|
|
3180
|
-
/**
|
|
3181
|
-
* Emit error to all registered callbacks
|
|
3182
|
-
*/
|
|
3183
|
-
private emitError;
|
|
3184
|
-
}
|
|
3185
|
-
|
|
3186
|
-
/**
|
|
3187
|
-
* Kokoro TTS Web Worker implementation
|
|
3188
|
-
*
|
|
3189
|
-
* Moves the heavy ONNX `session.run()` to a dedicated Web Worker to prevent
|
|
3190
|
-
* main thread blocking (~1-2s per sentence on WASM). Phonemizer, tokenizer,
|
|
3191
|
-
* and voice logic stay on the main thread (fast, <10ms combined).
|
|
3192
|
-
*
|
|
3193
|
-
* Architecture:
|
|
3194
|
-
* ```
|
|
3195
|
-
* Main Thread (KokoroTTSWorker): Worker (WORKER_SCRIPT):
|
|
3196
|
-
* stream(text) →
|
|
3197
|
-
* splitSentences(text)
|
|
3198
|
-
* for each sentence:
|
|
3199
|
-
* phonemize(sentence) → phonemes
|
|
3200
|
-
* tokenize(phonemes) → tokens
|
|
3201
|
-
* ensureVoice() → style
|
|
3202
|
-
* postMessage(tokens, style, speed) ──→ session.run(feeds)
|
|
3203
|
-
* await result ←── postMessage(audio)
|
|
3204
|
-
* yield {audio, text, phonemes, duration}
|
|
3205
|
-
* ```
|
|
3206
|
-
*
|
|
3207
|
-
* @category Inference
|
|
3208
|
-
*
|
|
3209
|
-
* @example Basic usage
|
|
3210
|
-
* ```typescript
|
|
3211
|
-
* import { KokoroTTSWorker } from '@omote/core';
|
|
3212
|
-
*
|
|
3213
|
-
* const tts = new KokoroTTSWorker({ defaultVoice: 'af_heart' });
|
|
3214
|
-
* await tts.load();
|
|
3215
|
-
*
|
|
3216
|
-
* for await (const chunk of tts.stream("Hello world!")) {
|
|
3217
|
-
* playbackPipeline.feedBuffer(chunk.audio);
|
|
3218
|
-
* }
|
|
3219
|
-
* ```
|
|
3220
|
-
*
|
|
3221
|
-
* @module inference/KokoroTTSWorker
|
|
3222
|
-
*/
|
|
3223
|
-
|
|
3224
|
-
/**
|
|
3225
|
-
* Kokoro TTS Worker — off-main-thread ONNX inference for non-blocking TTS.
|
|
3226
|
-
*
|
|
3227
|
-
* Phonemizer/tokenizer/voice logic run on the main thread (fast, <10ms).
|
|
3228
|
-
* Only the heavy ONNX `session.run()` is delegated to the worker.
|
|
3229
|
-
*
|
|
3230
|
-
* Implements the same TTSBackend interface as KokoroTTSInference.
|
|
3231
|
-
*
|
|
3232
|
-
* @see KokoroTTSInference for main-thread version
|
|
3233
|
-
*/
|
|
3234
|
-
declare class KokoroTTSWorker implements TTSBackend {
|
|
3235
|
-
private readonly config;
|
|
3236
|
-
private readonly modelUrl;
|
|
3237
|
-
private readonly voiceBaseUrl;
|
|
3238
|
-
private worker;
|
|
3239
|
-
private _isLoaded;
|
|
3240
|
-
private isLoading;
|
|
3241
|
-
private poisoned;
|
|
3242
|
-
/** Serializes all worker calls (stream sentence chunks + synthesize) */
|
|
3243
|
-
private inferenceQueue;
|
|
3244
|
-
/** Cached voice data (voice name → Float32Array) */
|
|
3245
|
-
private loadedVoices;
|
|
3246
|
-
/** Pending message handlers */
|
|
3247
|
-
private pendingResolvers;
|
|
3248
|
-
constructor(config?: KokoroTTSConfig);
|
|
3249
|
-
get isLoaded(): boolean;
|
|
3250
|
-
get sampleRate(): number;
|
|
3251
|
-
load(): Promise<KokoroTTSModelInfo>;
|
|
3252
|
-
synthesize(text: string, options?: SynthesizeOptions): Promise<KokoroTTSResult>;
|
|
3253
|
-
stream(text: string, options?: SynthesizeOptions & TTSStreamOptions): AsyncGenerator<KokoroStreamChunk & TTSChunk>;
|
|
3254
|
-
preloadVoice(voiceName: string): Promise<void>;
|
|
3255
|
-
listVoices(): string[];
|
|
3256
|
-
dispose(): Promise<void>;
|
|
3257
|
-
static isSupported(): boolean;
|
|
3258
|
-
private ensureVoice;
|
|
3259
|
-
private createWorker;
|
|
3260
|
-
private handleWorkerMessage;
|
|
3261
|
-
private sendMessage;
|
|
2510
|
+
offError(callback: SpeechErrorCallback): void;
|
|
3262
2511
|
/**
|
|
3263
|
-
*
|
|
3264
|
-
*
|
|
2512
|
+
* Start listening for speech
|
|
2513
|
+
*
|
|
2514
|
+
* On iOS Safari, this will trigger the microphone permission prompt
|
|
2515
|
+
* if not already granted.
|
|
3265
2516
|
*/
|
|
3266
|
-
|
|
2517
|
+
start(): Promise<void>;
|
|
2518
|
+
/**
|
|
2519
|
+
* Stop listening and return the final transcript
|
|
2520
|
+
*/
|
|
2521
|
+
stop(): Promise<SpeechRecognitionResult>;
|
|
2522
|
+
/**
|
|
2523
|
+
* Abort recognition without waiting for final result
|
|
2524
|
+
*/
|
|
2525
|
+
abort(): void;
|
|
2526
|
+
/**
|
|
2527
|
+
* NOT SUPPORTED: Transcribe audio buffer
|
|
2528
|
+
*
|
|
2529
|
+
* Safari Speech API does not support transcribing pre-recorded audio.
|
|
2530
|
+
* It only works with live microphone input.
|
|
2531
|
+
*
|
|
2532
|
+
* For batch transcription on iOS, use server-side Whisper or a cloud ASR service.
|
|
2533
|
+
*
|
|
2534
|
+
* @throws Error always - this method is not supported
|
|
2535
|
+
*/
|
|
2536
|
+
transcribe(_audio: Float32Array): Promise<SpeechRecognitionResult>;
|
|
2537
|
+
/**
|
|
2538
|
+
* Dispose of recognition resources
|
|
2539
|
+
*/
|
|
2540
|
+
dispose(): void;
|
|
2541
|
+
/**
|
|
2542
|
+
* Set up event handlers for the recognition instance
|
|
2543
|
+
*/
|
|
2544
|
+
private setupEventHandlers;
|
|
2545
|
+
/**
|
|
2546
|
+
* Emit result to all registered callbacks
|
|
2547
|
+
*/
|
|
2548
|
+
private emitResult;
|
|
3267
2549
|
/**
|
|
3268
|
-
*
|
|
2550
|
+
* Emit error to all registered callbacks
|
|
3269
2551
|
*/
|
|
3270
|
-
private
|
|
2552
|
+
private emitError;
|
|
3271
2553
|
}
|
|
3272
2554
|
|
|
3273
2555
|
/**
|
|
3274
|
-
* Factory function for Kokoro TTS
|
|
2556
|
+
* Factory function for Kokoro TTS via UnifiedInferenceWorker
|
|
3275
2557
|
*
|
|
3276
|
-
*
|
|
3277
|
-
*
|
|
3278
|
-
*
|
|
2558
|
+
* When called without a `unifiedWorker`, a dedicated worker is created
|
|
2559
|
+
* automatically on the first `load()` call. Pass a shared worker when using
|
|
2560
|
+
* VoiceOrchestrator or multiple models to avoid extra WASM instances.
|
|
3279
2561
|
*
|
|
3280
2562
|
* @category Inference
|
|
3281
2563
|
*
|
|
3282
|
-
* @example
|
|
2564
|
+
* @example Standalone (auto-creates worker)
|
|
3283
2565
|
* ```typescript
|
|
3284
2566
|
* import { createKokoroTTS } from '@omote/core';
|
|
3285
2567
|
*
|
|
@@ -3291,14 +2573,9 @@ declare class KokoroTTSWorker implements TTSBackend {
|
|
|
3291
2573
|
* }
|
|
3292
2574
|
* ```
|
|
3293
2575
|
*
|
|
3294
|
-
* @example
|
|
2576
|
+
* @example With shared worker
|
|
3295
2577
|
* ```typescript
|
|
3296
|
-
* const tts = createKokoroTTS({ defaultVoice: 'af_heart',
|
|
3297
|
-
* ```
|
|
3298
|
-
*
|
|
3299
|
-
* @example Force main thread
|
|
3300
|
-
* ```typescript
|
|
3301
|
-
* const tts = createKokoroTTS({ defaultVoice: 'af_heart', useWorker: false });
|
|
2578
|
+
* const tts = createKokoroTTS({ defaultVoice: 'af_heart', unifiedWorker: worker });
|
|
3302
2579
|
* ```
|
|
3303
2580
|
*/
|
|
3304
2581
|
|
|
@@ -3308,10 +2585,12 @@ declare class KokoroTTSWorker implements TTSBackend {
|
|
|
3308
2585
|
interface CreateKokoroTTSConfig extends KokoroTTSConfig, InferenceFactoryConfig {
|
|
3309
2586
|
}
|
|
3310
2587
|
/**
|
|
3311
|
-
* Create a Kokoro TTS instance
|
|
2588
|
+
* Create a Kokoro TTS instance via the unified worker.
|
|
2589
|
+
*
|
|
2590
|
+
* If no `unifiedWorker` is provided, a dedicated worker is created on load().
|
|
3312
2591
|
*
|
|
3313
2592
|
* @param config - Factory configuration
|
|
3314
|
-
* @returns A TTSBackend instance
|
|
2593
|
+
* @returns A TTSBackend instance
|
|
3315
2594
|
*/
|
|
3316
2595
|
declare function createKokoroTTS(config?: CreateKokoroTTSConfig): TTSBackend;
|
|
3317
2596
|
|
|
@@ -3360,7 +2639,7 @@ declare function listVoices(): string[];
|
|
|
3360
2639
|
* ElevenLabs TTS Backend — Cloud text-to-speech via ElevenLabs REST API.
|
|
3361
2640
|
*
|
|
3362
2641
|
* Implements the TTSBackend interface so it can be used anywhere Kokoro TTS is used
|
|
3363
|
-
* (TTSPlayback, TTSSpeaker,
|
|
2642
|
+
* (TTSPlayback, TTSSpeaker, VoiceOrchestrator, PlaybackPipeline, etc.)
|
|
3364
2643
|
*
|
|
3365
2644
|
* Zero external dependencies — uses fetch() directly.
|
|
3366
2645
|
*
|
|
@@ -3438,141 +2717,6 @@ declare class ElevenLabsTTSBackend implements TTSBackend {
|
|
|
3438
2717
|
private getHttpErrorMessage;
|
|
3439
2718
|
}
|
|
3440
2719
|
|
|
3441
|
-
/**
|
|
3442
|
-
* AWS Polly TTS Backend — Cloud text-to-speech via consumer-provided AWS SDK call.
|
|
3443
|
-
*
|
|
3444
|
-
* Implements the TTSBackend interface. Keeps @omote/core free of AWS SDK dependencies
|
|
3445
|
-
* by delegating the actual Polly API call to a consumer-provided function.
|
|
3446
|
-
*
|
|
3447
|
-
* @category Inference
|
|
3448
|
-
*
|
|
3449
|
-
* @example Basic usage with AWS SDK v3
|
|
3450
|
-
* ```typescript
|
|
3451
|
-
* import { PollyTTSBackend } from '@omote/core';
|
|
3452
|
-
* import { PollyClient, SynthesizeSpeechCommand } from '@aws-sdk/client-polly';
|
|
3453
|
-
*
|
|
3454
|
-
* const polly = new PollyClient({ region: 'us-east-1' });
|
|
3455
|
-
*
|
|
3456
|
-
* const tts = new PollyTTSBackend({
|
|
3457
|
-
* synthesizeFn: async (text, voice, sampleRate) => {
|
|
3458
|
-
* const cmd = new SynthesizeSpeechCommand({
|
|
3459
|
-
* Text: text,
|
|
3460
|
-
* VoiceId: voice,
|
|
3461
|
-
* Engine: 'neural',
|
|
3462
|
-
* OutputFormat: 'pcm',
|
|
3463
|
-
* SampleRate: String(sampleRate),
|
|
3464
|
-
* });
|
|
3465
|
-
* const result = await polly.send(cmd);
|
|
3466
|
-
* const stream = result.AudioStream;
|
|
3467
|
-
* // Convert stream to ArrayBuffer (Node or browser)
|
|
3468
|
-
* const chunks: Uint8Array[] = [];
|
|
3469
|
-
* for await (const chunk of stream as AsyncIterable<Uint8Array>) {
|
|
3470
|
-
* chunks.push(chunk);
|
|
3471
|
-
* }
|
|
3472
|
-
* const totalLength = chunks.reduce((sum, c) => sum + c.length, 0);
|
|
3473
|
-
* const merged = new Uint8Array(totalLength);
|
|
3474
|
-
* let offset = 0;
|
|
3475
|
-
* for (const chunk of chunks) {
|
|
3476
|
-
* merged.set(chunk, offset);
|
|
3477
|
-
* offset += chunk.length;
|
|
3478
|
-
* }
|
|
3479
|
-
* return {
|
|
3480
|
-
* audio: merged.buffer,
|
|
3481
|
-
* contentType: result.ContentType ?? 'audio/pcm',
|
|
3482
|
-
* };
|
|
3483
|
-
* },
|
|
3484
|
-
* });
|
|
3485
|
-
*
|
|
3486
|
-
* await tts.load();
|
|
3487
|
-
* for await (const chunk of tts.stream("Hello world!")) {
|
|
3488
|
-
* playbackPipeline.feedBuffer(chunk.audio);
|
|
3489
|
-
* }
|
|
3490
|
-
* ```
|
|
3491
|
-
*/
|
|
3492
|
-
|
|
3493
|
-
/**
|
|
3494
|
-
* Result from the consumer-provided synthesize function.
|
|
3495
|
-
*/
|
|
3496
|
-
interface PollySynthesizeResult {
|
|
3497
|
-
/** Raw PCM audio bytes (Int16 LE) */
|
|
3498
|
-
audio: ArrayBuffer;
|
|
3499
|
-
/** Content type from Polly response (e.g., 'audio/pcm') */
|
|
3500
|
-
contentType: string;
|
|
3501
|
-
}
|
|
3502
|
-
/**
|
|
3503
|
-
* Configuration for PollyTTSBackend.
|
|
3504
|
-
*
|
|
3505
|
-
* The `synthesizeFn` callback lets consumers use their own AWS SDK setup
|
|
3506
|
-
* (credentials, region, SDK version) without @omote/core depending on `@aws-sdk/client-polly`.
|
|
3507
|
-
*/
|
|
3508
|
-
interface PollyConfig {
|
|
3509
|
-
/**
|
|
3510
|
-
* Consumer-provided function that calls AWS Polly.
|
|
3511
|
-
* Must return PCM audio (Int16 LE) at the requested sample rate.
|
|
3512
|
-
*
|
|
3513
|
-
* @param text - Text to synthesize
|
|
3514
|
-
* @param voice - Polly voice ID (e.g., 'Joanna')
|
|
3515
|
-
* @param sampleRate - Requested output sample rate (e.g., 16000)
|
|
3516
|
-
* @returns PCM audio buffer and content type
|
|
3517
|
-
*/
|
|
3518
|
-
synthesizeFn: (text: string, voice: string, sampleRate: number) => Promise<PollySynthesizeResult>;
|
|
3519
|
-
/** Polly voice ID (default: 'Joanna') */
|
|
3520
|
-
voice?: string;
|
|
3521
|
-
/** Output sample rate in Hz (default: 16000) */
|
|
3522
|
-
sampleRate?: number;
|
|
3523
|
-
/** Polly engine type (default: 'neural') */
|
|
3524
|
-
engine?: 'neural' | 'standard' | 'generative' | 'long-form';
|
|
3525
|
-
}
|
|
3526
|
-
declare class PollyTTSBackend implements TTSBackend {
|
|
3527
|
-
private readonly synthesizeFn;
|
|
3528
|
-
private readonly voice;
|
|
3529
|
-
private readonly _sampleRate;
|
|
3530
|
-
private readonly engine;
|
|
3531
|
-
private _isLoaded;
|
|
3532
|
-
constructor(config: PollyConfig);
|
|
3533
|
-
get sampleRate(): number;
|
|
3534
|
-
get isLoaded(): boolean;
|
|
3535
|
-
/**
|
|
3536
|
-
* No-op for cloud TTS (no model to load).
|
|
3537
|
-
* Marks backend as ready.
|
|
3538
|
-
*/
|
|
3539
|
-
load(): Promise<void>;
|
|
3540
|
-
/**
|
|
3541
|
-
* Synthesize audio via consumer's Polly function.
|
|
3542
|
-
*
|
|
3543
|
-
* Polly's SynthesizeSpeech is request/response (not streaming for PCM),
|
|
3544
|
-
* so this yields a single chunk per call. For long text, consider splitting
|
|
3545
|
-
* into sentences on the consumer side.
|
|
3546
|
-
*/
|
|
3547
|
-
stream(text: string, options?: TTSStreamOptions): AsyncGenerator<TTSChunk>;
|
|
3548
|
-
dispose(): Promise<void>;
|
|
3549
|
-
}
|
|
3550
|
-
|
|
3551
|
-
/**
|
|
3552
|
-
* ORT CDN configuration
|
|
3553
|
-
*
|
|
3554
|
-
* Allows consumers to override the CDN base URL used for loading
|
|
3555
|
-
* ONNX Runtime WASM/WebGPU binaries. By default, ORT loads from
|
|
3556
|
-
* its bundled CDN path. Use {@link configureOrtCdn} to point at
|
|
3557
|
-
* a self-hosted or enterprise CDN.
|
|
3558
|
-
*
|
|
3559
|
-
* @category Inference
|
|
3560
|
-
*/
|
|
3561
|
-
/**
|
|
3562
|
-
* Override the CDN base URL for ONNX Runtime WASM/WebGPU binaries.
|
|
3563
|
-
*
|
|
3564
|
-
* Must be an HTTPS URL or a relative path (starts with `/` or `./`).
|
|
3565
|
-
* Call this once at app startup, before loading any models.
|
|
3566
|
-
*
|
|
3567
|
-
* @param cdnPath - HTTPS URL or relative path to ORT binaries directory
|
|
3568
|
-
* @throws If cdnPath is not HTTPS or a relative path
|
|
3569
|
-
*/
|
|
3570
|
-
declare function configureOrtCdn(cdnPath: string): void;
|
|
3571
|
-
/**
|
|
3572
|
-
* Get the current ORT CDN base URL override, or null if using defaults.
|
|
3573
|
-
*/
|
|
3574
|
-
declare function getOrtCdnBase(): string | null;
|
|
3575
|
-
|
|
3576
2720
|
/**
|
|
3577
2721
|
* Emotion - Helper for creating emotion vectors for avatar animation
|
|
3578
2722
|
*
|
|
@@ -4118,7 +3262,7 @@ declare const MetricNames: {
|
|
|
4118
3262
|
readonly CACHE_QUOTA_WARNING: "omote.cache.quota_warning";
|
|
4119
3263
|
/** Counter: Cache eviction (LRU) */
|
|
4120
3264
|
readonly CACHE_EVICTION: "omote.cache.eviction";
|
|
4121
|
-
/** Histogram:
|
|
3265
|
+
/** Histogram: Voice turn latency (speech end → transcript ready, excludes playback) */
|
|
4122
3266
|
readonly VOICE_TURN_LATENCY: "omote.voice.turn.latency";
|
|
4123
3267
|
/** Histogram: ASR transcription latency in ms */
|
|
4124
3268
|
readonly VOICE_TRANSCRIPTION_LATENCY: "omote.voice.transcription.latency";
|
|
@@ -4966,7 +4110,7 @@ declare class ProceduralLifeLayer {
|
|
|
4966
4110
|
*/
|
|
4967
4111
|
update(delta: number, input?: LifeLayerInput): LifeLayerOutput;
|
|
4968
4112
|
/**
|
|
4969
|
-
* Write life layer output directly to a Float32Array[52] in
|
|
4113
|
+
* Write life layer output directly to a Float32Array[52] in ARKIT_BLENDSHAPES order.
|
|
4970
4114
|
*
|
|
4971
4115
|
* Includes micro-jitter (0.4% amplitude simplex noise on all channels) to
|
|
4972
4116
|
* break uncanny stillness on undriven channels.
|
|
@@ -5301,7 +4445,7 @@ declare class FaceCompositor {
|
|
|
5301
4445
|
/**
|
|
5302
4446
|
* Compose a single output frame from the 5-stage signal chain.
|
|
5303
4447
|
*
|
|
5304
|
-
* @param base - A2E raw output (Float32Array[52],
|
|
4448
|
+
* @param base - A2E raw output (Float32Array[52], ARKIT_BLENDSHAPES order)
|
|
5305
4449
|
* @param input - Per-frame input (deltaTime, emotion, life layer params)
|
|
5306
4450
|
* @param target - Optional pre-allocated output buffer (avoids per-frame allocation).
|
|
5307
4451
|
* When omitted, an internal buffer is used (valid until next compose() call).
|
|
@@ -5583,216 +4727,6 @@ declare class MicLipSync extends EventEmitter<MicLipSyncEvents> {
|
|
|
5583
4727
|
private setState;
|
|
5584
4728
|
}
|
|
5585
4729
|
|
|
5586
|
-
/**
|
|
5587
|
-
* VoicePipeline - Full conversational agent loop
|
|
5588
|
-
*
|
|
5589
|
-
* Composes: MicrophoneCapture → SileroVAD → SenseVoice ASR → PlaybackPipeline (A2E)
|
|
5590
|
-
*
|
|
5591
|
-
* State machine: idle → loading → ready → listening → thinking → speaking → listening → ...
|
|
5592
|
-
*
|
|
5593
|
-
* The consumer provides an `onResponse` callback that receives transcribed text
|
|
5594
|
-
* and streams audio back for playback + lip sync. VoicePipeline is backend-agnostic.
|
|
5595
|
-
*
|
|
5596
|
-
* @category Orchestration
|
|
5597
|
-
*/
|
|
5598
|
-
|
|
5599
|
-
/** Shared config options for all VoicePipeline modes */
|
|
5600
|
-
interface VoicePipelineBaseConfig {
|
|
5601
|
-
/** Pre-built backends — skip internal factory creation. Takes precedence over `models`. */
|
|
5602
|
-
backends?: {
|
|
5603
|
-
asr: SenseVoiceBackend;
|
|
5604
|
-
lam: A2EBackend;
|
|
5605
|
-
vad: SileroVADBackend;
|
|
5606
|
-
tts?: TTSBackend;
|
|
5607
|
-
};
|
|
5608
|
-
/** External unified worker (reuse across pipelines). Takes precedence over internal creation. */
|
|
5609
|
-
unifiedWorker?: UnifiedInferenceWorker;
|
|
5610
|
-
/** URLs and options for model loading. Required if `backends` not provided. */
|
|
5611
|
-
models?: {
|
|
5612
|
-
senseVoice: {
|
|
5613
|
-
modelUrl: string;
|
|
5614
|
-
tokensUrl?: string;
|
|
5615
|
-
language?: string;
|
|
5616
|
-
};
|
|
5617
|
-
lam: {
|
|
5618
|
-
modelUrl: string;
|
|
5619
|
-
externalDataUrl?: string | false;
|
|
5620
|
-
backend?: 'auto' | 'webgpu' | 'wasm';
|
|
5621
|
-
};
|
|
5622
|
-
vad: {
|
|
5623
|
-
modelUrl: string;
|
|
5624
|
-
threshold?: number;
|
|
5625
|
-
preSpeechBufferChunks?: number;
|
|
5626
|
-
};
|
|
5627
|
-
};
|
|
5628
|
-
/** Per-character expression weight scaling */
|
|
5629
|
-
profile?: ExpressionProfile;
|
|
5630
|
-
/** Identity/style index for A2E model (default: 0) */
|
|
5631
|
-
identityIndex?: number;
|
|
5632
|
-
/** Base silence timeout in ms (default: 500) */
|
|
5633
|
-
silenceTimeoutMs?: number;
|
|
5634
|
-
/** Extended silence timeout for long utterances (default: 700) */
|
|
5635
|
-
silenceTimeoutExtendedMs?: number;
|
|
5636
|
-
/** Enable adaptive timeout based on speech duration (default: true) */
|
|
5637
|
-
adaptiveTimeout?: boolean;
|
|
5638
|
-
/** Minimum audio duration in seconds (default: 0.3) */
|
|
5639
|
-
minAudioDurationSec?: number;
|
|
5640
|
-
/** Minimum audio energy (default: 0.02) */
|
|
5641
|
-
minAudioEnergy?: number;
|
|
5642
|
-
/** Enable audio normalization for quiet audio (default: true) */
|
|
5643
|
-
normalizeAudio?: boolean;
|
|
5644
|
-
/** Progressive transcription interval — desktop (default: 500ms) */
|
|
5645
|
-
progressiveIntervalMs?: number;
|
|
5646
|
-
/** Progressive transcription interval — iOS (default: 800ms) */
|
|
5647
|
-
progressiveIntervalIosMs?: number;
|
|
5648
|
-
/** Coverage threshold to use progressive result (default: 0.8) */
|
|
5649
|
-
progressiveCoverageThreshold?: number;
|
|
5650
|
-
/** Minimum samples before progressive transcription starts (default: 8000) */
|
|
5651
|
-
progressiveMinSamples?: number;
|
|
5652
|
-
/** Timeout for individual SenseVoice.transcribe() calls (default: 10000ms) */
|
|
5653
|
-
transcriptionTimeoutMs?: number;
|
|
5654
|
-
/** Enable barge-in detection (default: true) */
|
|
5655
|
-
interruptionEnabled?: boolean;
|
|
5656
|
-
/** Minimum speech duration for interruption (default: 200ms) */
|
|
5657
|
-
interruptionMinSpeechMs?: number;
|
|
5658
|
-
/** Audio playback delay (default: auto-detected) */
|
|
5659
|
-
audioDelayMs?: number;
|
|
5660
|
-
/** Coalescer target duration (default: 200ms) */
|
|
5661
|
-
chunkTargetMs?: number;
|
|
5662
|
-
/** Enable neutral transition on playback complete (default: true) */
|
|
5663
|
-
neutralTransitionEnabled?: boolean;
|
|
5664
|
-
/** Duration of neutral fade-out (default: 250ms) */
|
|
5665
|
-
neutralTransitionMs?: number;
|
|
5666
|
-
}
|
|
5667
|
-
/** Cloud TTS mode: consumer handles response + audio streaming */
|
|
5668
|
-
interface VoicePipelineCloudConfig extends VoicePipelineBaseConfig {
|
|
5669
|
-
mode: 'cloud';
|
|
5670
|
-
/** Consumer's response handler (streams audio back) */
|
|
5671
|
-
onResponse: ResponseHandler;
|
|
5672
|
-
}
|
|
5673
|
-
/** Local TTS mode: SDK handles synthesis internally via TTSBackend */
|
|
5674
|
-
interface VoicePipelineLocalConfig extends VoicePipelineBaseConfig {
|
|
5675
|
-
mode: 'local';
|
|
5676
|
-
/**
|
|
5677
|
-
* TTS backend (e.g., KokoroTTSInference). Provide either `tts` or `ttsConfig`.
|
|
5678
|
-
*
|
|
5679
|
-
* When `tts` is provided, VoicePipeline uses it as-is. On iOS, this means
|
|
5680
|
-
* inference runs on the main thread (may cause UI freezes).
|
|
5681
|
-
*
|
|
5682
|
-
* Prefer `ttsConfig` for automatic unified worker integration on iOS.
|
|
5683
|
-
*/
|
|
5684
|
-
tts?: TTSBackend;
|
|
5685
|
-
/**
|
|
5686
|
-
* Kokoro TTS configuration. When provided, VoicePipeline creates the TTS
|
|
5687
|
-
* internally and passes the unified worker on iOS for off-main-thread inference.
|
|
5688
|
-
*
|
|
5689
|
-
* Takes precedence over `tts` if both are provided.
|
|
5690
|
-
*/
|
|
5691
|
-
ttsConfig?: {
|
|
5692
|
-
defaultVoice?: string;
|
|
5693
|
-
speed?: number;
|
|
5694
|
-
modelUrl?: string;
|
|
5695
|
-
voiceBaseUrl?: string;
|
|
5696
|
-
};
|
|
5697
|
-
/** Optional text transform (e.g., LLM call). Receives transcript, returns response text. */
|
|
5698
|
-
onTranscript?: (text: string) => string | Promise<string>;
|
|
5699
|
-
}
|
|
5700
|
-
type VoicePipelineConfig = VoicePipelineCloudConfig | VoicePipelineLocalConfig;
|
|
5701
|
-
interface VoicePipelineEvents {
|
|
5702
|
-
'state': VoicePipelineState;
|
|
5703
|
-
'loading:progress': LoadingProgress;
|
|
5704
|
-
'transcript': TranscriptResult;
|
|
5705
|
-
'frame': FullFaceFrame;
|
|
5706
|
-
'frame:raw': Float32Array;
|
|
5707
|
-
'speech:start': void;
|
|
5708
|
-
'speech:end': {
|
|
5709
|
-
durationMs: number;
|
|
5710
|
-
};
|
|
5711
|
-
'playback:start': {
|
|
5712
|
-
time: number;
|
|
5713
|
-
};
|
|
5714
|
-
'playback:complete': void;
|
|
5715
|
-
'interruption': void;
|
|
5716
|
-
'audio:level': {
|
|
5717
|
-
rms: number;
|
|
5718
|
-
peak: number;
|
|
5719
|
-
};
|
|
5720
|
-
'error': Error;
|
|
5721
|
-
[key: string]: unknown;
|
|
5722
|
-
}
|
|
5723
|
-
declare class VoicePipeline extends EventEmitter<VoicePipelineEvents> {
|
|
5724
|
-
private readonly config;
|
|
5725
|
-
private readonly isLocalMode;
|
|
5726
|
-
private _state;
|
|
5727
|
-
private stopped;
|
|
5728
|
-
private epoch;
|
|
5729
|
-
private _sessionId;
|
|
5730
|
-
private asr;
|
|
5731
|
-
private lam;
|
|
5732
|
-
private vad;
|
|
5733
|
-
private unifiedWorker;
|
|
5734
|
-
private playback;
|
|
5735
|
-
private interruption;
|
|
5736
|
-
private omoteEvents;
|
|
5737
|
-
private mic;
|
|
5738
|
-
private static readonly MAX_AUDIO_BUFFER_SAMPLES;
|
|
5739
|
-
private audioBuffer;
|
|
5740
|
-
private audioBufferSamples;
|
|
5741
|
-
private speechStartTime;
|
|
5742
|
-
private silenceTimer;
|
|
5743
|
-
private isSpeaking;
|
|
5744
|
-
private progressiveTimer;
|
|
5745
|
-
private progressivePromise;
|
|
5746
|
-
private lastProgressiveResult;
|
|
5747
|
-
private lastProgressiveSamples;
|
|
5748
|
-
private asrErrorCount;
|
|
5749
|
-
private progressiveErrorCount;
|
|
5750
|
-
private responseAbortController;
|
|
5751
|
-
private _unsubChunk;
|
|
5752
|
-
private _unsubLevel;
|
|
5753
|
-
private _currentFrame;
|
|
5754
|
-
/** Current pipeline state */
|
|
5755
|
-
get state(): VoicePipelineState;
|
|
5756
|
-
/** Latest blendshape frame */
|
|
5757
|
-
get currentFrame(): Float32Array | null;
|
|
5758
|
-
/** Whether user is currently speaking */
|
|
5759
|
-
get isSpeechActive(): boolean;
|
|
5760
|
-
/** Session ID (generated on start(), null before) */
|
|
5761
|
-
get sessionId(): string | null;
|
|
5762
|
-
constructor(config: VoicePipelineConfig);
|
|
5763
|
-
loadModels(): Promise<void>;
|
|
5764
|
-
/**
|
|
5765
|
-
* Load from pre-built backends (dependency injection path).
|
|
5766
|
-
* Loads any backends that aren't loaded yet.
|
|
5767
|
-
*/
|
|
5768
|
-
private loadFromBackends;
|
|
5769
|
-
/**
|
|
5770
|
-
* Load from factories (original path). Loads SenseVoice, LAM, and VAD in parallel.
|
|
5771
|
-
*/
|
|
5772
|
-
private loadFromFactories;
|
|
5773
|
-
start(): Promise<void>;
|
|
5774
|
-
stop(): void;
|
|
5775
|
-
setProfile(profile: ExpressionProfile): void;
|
|
5776
|
-
dispose(): Promise<void>;
|
|
5777
|
-
private processAudioChunk;
|
|
5778
|
-
private getSilenceTimeout;
|
|
5779
|
-
private onSilenceDetected;
|
|
5780
|
-
private processEndOfSpeech;
|
|
5781
|
-
private callResponseHandler;
|
|
5782
|
-
/** Cloud mode: delegate to consumer's onResponse handler */
|
|
5783
|
-
private handleCloudResponse;
|
|
5784
|
-
/** Local mode: synthesize text with TTSBackend, stream to PlaybackPipeline */
|
|
5785
|
-
private handleLocalResponse;
|
|
5786
|
-
private handleInterruption;
|
|
5787
|
-
private startProgressiveTranscription;
|
|
5788
|
-
private stopProgressiveTranscription;
|
|
5789
|
-
private transcribeWithTimeout;
|
|
5790
|
-
private normalizeAudio;
|
|
5791
|
-
private setState;
|
|
5792
|
-
private emitProgress;
|
|
5793
|
-
private clearSilenceTimer;
|
|
5794
|
-
}
|
|
5795
|
-
|
|
5796
4730
|
/**
|
|
5797
4731
|
* VoiceOrchestrator — Shared voice wiring for OmoteAvatar adapters.
|
|
5798
4732
|
*
|
|
@@ -5810,6 +4744,11 @@ interface VoiceOrchestratorBaseConfig {
|
|
|
5810
4744
|
listener?: SpeechListenerConfig;
|
|
5811
4745
|
interruptionEnabled?: boolean;
|
|
5812
4746
|
profile?: ExpressionProfile;
|
|
4747
|
+
onStateChange?: (state: ConversationalState) => void;
|
|
4748
|
+
onLoadingProgress?: (progress: LoadingProgress) => void;
|
|
4749
|
+
onError?: (error: Error) => void;
|
|
4750
|
+
onTranscriptEvent?: (result: TranscriptResult) => void;
|
|
4751
|
+
onInterruption?: () => void;
|
|
5813
4752
|
}
|
|
5814
4753
|
interface VoiceOrchestratorLocalConfig extends VoiceOrchestratorBaseConfig {
|
|
5815
4754
|
mode?: 'local';
|
|
@@ -5823,12 +4762,23 @@ interface VoiceOrchestratorCloudConfig extends VoiceOrchestratorBaseConfig {
|
|
|
5823
4762
|
lam?: {
|
|
5824
4763
|
modelUrl?: string;
|
|
5825
4764
|
externalDataUrl?: string | false;
|
|
4765
|
+
unifiedWorker?: UnifiedInferenceWorker;
|
|
5826
4766
|
};
|
|
4767
|
+
identityIndex?: number;
|
|
4768
|
+
neutralTransitionEnabled?: boolean;
|
|
5827
4769
|
}
|
|
5828
4770
|
type VoiceOrchestratorConfig = VoiceOrchestratorLocalConfig | VoiceOrchestratorCloudConfig;
|
|
5829
4771
|
interface VoiceOrchestratorEvents {
|
|
5830
4772
|
'state': ConversationalState;
|
|
5831
4773
|
'transcript': TranscriptResult;
|
|
4774
|
+
'interruption': void;
|
|
4775
|
+
'loading:progress': LoadingProgress;
|
|
4776
|
+
'error': Error;
|
|
4777
|
+
'audio:level': {
|
|
4778
|
+
rms: number;
|
|
4779
|
+
peak: number;
|
|
4780
|
+
};
|
|
4781
|
+
'playback:complete': void;
|
|
5832
4782
|
[key: string]: unknown;
|
|
5833
4783
|
}
|
|
5834
4784
|
declare class VoiceOrchestrator extends EventEmitter<VoiceOrchestratorEvents> {
|
|
@@ -5837,6 +4787,8 @@ declare class VoiceOrchestrator extends EventEmitter<VoiceOrchestratorEvents> {
|
|
|
5837
4787
|
private ttsSpeaker;
|
|
5838
4788
|
private playbackPipeline;
|
|
5839
4789
|
private ownedLam;
|
|
4790
|
+
private ownedWorker;
|
|
4791
|
+
private usesSharedWorker;
|
|
5840
4792
|
private transcriptUnsub;
|
|
5841
4793
|
private audioChunkUnsub;
|
|
5842
4794
|
private connectEpoch;
|
|
@@ -5860,10 +4812,14 @@ declare class VoiceOrchestrator extends EventEmitter<VoiceOrchestratorEvents> {
|
|
|
5860
4812
|
speak(text: string, options?: {
|
|
5861
4813
|
signal?: AbortSignal;
|
|
5862
4814
|
voice?: string;
|
|
4815
|
+
speed?: number;
|
|
4816
|
+
language?: string;
|
|
5863
4817
|
}): Promise<void>;
|
|
5864
4818
|
streamText(options?: {
|
|
5865
4819
|
signal?: AbortSignal;
|
|
5866
4820
|
voice?: string;
|
|
4821
|
+
speed?: number;
|
|
4822
|
+
language?: string;
|
|
5867
4823
|
}): Promise<{
|
|
5868
4824
|
push: (token: string) => void;
|
|
5869
4825
|
end: () => Promise<void>;
|
|
@@ -5875,4 +4831,4 @@ declare class VoiceOrchestrator extends EventEmitter<VoiceOrchestratorEvents> {
|
|
|
5875
4831
|
private setState;
|
|
5876
4832
|
}
|
|
5877
4833
|
|
|
5878
|
-
export { type A2EBackend,
|
|
4834
|
+
export { type A2EBackend, type A2EModelInfo, A2EProcessor, type A2EProcessorConfig, type A2EResult, A2EUnifiedAdapter, ALL_AUS, ARKIT_BLENDSHAPES, type AUActivation, AU_TO_ARKIT, type ActiveSpan, type AnimationClip, type AnimationController, AnimationGraph, type AnimationGraphConfig, type AnimationGraphEvents, type AnimationLayer, type AnimationOutput, type AnimationSource, type AnimationSourceOptions, type AnimationState, type AnimationStateName, type AnimationTrigger, AudioChunkCoalescer, type AudioChunkCoalescerOptions, AudioEnergyAnalyzer, AudioScheduler, type AudioSchedulerOptions, BLENDSHAPE_TO_GROUP, type BackendPreference, type BlendWeight, type BlendshapeGroup, BlendshapeSmoother, type BlendshapeSmootherConfig, type BoneFilterConfig, type CacheConfig, type CacheSpanAttributes, CharacterController, type CharacterControllerConfig, type CharacterProfile, type CharacterUpdateInput, type CharacterUpdateOutput, ConsoleExporter, type ConversationalState, type CreateA2EConfig, type CreateKokoroTTSConfig, type CreateSenseVoiceConfig, type CreateTTSPlayerConfig, DEFAULT_ANIMATION_CONFIG, DEFAULT_BONE_FILTER, DEFAULT_MODEL_URLS, EMOTION_NAMES, EMOTION_TO_AU, EMOTION_VECTOR_SIZE, EXPLICIT_EMOTION_COUNT, type ElevenLabsConfig, ElevenLabsTTSBackend, type EmotionAnimationMap, EmotionController, type EmotionLabel, type EmotionName, type EmotionPresetName, EmotionPresets, EmotionResolver, type EmotionWeights, EmphasisDetector, type ErrorType, ErrorTypes, EventEmitter, type ExpressionProfile, FaceCompositor, type FaceCompositorConfig, type FaceCompositorInput, type FaceCompositorOutput, type FetchWithCacheOptions, type FrameSource, type FullFaceFrame, HF_CDN_URLS, INFERENCE_LATENCY_BUCKETS, type InferenceFactoryConfig, type InferenceSpanAttributes, type InterruptionConfig, type InterruptionEvents, InterruptionHandler, KOKORO_VOICES, type KokoroStreamChunk, type KokoroTTSConfig, type KokoroTTSModelInfo, type KokoroTTSResult, KokoroTTSUnifiedAdapter, type KokoroVoiceName, LAM_BLENDSHAPES, type LifeLayerConfig, type LifeLayerInput, type LifeLayerOutput, type LoadingProgress, MIXAMO_PREFIX, MODEL_LOAD_TIME_BUCKETS, type MetricData, MetricNames, MicLipSync, type MicLipSyncConfig, type MicLipSyncEvents, type MicLipSyncFrame, type MicLipSyncState, MicrophoneCapture, type MicrophoneCaptureConfig, ModelCache, type ModelSpanAttributes, type ModelUrlKey, OTLPExporter, type OTLPExporterConfig, OmoteEvents, OmoteTelemetry, PRESERVE_POSITION_BONES, PlaybackPipeline, type PlaybackPipelineConfig, type PlaybackPipelineEvents, type PlaybackState, ProceduralLifeLayer, type Quat, type QuotaInfo, type ResolvedEmotion, type ResponseHandler, RingBuffer, type RuntimeBackend, type SafariSpeechConfig, SafariSpeechRecognition, type SamplingConfig, type SenseVoiceBackend, type SenseVoiceConfig, type SenseVoiceLanguage, type SenseVoiceModelInfo, type SenseVoiceResult, SenseVoiceUnifiedAdapter, type SenseVoiceWorkerConfig, type SileroVADBackend, type SileroVADConfig, type SileroVADFactoryConfig, SileroVADUnifiedAdapter, type SpanAttributes, type SpanData, type SpeechErrorCallback, SpeechListener, type SpeechListenerConfig, type SpeechListenerEvents, type SpeechListenerState, type SpeechRecognitionResult, type SpeechResultCallback, type SpeechSegment, type SynthesizeOptions, type TTSBackend, type TTSChunk, TTSPlayback, type TTSPlaybackConfig, type TTSPlaybackEvents, TTSPlayer, TTSSpeaker, type TTSSpeakerConfig, type TTSStreamOptions, type TelemetryConfig, type TelemetryExporter, type TelemetryExporterInterface, type TrackDescriptor, type TranscriptResult, type Transition, UnifiedInferenceWorker, type VADBackend, type VADModelInfo, type VADResult, type VADWorkerConfig, type VADWorkerModelInfo, type ValidationResult, type Vec3, VoiceOrchestrator, type VoiceOrchestratorCloudConfig, type VoiceOrchestratorConfig, type VoiceOrchestratorEvents, type VoiceOrchestratorLocalConfig, type VoicePipelineState, type WorkerHealthState, analyzeTextEmotion, applyProfile, blendEmotions, calculatePeak, calculateRMS, configureCacheLimit, configureModelUrls, configureTelemetry, createA2E, createEmotionVector, createKokoroTTS, createSenseVoice, createSileroVAD, createTTSPlayer, fetchWithCache, float32ToPcm16, formatBytes, getCacheConfig, getCacheKey, getEmotionPreset, getModelCache, getOptimalWasmThreads, getRecommendedBackend, getTelemetry, hasWebGPUApi, int16ToFloat32, isAndroid, isIOS, isIOSSafari, isMobile, isSafari, isSpeechRecognitionAvailable, isWebGPUAvailable, lerpBlendshapes, lerpEmotion, listVoices as listKokoroVoices, parseEmotionTags, pcm16ToFloat32, preloadModels, resampleLinear, resetModelUrls, resolveBackend, resolveEmotion, shouldEnableWasmProxy, shouldKeepTrack, shouldUseNativeASR, shouldUseServerA2E, stripMixamoPrefix, ttsToPlaybackFormat, validateTTSInput };
|