@omote/core 0.9.6 → 0.10.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -1
- package/dist/{chunk-Y3DTP5P3.mjs → chunk-VSYYT4HO.mjs} +1 -1
- package/dist/{chunk-X5OTUOE6.mjs.map → chunk-VSYYT4HO.mjs.map} +1 -1
- package/dist/index.d.mts +268 -1305
- package/dist/index.d.ts +268 -1305
- package/dist/index.js +6417 -11038
- package/dist/index.js.map +1 -1
- package/dist/index.mjs +6416 -11037
- package/dist/index.mjs.map +1 -1
- package/dist/logging/index.js.map +1 -1
- package/dist/logging/index.mjs +1 -1
- package/package.json +1 -2
- package/dist/Logger-BeUI6jG7.d.mts +0 -145
- package/dist/Logger-BeUI6jG7.d.ts +0 -145
- package/dist/Logger-DSoGAYJu.d.mts +0 -141
- package/dist/Logger-DSoGAYJu.d.ts +0 -141
- package/dist/chunk-3NDJA3I4.mjs +0 -853
- package/dist/chunk-3NDJA3I4.mjs.map +0 -1
- package/dist/chunk-CYBTTLG7.mjs +0 -927
- package/dist/chunk-CYBTTLG7.mjs.map +0 -1
- package/dist/chunk-ESU52TDS.mjs +0 -287
- package/dist/chunk-ESU52TDS.mjs.map +0 -1
- package/dist/chunk-MXKJOF4I.mjs +0 -38
- package/dist/chunk-MXKJOF4I.mjs.map +0 -1
- package/dist/chunk-X5OTUOE6.mjs +0 -927
- package/dist/chunk-XK22BRG4.mjs +0 -38
- package/dist/chunk-XK22BRG4.mjs.map +0 -1
- package/dist/chunk-Y3DTP5P3.mjs.map +0 -1
package/dist/index.d.mts
CHANGED
|
@@ -470,7 +470,7 @@ declare function shouldUseServerA2E(): boolean;
|
|
|
470
470
|
/**
|
|
471
471
|
* Common interface for audio-to-expression (A2E) inference backends
|
|
472
472
|
*
|
|
473
|
-
* Implemented by
|
|
473
|
+
* Implemented by A2EUnifiedAdapter, allowing PlaybackPipeline
|
|
474
474
|
* and A2EProcessor to work with either implementation transparently.
|
|
475
475
|
*
|
|
476
476
|
* @category Inference
|
|
@@ -488,11 +488,11 @@ interface A2EModelInfo {
|
|
|
488
488
|
/**
|
|
489
489
|
* Result from A2E inference
|
|
490
490
|
*
|
|
491
|
-
* All implementations must return blendshapes in
|
|
491
|
+
* All implementations must return blendshapes in ARKIT_BLENDSHAPES order (alphabetical).
|
|
492
492
|
* Models with different native orderings must remap internally before returning.
|
|
493
493
|
*/
|
|
494
494
|
interface A2EResult {
|
|
495
|
-
/** Blendshape weights [frames, 52] in
|
|
495
|
+
/** Blendshape weights [frames, 52] in ARKIT_BLENDSHAPES order - 30fps */
|
|
496
496
|
blendshapes: Float32Array[];
|
|
497
497
|
/** Number of blendshape frames */
|
|
498
498
|
numFrames: number;
|
|
@@ -507,10 +507,8 @@ interface A2EResult {
|
|
|
507
507
|
* pipeline — A2E is the interface abstraction, LAM is the model.
|
|
508
508
|
*
|
|
509
509
|
* Implemented by:
|
|
510
|
-
* - {@link
|
|
511
|
-
* - A2EUnifiedAdapter (shared unified worker)
|
|
510
|
+
* - {@link A2EUnifiedAdapter} (shared unified worker)
|
|
512
511
|
*
|
|
513
|
-
* @see {@link A2EInference} for direct usage
|
|
514
512
|
* @see {@link createA2E} for the recommended factory API
|
|
515
513
|
*/
|
|
516
514
|
interface A2EBackend {
|
|
@@ -531,7 +529,7 @@ interface A2EBackend {
|
|
|
531
529
|
* Run inference on raw audio
|
|
532
530
|
* @param audioSamples - Float32Array of raw audio at 16kHz
|
|
533
531
|
* @param identityIndex - Optional identity index (ignored by CPU model)
|
|
534
|
-
* @returns A2E result with blendshapes in
|
|
532
|
+
* @returns A2E result with blendshapes in ARKIT_BLENDSHAPES order
|
|
535
533
|
*/
|
|
536
534
|
infer(audioSamples: Float32Array, identityIndex?: number): Promise<A2EResult>;
|
|
537
535
|
/**
|
|
@@ -544,7 +542,7 @@ interface A2EBackend {
|
|
|
544
542
|
* ExpressionProfile - Per-character weight scaling for A2E blendshape output
|
|
545
543
|
*
|
|
546
544
|
* Maps blendshape groups (eyes, brows, jaw, mouth, cheeks, nose, tongue)
|
|
547
|
-
* to weight scalers. Used by PlaybackPipeline, MicLipSync, and
|
|
545
|
+
* to weight scalers. Used by PlaybackPipeline, MicLipSync, and VoiceOrchestrator.
|
|
548
546
|
*
|
|
549
547
|
* @category Audio
|
|
550
548
|
*/
|
|
@@ -575,7 +573,7 @@ interface ExpressionProfile {
|
|
|
575
573
|
overrides?: Partial<Record<string, number>>;
|
|
576
574
|
}
|
|
577
575
|
/**
|
|
578
|
-
* Map each
|
|
576
|
+
* Map each ARKIT_BLENDSHAPES entry to its BlendshapeGroup.
|
|
579
577
|
* Built once at module load from prefix matching.
|
|
580
578
|
*/
|
|
581
579
|
declare const BLENDSHAPE_TO_GROUP: Map<string, BlendshapeGroup>;
|
|
@@ -678,6 +676,13 @@ declare class PlaybackPipeline extends EventEmitter<PlaybackPipelineEvents> {
|
|
|
678
676
|
private neutralTransitionFrame;
|
|
679
677
|
private neutralTransitionStart;
|
|
680
678
|
private neutralAnimationId;
|
|
679
|
+
private static readonly RAMP_IN_HALFLIFE;
|
|
680
|
+
private static readonly RAMP_IN_DURATION_MS;
|
|
681
|
+
private rampInSmoother;
|
|
682
|
+
private rampInActive;
|
|
683
|
+
private rampInLastTime;
|
|
684
|
+
private rampInStartTime;
|
|
685
|
+
private readonly _rampInBuffer;
|
|
681
686
|
private _currentFrame;
|
|
682
687
|
private _currentRawFrame;
|
|
683
688
|
private _emotion;
|
|
@@ -691,6 +696,8 @@ declare class PlaybackPipeline extends EventEmitter<PlaybackPipelineEvents> {
|
|
|
691
696
|
constructor(config: PlaybackPipelineConfig);
|
|
692
697
|
/** Initialize AudioContext (lazy, call after user gesture) */
|
|
693
698
|
initialize(): Promise<void>;
|
|
699
|
+
/** Eagerly create AudioContext. Call from user gesture for iOS. */
|
|
700
|
+
warmup(): Promise<void>;
|
|
694
701
|
/** Update ExpressionProfile at runtime */
|
|
695
702
|
setProfile(profile: ExpressionProfile): void;
|
|
696
703
|
/** Set the emotion label to include in emitted frames */
|
|
@@ -737,7 +744,7 @@ declare class PlaybackPipeline extends EventEmitter<PlaybackPipelineEvents> {
|
|
|
737
744
|
* TTSBackend — Streaming text-to-speech backend interface.
|
|
738
745
|
*
|
|
739
746
|
* Any TTS engine (Kokoro, ElevenLabs, etc.) can implement this contract
|
|
740
|
-
* to integrate with TTSPlayback and
|
|
747
|
+
* to integrate with TTSPlayback and VoiceOrchestrator.
|
|
741
748
|
*
|
|
742
749
|
* @category Inference
|
|
743
750
|
*/
|
|
@@ -781,6 +788,10 @@ interface TTSStreamOptions {
|
|
|
781
788
|
voice?: string;
|
|
782
789
|
/** Speed multiplier override per-call */
|
|
783
790
|
speed?: number;
|
|
791
|
+
/** Language override per-call (e.g. 'en-us', 'ja'). Default: derived from voice name. */
|
|
792
|
+
language?: string;
|
|
793
|
+
/** When true, emit the entire text as a single chunk (no sentence splitting). */
|
|
794
|
+
singleShot?: boolean;
|
|
784
795
|
}
|
|
785
796
|
/**
|
|
786
797
|
* A single chunk of TTS audio output
|
|
@@ -856,7 +867,11 @@ declare class TTSPlayback extends EventEmitter<TTSPlaybackEvents> {
|
|
|
856
867
|
speak(text: string, options?: {
|
|
857
868
|
signal?: AbortSignal;
|
|
858
869
|
voice?: string;
|
|
870
|
+
speed?: number;
|
|
871
|
+
language?: string;
|
|
859
872
|
}): Promise<void>;
|
|
873
|
+
/** Eagerly create AudioContext. Call from user gesture for iOS. */
|
|
874
|
+
warmup(): Promise<void>;
|
|
860
875
|
/** Dispose of all resources. */
|
|
861
876
|
dispose(): Promise<void>;
|
|
862
877
|
private speakWithPrefetch;
|
|
@@ -893,34 +908,9 @@ declare class TTSPlayback extends EventEmitter<TTSPlaybackEvents> {
|
|
|
893
908
|
declare function isWebGPUAvailable(): Promise<boolean>;
|
|
894
909
|
|
|
895
910
|
/**
|
|
896
|
-
* SenseVoice
|
|
897
|
-
*
|
|
898
|
-
* Non-autoregressive CTC-based ASR that is 5x faster than Whisper-Small.
|
|
899
|
-
* Runs entirely in browser via WebGPU or WASM. No transformers.js dependency.
|
|
900
|
-
*
|
|
901
|
-
* Uses the sherpa-onnx SenseVoice export (model.int8.onnx, 239MB int8 quantized).
|
|
902
|
-
* Also provides emotion detection, language identification, and audio event detection
|
|
903
|
-
* from the same forward pass.
|
|
911
|
+
* SenseVoice type definitions
|
|
904
912
|
*
|
|
905
913
|
* @category Inference
|
|
906
|
-
*
|
|
907
|
-
* @example Basic usage
|
|
908
|
-
* ```typescript
|
|
909
|
-
* import { SenseVoiceInference } from '@omote/core';
|
|
910
|
-
*
|
|
911
|
-
* const asr = new SenseVoiceInference({
|
|
912
|
-
* modelUrl: '/models/sensevoice/model.int8.onnx',
|
|
913
|
-
* tokensUrl: '/models/sensevoice/tokens.txt',
|
|
914
|
-
* });
|
|
915
|
-
* await asr.load();
|
|
916
|
-
*
|
|
917
|
-
* const { text, emotion, language } = await asr.transcribe(audioSamples);
|
|
918
|
-
* console.log(text); // "Hello world"
|
|
919
|
-
* console.log(emotion); // "NEUTRAL"
|
|
920
|
-
* console.log(language); // "en"
|
|
921
|
-
* ```
|
|
922
|
-
*
|
|
923
|
-
* @module inference/SenseVoiceInference
|
|
924
914
|
*/
|
|
925
915
|
|
|
926
916
|
type SenseVoiceLanguage = 'auto' | 'zh' | 'en' | 'ja' | 'ko' | 'yue';
|
|
@@ -957,76 +947,49 @@ interface SenseVoiceModelInfo {
|
|
|
957
947
|
outputNames: string[];
|
|
958
948
|
vocabSize: number;
|
|
959
949
|
}
|
|
960
|
-
|
|
961
|
-
|
|
962
|
-
|
|
963
|
-
|
|
964
|
-
|
|
965
|
-
|
|
966
|
-
|
|
967
|
-
|
|
968
|
-
|
|
969
|
-
|
|
970
|
-
|
|
971
|
-
|
|
972
|
-
|
|
973
|
-
|
|
974
|
-
|
|
975
|
-
|
|
976
|
-
|
|
977
|
-
|
|
978
|
-
|
|
950
|
+
/**
|
|
951
|
+
* Configuration for SenseVoice Worker (used by SenseVoiceUnifiedAdapter)
|
|
952
|
+
*/
|
|
953
|
+
interface SenseVoiceWorkerConfig {
|
|
954
|
+
/** Path or URL to model.int8.onnx (239MB) */
|
|
955
|
+
modelUrl: string;
|
|
956
|
+
/** Path or URL to tokens.txt vocabulary file (default: sibling of modelUrl) */
|
|
957
|
+
tokensUrl?: string;
|
|
958
|
+
/** Language hint (default: 'auto' for auto-detection) */
|
|
959
|
+
language?: SenseVoiceLanguage;
|
|
960
|
+
/** Text normalization: 'with_itn' applies inverse text normalization (default: 'with_itn') */
|
|
961
|
+
textNorm?: 'with_itn' | 'without_itn';
|
|
962
|
+
}
|
|
963
|
+
/**
|
|
964
|
+
* Common interface for SenseVoice implementations
|
|
965
|
+
*/
|
|
966
|
+
interface SenseVoiceBackend {
|
|
967
|
+
/** Whether the model is loaded and ready for inference */
|
|
968
|
+
readonly isLoaded: boolean;
|
|
969
|
+
/** Current backend type ('wasm' for worker, full backend for main thread, null if not loaded) */
|
|
970
|
+
readonly backend: 'wasm' | 'webgpu' | null;
|
|
971
|
+
/**
|
|
972
|
+
* Load the ONNX model
|
|
973
|
+
* @param onProgress - Optional progress callback (fires once at 100% for worker)
|
|
974
|
+
* @returns Model loading information
|
|
975
|
+
*/
|
|
979
976
|
load(onProgress?: (loaded: number, total: number) => void): Promise<SenseVoiceModelInfo>;
|
|
980
977
|
/**
|
|
981
978
|
* Transcribe audio samples to text
|
|
982
|
-
*
|
|
983
|
-
* @
|
|
984
|
-
* @returns Transcription result with text, emotion, language, and event
|
|
979
|
+
* @param audioSamples - Float32Array of audio samples at 16kHz
|
|
980
|
+
* @returns Transcription result
|
|
985
981
|
*/
|
|
986
982
|
transcribe(audioSamples: Float32Array): Promise<SenseVoiceResult>;
|
|
987
|
-
|
|
983
|
+
/**
|
|
984
|
+
* Dispose of the model and free resources
|
|
985
|
+
*/
|
|
988
986
|
dispose(): Promise<void>;
|
|
989
987
|
}
|
|
990
988
|
|
|
991
989
|
/**
|
|
992
|
-
* Silero VAD
|
|
993
|
-
*
|
|
994
|
-
* Neural network-based VAD running in browser via ONNX Runtime Web.
|
|
995
|
-
* Much more accurate than RMS-based energy detection.
|
|
996
|
-
*
|
|
997
|
-
* Uses lazy loading to conditionally load WebGPU or WASM-only bundle:
|
|
998
|
-
* - iOS: Loads WASM-only bundle (WebGPU crashes due to Safari bugs)
|
|
999
|
-
* - Android/Desktop: Loads WebGPU bundle (with WASM fallback)
|
|
990
|
+
* Silero VAD type definitions
|
|
1000
991
|
*
|
|
1001
992
|
* @category Inference
|
|
1002
|
-
*
|
|
1003
|
-
* @example Basic usage
|
|
1004
|
-
* ```typescript
|
|
1005
|
-
* import { SileroVADInference } from '@omote/core';
|
|
1006
|
-
*
|
|
1007
|
-
* const vad = new SileroVADInference({
|
|
1008
|
-
* modelUrl: '/models/silero-vad.onnx'
|
|
1009
|
-
* });
|
|
1010
|
-
* await vad.load();
|
|
1011
|
-
*
|
|
1012
|
-
* // Process 32ms chunks (512 samples at 16kHz)
|
|
1013
|
-
* const probability = await vad.process(audioChunk);
|
|
1014
|
-
* if (probability > 0.5) {
|
|
1015
|
-
* console.log('Speech detected!');
|
|
1016
|
-
* }
|
|
1017
|
-
* ```
|
|
1018
|
-
*
|
|
1019
|
-
* @example Streaming with state management
|
|
1020
|
-
* ```typescript
|
|
1021
|
-
* // State is automatically maintained between process() calls
|
|
1022
|
-
* // Call reset() when starting a new audio stream
|
|
1023
|
-
* vad.reset();
|
|
1024
|
-
*
|
|
1025
|
-
* for (const chunk of audioChunks) {
|
|
1026
|
-
* const prob = await vad.process(chunk);
|
|
1027
|
-
* // prob is speech probability [0, 1]
|
|
1028
|
-
* }
|
|
1029
|
-
* ```
|
|
1030
993
|
*/
|
|
1031
994
|
|
|
1032
995
|
type VADBackend = BackendPreference;
|
|
@@ -1096,117 +1059,6 @@ interface SpeechSegment {
|
|
|
1096
1059
|
/** Average probability during segment */
|
|
1097
1060
|
avgProbability: number;
|
|
1098
1061
|
}
|
|
1099
|
-
/**
|
|
1100
|
-
* Silero VAD - Neural network voice activity detection
|
|
1101
|
-
*
|
|
1102
|
-
* Based on snakers4/silero-vad ONNX model.
|
|
1103
|
-
* Processes 32ms chunks (512 samples at 16kHz) with LSTM state.
|
|
1104
|
-
*
|
|
1105
|
-
* @see https://github.com/snakers4/silero-vad
|
|
1106
|
-
*/
|
|
1107
|
-
declare class SileroVADInference {
|
|
1108
|
-
private session;
|
|
1109
|
-
private ort;
|
|
1110
|
-
private config;
|
|
1111
|
-
private _backend;
|
|
1112
|
-
private isLoading;
|
|
1113
|
-
private state;
|
|
1114
|
-
private context;
|
|
1115
|
-
private readonly chunkSize;
|
|
1116
|
-
private readonly contextSize;
|
|
1117
|
-
private inferenceQueue;
|
|
1118
|
-
private preSpeechBuffer;
|
|
1119
|
-
private wasSpeaking;
|
|
1120
|
-
private srTensor;
|
|
1121
|
-
constructor(config: SileroVADConfig);
|
|
1122
|
-
get backend(): RuntimeBackend | null;
|
|
1123
|
-
get isLoaded(): boolean;
|
|
1124
|
-
get sampleRate(): number;
|
|
1125
|
-
get threshold(): number;
|
|
1126
|
-
/**
|
|
1127
|
-
* Get required chunk size in samples
|
|
1128
|
-
*/
|
|
1129
|
-
getChunkSize(): number;
|
|
1130
|
-
/**
|
|
1131
|
-
* Get chunk duration in milliseconds
|
|
1132
|
-
*/
|
|
1133
|
-
getChunkDurationMs(): number;
|
|
1134
|
-
/**
|
|
1135
|
-
* Check if WebGPU is available and working
|
|
1136
|
-
* (iOS returns false even if navigator.gpu exists due to ONNX Runtime bugs)
|
|
1137
|
-
*/
|
|
1138
|
-
static isWebGPUAvailable: typeof isWebGPUAvailable;
|
|
1139
|
-
/**
|
|
1140
|
-
* Load the ONNX model
|
|
1141
|
-
*/
|
|
1142
|
-
load(): Promise<VADModelInfo>;
|
|
1143
|
-
/**
|
|
1144
|
-
* Reset state for new audio stream
|
|
1145
|
-
*/
|
|
1146
|
-
reset(): void;
|
|
1147
|
-
/**
|
|
1148
|
-
* Process a single audio chunk
|
|
1149
|
-
*
|
|
1150
|
-
* @param audioChunk - Float32Array of exactly chunkSize samples (512 for 16kHz, 256 for 8kHz)
|
|
1151
|
-
* @returns VAD result with speech probability
|
|
1152
|
-
*/
|
|
1153
|
-
process(audioChunk: Float32Array): Promise<VADResult>;
|
|
1154
|
-
/**
|
|
1155
|
-
* Process audio and detect speech segments
|
|
1156
|
-
*
|
|
1157
|
-
* @param audio - Complete audio buffer
|
|
1158
|
-
* @param options - Detection options
|
|
1159
|
-
* @returns Array of speech segments
|
|
1160
|
-
*/
|
|
1161
|
-
detectSpeech(audio: Float32Array, options?: {
|
|
1162
|
-
/** Minimum speech duration in ms (default: 250) */
|
|
1163
|
-
minSpeechDurationMs?: number;
|
|
1164
|
-
/** Minimum silence duration to end segment in ms (default: 300) */
|
|
1165
|
-
minSilenceDurationMs?: number;
|
|
1166
|
-
/** Padding to add before/after speech in ms (default: 30) */
|
|
1167
|
-
speechPadMs?: number;
|
|
1168
|
-
}): Promise<SpeechSegment[]>;
|
|
1169
|
-
/**
|
|
1170
|
-
* Queue inference to serialize ONNX session calls
|
|
1171
|
-
*/
|
|
1172
|
-
private queueInference;
|
|
1173
|
-
/**
|
|
1174
|
-
* Dispose of the model and free resources
|
|
1175
|
-
*/
|
|
1176
|
-
dispose(): Promise<void>;
|
|
1177
|
-
}
|
|
1178
|
-
|
|
1179
|
-
/**
|
|
1180
|
-
* Silero VAD Web Worker implementation
|
|
1181
|
-
*
|
|
1182
|
-
* Runs Silero VAD inference in a dedicated Web Worker to prevent main thread blocking.
|
|
1183
|
-
* Uses inline worker script (Blob URL pattern) to avoid separate file deployment.
|
|
1184
|
-
*
|
|
1185
|
-
* Key design decisions:
|
|
1186
|
-
* - WASM backend only (WebGPU doesn't work in Workers)
|
|
1187
|
-
* - LSTM state serialized as Float32Array (Tensors can't cross worker boundary)
|
|
1188
|
-
* - Audio copied (not transferred) to retain main thread access for pre-speech buffer
|
|
1189
|
-
* - ONNX Runtime loaded from CDN in worker (no bundler complications)
|
|
1190
|
-
*
|
|
1191
|
-
* @category Inference
|
|
1192
|
-
*
|
|
1193
|
-
* @example Basic usage
|
|
1194
|
-
* ```typescript
|
|
1195
|
-
* import { SileroVADWorker } from '@omote/core';
|
|
1196
|
-
*
|
|
1197
|
-
* const vad = new SileroVADWorker({
|
|
1198
|
-
* modelUrl: '/models/silero-vad.onnx'
|
|
1199
|
-
* });
|
|
1200
|
-
* await vad.load();
|
|
1201
|
-
*
|
|
1202
|
-
* // Process 32ms chunks (512 samples at 16kHz)
|
|
1203
|
-
* const result = await vad.process(audioChunk);
|
|
1204
|
-
* if (result.isSpeech) {
|
|
1205
|
-
* console.log('Speech detected!', result.probability);
|
|
1206
|
-
* }
|
|
1207
|
-
* ```
|
|
1208
|
-
*/
|
|
1209
|
-
|
|
1210
1062
|
/**
|
|
1211
1063
|
* Configuration for Silero VAD Worker
|
|
1212
1064
|
*/
|
|
@@ -1219,13 +1071,6 @@ interface VADWorkerConfig {
|
|
|
1219
1071
|
threshold?: number;
|
|
1220
1072
|
/**
|
|
1221
1073
|
* Number of audio chunks to keep in pre-speech buffer.
|
|
1222
|
-
* When VAD triggers, these chunks are prepended to the speech buffer
|
|
1223
|
-
* to capture the beginning of speech that occurred before detection.
|
|
1224
|
-
*
|
|
1225
|
-
* At 512 samples/chunk and 16kHz:
|
|
1226
|
-
* - 10 chunks = 320ms of pre-speech audio
|
|
1227
|
-
* - 15 chunks = 480ms of pre-speech audio
|
|
1228
|
-
*
|
|
1229
1074
|
* Default: 10 chunks (320ms)
|
|
1230
1075
|
*/
|
|
1231
1076
|
preSpeechBufferChunks?: number;
|
|
@@ -1241,85 +1086,45 @@ interface VADWorkerModelInfo {
|
|
|
1241
1086
|
sampleRate: number;
|
|
1242
1087
|
chunkSize: number;
|
|
1243
1088
|
}
|
|
1244
|
-
|
|
1245
1089
|
/**
|
|
1246
|
-
*
|
|
1247
|
-
*
|
|
1248
|
-
* Runs Silero VAD inference off the main thread to prevent UI blocking.
|
|
1249
|
-
* Feature parity with SileroVADInference but runs in dedicated worker.
|
|
1250
|
-
*
|
|
1251
|
-
* @see SileroVADInference for main-thread version
|
|
1090
|
+
* Common interface for Silero VAD implementations
|
|
1252
1091
|
*/
|
|
1253
|
-
|
|
1254
|
-
|
|
1255
|
-
|
|
1256
|
-
|
|
1257
|
-
|
|
1258
|
-
|
|
1259
|
-
|
|
1260
|
-
|
|
1261
|
-
|
|
1262
|
-
private readonly contextSize;
|
|
1263
|
-
private inferenceQueue;
|
|
1264
|
-
private preSpeechBuffer;
|
|
1265
|
-
private wasSpeaking;
|
|
1266
|
-
private pendingResolvers;
|
|
1267
|
-
private messageId;
|
|
1268
|
-
constructor(config: VADWorkerConfig);
|
|
1269
|
-
get isLoaded(): boolean;
|
|
1270
|
-
/**
|
|
1271
|
-
* Backend type (always 'wasm' for Worker, WebGPU not supported in Workers)
|
|
1272
|
-
*/
|
|
1273
|
-
get backend(): 'wasm' | null;
|
|
1274
|
-
get sampleRate(): number;
|
|
1275
|
-
get threshold(): number;
|
|
1276
|
-
/**
|
|
1277
|
-
* Get required chunk size in samples
|
|
1278
|
-
*/
|
|
1279
|
-
getChunkSize(): number;
|
|
1280
|
-
/**
|
|
1281
|
-
* Get chunk duration in milliseconds
|
|
1282
|
-
*/
|
|
1283
|
-
getChunkDurationMs(): number;
|
|
1284
|
-
/**
|
|
1285
|
-
* Create the worker from inline script
|
|
1286
|
-
*/
|
|
1287
|
-
private createWorker;
|
|
1288
|
-
/**
|
|
1289
|
-
* Handle messages from worker
|
|
1290
|
-
*/
|
|
1291
|
-
private handleWorkerMessage;
|
|
1292
|
-
/**
|
|
1293
|
-
* Send message to worker and wait for response
|
|
1294
|
-
*/
|
|
1295
|
-
private sendMessage;
|
|
1296
|
-
/**
|
|
1297
|
-
* Load the ONNX model in the worker
|
|
1298
|
-
*/
|
|
1299
|
-
load(): Promise<VADWorkerModelInfo>;
|
|
1092
|
+
interface SileroVADBackend {
|
|
1093
|
+
/** Current backend type (webgpu, wasm, or null if not loaded) */
|
|
1094
|
+
readonly backend: RuntimeBackend | null;
|
|
1095
|
+
/** Whether the model is loaded and ready for inference */
|
|
1096
|
+
readonly isLoaded: boolean;
|
|
1097
|
+
/** Audio sample rate (8000 or 16000 Hz) */
|
|
1098
|
+
readonly sampleRate: number;
|
|
1099
|
+
/** Speech detection threshold (0-1) */
|
|
1100
|
+
readonly threshold: number;
|
|
1300
1101
|
/**
|
|
1301
|
-
*
|
|
1102
|
+
* Load the ONNX model
|
|
1103
|
+
* @returns Model loading information
|
|
1302
1104
|
*/
|
|
1303
|
-
|
|
1105
|
+
load(): Promise<VADModelInfo | VADWorkerModelInfo>;
|
|
1304
1106
|
/**
|
|
1305
1107
|
* Process a single audio chunk
|
|
1306
|
-
*
|
|
1307
|
-
* @param audioChunk - Float32Array of exactly chunkSize samples (512 for 16kHz, 256 for 8kHz)
|
|
1108
|
+
* @param audioChunk - Float32Array of exactly chunkSize samples
|
|
1308
1109
|
* @returns VAD result with speech probability
|
|
1309
1110
|
*/
|
|
1310
1111
|
process(audioChunk: Float32Array): Promise<VADResult>;
|
|
1311
1112
|
/**
|
|
1312
|
-
*
|
|
1113
|
+
* Reset state for new audio stream
|
|
1313
1114
|
*/
|
|
1314
|
-
|
|
1115
|
+
reset(): void | Promise<void>;
|
|
1315
1116
|
/**
|
|
1316
|
-
* Dispose of the
|
|
1117
|
+
* Dispose of the model and free resources
|
|
1317
1118
|
*/
|
|
1318
1119
|
dispose(): Promise<void>;
|
|
1319
1120
|
/**
|
|
1320
|
-
*
|
|
1121
|
+
* Get required chunk size in samples
|
|
1321
1122
|
*/
|
|
1322
|
-
|
|
1123
|
+
getChunkSize(): number;
|
|
1124
|
+
/**
|
|
1125
|
+
* Get chunk duration in milliseconds
|
|
1126
|
+
*/
|
|
1127
|
+
getChunkDurationMs(): number;
|
|
1323
1128
|
}
|
|
1324
1129
|
|
|
1325
1130
|
/**
|
|
@@ -1447,43 +1252,33 @@ declare class UnifiedInferenceWorker {
|
|
|
1447
1252
|
|
|
1448
1253
|
/** Base config shared across all inference factory functions */
|
|
1449
1254
|
interface InferenceFactoryConfig {
|
|
1450
|
-
/**
|
|
1451
|
-
* Worker mode:
|
|
1452
|
-
* - 'auto' (default): Use Worker if supported, else main thread
|
|
1453
|
-
* - true: Force Worker (throws if unsupported)
|
|
1454
|
-
* - false: Force main thread
|
|
1455
|
-
*/
|
|
1456
|
-
useWorker?: boolean | 'auto';
|
|
1457
1255
|
/**
|
|
1458
1256
|
* Unified inference worker instance.
|
|
1459
|
-
*
|
|
1257
|
+
* Routes inference through the shared worker,
|
|
1460
1258
|
* keeping all inference off the main thread.
|
|
1461
|
-
* Takes precedence over useWorker setting.
|
|
1462
1259
|
*/
|
|
1463
1260
|
unifiedWorker?: UnifiedInferenceWorker;
|
|
1464
1261
|
}
|
|
1465
1262
|
|
|
1466
1263
|
/**
|
|
1467
|
-
* Factory function for A2E inference
|
|
1264
|
+
* Factory function for A2E inference via UnifiedInferenceWorker
|
|
1468
1265
|
*
|
|
1469
1266
|
* Creates an A2EBackend instance with zero-config defaults (HuggingFace CDN).
|
|
1470
|
-
*
|
|
1267
|
+
* Routes inference through the shared unified worker.
|
|
1471
1268
|
*
|
|
1472
1269
|
* @category Inference
|
|
1473
1270
|
*
|
|
1474
|
-
* @example
|
|
1271
|
+
* @example
|
|
1475
1272
|
* ```typescript
|
|
1476
|
-
* import { createA2E } from '@omote/core';
|
|
1273
|
+
* import { createA2E, UnifiedInferenceWorker } from '@omote/core';
|
|
1274
|
+
*
|
|
1275
|
+
* const worker = new UnifiedInferenceWorker();
|
|
1276
|
+
* await worker.init();
|
|
1477
1277
|
*
|
|
1478
|
-
* const a2e = createA2E(
|
|
1278
|
+
* const a2e = createA2E({ unifiedWorker: worker });
|
|
1479
1279
|
* await a2e.load();
|
|
1480
1280
|
* const { blendshapes } = await a2e.infer(audioSamples);
|
|
1481
1281
|
* ```
|
|
1482
|
-
*
|
|
1483
|
-
* @example Custom model URL
|
|
1484
|
-
* ```typescript
|
|
1485
|
-
* const a2e = createA2E({ modelUrl: '/models/lam.onnx' });
|
|
1486
|
-
* ```
|
|
1487
1282
|
*/
|
|
1488
1283
|
|
|
1489
1284
|
/**
|
|
@@ -1499,13 +1294,13 @@ interface CreateA2EConfig extends InferenceFactoryConfig {
|
|
|
1499
1294
|
* Set to `false` to skip external data loading (single-file models only).
|
|
1500
1295
|
*/
|
|
1501
1296
|
externalDataUrl?: string | false;
|
|
1502
|
-
/** Backend preference (default: 'auto') */
|
|
1503
|
-
backend?: BackendPreference;
|
|
1504
1297
|
/** Number of identity classes (default: 12) */
|
|
1505
1298
|
numIdentityClasses?: number;
|
|
1506
1299
|
}
|
|
1507
1300
|
/**
|
|
1508
|
-
* Create an A2E instance
|
|
1301
|
+
* Create an A2E instance via the unified worker.
|
|
1302
|
+
*
|
|
1303
|
+
* If no `unifiedWorker` is provided, a dedicated worker is created on load().
|
|
1509
1304
|
*
|
|
1510
1305
|
* @param config - Factory configuration
|
|
1511
1306
|
* @returns An A2EBackend instance
|
|
@@ -1521,7 +1316,7 @@ declare function createA2E(config?: CreateA2EConfig): A2EBackend;
|
|
|
1521
1316
|
/**
|
|
1522
1317
|
* Generic frame source -- any object that emits 'frame' events with blendshapes.
|
|
1523
1318
|
*
|
|
1524
|
-
* Implemented by PlaybackPipeline, MicLipSync,
|
|
1319
|
+
* Implemented by PlaybackPipeline, MicLipSync, and any custom source.
|
|
1525
1320
|
* Used by OmoteAvatar (all renderer adapters) to receive animation frames.
|
|
1526
1321
|
*/
|
|
1527
1322
|
interface FrameSource {
|
|
@@ -1550,7 +1345,7 @@ interface TranscriptResult {
|
|
|
1550
1345
|
inferenceTimeMs?: number;
|
|
1551
1346
|
}
|
|
1552
1347
|
/**
|
|
1553
|
-
* Consumer's response handler.
|
|
1348
|
+
* Consumer's response handler. VoiceOrchestrator calls this with transcribed text.
|
|
1554
1349
|
* Consumer must stream audio back for playback + lip sync.
|
|
1555
1350
|
*/
|
|
1556
1351
|
interface ResponseHandler {
|
|
@@ -1581,6 +1376,8 @@ interface ResponseHandler {
|
|
|
1581
1376
|
*/
|
|
1582
1377
|
|
|
1583
1378
|
interface TTSSpeakerConfig {
|
|
1379
|
+
/** Skip LAM download — audio playback only, no lip sync. Default: false. */
|
|
1380
|
+
audioOnly?: boolean;
|
|
1584
1381
|
/** Per-character expression weight scaling */
|
|
1585
1382
|
profile?: ExpressionProfile;
|
|
1586
1383
|
/** Identity/style index for A2E model (default: 0) */
|
|
@@ -1593,8 +1390,8 @@ interface TTSSpeakerConfig {
|
|
|
1593
1390
|
neutralTransitionMs?: number;
|
|
1594
1391
|
/** Pre-built A2E backend (skip internal createA2E). */
|
|
1595
1392
|
lam?: A2EBackend;
|
|
1596
|
-
/** LAM model config (only when lam not provided) */
|
|
1597
|
-
models?: CreateA2EConfig
|
|
1393
|
+
/** LAM model config (only when lam not provided). unifiedWorker is supplied by TTSSpeaker. */
|
|
1394
|
+
models?: Omit<CreateA2EConfig, 'unifiedWorker'>;
|
|
1598
1395
|
/** Shared unified worker (recommended for iOS) */
|
|
1599
1396
|
unifiedWorker?: UnifiedInferenceWorker;
|
|
1600
1397
|
}
|
|
@@ -1603,6 +1400,7 @@ declare class TTSSpeaker {
|
|
|
1603
1400
|
private tts;
|
|
1604
1401
|
private ownedLam;
|
|
1605
1402
|
private ownedWorker;
|
|
1403
|
+
private usesSharedWorker;
|
|
1606
1404
|
private currentAbort;
|
|
1607
1405
|
private _isSpeaking;
|
|
1608
1406
|
private _audioOnly;
|
|
@@ -1616,11 +1414,8 @@ declare class TTSSpeaker {
|
|
|
1616
1414
|
/**
|
|
1617
1415
|
* Connect a TTS backend.
|
|
1618
1416
|
*
|
|
1619
|
-
*
|
|
1620
|
-
*
|
|
1621
|
-
*
|
|
1622
|
-
* When config is omitted or has none of those, audio-only mode is used:
|
|
1623
|
-
* TTS → AudioScheduler (speakers only, no blendshapes, no LAM download).
|
|
1417
|
+
* By default, the full lip sync pipeline is created (auto-downloads LAM).
|
|
1418
|
+
* Pass `audioOnly: true` for audio-only mode (no blendshapes, no LAM download).
|
|
1624
1419
|
*
|
|
1625
1420
|
* @param tts - TTS backend to use for speech synthesis
|
|
1626
1421
|
* @param config - Optional configuration for A2E, expression profile, etc.
|
|
@@ -1636,6 +1431,8 @@ declare class TTSSpeaker {
|
|
|
1636
1431
|
speak(text: string, options?: {
|
|
1637
1432
|
signal?: AbortSignal;
|
|
1638
1433
|
voice?: string;
|
|
1434
|
+
speed?: number;
|
|
1435
|
+
language?: string;
|
|
1639
1436
|
}): Promise<void>;
|
|
1640
1437
|
/** Audio-only speak: TTS → resample → AudioScheduler (no blendshapes). */
|
|
1641
1438
|
private speakAudioOnly;
|
|
@@ -1655,13 +1452,20 @@ declare class TTSSpeaker {
|
|
|
1655
1452
|
streamText(options: {
|
|
1656
1453
|
signal?: AbortSignal;
|
|
1657
1454
|
voice?: string;
|
|
1455
|
+
speed?: number;
|
|
1456
|
+
language?: string;
|
|
1658
1457
|
}): Promise<{
|
|
1659
1458
|
push: (token: string) => void;
|
|
1660
1459
|
end: () => Promise<void>;
|
|
1661
1460
|
}>;
|
|
1662
1461
|
/** streamText in audio-only mode: TTS → AudioScheduler (no blendshapes). */
|
|
1663
1462
|
private streamTextAudioOnly;
|
|
1664
|
-
/**
|
|
1463
|
+
/**
|
|
1464
|
+
* Warm up AudioContext for iOS/Safari autoplay policy.
|
|
1465
|
+
* Call from a user gesture handler (click/tap) before speak().
|
|
1466
|
+
*/
|
|
1467
|
+
warmup(): Promise<void>;
|
|
1468
|
+
/** Abort current speak if any. Triggers neutral transition on PlaybackPipeline. */
|
|
1665
1469
|
stop(): void;
|
|
1666
1470
|
/** Clean teardown of all owned resources. */
|
|
1667
1471
|
dispose(): Promise<void>;
|
|
@@ -1697,11 +1501,13 @@ interface CreateTTSPlayerConfig {
|
|
|
1697
1501
|
modelUrl?: string;
|
|
1698
1502
|
/** Voice data base URL override */
|
|
1699
1503
|
voiceBaseUrl?: string;
|
|
1504
|
+
/** Shared unified worker (created automatically if not provided) */
|
|
1505
|
+
unifiedWorker?: UnifiedInferenceWorker;
|
|
1700
1506
|
}
|
|
1701
1507
|
/**
|
|
1702
1508
|
* Zero-config TTS player. Speak text through speakers without an avatar.
|
|
1703
1509
|
*
|
|
1704
|
-
* Uses Kokoro TTS (82M q8, ~92MB) with automatic worker
|
|
1510
|
+
* Uses Kokoro TTS (82M q8, ~92MB) with automatic worker creation.
|
|
1705
1511
|
* No LAM model is downloaded — audio plays directly through AudioScheduler.
|
|
1706
1512
|
*/
|
|
1707
1513
|
declare function createTTSPlayer(config?: CreateTTSPlayerConfig): TTSPlayer;
|
|
@@ -1710,254 +1516,27 @@ declare function createTTSPlayer(config?: CreateTTSPlayerConfig): TTSPlayer;
|
|
|
1710
1516
|
*/
|
|
1711
1517
|
declare class TTSPlayer extends TTSSpeaker {
|
|
1712
1518
|
private backend;
|
|
1713
|
-
|
|
1519
|
+
private ttsWorker;
|
|
1520
|
+
private ttsPlayerUsesSharedWorker;
|
|
1521
|
+
private ttsConfig;
|
|
1522
|
+
constructor(config?: CreateTTSPlayerConfig);
|
|
1714
1523
|
/** Load TTS model and connect in audio-only mode. */
|
|
1715
1524
|
load(): Promise<void>;
|
|
1716
1525
|
/** Whether the TTS model is loaded and ready. */
|
|
1717
1526
|
get isLoaded(): boolean;
|
|
1527
|
+
dispose(): Promise<void>;
|
|
1718
1528
|
}
|
|
1719
1529
|
|
|
1720
1530
|
/**
|
|
1721
|
-
*
|
|
1722
|
-
*
|
|
1723
|
-
* Provides a unified API that automatically selects the optimal implementation:
|
|
1724
|
-
* - Worker supported: Uses SenseVoiceWorker (off-main-thread inference)
|
|
1725
|
-
* - Worker unsupported: Uses SenseVoiceInference (main thread)
|
|
1726
|
-
*
|
|
1727
|
-
* @category Inference
|
|
1728
|
-
*
|
|
1729
|
-
* @example Auto-detect (recommended)
|
|
1730
|
-
* ```typescript
|
|
1731
|
-
* import { createSenseVoice } from '@omote/core';
|
|
1531
|
+
* SpeechListener — Standalone listening primitive.
|
|
1732
1532
|
*
|
|
1733
|
-
*
|
|
1734
|
-
*
|
|
1735
|
-
* });
|
|
1736
|
-
* await asr.load();
|
|
1737
|
-
* const { text, emotion } = await asr.transcribe(audioSamples);
|
|
1738
|
-
* ```
|
|
1533
|
+
* Composes: MicrophoneCapture → SileroVAD → SenseVoice ASR → transcript events.
|
|
1534
|
+
* Used independently or alongside TTSSpeaker and VoiceOrchestrator.
|
|
1739
1535
|
*
|
|
1740
|
-
*
|
|
1741
|
-
*
|
|
1742
|
-
* const asr = createSenseVoice({
|
|
1743
|
-
* modelUrl: '/models/sensevoice/model.int8.onnx',
|
|
1744
|
-
* useWorker: true,
|
|
1745
|
-
* });
|
|
1746
|
-
* ```
|
|
1536
|
+
* Does NOT handle TTS, LAM, or response routing — those belong to TTSSpeaker
|
|
1537
|
+
* and VoiceOrchestrator respectively.
|
|
1747
1538
|
*
|
|
1748
|
-
* @
|
|
1749
|
-
* ```typescript
|
|
1750
|
-
* const asr = createSenseVoice({
|
|
1751
|
-
* modelUrl: '/models/sensevoice/model.int8.onnx',
|
|
1752
|
-
* useWorker: false,
|
|
1753
|
-
* });
|
|
1754
|
-
* ```
|
|
1755
|
-
*/
|
|
1756
|
-
|
|
1757
|
-
/**
|
|
1758
|
-
* Common interface for both SenseVoiceInference and SenseVoiceWorker
|
|
1759
|
-
*/
|
|
1760
|
-
interface SenseVoiceBackend {
|
|
1761
|
-
/** Whether the model is loaded and ready for inference */
|
|
1762
|
-
readonly isLoaded: boolean;
|
|
1763
|
-
/** Current backend type ('wasm' for worker, full backend for main thread, null if not loaded) */
|
|
1764
|
-
readonly backend: 'wasm' | 'webgpu' | null;
|
|
1765
|
-
/**
|
|
1766
|
-
* Load the ONNX model
|
|
1767
|
-
* @param onProgress - Optional progress callback (fires once at 100% for worker)
|
|
1768
|
-
* @returns Model loading information
|
|
1769
|
-
*/
|
|
1770
|
-
load(onProgress?: (loaded: number, total: number) => void): Promise<SenseVoiceModelInfo>;
|
|
1771
|
-
/**
|
|
1772
|
-
* Transcribe audio samples to text
|
|
1773
|
-
* @param audioSamples - Float32Array of audio samples at 16kHz
|
|
1774
|
-
* @returns Transcription result
|
|
1775
|
-
*/
|
|
1776
|
-
transcribe(audioSamples: Float32Array): Promise<SenseVoiceResult>;
|
|
1777
|
-
/**
|
|
1778
|
-
* Dispose of the model and free resources
|
|
1779
|
-
*/
|
|
1780
|
-
dispose(): Promise<void>;
|
|
1781
|
-
}
|
|
1782
|
-
/**
|
|
1783
|
-
* Configuration for the SenseVoice factory
|
|
1784
|
-
*/
|
|
1785
|
-
interface CreateSenseVoiceConfig extends InferenceFactoryConfig {
|
|
1786
|
-
/** Path or URL to model.int8.onnx (239MB). Default: HuggingFace CDN */
|
|
1787
|
-
modelUrl?: string;
|
|
1788
|
-
/** Path or URL to tokens.txt vocabulary file (default: sibling of modelUrl) */
|
|
1789
|
-
tokensUrl?: string;
|
|
1790
|
-
/** Language hint (default: 'auto') */
|
|
1791
|
-
language?: SenseVoiceLanguage;
|
|
1792
|
-
/** Text normalization (default: 'with_itn') */
|
|
1793
|
-
textNorm?: 'with_itn' | 'without_itn';
|
|
1794
|
-
}
|
|
1795
|
-
/**
|
|
1796
|
-
* Create a SenseVoice ASR instance with automatic implementation selection
|
|
1797
|
-
*
|
|
1798
|
-
* @param config - Factory configuration
|
|
1799
|
-
* @returns A SenseVoiceBackend instance (either Worker or main thread)
|
|
1800
|
-
*/
|
|
1801
|
-
declare function createSenseVoice(config?: CreateSenseVoiceConfig): SenseVoiceBackend;
|
|
1802
|
-
|
|
1803
|
-
/**
|
|
1804
|
-
* Factory function for Silero VAD with automatic Worker vs main thread selection
|
|
1805
|
-
*
|
|
1806
|
-
* Provides a unified API that automatically selects the optimal implementation:
|
|
1807
|
-
* - Desktop browsers: Uses SileroVADWorker (off-main-thread inference)
|
|
1808
|
-
* - Mobile devices: Uses SileroVADInference (main thread, avoids memory overhead)
|
|
1809
|
-
* - Fallback: Gracefully falls back to main thread if Worker fails
|
|
1810
|
-
*
|
|
1811
|
-
* @category Inference
|
|
1812
|
-
*
|
|
1813
|
-
* @example Basic usage (auto-detect)
|
|
1814
|
-
* ```typescript
|
|
1815
|
-
* import { createSileroVAD } from '@omote/core';
|
|
1816
|
-
*
|
|
1817
|
-
* const vad = createSileroVAD({
|
|
1818
|
-
* modelUrl: '/models/silero-vad.onnx',
|
|
1819
|
-
* threshold: 0.5,
|
|
1820
|
-
* });
|
|
1821
|
-
*
|
|
1822
|
-
* await vad.load();
|
|
1823
|
-
* const result = await vad.process(audioChunk);
|
|
1824
|
-
* if (result.isSpeech) {
|
|
1825
|
-
* console.log('Speech detected!', result.probability);
|
|
1826
|
-
* }
|
|
1827
|
-
* ```
|
|
1828
|
-
*
|
|
1829
|
-
* @example Force worker usage
|
|
1830
|
-
* ```typescript
|
|
1831
|
-
* const vad = createSileroVAD({
|
|
1832
|
-
* modelUrl: '/models/silero-vad.onnx',
|
|
1833
|
-
* useWorker: true, // Force Worker even on mobile
|
|
1834
|
-
* });
|
|
1835
|
-
* ```
|
|
1836
|
-
*
|
|
1837
|
-
* @example Force main thread
|
|
1838
|
-
* ```typescript
|
|
1839
|
-
* const vad = createSileroVAD({
|
|
1840
|
-
* modelUrl: '/models/silero-vad.onnx',
|
|
1841
|
-
* useWorker: false, // Force main thread
|
|
1842
|
-
* });
|
|
1843
|
-
* ```
|
|
1844
|
-
*/
|
|
1845
|
-
|
|
1846
|
-
/**
|
|
1847
|
-
* Common interface for both SileroVADInference and SileroVADWorker
|
|
1848
|
-
*
|
|
1849
|
-
* This interface defines the shared API that both implementations provide,
|
|
1850
|
-
* allowing consumers to use either interchangeably.
|
|
1851
|
-
*/
|
|
1852
|
-
interface SileroVADBackend {
|
|
1853
|
-
/** Current backend type (webgpu, wasm, or null if not loaded) */
|
|
1854
|
-
readonly backend: RuntimeBackend | null;
|
|
1855
|
-
/** Whether the model is loaded and ready for inference */
|
|
1856
|
-
readonly isLoaded: boolean;
|
|
1857
|
-
/** Audio sample rate (8000 or 16000 Hz) */
|
|
1858
|
-
readonly sampleRate: number;
|
|
1859
|
-
/** Speech detection threshold (0-1) */
|
|
1860
|
-
readonly threshold: number;
|
|
1861
|
-
/**
|
|
1862
|
-
* Load the ONNX model
|
|
1863
|
-
* @returns Model loading information
|
|
1864
|
-
*/
|
|
1865
|
-
load(): Promise<VADModelInfo | VADWorkerModelInfo>;
|
|
1866
|
-
/**
|
|
1867
|
-
* Process a single audio chunk
|
|
1868
|
-
* @param audioChunk - Float32Array of exactly chunkSize samples
|
|
1869
|
-
* @returns VAD result with speech probability
|
|
1870
|
-
*/
|
|
1871
|
-
process(audioChunk: Float32Array): Promise<VADResult>;
|
|
1872
|
-
/**
|
|
1873
|
-
* Reset state for new audio stream
|
|
1874
|
-
*/
|
|
1875
|
-
reset(): void | Promise<void>;
|
|
1876
|
-
/**
|
|
1877
|
-
* Dispose of the model and free resources
|
|
1878
|
-
*/
|
|
1879
|
-
dispose(): Promise<void>;
|
|
1880
|
-
/**
|
|
1881
|
-
* Get required chunk size in samples
|
|
1882
|
-
*/
|
|
1883
|
-
getChunkSize(): number;
|
|
1884
|
-
/**
|
|
1885
|
-
* Get chunk duration in milliseconds
|
|
1886
|
-
*/
|
|
1887
|
-
getChunkDurationMs(): number;
|
|
1888
|
-
}
|
|
1889
|
-
/**
|
|
1890
|
-
* Configuration for the Silero VAD factory
|
|
1891
|
-
*
|
|
1892
|
-
* Extends SileroVADConfig with worker-specific options.
|
|
1893
|
-
*/
|
|
1894
|
-
interface SileroVADFactoryConfig extends Omit<SileroVADConfig, 'modelUrl'>, InferenceFactoryConfig {
|
|
1895
|
-
/** Path or URL to the ONNX model. Default: HuggingFace CDN */
|
|
1896
|
-
modelUrl?: string;
|
|
1897
|
-
/**
|
|
1898
|
-
* Fallback to main thread on worker errors.
|
|
1899
|
-
*
|
|
1900
|
-
* When true (default), if the Worker fails to load or encounters an error,
|
|
1901
|
-
* the factory will automatically create a main thread instance instead.
|
|
1902
|
-
*
|
|
1903
|
-
* When false, worker errors will propagate as exceptions.
|
|
1904
|
-
*
|
|
1905
|
-
* Default: true
|
|
1906
|
-
*/
|
|
1907
|
-
fallbackOnError?: boolean;
|
|
1908
|
-
}
|
|
1909
|
-
/**
|
|
1910
|
-
* Check if the current environment supports VAD Web Workers
|
|
1911
|
-
*
|
|
1912
|
-
* Requirements:
|
|
1913
|
-
* - Worker constructor must exist
|
|
1914
|
-
* - Blob URL support (for inline worker script)
|
|
1915
|
-
*
|
|
1916
|
-
* @returns true if VAD Worker is supported
|
|
1917
|
-
*/
|
|
1918
|
-
declare function supportsVADWorker(): boolean;
|
|
1919
|
-
/**
|
|
1920
|
-
* Create a Silero VAD instance with automatic implementation selection
|
|
1921
|
-
*
|
|
1922
|
-
* This factory function automatically selects between:
|
|
1923
|
-
* - **SileroVADWorker**: Off-main-thread inference (better for desktop)
|
|
1924
|
-
* - **SileroVADInference**: Main thread inference (better for mobile)
|
|
1925
|
-
*
|
|
1926
|
-
* The selection is based on:
|
|
1927
|
-
* 1. Explicit `useWorker` config (if provided)
|
|
1928
|
-
* 2. Platform detection (mobile vs desktop)
|
|
1929
|
-
* 3. Worker API availability
|
|
1930
|
-
*
|
|
1931
|
-
* Both implementations share the same interface (SileroVADBackend),
|
|
1932
|
-
* so consumers can use either interchangeably.
|
|
1933
|
-
*
|
|
1934
|
-
* @param config - Factory configuration
|
|
1935
|
-
* @returns A SileroVAD instance (either Worker or main thread)
|
|
1936
|
-
*
|
|
1937
|
-
* @example
|
|
1938
|
-
* ```typescript
|
|
1939
|
-
* // Auto-detect (recommended)
|
|
1940
|
-
* const vad = createSileroVAD({ modelUrl: '/models/silero-vad.onnx' });
|
|
1941
|
-
*
|
|
1942
|
-
* // Force Worker
|
|
1943
|
-
* const vadWorker = createSileroVAD({ modelUrl: '/models/silero-vad.onnx', useWorker: true });
|
|
1944
|
-
*
|
|
1945
|
-
* // Force main thread
|
|
1946
|
-
* const vadMain = createSileroVAD({ modelUrl: '/models/silero-vad.onnx', useWorker: false });
|
|
1947
|
-
* ```
|
|
1948
|
-
*/
|
|
1949
|
-
declare function createSileroVAD(config?: SileroVADFactoryConfig): SileroVADBackend;
|
|
1950
|
-
|
|
1951
|
-
/**
|
|
1952
|
-
* SpeechListener — Standalone listening primitive.
|
|
1953
|
-
*
|
|
1954
|
-
* Composes: MicrophoneCapture → SileroVAD → SenseVoice ASR → transcript events.
|
|
1955
|
-
* Extracted from VoicePipeline's listening half so it can be used independently.
|
|
1956
|
-
*
|
|
1957
|
-
* Does NOT handle TTS, LAM, or response routing — those belong to TTSSpeaker
|
|
1958
|
-
* and VoicePipeline respectively.
|
|
1959
|
-
*
|
|
1960
|
-
* @category Audio
|
|
1539
|
+
* @category Audio
|
|
1961
1540
|
*/
|
|
1962
1541
|
|
|
1963
1542
|
interface SpeechListenerConfig {
|
|
@@ -1974,6 +1553,7 @@ interface SpeechListenerConfig {
|
|
|
1974
1553
|
modelUrl: string;
|
|
1975
1554
|
tokensUrl?: string;
|
|
1976
1555
|
language?: string;
|
|
1556
|
+
textNorm?: 'with_itn' | 'without_itn';
|
|
1977
1557
|
};
|
|
1978
1558
|
vad: {
|
|
1979
1559
|
modelUrl: string;
|
|
@@ -2028,6 +1608,7 @@ declare class SpeechListener extends EventEmitter<SpeechListenerEvents> {
|
|
|
2028
1608
|
private asr;
|
|
2029
1609
|
private vad;
|
|
2030
1610
|
private ownedWorker;
|
|
1611
|
+
private usesSharedWorker;
|
|
2031
1612
|
private mic;
|
|
2032
1613
|
private omoteEvents;
|
|
2033
1614
|
private _unsubChunk;
|
|
@@ -2157,114 +1738,48 @@ declare class InterruptionHandler extends EventEmitter<InterruptionEvents> {
|
|
|
2157
1738
|
}
|
|
2158
1739
|
|
|
2159
1740
|
/**
|
|
2160
|
-
* SenseVoice ASR
|
|
2161
|
-
*
|
|
2162
|
-
* Runs SenseVoice speech recognition in a dedicated Web Worker to prevent
|
|
2163
|
-
* main thread blocking. Uses inline worker script (Blob URL pattern) to
|
|
2164
|
-
* avoid separate file deployment.
|
|
2165
|
-
*
|
|
2166
|
-
* Key design decisions:
|
|
2167
|
-
* - WASM backend only (WebGPU doesn't work in Workers)
|
|
2168
|
-
* - All preprocessing (fbank, LFR, CMVN) and CTC decoding inlined in worker
|
|
2169
|
-
* - Audio copied (not transferred) to retain main thread access
|
|
2170
|
-
* - ONNX Runtime loaded from CDN in worker (no bundler complications)
|
|
2171
|
-
* - iOS: model URL passed as string to ORT (avoids 239MB JS heap allocation)
|
|
1741
|
+
* Factory function for SenseVoice ASR via UnifiedInferenceWorker
|
|
2172
1742
|
*
|
|
2173
1743
|
* @category Inference
|
|
2174
1744
|
*
|
|
2175
|
-
* @example
|
|
1745
|
+
* @example
|
|
2176
1746
|
* ```typescript
|
|
2177
|
-
* import {
|
|
1747
|
+
* import { createSenseVoice, UnifiedInferenceWorker } from '@omote/core';
|
|
2178
1748
|
*
|
|
2179
|
-
* const
|
|
1749
|
+
* const worker = new UnifiedInferenceWorker();
|
|
1750
|
+
* await worker.init();
|
|
1751
|
+
*
|
|
1752
|
+
* const asr = createSenseVoice({
|
|
2180
1753
|
* modelUrl: '/models/sensevoice/model.int8.onnx',
|
|
2181
|
-
*
|
|
1754
|
+
* unifiedWorker: worker,
|
|
2182
1755
|
* });
|
|
2183
1756
|
* await asr.load();
|
|
2184
|
-
*
|
|
2185
|
-
* const { text, emotion, language } = await asr.transcribe(audioSamples);
|
|
2186
|
-
* console.log(text); // "Hello world"
|
|
2187
|
-
* console.log(emotion); // "NEUTRAL"
|
|
2188
|
-
* console.log(language); // "en"
|
|
1757
|
+
* const { text, emotion } = await asr.transcribe(audioSamples);
|
|
2189
1758
|
* ```
|
|
2190
1759
|
*/
|
|
2191
1760
|
|
|
2192
1761
|
/**
|
|
2193
|
-
* Configuration for SenseVoice
|
|
1762
|
+
* Configuration for the SenseVoice factory
|
|
2194
1763
|
*/
|
|
2195
|
-
interface
|
|
2196
|
-
/** Path or URL to model.int8.onnx (239MB) */
|
|
2197
|
-
modelUrl
|
|
1764
|
+
interface CreateSenseVoiceConfig extends InferenceFactoryConfig {
|
|
1765
|
+
/** Path or URL to model.int8.onnx (239MB). Default: HuggingFace CDN */
|
|
1766
|
+
modelUrl?: string;
|
|
2198
1767
|
/** Path or URL to tokens.txt vocabulary file (default: sibling of modelUrl) */
|
|
2199
1768
|
tokensUrl?: string;
|
|
2200
|
-
/** Language hint (default: 'auto'
|
|
1769
|
+
/** Language hint (default: 'auto') */
|
|
2201
1770
|
language?: SenseVoiceLanguage;
|
|
2202
|
-
/** Text normalization
|
|
1771
|
+
/** Text normalization (default: 'with_itn') */
|
|
2203
1772
|
textNorm?: 'with_itn' | 'without_itn';
|
|
2204
1773
|
}
|
|
2205
1774
|
/**
|
|
2206
|
-
* SenseVoice ASR
|
|
1775
|
+
* Create a SenseVoice ASR instance via the unified worker.
|
|
2207
1776
|
*
|
|
2208
|
-
*
|
|
2209
|
-
* All preprocessing (fbank, LFR, CMVN) and CTC decoding run in the worker.
|
|
1777
|
+
* If no `unifiedWorker` is provided, a dedicated worker is created on load().
|
|
2210
1778
|
*
|
|
2211
|
-
* @
|
|
1779
|
+
* @param config - Factory configuration
|
|
1780
|
+
* @returns A SenseVoiceBackend instance
|
|
2212
1781
|
*/
|
|
2213
|
-
declare
|
|
2214
|
-
private worker;
|
|
2215
|
-
private config;
|
|
2216
|
-
private isLoading;
|
|
2217
|
-
private _isLoaded;
|
|
2218
|
-
private inferenceQueue;
|
|
2219
|
-
private poisoned;
|
|
2220
|
-
private pendingResolvers;
|
|
2221
|
-
private languageId;
|
|
2222
|
-
private textNormId;
|
|
2223
|
-
constructor(config: SenseVoiceWorkerConfig);
|
|
2224
|
-
get isLoaded(): boolean;
|
|
2225
|
-
/**
|
|
2226
|
-
* Backend type (always 'wasm' for Worker, WebGPU not supported in Workers)
|
|
2227
|
-
*/
|
|
2228
|
-
get backend(): 'wasm' | null;
|
|
2229
|
-
/**
|
|
2230
|
-
* Create the worker from inline script
|
|
2231
|
-
*/
|
|
2232
|
-
private createWorker;
|
|
2233
|
-
/**
|
|
2234
|
-
* Handle messages from worker
|
|
2235
|
-
*/
|
|
2236
|
-
private handleWorkerMessage;
|
|
2237
|
-
/**
|
|
2238
|
-
* Send message to worker and wait for response
|
|
2239
|
-
*/
|
|
2240
|
-
private sendMessage;
|
|
2241
|
-
/**
|
|
2242
|
-
* Load the ONNX model in the worker
|
|
2243
|
-
*
|
|
2244
|
-
* @param onProgress - Optional progress callback. Fires once at 100% when load completes
|
|
2245
|
-
* (the worker downloads and loads the model internally, so granular progress is not available).
|
|
2246
|
-
*/
|
|
2247
|
-
load(onProgress?: (loaded: number, total: number) => void): Promise<SenseVoiceModelInfo>;
|
|
2248
|
-
/**
|
|
2249
|
-
* Transcribe audio samples to text
|
|
2250
|
-
*
|
|
2251
|
-
* @param audioSamples Float32Array of audio samples at 16kHz, [-1, 1] range
|
|
2252
|
-
* @returns Transcription result with text, emotion, language, and event
|
|
2253
|
-
*/
|
|
2254
|
-
transcribe(audioSamples: Float32Array): Promise<SenseVoiceResult>;
|
|
2255
|
-
/**
|
|
2256
|
-
* Queue inference to serialize worker calls
|
|
2257
|
-
*/
|
|
2258
|
-
private queueInference;
|
|
2259
|
-
/**
|
|
2260
|
-
* Dispose of the worker and free resources
|
|
2261
|
-
*/
|
|
2262
|
-
dispose(): Promise<void>;
|
|
2263
|
-
/**
|
|
2264
|
-
* Check if Web Workers are supported
|
|
2265
|
-
*/
|
|
2266
|
-
static isSupported(): boolean;
|
|
2267
|
-
}
|
|
1782
|
+
declare function createSenseVoice(config?: CreateSenseVoiceConfig): SenseVoiceBackend;
|
|
2268
1783
|
|
|
2269
1784
|
/**
|
|
2270
1785
|
* Shared blendshape constants and utilities for lip sync inference
|
|
@@ -2298,100 +1813,6 @@ declare const LAM_BLENDSHAPES: readonly ["browDownLeft", "browDownRight", "browI
|
|
|
2298
1813
|
*/
|
|
2299
1814
|
declare function lerpBlendshapes(current: Float32Array | number[], target: Float32Array | number[], factor?: number): number[];
|
|
2300
1815
|
|
|
2301
|
-
/**
|
|
2302
|
-
* A2E inference engine for Audio-to-Expression (LAM model)
|
|
2303
|
-
*
|
|
2304
|
-
* Runs entirely in the browser using WebGPU or WASM.
|
|
2305
|
-
* Takes raw 16kHz audio and outputs 52 ARKit blendshapes for lip sync.
|
|
2306
|
-
* Uses the LAM (Large Animation Model) — see {@link A2EBackend} for the interface.
|
|
2307
|
-
*
|
|
2308
|
-
* @see {@link createA2E} for the recommended zero-config factory
|
|
2309
|
-
* @see {@link A2EBackend} for the common interface
|
|
2310
|
-
* @category Inference
|
|
2311
|
-
*
|
|
2312
|
-
* @example Basic usage
|
|
2313
|
-
* ```typescript
|
|
2314
|
-
* import { A2EInference } from '@omote/core';
|
|
2315
|
-
*
|
|
2316
|
-
* const a2e = new A2EInference({ modelUrl: '/models/lam.onnx' });
|
|
2317
|
-
* await a2e.load();
|
|
2318
|
-
*
|
|
2319
|
-
* // Process 1 second of audio (16kHz = 16000 samples)
|
|
2320
|
-
* const result = await a2e.infer(audioSamples);
|
|
2321
|
-
* console.log('Blendshapes:', result.blendshapes); // [30, 52] for 30fps
|
|
2322
|
-
* ```
|
|
2323
|
-
*/
|
|
2324
|
-
|
|
2325
|
-
interface A2EInferenceConfig {
|
|
2326
|
-
/** Path or URL to the ONNX model */
|
|
2327
|
-
modelUrl: string;
|
|
2328
|
-
/**
|
|
2329
|
-
* Path or URL to external model data file (.onnx.data weights).
|
|
2330
|
-
* Default: `${modelUrl}.data` (e.g., /models/model.onnx.data)
|
|
2331
|
-
*
|
|
2332
|
-
* Set to `false` to skip external data loading (single-file models only).
|
|
2333
|
-
*/
|
|
2334
|
-
externalDataUrl?: string | false;
|
|
2335
|
-
/** Preferred backend (auto will try WebGPU first, fallback to WASM) */
|
|
2336
|
-
backend?: BackendPreference;
|
|
2337
|
-
/** Number of identity classes (default: 12 for streaming model) */
|
|
2338
|
-
numIdentityClasses?: number;
|
|
2339
|
-
/**
|
|
2340
|
-
* Number of audio samples per inference chunk (default: 16000).
|
|
2341
|
-
* Model supports variable chunk sizes. Smaller chunks = lower latency,
|
|
2342
|
-
* more inference overhead. 8000 (500ms) is recommended for real-time lip sync.
|
|
2343
|
-
*/
|
|
2344
|
-
chunkSize?: number;
|
|
2345
|
-
}
|
|
2346
|
-
|
|
2347
|
-
declare class A2EInference implements A2EBackend {
|
|
2348
|
-
readonly modelId: "a2e";
|
|
2349
|
-
private session;
|
|
2350
|
-
private ort;
|
|
2351
|
-
private config;
|
|
2352
|
-
private _backend;
|
|
2353
|
-
private isLoading;
|
|
2354
|
-
private numIdentityClasses;
|
|
2355
|
-
readonly chunkSize: number;
|
|
2356
|
-
private inferenceQueue;
|
|
2357
|
-
private poisoned;
|
|
2358
|
-
private static readonly INFERENCE_TIMEOUT_MS;
|
|
2359
|
-
constructor(config: A2EInferenceConfig);
|
|
2360
|
-
/**
|
|
2361
|
-
* Check if WebGPU is available and working
|
|
2362
|
-
* (iOS returns false even if navigator.gpu exists due to ONNX Runtime bugs)
|
|
2363
|
-
*/
|
|
2364
|
-
static isWebGPUAvailable: typeof isWebGPUAvailable;
|
|
2365
|
-
get backend(): 'webgpu' | 'wasm' | null;
|
|
2366
|
-
get isLoaded(): boolean;
|
|
2367
|
-
/** True if inference timed out and the session is permanently unusable */
|
|
2368
|
-
get isSessionPoisoned(): boolean;
|
|
2369
|
-
/**
|
|
2370
|
-
* Load the ONNX model
|
|
2371
|
-
*/
|
|
2372
|
-
load(): Promise<A2EModelInfo>;
|
|
2373
|
-
/**
|
|
2374
|
-
* Run inference on raw audio
|
|
2375
|
-
* @param audioSamples - Float32Array of raw audio at 16kHz
|
|
2376
|
-
* @param identityIndex - Optional identity index (0-11, default 0 = neutral)
|
|
2377
|
-
*
|
|
2378
|
-
* Audio will be zero-padded or truncated to chunkSize samples.
|
|
2379
|
-
*/
|
|
2380
|
-
infer(audioSamples: Float32Array, identityIndex?: number): Promise<A2EResult>;
|
|
2381
|
-
/**
|
|
2382
|
-
* Queue inference to serialize ONNX session calls
|
|
2383
|
-
*/
|
|
2384
|
-
private queueInference;
|
|
2385
|
-
/**
|
|
2386
|
-
* Get blendshape value by name for a specific frame
|
|
2387
|
-
*/
|
|
2388
|
-
getBlendshape(blendshapes: Float32Array, name: typeof LAM_BLENDSHAPES[number]): number;
|
|
2389
|
-
/**
|
|
2390
|
-
* Dispose of the model and free resources
|
|
2391
|
-
*/
|
|
2392
|
-
dispose(): Promise<void>;
|
|
2393
|
-
}
|
|
2394
|
-
|
|
2395
1816
|
/**
|
|
2396
1817
|
* Default and user-configurable model URLs for all ONNX models
|
|
2397
1818
|
*
|
|
@@ -2427,7 +1848,7 @@ type ModelUrlKey = 'lam' | 'senseVoice' | 'sileroVad' | 'kokoroTTS' | 'kokoroVoi
|
|
|
2427
1848
|
* Resolved model URLs — user overrides take priority, HuggingFace CDN is fallback.
|
|
2428
1849
|
*
|
|
2429
1850
|
* All SDK factories (`createA2E`, `createSenseVoice`, `createSileroVAD`) and
|
|
2430
|
-
* orchestrators (`
|
|
1851
|
+
* orchestrators (`VoiceOrchestrator`) read from this object. Call
|
|
2431
1852
|
* {@link configureModelUrls} before constructing any pipelines to point
|
|
2432
1853
|
* models at your own CDN.
|
|
2433
1854
|
*/
|
|
@@ -2697,6 +2118,44 @@ declare class BlendshapeSmoother {
|
|
|
2697
2118
|
reset(): void;
|
|
2698
2119
|
}
|
|
2699
2120
|
|
|
2121
|
+
/**
|
|
2122
|
+
* Factory function for Silero VAD via UnifiedInferenceWorker
|
|
2123
|
+
*
|
|
2124
|
+
* @category Inference
|
|
2125
|
+
*
|
|
2126
|
+
* @example
|
|
2127
|
+
* ```typescript
|
|
2128
|
+
* import { createSileroVAD, UnifiedInferenceWorker } from '@omote/core';
|
|
2129
|
+
*
|
|
2130
|
+
* const worker = new UnifiedInferenceWorker();
|
|
2131
|
+
* await worker.init();
|
|
2132
|
+
*
|
|
2133
|
+
* const vad = createSileroVAD({
|
|
2134
|
+
* modelUrl: '/models/silero-vad.onnx',
|
|
2135
|
+
* unifiedWorker: worker,
|
|
2136
|
+
* });
|
|
2137
|
+
* await vad.load();
|
|
2138
|
+
* const result = await vad.process(audioChunk);
|
|
2139
|
+
* ```
|
|
2140
|
+
*/
|
|
2141
|
+
|
|
2142
|
+
/**
|
|
2143
|
+
* Configuration for the Silero VAD factory
|
|
2144
|
+
*/
|
|
2145
|
+
interface SileroVADFactoryConfig extends Omit<SileroVADConfig, 'modelUrl'>, InferenceFactoryConfig {
|
|
2146
|
+
/** Path or URL to the ONNX model. Default: HuggingFace CDN */
|
|
2147
|
+
modelUrl?: string;
|
|
2148
|
+
}
|
|
2149
|
+
/**
|
|
2150
|
+
* Create a Silero VAD instance via the unified worker.
|
|
2151
|
+
*
|
|
2152
|
+
* If no `unifiedWorker` is provided, a dedicated worker is created on load().
|
|
2153
|
+
*
|
|
2154
|
+
* @param config - Factory configuration
|
|
2155
|
+
* @returns A SileroVADBackend instance
|
|
2156
|
+
*/
|
|
2157
|
+
declare function createSileroVAD(config?: SileroVADFactoryConfig): SileroVADBackend;
|
|
2158
|
+
|
|
2700
2159
|
/**
|
|
2701
2160
|
* SenseVoice adapter backed by UnifiedInferenceWorker
|
|
2702
2161
|
*
|
|
@@ -2755,34 +2214,9 @@ declare class A2EUnifiedAdapter implements A2EBackend {
|
|
|
2755
2214
|
}
|
|
2756
2215
|
|
|
2757
2216
|
/**
|
|
2758
|
-
* Kokoro TTS
|
|
2759
|
-
*
|
|
2760
|
-
* Pure ONNX pipeline for browser-based text-to-speech. No transformers.js dependency.
|
|
2761
|
-
* Uses eSpeak-NG WASM for phonemization and Kokoro-82M (q8, 92MB) for synthesis.
|
|
2762
|
-
*
|
|
2763
|
-
* Pipeline: Text → Normalize → Phonemize (eSpeak WASM) → Tokenize → Voice Style → ONNX → Audio
|
|
2217
|
+
* Kokoro TTS type definitions
|
|
2764
2218
|
*
|
|
2765
2219
|
* @category Inference
|
|
2766
|
-
*
|
|
2767
|
-
* @example Basic usage
|
|
2768
|
-
* ```typescript
|
|
2769
|
-
* import { KokoroTTSInference } from '@omote/core';
|
|
2770
|
-
*
|
|
2771
|
-
* const tts = new KokoroTTSInference({ defaultVoice: 'af_heart' });
|
|
2772
|
-
* await tts.load();
|
|
2773
|
-
*
|
|
2774
|
-
* const { audio, duration } = await tts.synthesize("Hello world");
|
|
2775
|
-
* // audio: Float32Array @ 24kHz
|
|
2776
|
-
* ```
|
|
2777
|
-
*
|
|
2778
|
-
* @example Streaming (sentence-by-sentence)
|
|
2779
|
-
* ```typescript
|
|
2780
|
-
* for await (const chunk of tts.stream("First sentence. Second sentence.")) {
|
|
2781
|
-
* playbackPipeline.feedBuffer(chunk.audio);
|
|
2782
|
-
* }
|
|
2783
|
-
* ```
|
|
2784
|
-
*
|
|
2785
|
-
* @module inference/KokoroTTSInference
|
|
2786
2220
|
*/
|
|
2787
2221
|
|
|
2788
2222
|
interface KokoroTTSConfig {
|
|
@@ -2796,6 +2230,8 @@ interface KokoroTTSConfig {
|
|
|
2796
2230
|
backend?: BackendPreference;
|
|
2797
2231
|
/** Speech speed multiplier (default: 1.0) */
|
|
2798
2232
|
speed?: number;
|
|
2233
|
+
/** Eagerly load phonemizer + default voice during load() instead of first speak(). Default: true. */
|
|
2234
|
+
eagerLoad?: boolean;
|
|
2799
2235
|
}
|
|
2800
2236
|
interface KokoroTTSResult {
|
|
2801
2237
|
/** Audio samples at 24kHz */
|
|
@@ -2834,67 +2270,6 @@ interface SynthesizeOptions {
|
|
|
2834
2270
|
* Returns trimmed text on success, throws on invalid input.
|
|
2835
2271
|
*/
|
|
2836
2272
|
declare function validateTTSInput(text: unknown, voiceName: string, speed: number, availableVoices?: string[]): string;
|
|
2837
|
-
declare class KokoroTTSInference implements TTSBackend {
|
|
2838
|
-
private readonly config;
|
|
2839
|
-
private readonly modelUrl;
|
|
2840
|
-
private readonly voiceBaseUrl;
|
|
2841
|
-
private ort;
|
|
2842
|
-
private session;
|
|
2843
|
-
private _backend;
|
|
2844
|
-
private isLoading;
|
|
2845
|
-
private poisoned;
|
|
2846
|
-
private inferenceQueue;
|
|
2847
|
-
private phonemizerReady;
|
|
2848
|
-
private defaultVoiceLoaded;
|
|
2849
|
-
/** Cached voice data (voice name → Float32Array) */
|
|
2850
|
-
private loadedVoices;
|
|
2851
|
-
constructor(config?: KokoroTTSConfig);
|
|
2852
|
-
get isLoaded(): boolean;
|
|
2853
|
-
get sampleRate(): number;
|
|
2854
|
-
/**
|
|
2855
|
-
* Load the ONNX model, phonemizer WASM, and default voice.
|
|
2856
|
-
* Safe to call multiple times (no-ops after first successful load).
|
|
2857
|
-
*/
|
|
2858
|
-
load(): Promise<KokoroTTSModelInfo>;
|
|
2859
|
-
/**
|
|
2860
|
-
* Lazily initialize phonemizer and default voice on first use.
|
|
2861
|
-
* Moves 100-200ms of main-thread blocking out of load() into first synthesis.
|
|
2862
|
-
*/
|
|
2863
|
-
private ensureReady;
|
|
2864
|
-
/**
|
|
2865
|
-
* Synthesize speech from text (one-shot, full audio output).
|
|
2866
|
-
*
|
|
2867
|
-
* @param text - Input text to synthesize
|
|
2868
|
-
* @param options - Voice and speed overrides
|
|
2869
|
-
* @returns Audio Float32Array at 24kHz with duration
|
|
2870
|
-
*/
|
|
2871
|
-
synthesize(text: string, options?: SynthesizeOptions): Promise<KokoroTTSResult>;
|
|
2872
|
-
/**
|
|
2873
|
-
* Stream synthesis sentence-by-sentence (async generator).
|
|
2874
|
-
* Splits text on sentence boundaries and yields audio for each.
|
|
2875
|
-
*
|
|
2876
|
-
* Compatible with both `SynthesizeOptions` (legacy) and `TTSStreamOptions` (TTSBackend).
|
|
2877
|
-
*
|
|
2878
|
-
* @param text - Input text (can be multiple sentences)
|
|
2879
|
-
* @param options - Voice, speed, and abort signal overrides
|
|
2880
|
-
*/
|
|
2881
|
-
stream(text: string, options?: SynthesizeOptions & TTSStreamOptions): AsyncGenerator<KokoroStreamChunk & TTSChunk>;
|
|
2882
|
-
/**
|
|
2883
|
-
* Preload a voice (fetches and caches the .bin file).
|
|
2884
|
-
*/
|
|
2885
|
-
preloadVoice(voiceName: string): Promise<void>;
|
|
2886
|
-
/**
|
|
2887
|
-
* List available voice names.
|
|
2888
|
-
*/
|
|
2889
|
-
listVoices(): string[];
|
|
2890
|
-
/**
|
|
2891
|
-
* Release the ONNX session and clear cached voices.
|
|
2892
|
-
*/
|
|
2893
|
-
dispose(): Promise<void>;
|
|
2894
|
-
private ensureVoice;
|
|
2895
|
-
private queueInference;
|
|
2896
|
-
private runInference;
|
|
2897
|
-
}
|
|
2898
2273
|
|
|
2899
2274
|
/**
|
|
2900
2275
|
* Kokoro TTS adapter backed by UnifiedInferenceWorker
|
|
@@ -2910,6 +2285,7 @@ declare class KokoroTTSUnifiedAdapter implements TTSBackend {
|
|
|
2910
2285
|
private readonly modelUrl;
|
|
2911
2286
|
private readonly voiceBaseUrl;
|
|
2912
2287
|
private _isLoaded;
|
|
2288
|
+
private _backend;
|
|
2913
2289
|
private loadedGeneration;
|
|
2914
2290
|
/** Per-adapter inference queue — ensures sequential state updates. */
|
|
2915
2291
|
private inferenceQueue;
|
|
@@ -3131,148 +2507,61 @@ declare class SafariSpeechRecognition {
|
|
|
3131
2507
|
/**
|
|
3132
2508
|
* Remove an error callback
|
|
3133
2509
|
*/
|
|
3134
|
-
offError(callback: SpeechErrorCallback): void;
|
|
3135
|
-
/**
|
|
3136
|
-
* Start listening for speech
|
|
3137
|
-
*
|
|
3138
|
-
* On iOS Safari, this will trigger the microphone permission prompt
|
|
3139
|
-
* if not already granted.
|
|
3140
|
-
*/
|
|
3141
|
-
start(): Promise<void>;
|
|
3142
|
-
/**
|
|
3143
|
-
* Stop listening and return the final transcript
|
|
3144
|
-
*/
|
|
3145
|
-
stop(): Promise<SpeechRecognitionResult>;
|
|
3146
|
-
/**
|
|
3147
|
-
* Abort recognition without waiting for final result
|
|
3148
|
-
*/
|
|
3149
|
-
abort(): void;
|
|
3150
|
-
/**
|
|
3151
|
-
* NOT SUPPORTED: Transcribe audio buffer
|
|
3152
|
-
*
|
|
3153
|
-
* Safari Speech API does not support transcribing pre-recorded audio.
|
|
3154
|
-
* It only works with live microphone input.
|
|
3155
|
-
*
|
|
3156
|
-
* For batch transcription on iOS, use server-side Whisper or a cloud ASR service.
|
|
3157
|
-
*
|
|
3158
|
-
* @throws Error always - this method is not supported
|
|
3159
|
-
*/
|
|
3160
|
-
transcribe(_audio: Float32Array): Promise<SpeechRecognitionResult>;
|
|
3161
|
-
/**
|
|
3162
|
-
* Dispose of recognition resources
|
|
3163
|
-
*/
|
|
3164
|
-
dispose(): void;
|
|
3165
|
-
/**
|
|
3166
|
-
* Set up event handlers for the recognition instance
|
|
3167
|
-
*/
|
|
3168
|
-
private setupEventHandlers;
|
|
3169
|
-
/**
|
|
3170
|
-
* Emit result to all registered callbacks
|
|
3171
|
-
*/
|
|
3172
|
-
private emitResult;
|
|
3173
|
-
/**
|
|
3174
|
-
* Emit error to all registered callbacks
|
|
3175
|
-
*/
|
|
3176
|
-
private emitError;
|
|
3177
|
-
}
|
|
3178
|
-
|
|
3179
|
-
/**
|
|
3180
|
-
* Kokoro TTS Web Worker implementation
|
|
3181
|
-
*
|
|
3182
|
-
* Moves the heavy ONNX `session.run()` to a dedicated Web Worker to prevent
|
|
3183
|
-
* main thread blocking (~1-2s per sentence on WASM). Phonemizer, tokenizer,
|
|
3184
|
-
* and voice logic stay on the main thread (fast, <10ms combined).
|
|
3185
|
-
*
|
|
3186
|
-
* Architecture:
|
|
3187
|
-
* ```
|
|
3188
|
-
* Main Thread (KokoroTTSWorker): Worker (WORKER_SCRIPT):
|
|
3189
|
-
* stream(text) →
|
|
3190
|
-
* splitSentences(text)
|
|
3191
|
-
* for each sentence:
|
|
3192
|
-
* phonemize(sentence) → phonemes
|
|
3193
|
-
* tokenize(phonemes) → tokens
|
|
3194
|
-
* ensureVoice() → style
|
|
3195
|
-
* postMessage(tokens, style, speed) ──→ session.run(feeds)
|
|
3196
|
-
* await result ←── postMessage(audio)
|
|
3197
|
-
* yield {audio, text, phonemes, duration}
|
|
3198
|
-
* ```
|
|
3199
|
-
*
|
|
3200
|
-
* @category Inference
|
|
3201
|
-
*
|
|
3202
|
-
* @example Basic usage
|
|
3203
|
-
* ```typescript
|
|
3204
|
-
* import { KokoroTTSWorker } from '@omote/core';
|
|
3205
|
-
*
|
|
3206
|
-
* const tts = new KokoroTTSWorker({ defaultVoice: 'af_heart' });
|
|
3207
|
-
* await tts.load();
|
|
3208
|
-
*
|
|
3209
|
-
* for await (const chunk of tts.stream("Hello world!")) {
|
|
3210
|
-
* playbackPipeline.feedBuffer(chunk.audio);
|
|
3211
|
-
* }
|
|
3212
|
-
* ```
|
|
3213
|
-
*
|
|
3214
|
-
* @module inference/KokoroTTSWorker
|
|
3215
|
-
*/
|
|
3216
|
-
|
|
3217
|
-
/**
|
|
3218
|
-
* Kokoro TTS Worker — off-main-thread ONNX inference for non-blocking TTS.
|
|
3219
|
-
*
|
|
3220
|
-
* Phonemizer/tokenizer/voice logic run on the main thread (fast, <10ms).
|
|
3221
|
-
* Only the heavy ONNX `session.run()` is delegated to the worker.
|
|
3222
|
-
*
|
|
3223
|
-
* Implements the same TTSBackend interface as KokoroTTSInference.
|
|
3224
|
-
*
|
|
3225
|
-
* @see KokoroTTSInference for main-thread version
|
|
3226
|
-
*/
|
|
3227
|
-
declare class KokoroTTSWorker implements TTSBackend {
|
|
3228
|
-
private readonly config;
|
|
3229
|
-
private readonly modelUrl;
|
|
3230
|
-
private readonly voiceBaseUrl;
|
|
3231
|
-
private worker;
|
|
3232
|
-
private _isLoaded;
|
|
3233
|
-
private isLoading;
|
|
3234
|
-
private poisoned;
|
|
3235
|
-
/** Serializes all worker calls (stream sentence chunks + synthesize) */
|
|
3236
|
-
private inferenceQueue;
|
|
3237
|
-
/** Cached voice data (voice name → Float32Array) */
|
|
3238
|
-
private loadedVoices;
|
|
3239
|
-
/** Pending message handlers */
|
|
3240
|
-
private pendingResolvers;
|
|
3241
|
-
constructor(config?: KokoroTTSConfig);
|
|
3242
|
-
get isLoaded(): boolean;
|
|
3243
|
-
get sampleRate(): number;
|
|
3244
|
-
load(): Promise<KokoroTTSModelInfo>;
|
|
3245
|
-
synthesize(text: string, options?: SynthesizeOptions): Promise<KokoroTTSResult>;
|
|
3246
|
-
stream(text: string, options?: SynthesizeOptions & TTSStreamOptions): AsyncGenerator<KokoroStreamChunk & TTSChunk>;
|
|
3247
|
-
preloadVoice(voiceName: string): Promise<void>;
|
|
3248
|
-
listVoices(): string[];
|
|
3249
|
-
dispose(): Promise<void>;
|
|
3250
|
-
static isSupported(): boolean;
|
|
3251
|
-
private ensureVoice;
|
|
3252
|
-
private createWorker;
|
|
3253
|
-
private handleWorkerMessage;
|
|
3254
|
-
private sendMessage;
|
|
2510
|
+
offError(callback: SpeechErrorCallback): void;
|
|
3255
2511
|
/**
|
|
3256
|
-
*
|
|
3257
|
-
*
|
|
2512
|
+
* Start listening for speech
|
|
2513
|
+
*
|
|
2514
|
+
* On iOS Safari, this will trigger the microphone permission prompt
|
|
2515
|
+
* if not already granted.
|
|
3258
2516
|
*/
|
|
3259
|
-
|
|
2517
|
+
start(): Promise<void>;
|
|
2518
|
+
/**
|
|
2519
|
+
* Stop listening and return the final transcript
|
|
2520
|
+
*/
|
|
2521
|
+
stop(): Promise<SpeechRecognitionResult>;
|
|
2522
|
+
/**
|
|
2523
|
+
* Abort recognition without waiting for final result
|
|
2524
|
+
*/
|
|
2525
|
+
abort(): void;
|
|
2526
|
+
/**
|
|
2527
|
+
* NOT SUPPORTED: Transcribe audio buffer
|
|
2528
|
+
*
|
|
2529
|
+
* Safari Speech API does not support transcribing pre-recorded audio.
|
|
2530
|
+
* It only works with live microphone input.
|
|
2531
|
+
*
|
|
2532
|
+
* For batch transcription on iOS, use server-side Whisper or a cloud ASR service.
|
|
2533
|
+
*
|
|
2534
|
+
* @throws Error always - this method is not supported
|
|
2535
|
+
*/
|
|
2536
|
+
transcribe(_audio: Float32Array): Promise<SpeechRecognitionResult>;
|
|
2537
|
+
/**
|
|
2538
|
+
* Dispose of recognition resources
|
|
2539
|
+
*/
|
|
2540
|
+
dispose(): void;
|
|
2541
|
+
/**
|
|
2542
|
+
* Set up event handlers for the recognition instance
|
|
2543
|
+
*/
|
|
2544
|
+
private setupEventHandlers;
|
|
2545
|
+
/**
|
|
2546
|
+
* Emit result to all registered callbacks
|
|
2547
|
+
*/
|
|
2548
|
+
private emitResult;
|
|
3260
2549
|
/**
|
|
3261
|
-
*
|
|
2550
|
+
* Emit error to all registered callbacks
|
|
3262
2551
|
*/
|
|
3263
|
-
private
|
|
2552
|
+
private emitError;
|
|
3264
2553
|
}
|
|
3265
2554
|
|
|
3266
2555
|
/**
|
|
3267
|
-
* Factory function for Kokoro TTS
|
|
2556
|
+
* Factory function for Kokoro TTS via UnifiedInferenceWorker
|
|
3268
2557
|
*
|
|
3269
|
-
*
|
|
3270
|
-
*
|
|
3271
|
-
*
|
|
2558
|
+
* When called without a `unifiedWorker`, a dedicated worker is created
|
|
2559
|
+
* automatically on the first `load()` call. Pass a shared worker when using
|
|
2560
|
+
* VoiceOrchestrator or multiple models to avoid extra WASM instances.
|
|
3272
2561
|
*
|
|
3273
2562
|
* @category Inference
|
|
3274
2563
|
*
|
|
3275
|
-
* @example
|
|
2564
|
+
* @example Standalone (auto-creates worker)
|
|
3276
2565
|
* ```typescript
|
|
3277
2566
|
* import { createKokoroTTS } from '@omote/core';
|
|
3278
2567
|
*
|
|
@@ -3284,14 +2573,9 @@ declare class KokoroTTSWorker implements TTSBackend {
|
|
|
3284
2573
|
* }
|
|
3285
2574
|
* ```
|
|
3286
2575
|
*
|
|
3287
|
-
* @example
|
|
2576
|
+
* @example With shared worker
|
|
3288
2577
|
* ```typescript
|
|
3289
|
-
* const tts = createKokoroTTS({ defaultVoice: 'af_heart',
|
|
3290
|
-
* ```
|
|
3291
|
-
*
|
|
3292
|
-
* @example Force main thread
|
|
3293
|
-
* ```typescript
|
|
3294
|
-
* const tts = createKokoroTTS({ defaultVoice: 'af_heart', useWorker: false });
|
|
2578
|
+
* const tts = createKokoroTTS({ defaultVoice: 'af_heart', unifiedWorker: worker });
|
|
3295
2579
|
* ```
|
|
3296
2580
|
*/
|
|
3297
2581
|
|
|
@@ -3301,10 +2585,12 @@ declare class KokoroTTSWorker implements TTSBackend {
|
|
|
3301
2585
|
interface CreateKokoroTTSConfig extends KokoroTTSConfig, InferenceFactoryConfig {
|
|
3302
2586
|
}
|
|
3303
2587
|
/**
|
|
3304
|
-
* Create a Kokoro TTS instance
|
|
2588
|
+
* Create a Kokoro TTS instance via the unified worker.
|
|
2589
|
+
*
|
|
2590
|
+
* If no `unifiedWorker` is provided, a dedicated worker is created on load().
|
|
3305
2591
|
*
|
|
3306
2592
|
* @param config - Factory configuration
|
|
3307
|
-
* @returns A TTSBackend instance
|
|
2593
|
+
* @returns A TTSBackend instance
|
|
3308
2594
|
*/
|
|
3309
2595
|
declare function createKokoroTTS(config?: CreateKokoroTTSConfig): TTSBackend;
|
|
3310
2596
|
|
|
@@ -3353,7 +2639,7 @@ declare function listVoices(): string[];
|
|
|
3353
2639
|
* ElevenLabs TTS Backend — Cloud text-to-speech via ElevenLabs REST API.
|
|
3354
2640
|
*
|
|
3355
2641
|
* Implements the TTSBackend interface so it can be used anywhere Kokoro TTS is used
|
|
3356
|
-
* (TTSPlayback, TTSSpeaker,
|
|
2642
|
+
* (TTSPlayback, TTSSpeaker, VoiceOrchestrator, PlaybackPipeline, etc.)
|
|
3357
2643
|
*
|
|
3358
2644
|
* Zero external dependencies — uses fetch() directly.
|
|
3359
2645
|
*
|
|
@@ -3431,141 +2717,6 @@ declare class ElevenLabsTTSBackend implements TTSBackend {
|
|
|
3431
2717
|
private getHttpErrorMessage;
|
|
3432
2718
|
}
|
|
3433
2719
|
|
|
3434
|
-
/**
|
|
3435
|
-
* AWS Polly TTS Backend — Cloud text-to-speech via consumer-provided AWS SDK call.
|
|
3436
|
-
*
|
|
3437
|
-
* Implements the TTSBackend interface. Keeps @omote/core free of AWS SDK dependencies
|
|
3438
|
-
* by delegating the actual Polly API call to a consumer-provided function.
|
|
3439
|
-
*
|
|
3440
|
-
* @category Inference
|
|
3441
|
-
*
|
|
3442
|
-
* @example Basic usage with AWS SDK v3
|
|
3443
|
-
* ```typescript
|
|
3444
|
-
* import { PollyTTSBackend } from '@omote/core';
|
|
3445
|
-
* import { PollyClient, SynthesizeSpeechCommand } from '@aws-sdk/client-polly';
|
|
3446
|
-
*
|
|
3447
|
-
* const polly = new PollyClient({ region: 'us-east-1' });
|
|
3448
|
-
*
|
|
3449
|
-
* const tts = new PollyTTSBackend({
|
|
3450
|
-
* synthesizeFn: async (text, voice, sampleRate) => {
|
|
3451
|
-
* const cmd = new SynthesizeSpeechCommand({
|
|
3452
|
-
* Text: text,
|
|
3453
|
-
* VoiceId: voice,
|
|
3454
|
-
* Engine: 'neural',
|
|
3455
|
-
* OutputFormat: 'pcm',
|
|
3456
|
-
* SampleRate: String(sampleRate),
|
|
3457
|
-
* });
|
|
3458
|
-
* const result = await polly.send(cmd);
|
|
3459
|
-
* const stream = result.AudioStream;
|
|
3460
|
-
* // Convert stream to ArrayBuffer (Node or browser)
|
|
3461
|
-
* const chunks: Uint8Array[] = [];
|
|
3462
|
-
* for await (const chunk of stream as AsyncIterable<Uint8Array>) {
|
|
3463
|
-
* chunks.push(chunk);
|
|
3464
|
-
* }
|
|
3465
|
-
* const totalLength = chunks.reduce((sum, c) => sum + c.length, 0);
|
|
3466
|
-
* const merged = new Uint8Array(totalLength);
|
|
3467
|
-
* let offset = 0;
|
|
3468
|
-
* for (const chunk of chunks) {
|
|
3469
|
-
* merged.set(chunk, offset);
|
|
3470
|
-
* offset += chunk.length;
|
|
3471
|
-
* }
|
|
3472
|
-
* return {
|
|
3473
|
-
* audio: merged.buffer,
|
|
3474
|
-
* contentType: result.ContentType ?? 'audio/pcm',
|
|
3475
|
-
* };
|
|
3476
|
-
* },
|
|
3477
|
-
* });
|
|
3478
|
-
*
|
|
3479
|
-
* await tts.load();
|
|
3480
|
-
* for await (const chunk of tts.stream("Hello world!")) {
|
|
3481
|
-
* playbackPipeline.feedBuffer(chunk.audio);
|
|
3482
|
-
* }
|
|
3483
|
-
* ```
|
|
3484
|
-
*/
|
|
3485
|
-
|
|
3486
|
-
/**
|
|
3487
|
-
* Result from the consumer-provided synthesize function.
|
|
3488
|
-
*/
|
|
3489
|
-
interface PollySynthesizeResult {
|
|
3490
|
-
/** Raw PCM audio bytes (Int16 LE) */
|
|
3491
|
-
audio: ArrayBuffer;
|
|
3492
|
-
/** Content type from Polly response (e.g., 'audio/pcm') */
|
|
3493
|
-
contentType: string;
|
|
3494
|
-
}
|
|
3495
|
-
/**
|
|
3496
|
-
* Configuration for PollyTTSBackend.
|
|
3497
|
-
*
|
|
3498
|
-
* The `synthesizeFn` callback lets consumers use their own AWS SDK setup
|
|
3499
|
-
* (credentials, region, SDK version) without @omote/core depending on `@aws-sdk/client-polly`.
|
|
3500
|
-
*/
|
|
3501
|
-
interface PollyConfig {
|
|
3502
|
-
/**
|
|
3503
|
-
* Consumer-provided function that calls AWS Polly.
|
|
3504
|
-
* Must return PCM audio (Int16 LE) at the requested sample rate.
|
|
3505
|
-
*
|
|
3506
|
-
* @param text - Text to synthesize
|
|
3507
|
-
* @param voice - Polly voice ID (e.g., 'Joanna')
|
|
3508
|
-
* @param sampleRate - Requested output sample rate (e.g., 16000)
|
|
3509
|
-
* @returns PCM audio buffer and content type
|
|
3510
|
-
*/
|
|
3511
|
-
synthesizeFn: (text: string, voice: string, sampleRate: number) => Promise<PollySynthesizeResult>;
|
|
3512
|
-
/** Polly voice ID (default: 'Joanna') */
|
|
3513
|
-
voice?: string;
|
|
3514
|
-
/** Output sample rate in Hz (default: 16000) */
|
|
3515
|
-
sampleRate?: number;
|
|
3516
|
-
/** Polly engine type (default: 'neural') */
|
|
3517
|
-
engine?: 'neural' | 'standard' | 'generative' | 'long-form';
|
|
3518
|
-
}
|
|
3519
|
-
declare class PollyTTSBackend implements TTSBackend {
|
|
3520
|
-
private readonly synthesizeFn;
|
|
3521
|
-
private readonly voice;
|
|
3522
|
-
private readonly _sampleRate;
|
|
3523
|
-
private readonly engine;
|
|
3524
|
-
private _isLoaded;
|
|
3525
|
-
constructor(config: PollyConfig);
|
|
3526
|
-
get sampleRate(): number;
|
|
3527
|
-
get isLoaded(): boolean;
|
|
3528
|
-
/**
|
|
3529
|
-
* No-op for cloud TTS (no model to load).
|
|
3530
|
-
* Marks backend as ready.
|
|
3531
|
-
*/
|
|
3532
|
-
load(): Promise<void>;
|
|
3533
|
-
/**
|
|
3534
|
-
* Synthesize audio via consumer's Polly function.
|
|
3535
|
-
*
|
|
3536
|
-
* Polly's SynthesizeSpeech is request/response (not streaming for PCM),
|
|
3537
|
-
* so this yields a single chunk per call. For long text, consider splitting
|
|
3538
|
-
* into sentences on the consumer side.
|
|
3539
|
-
*/
|
|
3540
|
-
stream(text: string, options?: TTSStreamOptions): AsyncGenerator<TTSChunk>;
|
|
3541
|
-
dispose(): Promise<void>;
|
|
3542
|
-
}
|
|
3543
|
-
|
|
3544
|
-
/**
|
|
3545
|
-
* ORT CDN configuration
|
|
3546
|
-
*
|
|
3547
|
-
* Allows consumers to override the CDN base URL used for loading
|
|
3548
|
-
* ONNX Runtime WASM/WebGPU binaries. By default, ORT loads from
|
|
3549
|
-
* its bundled CDN path. Use {@link configureOrtCdn} to point at
|
|
3550
|
-
* a self-hosted or enterprise CDN.
|
|
3551
|
-
*
|
|
3552
|
-
* @category Inference
|
|
3553
|
-
*/
|
|
3554
|
-
/**
|
|
3555
|
-
* Override the CDN base URL for ONNX Runtime WASM/WebGPU binaries.
|
|
3556
|
-
*
|
|
3557
|
-
* Must be an HTTPS URL or a relative path (starts with `/` or `./`).
|
|
3558
|
-
* Call this once at app startup, before loading any models.
|
|
3559
|
-
*
|
|
3560
|
-
* @param cdnPath - HTTPS URL or relative path to ORT binaries directory
|
|
3561
|
-
* @throws If cdnPath is not HTTPS or a relative path
|
|
3562
|
-
*/
|
|
3563
|
-
declare function configureOrtCdn(cdnPath: string): void;
|
|
3564
|
-
/**
|
|
3565
|
-
* Get the current ORT CDN base URL override, or null if using defaults.
|
|
3566
|
-
*/
|
|
3567
|
-
declare function getOrtCdnBase(): string | null;
|
|
3568
|
-
|
|
3569
2720
|
/**
|
|
3570
2721
|
* Emotion - Helper for creating emotion vectors for avatar animation
|
|
3571
2722
|
*
|
|
@@ -4111,7 +3262,7 @@ declare const MetricNames: {
|
|
|
4111
3262
|
readonly CACHE_QUOTA_WARNING: "omote.cache.quota_warning";
|
|
4112
3263
|
/** Counter: Cache eviction (LRU) */
|
|
4113
3264
|
readonly CACHE_EVICTION: "omote.cache.eviction";
|
|
4114
|
-
/** Histogram:
|
|
3265
|
+
/** Histogram: Voice turn latency (speech end → transcript ready, excludes playback) */
|
|
4115
3266
|
readonly VOICE_TURN_LATENCY: "omote.voice.turn.latency";
|
|
4116
3267
|
/** Histogram: ASR transcription latency in ms */
|
|
4117
3268
|
readonly VOICE_TRANSCRIPTION_LATENCY: "omote.voice.transcription.latency";
|
|
@@ -4959,7 +4110,7 @@ declare class ProceduralLifeLayer {
|
|
|
4959
4110
|
*/
|
|
4960
4111
|
update(delta: number, input?: LifeLayerInput): LifeLayerOutput;
|
|
4961
4112
|
/**
|
|
4962
|
-
* Write life layer output directly to a Float32Array[52] in
|
|
4113
|
+
* Write life layer output directly to a Float32Array[52] in ARKIT_BLENDSHAPES order.
|
|
4963
4114
|
*
|
|
4964
4115
|
* Includes micro-jitter (0.4% amplitude simplex noise on all channels) to
|
|
4965
4116
|
* break uncanny stillness on undriven channels.
|
|
@@ -5294,7 +4445,7 @@ declare class FaceCompositor {
|
|
|
5294
4445
|
/**
|
|
5295
4446
|
* Compose a single output frame from the 5-stage signal chain.
|
|
5296
4447
|
*
|
|
5297
|
-
* @param base - A2E raw output (Float32Array[52],
|
|
4448
|
+
* @param base - A2E raw output (Float32Array[52], ARKIT_BLENDSHAPES order)
|
|
5298
4449
|
* @param input - Per-frame input (deltaTime, emotion, life layer params)
|
|
5299
4450
|
* @param target - Optional pre-allocated output buffer (avoids per-frame allocation).
|
|
5300
4451
|
* When omitted, an internal buffer is used (valid until next compose() call).
|
|
@@ -5576,216 +4727,6 @@ declare class MicLipSync extends EventEmitter<MicLipSyncEvents> {
|
|
|
5576
4727
|
private setState;
|
|
5577
4728
|
}
|
|
5578
4729
|
|
|
5579
|
-
/**
|
|
5580
|
-
* VoicePipeline - Full conversational agent loop
|
|
5581
|
-
*
|
|
5582
|
-
* Composes: MicrophoneCapture → SileroVAD → SenseVoice ASR → PlaybackPipeline (A2E)
|
|
5583
|
-
*
|
|
5584
|
-
* State machine: idle → loading → ready → listening → thinking → speaking → listening → ...
|
|
5585
|
-
*
|
|
5586
|
-
* The consumer provides an `onResponse` callback that receives transcribed text
|
|
5587
|
-
* and streams audio back for playback + lip sync. VoicePipeline is backend-agnostic.
|
|
5588
|
-
*
|
|
5589
|
-
* @category Orchestration
|
|
5590
|
-
*/
|
|
5591
|
-
|
|
5592
|
-
/** Shared config options for all VoicePipeline modes */
|
|
5593
|
-
interface VoicePipelineBaseConfig {
|
|
5594
|
-
/** Pre-built backends — skip internal factory creation. Takes precedence over `models`. */
|
|
5595
|
-
backends?: {
|
|
5596
|
-
asr: SenseVoiceBackend;
|
|
5597
|
-
lam: A2EBackend;
|
|
5598
|
-
vad: SileroVADBackend;
|
|
5599
|
-
tts?: TTSBackend;
|
|
5600
|
-
};
|
|
5601
|
-
/** External unified worker (reuse across pipelines). Takes precedence over internal creation. */
|
|
5602
|
-
unifiedWorker?: UnifiedInferenceWorker;
|
|
5603
|
-
/** URLs and options for model loading. Required if `backends` not provided. */
|
|
5604
|
-
models?: {
|
|
5605
|
-
senseVoice: {
|
|
5606
|
-
modelUrl: string;
|
|
5607
|
-
tokensUrl?: string;
|
|
5608
|
-
language?: string;
|
|
5609
|
-
};
|
|
5610
|
-
lam: {
|
|
5611
|
-
modelUrl: string;
|
|
5612
|
-
externalDataUrl?: string | false;
|
|
5613
|
-
backend?: 'auto' | 'webgpu' | 'wasm';
|
|
5614
|
-
};
|
|
5615
|
-
vad: {
|
|
5616
|
-
modelUrl: string;
|
|
5617
|
-
threshold?: number;
|
|
5618
|
-
preSpeechBufferChunks?: number;
|
|
5619
|
-
};
|
|
5620
|
-
};
|
|
5621
|
-
/** Per-character expression weight scaling */
|
|
5622
|
-
profile?: ExpressionProfile;
|
|
5623
|
-
/** Identity/style index for A2E model (default: 0) */
|
|
5624
|
-
identityIndex?: number;
|
|
5625
|
-
/** Base silence timeout in ms (default: 500) */
|
|
5626
|
-
silenceTimeoutMs?: number;
|
|
5627
|
-
/** Extended silence timeout for long utterances (default: 700) */
|
|
5628
|
-
silenceTimeoutExtendedMs?: number;
|
|
5629
|
-
/** Enable adaptive timeout based on speech duration (default: true) */
|
|
5630
|
-
adaptiveTimeout?: boolean;
|
|
5631
|
-
/** Minimum audio duration in seconds (default: 0.3) */
|
|
5632
|
-
minAudioDurationSec?: number;
|
|
5633
|
-
/** Minimum audio energy (default: 0.02) */
|
|
5634
|
-
minAudioEnergy?: number;
|
|
5635
|
-
/** Enable audio normalization for quiet audio (default: true) */
|
|
5636
|
-
normalizeAudio?: boolean;
|
|
5637
|
-
/** Progressive transcription interval — desktop (default: 500ms) */
|
|
5638
|
-
progressiveIntervalMs?: number;
|
|
5639
|
-
/** Progressive transcription interval — iOS (default: 800ms) */
|
|
5640
|
-
progressiveIntervalIosMs?: number;
|
|
5641
|
-
/** Coverage threshold to use progressive result (default: 0.8) */
|
|
5642
|
-
progressiveCoverageThreshold?: number;
|
|
5643
|
-
/** Minimum samples before progressive transcription starts (default: 8000) */
|
|
5644
|
-
progressiveMinSamples?: number;
|
|
5645
|
-
/** Timeout for individual SenseVoice.transcribe() calls (default: 10000ms) */
|
|
5646
|
-
transcriptionTimeoutMs?: number;
|
|
5647
|
-
/** Enable barge-in detection (default: true) */
|
|
5648
|
-
interruptionEnabled?: boolean;
|
|
5649
|
-
/** Minimum speech duration for interruption (default: 200ms) */
|
|
5650
|
-
interruptionMinSpeechMs?: number;
|
|
5651
|
-
/** Audio playback delay (default: auto-detected) */
|
|
5652
|
-
audioDelayMs?: number;
|
|
5653
|
-
/** Coalescer target duration (default: 200ms) */
|
|
5654
|
-
chunkTargetMs?: number;
|
|
5655
|
-
/** Enable neutral transition on playback complete (default: true) */
|
|
5656
|
-
neutralTransitionEnabled?: boolean;
|
|
5657
|
-
/** Duration of neutral fade-out (default: 250ms) */
|
|
5658
|
-
neutralTransitionMs?: number;
|
|
5659
|
-
}
|
|
5660
|
-
/** Cloud TTS mode: consumer handles response + audio streaming */
|
|
5661
|
-
interface VoicePipelineCloudConfig extends VoicePipelineBaseConfig {
|
|
5662
|
-
mode: 'cloud';
|
|
5663
|
-
/** Consumer's response handler (streams audio back) */
|
|
5664
|
-
onResponse: ResponseHandler;
|
|
5665
|
-
}
|
|
5666
|
-
/** Local TTS mode: SDK handles synthesis internally via TTSBackend */
|
|
5667
|
-
interface VoicePipelineLocalConfig extends VoicePipelineBaseConfig {
|
|
5668
|
-
mode: 'local';
|
|
5669
|
-
/**
|
|
5670
|
-
* TTS backend (e.g., KokoroTTSInference). Provide either `tts` or `ttsConfig`.
|
|
5671
|
-
*
|
|
5672
|
-
* When `tts` is provided, VoicePipeline uses it as-is. On iOS, this means
|
|
5673
|
-
* inference runs on the main thread (may cause UI freezes).
|
|
5674
|
-
*
|
|
5675
|
-
* Prefer `ttsConfig` for automatic unified worker integration on iOS.
|
|
5676
|
-
*/
|
|
5677
|
-
tts?: TTSBackend;
|
|
5678
|
-
/**
|
|
5679
|
-
* Kokoro TTS configuration. When provided, VoicePipeline creates the TTS
|
|
5680
|
-
* internally and passes the unified worker on iOS for off-main-thread inference.
|
|
5681
|
-
*
|
|
5682
|
-
* Takes precedence over `tts` if both are provided.
|
|
5683
|
-
*/
|
|
5684
|
-
ttsConfig?: {
|
|
5685
|
-
defaultVoice?: string;
|
|
5686
|
-
speed?: number;
|
|
5687
|
-
modelUrl?: string;
|
|
5688
|
-
voiceBaseUrl?: string;
|
|
5689
|
-
};
|
|
5690
|
-
/** Optional text transform (e.g., LLM call). Receives transcript, returns response text. */
|
|
5691
|
-
onTranscript?: (text: string) => string | Promise<string>;
|
|
5692
|
-
}
|
|
5693
|
-
type VoicePipelineConfig = VoicePipelineCloudConfig | VoicePipelineLocalConfig;
|
|
5694
|
-
interface VoicePipelineEvents {
|
|
5695
|
-
'state': VoicePipelineState;
|
|
5696
|
-
'loading:progress': LoadingProgress;
|
|
5697
|
-
'transcript': TranscriptResult;
|
|
5698
|
-
'frame': FullFaceFrame;
|
|
5699
|
-
'frame:raw': Float32Array;
|
|
5700
|
-
'speech:start': void;
|
|
5701
|
-
'speech:end': {
|
|
5702
|
-
durationMs: number;
|
|
5703
|
-
};
|
|
5704
|
-
'playback:start': {
|
|
5705
|
-
time: number;
|
|
5706
|
-
};
|
|
5707
|
-
'playback:complete': void;
|
|
5708
|
-
'interruption': void;
|
|
5709
|
-
'audio:level': {
|
|
5710
|
-
rms: number;
|
|
5711
|
-
peak: number;
|
|
5712
|
-
};
|
|
5713
|
-
'error': Error;
|
|
5714
|
-
[key: string]: unknown;
|
|
5715
|
-
}
|
|
5716
|
-
declare class VoicePipeline extends EventEmitter<VoicePipelineEvents> {
|
|
5717
|
-
private readonly config;
|
|
5718
|
-
private readonly isLocalMode;
|
|
5719
|
-
private _state;
|
|
5720
|
-
private stopped;
|
|
5721
|
-
private epoch;
|
|
5722
|
-
private _sessionId;
|
|
5723
|
-
private asr;
|
|
5724
|
-
private lam;
|
|
5725
|
-
private vad;
|
|
5726
|
-
private unifiedWorker;
|
|
5727
|
-
private playback;
|
|
5728
|
-
private interruption;
|
|
5729
|
-
private omoteEvents;
|
|
5730
|
-
private mic;
|
|
5731
|
-
private static readonly MAX_AUDIO_BUFFER_SAMPLES;
|
|
5732
|
-
private audioBuffer;
|
|
5733
|
-
private audioBufferSamples;
|
|
5734
|
-
private speechStartTime;
|
|
5735
|
-
private silenceTimer;
|
|
5736
|
-
private isSpeaking;
|
|
5737
|
-
private progressiveTimer;
|
|
5738
|
-
private progressivePromise;
|
|
5739
|
-
private lastProgressiveResult;
|
|
5740
|
-
private lastProgressiveSamples;
|
|
5741
|
-
private asrErrorCount;
|
|
5742
|
-
private progressiveErrorCount;
|
|
5743
|
-
private responseAbortController;
|
|
5744
|
-
private _unsubChunk;
|
|
5745
|
-
private _unsubLevel;
|
|
5746
|
-
private _currentFrame;
|
|
5747
|
-
/** Current pipeline state */
|
|
5748
|
-
get state(): VoicePipelineState;
|
|
5749
|
-
/** Latest blendshape frame */
|
|
5750
|
-
get currentFrame(): Float32Array | null;
|
|
5751
|
-
/** Whether user is currently speaking */
|
|
5752
|
-
get isSpeechActive(): boolean;
|
|
5753
|
-
/** Session ID (generated on start(), null before) */
|
|
5754
|
-
get sessionId(): string | null;
|
|
5755
|
-
constructor(config: VoicePipelineConfig);
|
|
5756
|
-
loadModels(): Promise<void>;
|
|
5757
|
-
/**
|
|
5758
|
-
* Load from pre-built backends (dependency injection path).
|
|
5759
|
-
* Loads any backends that aren't loaded yet.
|
|
5760
|
-
*/
|
|
5761
|
-
private loadFromBackends;
|
|
5762
|
-
/**
|
|
5763
|
-
* Load from factories (original path). Loads SenseVoice, LAM, and VAD in parallel.
|
|
5764
|
-
*/
|
|
5765
|
-
private loadFromFactories;
|
|
5766
|
-
start(): Promise<void>;
|
|
5767
|
-
stop(): void;
|
|
5768
|
-
setProfile(profile: ExpressionProfile): void;
|
|
5769
|
-
dispose(): Promise<void>;
|
|
5770
|
-
private processAudioChunk;
|
|
5771
|
-
private getSilenceTimeout;
|
|
5772
|
-
private onSilenceDetected;
|
|
5773
|
-
private processEndOfSpeech;
|
|
5774
|
-
private callResponseHandler;
|
|
5775
|
-
/** Cloud mode: delegate to consumer's onResponse handler */
|
|
5776
|
-
private handleCloudResponse;
|
|
5777
|
-
/** Local mode: synthesize text with TTSBackend, stream to PlaybackPipeline */
|
|
5778
|
-
private handleLocalResponse;
|
|
5779
|
-
private handleInterruption;
|
|
5780
|
-
private startProgressiveTranscription;
|
|
5781
|
-
private stopProgressiveTranscription;
|
|
5782
|
-
private transcribeWithTimeout;
|
|
5783
|
-
private normalizeAudio;
|
|
5784
|
-
private setState;
|
|
5785
|
-
private emitProgress;
|
|
5786
|
-
private clearSilenceTimer;
|
|
5787
|
-
}
|
|
5788
|
-
|
|
5789
4730
|
/**
|
|
5790
4731
|
* VoiceOrchestrator — Shared voice wiring for OmoteAvatar adapters.
|
|
5791
4732
|
*
|
|
@@ -5803,6 +4744,11 @@ interface VoiceOrchestratorBaseConfig {
|
|
|
5803
4744
|
listener?: SpeechListenerConfig;
|
|
5804
4745
|
interruptionEnabled?: boolean;
|
|
5805
4746
|
profile?: ExpressionProfile;
|
|
4747
|
+
onStateChange?: (state: ConversationalState) => void;
|
|
4748
|
+
onLoadingProgress?: (progress: LoadingProgress) => void;
|
|
4749
|
+
onError?: (error: Error) => void;
|
|
4750
|
+
onTranscriptEvent?: (result: TranscriptResult) => void;
|
|
4751
|
+
onInterruption?: () => void;
|
|
5806
4752
|
}
|
|
5807
4753
|
interface VoiceOrchestratorLocalConfig extends VoiceOrchestratorBaseConfig {
|
|
5808
4754
|
mode?: 'local';
|
|
@@ -5816,12 +4762,23 @@ interface VoiceOrchestratorCloudConfig extends VoiceOrchestratorBaseConfig {
|
|
|
5816
4762
|
lam?: {
|
|
5817
4763
|
modelUrl?: string;
|
|
5818
4764
|
externalDataUrl?: string | false;
|
|
4765
|
+
unifiedWorker?: UnifiedInferenceWorker;
|
|
5819
4766
|
};
|
|
4767
|
+
identityIndex?: number;
|
|
4768
|
+
neutralTransitionEnabled?: boolean;
|
|
5820
4769
|
}
|
|
5821
4770
|
type VoiceOrchestratorConfig = VoiceOrchestratorLocalConfig | VoiceOrchestratorCloudConfig;
|
|
5822
4771
|
interface VoiceOrchestratorEvents {
|
|
5823
4772
|
'state': ConversationalState;
|
|
5824
4773
|
'transcript': TranscriptResult;
|
|
4774
|
+
'interruption': void;
|
|
4775
|
+
'loading:progress': LoadingProgress;
|
|
4776
|
+
'error': Error;
|
|
4777
|
+
'audio:level': {
|
|
4778
|
+
rms: number;
|
|
4779
|
+
peak: number;
|
|
4780
|
+
};
|
|
4781
|
+
'playback:complete': void;
|
|
5825
4782
|
[key: string]: unknown;
|
|
5826
4783
|
}
|
|
5827
4784
|
declare class VoiceOrchestrator extends EventEmitter<VoiceOrchestratorEvents> {
|
|
@@ -5830,6 +4787,8 @@ declare class VoiceOrchestrator extends EventEmitter<VoiceOrchestratorEvents> {
|
|
|
5830
4787
|
private ttsSpeaker;
|
|
5831
4788
|
private playbackPipeline;
|
|
5832
4789
|
private ownedLam;
|
|
4790
|
+
private ownedWorker;
|
|
4791
|
+
private usesSharedWorker;
|
|
5833
4792
|
private transcriptUnsub;
|
|
5834
4793
|
private audioChunkUnsub;
|
|
5835
4794
|
private connectEpoch;
|
|
@@ -5853,10 +4812,14 @@ declare class VoiceOrchestrator extends EventEmitter<VoiceOrchestratorEvents> {
|
|
|
5853
4812
|
speak(text: string, options?: {
|
|
5854
4813
|
signal?: AbortSignal;
|
|
5855
4814
|
voice?: string;
|
|
4815
|
+
speed?: number;
|
|
4816
|
+
language?: string;
|
|
5856
4817
|
}): Promise<void>;
|
|
5857
4818
|
streamText(options?: {
|
|
5858
4819
|
signal?: AbortSignal;
|
|
5859
4820
|
voice?: string;
|
|
4821
|
+
speed?: number;
|
|
4822
|
+
language?: string;
|
|
5860
4823
|
}): Promise<{
|
|
5861
4824
|
push: (token: string) => void;
|
|
5862
4825
|
end: () => Promise<void>;
|
|
@@ -5868,4 +4831,4 @@ declare class VoiceOrchestrator extends EventEmitter<VoiceOrchestratorEvents> {
|
|
|
5868
4831
|
private setState;
|
|
5869
4832
|
}
|
|
5870
4833
|
|
|
5871
|
-
export { type A2EBackend,
|
|
4834
|
+
export { type A2EBackend, type A2EModelInfo, A2EProcessor, type A2EProcessorConfig, type A2EResult, A2EUnifiedAdapter, ALL_AUS, ARKIT_BLENDSHAPES, type AUActivation, AU_TO_ARKIT, type ActiveSpan, type AnimationClip, type AnimationController, AnimationGraph, type AnimationGraphConfig, type AnimationGraphEvents, type AnimationLayer, type AnimationOutput, type AnimationSource, type AnimationSourceOptions, type AnimationState, type AnimationStateName, type AnimationTrigger, AudioChunkCoalescer, type AudioChunkCoalescerOptions, AudioEnergyAnalyzer, AudioScheduler, type AudioSchedulerOptions, BLENDSHAPE_TO_GROUP, type BackendPreference, type BlendWeight, type BlendshapeGroup, BlendshapeSmoother, type BlendshapeSmootherConfig, type BoneFilterConfig, type CacheConfig, type CacheSpanAttributes, CharacterController, type CharacterControllerConfig, type CharacterProfile, type CharacterUpdateInput, type CharacterUpdateOutput, ConsoleExporter, type ConversationalState, type CreateA2EConfig, type CreateKokoroTTSConfig, type CreateSenseVoiceConfig, type CreateTTSPlayerConfig, DEFAULT_ANIMATION_CONFIG, DEFAULT_BONE_FILTER, DEFAULT_MODEL_URLS, EMOTION_NAMES, EMOTION_TO_AU, EMOTION_VECTOR_SIZE, EXPLICIT_EMOTION_COUNT, type ElevenLabsConfig, ElevenLabsTTSBackend, type EmotionAnimationMap, EmotionController, type EmotionLabel, type EmotionName, type EmotionPresetName, EmotionPresets, EmotionResolver, type EmotionWeights, EmphasisDetector, type ErrorType, ErrorTypes, EventEmitter, type ExpressionProfile, FaceCompositor, type FaceCompositorConfig, type FaceCompositorInput, type FaceCompositorOutput, type FetchWithCacheOptions, type FrameSource, type FullFaceFrame, HF_CDN_URLS, INFERENCE_LATENCY_BUCKETS, type InferenceFactoryConfig, type InferenceSpanAttributes, type InterruptionConfig, type InterruptionEvents, InterruptionHandler, KOKORO_VOICES, type KokoroStreamChunk, type KokoroTTSConfig, type KokoroTTSModelInfo, type KokoroTTSResult, KokoroTTSUnifiedAdapter, type KokoroVoiceName, LAM_BLENDSHAPES, type LifeLayerConfig, type LifeLayerInput, type LifeLayerOutput, type LoadingProgress, MIXAMO_PREFIX, MODEL_LOAD_TIME_BUCKETS, type MetricData, MetricNames, MicLipSync, type MicLipSyncConfig, type MicLipSyncEvents, type MicLipSyncFrame, type MicLipSyncState, MicrophoneCapture, type MicrophoneCaptureConfig, ModelCache, type ModelSpanAttributes, type ModelUrlKey, OTLPExporter, type OTLPExporterConfig, OmoteEvents, OmoteTelemetry, PRESERVE_POSITION_BONES, PlaybackPipeline, type PlaybackPipelineConfig, type PlaybackPipelineEvents, type PlaybackState, ProceduralLifeLayer, type Quat, type QuotaInfo, type ResolvedEmotion, type ResponseHandler, RingBuffer, type RuntimeBackend, type SafariSpeechConfig, SafariSpeechRecognition, type SamplingConfig, type SenseVoiceBackend, type SenseVoiceConfig, type SenseVoiceLanguage, type SenseVoiceModelInfo, type SenseVoiceResult, SenseVoiceUnifiedAdapter, type SenseVoiceWorkerConfig, type SileroVADBackend, type SileroVADConfig, type SileroVADFactoryConfig, SileroVADUnifiedAdapter, type SpanAttributes, type SpanData, type SpeechErrorCallback, SpeechListener, type SpeechListenerConfig, type SpeechListenerEvents, type SpeechListenerState, type SpeechRecognitionResult, type SpeechResultCallback, type SpeechSegment, type SynthesizeOptions, type TTSBackend, type TTSChunk, TTSPlayback, type TTSPlaybackConfig, type TTSPlaybackEvents, TTSPlayer, TTSSpeaker, type TTSSpeakerConfig, type TTSStreamOptions, type TelemetryConfig, type TelemetryExporter, type TelemetryExporterInterface, type TrackDescriptor, type TranscriptResult, type Transition, UnifiedInferenceWorker, type VADBackend, type VADModelInfo, type VADResult, type VADWorkerConfig, type VADWorkerModelInfo, type ValidationResult, type Vec3, VoiceOrchestrator, type VoiceOrchestratorCloudConfig, type VoiceOrchestratorConfig, type VoiceOrchestratorEvents, type VoiceOrchestratorLocalConfig, type VoicePipelineState, type WorkerHealthState, analyzeTextEmotion, applyProfile, blendEmotions, calculatePeak, calculateRMS, configureCacheLimit, configureModelUrls, configureTelemetry, createA2E, createEmotionVector, createKokoroTTS, createSenseVoice, createSileroVAD, createTTSPlayer, fetchWithCache, float32ToPcm16, formatBytes, getCacheConfig, getCacheKey, getEmotionPreset, getModelCache, getOptimalWasmThreads, getRecommendedBackend, getTelemetry, hasWebGPUApi, int16ToFloat32, isAndroid, isIOS, isIOSSafari, isMobile, isSafari, isSpeechRecognitionAvailable, isWebGPUAvailable, lerpBlendshapes, lerpEmotion, listVoices as listKokoroVoices, parseEmotionTags, pcm16ToFloat32, preloadModels, resampleLinear, resetModelUrls, resolveBackend, resolveEmotion, shouldEnableWasmProxy, shouldKeepTrack, shouldUseNativeASR, shouldUseServerA2E, stripMixamoPrefix, ttsToPlaybackFormat, validateTTSInput };
|