@omote/core 0.9.7 → 0.10.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +77 -35
- package/dist/{chunk-X5OTUOE6.mjs → chunk-3FILA2CD.mjs} +63 -205
- package/dist/chunk-3FILA2CD.mjs.map +1 -0
- package/dist/{chunk-CYBTTLG7.mjs → chunk-5WIOGMJA.mjs} +77 -219
- package/dist/chunk-5WIOGMJA.mjs.map +1 -0
- package/dist/{chunk-3NDJA3I4.mjs → chunk-NWZMIQK4.mjs} +135 -206
- package/dist/chunk-NWZMIQK4.mjs.map +1 -0
- package/dist/{chunk-Y3DTP5P3.mjs → chunk-VSYYT4HO.mjs} +1 -1
- package/dist/{chunk-X5OTUOE6.mjs.map → chunk-VSYYT4HO.mjs.map} +1 -1
- package/dist/chunk-WW4XAUJ3.mjs +208 -0
- package/dist/chunk-WW4XAUJ3.mjs.map +1 -0
- package/dist/index.d.mts +336 -1375
- package/dist/index.d.ts +336 -1375
- package/dist/index.js +6738 -11284
- package/dist/index.js.map +1 -1
- package/dist/index.mjs +6099 -10719
- package/dist/index.mjs.map +1 -1
- package/dist/logging/index.js +5 -0
- package/dist/logging/index.js.map +1 -1
- package/dist/logging/index.mjs +1 -1
- package/dist/otlp-2BML6FIK.mjs +7 -0
- package/dist/otlp-2BML6FIK.mjs.map +1 -0
- package/package.json +1 -2
- package/dist/Logger-BeUI6jG7.d.mts +0 -145
- package/dist/Logger-BeUI6jG7.d.ts +0 -145
- package/dist/Logger-DSoGAYJu.d.mts +0 -141
- package/dist/Logger-DSoGAYJu.d.ts +0 -141
- package/dist/chunk-3NDJA3I4.mjs.map +0 -1
- package/dist/chunk-CYBTTLG7.mjs.map +0 -1
- package/dist/chunk-ESU52TDS.mjs +0 -287
- package/dist/chunk-ESU52TDS.mjs.map +0 -1
- package/dist/chunk-MXKJOF4I.mjs +0 -38
- package/dist/chunk-MXKJOF4I.mjs.map +0 -1
- package/dist/chunk-XK22BRG4.mjs +0 -38
- package/dist/chunk-XK22BRG4.mjs.map +0 -1
- package/dist/chunk-Y3DTP5P3.mjs.map +0 -1
package/dist/index.d.ts
CHANGED
|
@@ -470,7 +470,7 @@ declare function shouldUseServerA2E(): boolean;
|
|
|
470
470
|
/**
|
|
471
471
|
* Common interface for audio-to-expression (A2E) inference backends
|
|
472
472
|
*
|
|
473
|
-
* Implemented by
|
|
473
|
+
* Implemented by A2EUnifiedAdapter, allowing PlaybackPipeline
|
|
474
474
|
* and A2EProcessor to work with either implementation transparently.
|
|
475
475
|
*
|
|
476
476
|
* @category Inference
|
|
@@ -488,11 +488,11 @@ interface A2EModelInfo {
|
|
|
488
488
|
/**
|
|
489
489
|
* Result from A2E inference
|
|
490
490
|
*
|
|
491
|
-
* All implementations must return blendshapes in
|
|
491
|
+
* All implementations must return blendshapes in ARKIT_BLENDSHAPES order (alphabetical).
|
|
492
492
|
* Models with different native orderings must remap internally before returning.
|
|
493
493
|
*/
|
|
494
494
|
interface A2EResult {
|
|
495
|
-
/** Blendshape weights [frames, 52] in
|
|
495
|
+
/** Blendshape weights [frames, 52] in ARKIT_BLENDSHAPES order - 30fps */
|
|
496
496
|
blendshapes: Float32Array[];
|
|
497
497
|
/** Number of blendshape frames */
|
|
498
498
|
numFrames: number;
|
|
@@ -507,10 +507,8 @@ interface A2EResult {
|
|
|
507
507
|
* pipeline — A2E is the interface abstraction, LAM is the model.
|
|
508
508
|
*
|
|
509
509
|
* Implemented by:
|
|
510
|
-
* - {@link
|
|
511
|
-
* - A2EUnifiedAdapter (shared unified worker)
|
|
510
|
+
* - {@link A2EUnifiedAdapter} (shared unified worker)
|
|
512
511
|
*
|
|
513
|
-
* @see {@link A2EInference} for direct usage
|
|
514
512
|
* @see {@link createA2E} for the recommended factory API
|
|
515
513
|
*/
|
|
516
514
|
interface A2EBackend {
|
|
@@ -531,7 +529,7 @@ interface A2EBackend {
|
|
|
531
529
|
* Run inference on raw audio
|
|
532
530
|
* @param audioSamples - Float32Array of raw audio at 16kHz
|
|
533
531
|
* @param identityIndex - Optional identity index (ignored by CPU model)
|
|
534
|
-
* @returns A2E result with blendshapes in
|
|
532
|
+
* @returns A2E result with blendshapes in ARKIT_BLENDSHAPES order
|
|
535
533
|
*/
|
|
536
534
|
infer(audioSamples: Float32Array, identityIndex?: number): Promise<A2EResult>;
|
|
537
535
|
/**
|
|
@@ -544,7 +542,7 @@ interface A2EBackend {
|
|
|
544
542
|
* ExpressionProfile - Per-character weight scaling for A2E blendshape output
|
|
545
543
|
*
|
|
546
544
|
* Maps blendshape groups (eyes, brows, jaw, mouth, cheeks, nose, tongue)
|
|
547
|
-
* to weight scalers. Used by PlaybackPipeline, MicLipSync, and
|
|
545
|
+
* to weight scalers. Used by PlaybackPipeline, MicLipSync, and VoiceOrchestrator.
|
|
548
546
|
*
|
|
549
547
|
* @category Audio
|
|
550
548
|
*/
|
|
@@ -575,7 +573,7 @@ interface ExpressionProfile {
|
|
|
575
573
|
overrides?: Partial<Record<string, number>>;
|
|
576
574
|
}
|
|
577
575
|
/**
|
|
578
|
-
* Map each
|
|
576
|
+
* Map each ARKIT_BLENDSHAPES entry to its BlendshapeGroup.
|
|
579
577
|
* Built once at module load from prefix matching.
|
|
580
578
|
*/
|
|
581
579
|
declare const BLENDSHAPE_TO_GROUP: Map<string, BlendshapeGroup>;
|
|
@@ -698,6 +696,8 @@ declare class PlaybackPipeline extends EventEmitter<PlaybackPipelineEvents> {
|
|
|
698
696
|
constructor(config: PlaybackPipelineConfig);
|
|
699
697
|
/** Initialize AudioContext (lazy, call after user gesture) */
|
|
700
698
|
initialize(): Promise<void>;
|
|
699
|
+
/** Eagerly create AudioContext. Call from user gesture for iOS. */
|
|
700
|
+
warmup(): Promise<void>;
|
|
701
701
|
/** Update ExpressionProfile at runtime */
|
|
702
702
|
setProfile(profile: ExpressionProfile): void;
|
|
703
703
|
/** Set the emotion label to include in emitted frames */
|
|
@@ -744,7 +744,7 @@ declare class PlaybackPipeline extends EventEmitter<PlaybackPipelineEvents> {
|
|
|
744
744
|
* TTSBackend — Streaming text-to-speech backend interface.
|
|
745
745
|
*
|
|
746
746
|
* Any TTS engine (Kokoro, ElevenLabs, etc.) can implement this contract
|
|
747
|
-
* to integrate with TTSPlayback and
|
|
747
|
+
* to integrate with TTSPlayback and VoiceOrchestrator.
|
|
748
748
|
*
|
|
749
749
|
* @category Inference
|
|
750
750
|
*/
|
|
@@ -788,6 +788,10 @@ interface TTSStreamOptions {
|
|
|
788
788
|
voice?: string;
|
|
789
789
|
/** Speed multiplier override per-call */
|
|
790
790
|
speed?: number;
|
|
791
|
+
/** Language override per-call (e.g. 'en-us', 'ja'). Default: derived from voice name. */
|
|
792
|
+
language?: string;
|
|
793
|
+
/** When true, emit the entire text as a single chunk (no sentence splitting). */
|
|
794
|
+
singleShot?: boolean;
|
|
791
795
|
}
|
|
792
796
|
/**
|
|
793
797
|
* A single chunk of TTS audio output
|
|
@@ -863,7 +867,11 @@ declare class TTSPlayback extends EventEmitter<TTSPlaybackEvents> {
|
|
|
863
867
|
speak(text: string, options?: {
|
|
864
868
|
signal?: AbortSignal;
|
|
865
869
|
voice?: string;
|
|
870
|
+
speed?: number;
|
|
871
|
+
language?: string;
|
|
866
872
|
}): Promise<void>;
|
|
873
|
+
/** Eagerly create AudioContext. Call from user gesture for iOS. */
|
|
874
|
+
warmup(): Promise<void>;
|
|
867
875
|
/** Dispose of all resources. */
|
|
868
876
|
dispose(): Promise<void>;
|
|
869
877
|
private speakWithPrefetch;
|
|
@@ -900,34 +908,9 @@ declare class TTSPlayback extends EventEmitter<TTSPlaybackEvents> {
|
|
|
900
908
|
declare function isWebGPUAvailable(): Promise<boolean>;
|
|
901
909
|
|
|
902
910
|
/**
|
|
903
|
-
* SenseVoice
|
|
904
|
-
*
|
|
905
|
-
* Non-autoregressive CTC-based ASR that is 5x faster than Whisper-Small.
|
|
906
|
-
* Runs entirely in browser via WebGPU or WASM. No transformers.js dependency.
|
|
907
|
-
*
|
|
908
|
-
* Uses the sherpa-onnx SenseVoice export (model.int8.onnx, 239MB int8 quantized).
|
|
909
|
-
* Also provides emotion detection, language identification, and audio event detection
|
|
910
|
-
* from the same forward pass.
|
|
911
|
+
* SenseVoice type definitions
|
|
911
912
|
*
|
|
912
913
|
* @category Inference
|
|
913
|
-
*
|
|
914
|
-
* @example Basic usage
|
|
915
|
-
* ```typescript
|
|
916
|
-
* import { SenseVoiceInference } from '@omote/core';
|
|
917
|
-
*
|
|
918
|
-
* const asr = new SenseVoiceInference({
|
|
919
|
-
* modelUrl: '/models/sensevoice/model.int8.onnx',
|
|
920
|
-
* tokensUrl: '/models/sensevoice/tokens.txt',
|
|
921
|
-
* });
|
|
922
|
-
* await asr.load();
|
|
923
|
-
*
|
|
924
|
-
* const { text, emotion, language } = await asr.transcribe(audioSamples);
|
|
925
|
-
* console.log(text); // "Hello world"
|
|
926
|
-
* console.log(emotion); // "NEUTRAL"
|
|
927
|
-
* console.log(language); // "en"
|
|
928
|
-
* ```
|
|
929
|
-
*
|
|
930
|
-
* @module inference/SenseVoiceInference
|
|
931
914
|
*/
|
|
932
915
|
|
|
933
916
|
type SenseVoiceLanguage = 'auto' | 'zh' | 'en' | 'ja' | 'ko' | 'yue';
|
|
@@ -964,76 +947,49 @@ interface SenseVoiceModelInfo {
|
|
|
964
947
|
outputNames: string[];
|
|
965
948
|
vocabSize: number;
|
|
966
949
|
}
|
|
967
|
-
|
|
968
|
-
|
|
969
|
-
|
|
970
|
-
|
|
971
|
-
|
|
972
|
-
|
|
973
|
-
|
|
974
|
-
|
|
975
|
-
|
|
976
|
-
|
|
977
|
-
|
|
978
|
-
|
|
979
|
-
|
|
980
|
-
|
|
981
|
-
|
|
982
|
-
|
|
983
|
-
|
|
984
|
-
|
|
985
|
-
|
|
950
|
+
/**
|
|
951
|
+
* Configuration for SenseVoice Worker (used by SenseVoiceUnifiedAdapter)
|
|
952
|
+
*/
|
|
953
|
+
interface SenseVoiceWorkerConfig {
|
|
954
|
+
/** Path or URL to model.int8.onnx (239MB) */
|
|
955
|
+
modelUrl: string;
|
|
956
|
+
/** Path or URL to tokens.txt vocabulary file (default: sibling of modelUrl) */
|
|
957
|
+
tokensUrl?: string;
|
|
958
|
+
/** Language hint (default: 'auto' for auto-detection) */
|
|
959
|
+
language?: SenseVoiceLanguage;
|
|
960
|
+
/** Text normalization: 'with_itn' applies inverse text normalization (default: 'with_itn') */
|
|
961
|
+
textNorm?: 'with_itn' | 'without_itn';
|
|
962
|
+
}
|
|
963
|
+
/**
|
|
964
|
+
* Common interface for SenseVoice implementations
|
|
965
|
+
*/
|
|
966
|
+
interface SenseVoiceBackend {
|
|
967
|
+
/** Whether the model is loaded and ready for inference */
|
|
968
|
+
readonly isLoaded: boolean;
|
|
969
|
+
/** Current backend type ('wasm' for worker, full backend for main thread, null if not loaded) */
|
|
970
|
+
readonly backend: 'wasm' | 'webgpu' | null;
|
|
971
|
+
/**
|
|
972
|
+
* Load the ONNX model
|
|
973
|
+
* @param onProgress - Optional progress callback (fires once at 100% for worker)
|
|
974
|
+
* @returns Model loading information
|
|
975
|
+
*/
|
|
986
976
|
load(onProgress?: (loaded: number, total: number) => void): Promise<SenseVoiceModelInfo>;
|
|
987
977
|
/**
|
|
988
978
|
* Transcribe audio samples to text
|
|
989
|
-
*
|
|
990
|
-
* @
|
|
991
|
-
* @returns Transcription result with text, emotion, language, and event
|
|
979
|
+
* @param audioSamples - Float32Array of audio samples at 16kHz
|
|
980
|
+
* @returns Transcription result
|
|
992
981
|
*/
|
|
993
982
|
transcribe(audioSamples: Float32Array): Promise<SenseVoiceResult>;
|
|
994
|
-
|
|
983
|
+
/**
|
|
984
|
+
* Dispose of the model and free resources
|
|
985
|
+
*/
|
|
995
986
|
dispose(): Promise<void>;
|
|
996
987
|
}
|
|
997
988
|
|
|
998
989
|
/**
|
|
999
|
-
* Silero VAD
|
|
1000
|
-
*
|
|
1001
|
-
* Neural network-based VAD running in browser via ONNX Runtime Web.
|
|
1002
|
-
* Much more accurate than RMS-based energy detection.
|
|
1003
|
-
*
|
|
1004
|
-
* Uses lazy loading to conditionally load WebGPU or WASM-only bundle:
|
|
1005
|
-
* - iOS: Loads WASM-only bundle (WebGPU crashes due to Safari bugs)
|
|
1006
|
-
* - Android/Desktop: Loads WebGPU bundle (with WASM fallback)
|
|
990
|
+
* Silero VAD type definitions
|
|
1007
991
|
*
|
|
1008
992
|
* @category Inference
|
|
1009
|
-
*
|
|
1010
|
-
* @example Basic usage
|
|
1011
|
-
* ```typescript
|
|
1012
|
-
* import { SileroVADInference } from '@omote/core';
|
|
1013
|
-
*
|
|
1014
|
-
* const vad = new SileroVADInference({
|
|
1015
|
-
* modelUrl: '/models/silero-vad.onnx'
|
|
1016
|
-
* });
|
|
1017
|
-
* await vad.load();
|
|
1018
|
-
*
|
|
1019
|
-
* // Process 32ms chunks (512 samples at 16kHz)
|
|
1020
|
-
* const probability = await vad.process(audioChunk);
|
|
1021
|
-
* if (probability > 0.5) {
|
|
1022
|
-
* console.log('Speech detected!');
|
|
1023
|
-
* }
|
|
1024
|
-
* ```
|
|
1025
|
-
*
|
|
1026
|
-
* @example Streaming with state management
|
|
1027
|
-
* ```typescript
|
|
1028
|
-
* // State is automatically maintained between process() calls
|
|
1029
|
-
* // Call reset() when starting a new audio stream
|
|
1030
|
-
* vad.reset();
|
|
1031
|
-
*
|
|
1032
|
-
* for (const chunk of audioChunks) {
|
|
1033
|
-
* const prob = await vad.process(chunk);
|
|
1034
|
-
* // prob is speech probability [0, 1]
|
|
1035
|
-
* }
|
|
1036
|
-
* ```
|
|
1037
993
|
*/
|
|
1038
994
|
|
|
1039
995
|
type VADBackend = BackendPreference;
|
|
@@ -1103,117 +1059,6 @@ interface SpeechSegment {
|
|
|
1103
1059
|
/** Average probability during segment */
|
|
1104
1060
|
avgProbability: number;
|
|
1105
1061
|
}
|
|
1106
|
-
/**
|
|
1107
|
-
* Silero VAD - Neural network voice activity detection
|
|
1108
|
-
*
|
|
1109
|
-
* Based on snakers4/silero-vad ONNX model.
|
|
1110
|
-
* Processes 32ms chunks (512 samples at 16kHz) with LSTM state.
|
|
1111
|
-
*
|
|
1112
|
-
* @see https://github.com/snakers4/silero-vad
|
|
1113
|
-
*/
|
|
1114
|
-
declare class SileroVADInference {
|
|
1115
|
-
private session;
|
|
1116
|
-
private ort;
|
|
1117
|
-
private config;
|
|
1118
|
-
private _backend;
|
|
1119
|
-
private isLoading;
|
|
1120
|
-
private state;
|
|
1121
|
-
private context;
|
|
1122
|
-
private readonly chunkSize;
|
|
1123
|
-
private readonly contextSize;
|
|
1124
|
-
private inferenceQueue;
|
|
1125
|
-
private preSpeechBuffer;
|
|
1126
|
-
private wasSpeaking;
|
|
1127
|
-
private srTensor;
|
|
1128
|
-
constructor(config: SileroVADConfig);
|
|
1129
|
-
get backend(): RuntimeBackend | null;
|
|
1130
|
-
get isLoaded(): boolean;
|
|
1131
|
-
get sampleRate(): number;
|
|
1132
|
-
get threshold(): number;
|
|
1133
|
-
/**
|
|
1134
|
-
* Get required chunk size in samples
|
|
1135
|
-
*/
|
|
1136
|
-
getChunkSize(): number;
|
|
1137
|
-
/**
|
|
1138
|
-
* Get chunk duration in milliseconds
|
|
1139
|
-
*/
|
|
1140
|
-
getChunkDurationMs(): number;
|
|
1141
|
-
/**
|
|
1142
|
-
* Check if WebGPU is available and working
|
|
1143
|
-
* (iOS returns false even if navigator.gpu exists due to ONNX Runtime bugs)
|
|
1144
|
-
*/
|
|
1145
|
-
static isWebGPUAvailable: typeof isWebGPUAvailable;
|
|
1146
|
-
/**
|
|
1147
|
-
* Load the ONNX model
|
|
1148
|
-
*/
|
|
1149
|
-
load(): Promise<VADModelInfo>;
|
|
1150
|
-
/**
|
|
1151
|
-
* Reset state for new audio stream
|
|
1152
|
-
*/
|
|
1153
|
-
reset(): void;
|
|
1154
|
-
/**
|
|
1155
|
-
* Process a single audio chunk
|
|
1156
|
-
*
|
|
1157
|
-
* @param audioChunk - Float32Array of exactly chunkSize samples (512 for 16kHz, 256 for 8kHz)
|
|
1158
|
-
* @returns VAD result with speech probability
|
|
1159
|
-
*/
|
|
1160
|
-
process(audioChunk: Float32Array): Promise<VADResult>;
|
|
1161
|
-
/**
|
|
1162
|
-
* Process audio and detect speech segments
|
|
1163
|
-
*
|
|
1164
|
-
* @param audio - Complete audio buffer
|
|
1165
|
-
* @param options - Detection options
|
|
1166
|
-
* @returns Array of speech segments
|
|
1167
|
-
*/
|
|
1168
|
-
detectSpeech(audio: Float32Array, options?: {
|
|
1169
|
-
/** Minimum speech duration in ms (default: 250) */
|
|
1170
|
-
minSpeechDurationMs?: number;
|
|
1171
|
-
/** Minimum silence duration to end segment in ms (default: 300) */
|
|
1172
|
-
minSilenceDurationMs?: number;
|
|
1173
|
-
/** Padding to add before/after speech in ms (default: 30) */
|
|
1174
|
-
speechPadMs?: number;
|
|
1175
|
-
}): Promise<SpeechSegment[]>;
|
|
1176
|
-
/**
|
|
1177
|
-
* Queue inference to serialize ONNX session calls
|
|
1178
|
-
*/
|
|
1179
|
-
private queueInference;
|
|
1180
|
-
/**
|
|
1181
|
-
* Dispose of the model and free resources
|
|
1182
|
-
*/
|
|
1183
|
-
dispose(): Promise<void>;
|
|
1184
|
-
}
|
|
1185
|
-
|
|
1186
|
-
/**
|
|
1187
|
-
* Silero VAD Web Worker implementation
|
|
1188
|
-
*
|
|
1189
|
-
* Runs Silero VAD inference in a dedicated Web Worker to prevent main thread blocking.
|
|
1190
|
-
* Uses inline worker script (Blob URL pattern) to avoid separate file deployment.
|
|
1191
|
-
*
|
|
1192
|
-
* Key design decisions:
|
|
1193
|
-
* - WASM backend only (WebGPU doesn't work in Workers)
|
|
1194
|
-
* - LSTM state serialized as Float32Array (Tensors can't cross worker boundary)
|
|
1195
|
-
* - Audio copied (not transferred) to retain main thread access for pre-speech buffer
|
|
1196
|
-
* - ONNX Runtime loaded from CDN in worker (no bundler complications)
|
|
1197
|
-
*
|
|
1198
|
-
* @category Inference
|
|
1199
|
-
*
|
|
1200
|
-
* @example Basic usage
|
|
1201
|
-
* ```typescript
|
|
1202
|
-
* import { SileroVADWorker } from '@omote/core';
|
|
1203
|
-
*
|
|
1204
|
-
* const vad = new SileroVADWorker({
|
|
1205
|
-
* modelUrl: '/models/silero-vad.onnx'
|
|
1206
|
-
* });
|
|
1207
|
-
* await vad.load();
|
|
1208
|
-
*
|
|
1209
|
-
* // Process 32ms chunks (512 samples at 16kHz)
|
|
1210
|
-
* const result = await vad.process(audioChunk);
|
|
1211
|
-
* if (result.isSpeech) {
|
|
1212
|
-
* console.log('Speech detected!', result.probability);
|
|
1213
|
-
* }
|
|
1214
|
-
* ```
|
|
1215
|
-
*/
|
|
1216
|
-
|
|
1217
1062
|
/**
|
|
1218
1063
|
* Configuration for Silero VAD Worker
|
|
1219
1064
|
*/
|
|
@@ -1226,13 +1071,6 @@ interface VADWorkerConfig {
|
|
|
1226
1071
|
threshold?: number;
|
|
1227
1072
|
/**
|
|
1228
1073
|
* Number of audio chunks to keep in pre-speech buffer.
|
|
1229
|
-
* When VAD triggers, these chunks are prepended to the speech buffer
|
|
1230
|
-
* to capture the beginning of speech that occurred before detection.
|
|
1231
|
-
*
|
|
1232
|
-
* At 512 samples/chunk and 16kHz:
|
|
1233
|
-
* - 10 chunks = 320ms of pre-speech audio
|
|
1234
|
-
* - 15 chunks = 480ms of pre-speech audio
|
|
1235
|
-
*
|
|
1236
1074
|
* Default: 10 chunks (320ms)
|
|
1237
1075
|
*/
|
|
1238
1076
|
preSpeechBufferChunks?: number;
|
|
@@ -1248,85 +1086,45 @@ interface VADWorkerModelInfo {
|
|
|
1248
1086
|
sampleRate: number;
|
|
1249
1087
|
chunkSize: number;
|
|
1250
1088
|
}
|
|
1251
|
-
|
|
1252
1089
|
/**
|
|
1253
|
-
*
|
|
1254
|
-
*
|
|
1255
|
-
* Runs Silero VAD inference off the main thread to prevent UI blocking.
|
|
1256
|
-
* Feature parity with SileroVADInference but runs in dedicated worker.
|
|
1257
|
-
*
|
|
1258
|
-
* @see SileroVADInference for main-thread version
|
|
1090
|
+
* Common interface for Silero VAD implementations
|
|
1259
1091
|
*/
|
|
1260
|
-
|
|
1261
|
-
|
|
1262
|
-
|
|
1263
|
-
|
|
1264
|
-
|
|
1265
|
-
|
|
1266
|
-
|
|
1267
|
-
|
|
1268
|
-
|
|
1269
|
-
private readonly contextSize;
|
|
1270
|
-
private inferenceQueue;
|
|
1271
|
-
private preSpeechBuffer;
|
|
1272
|
-
private wasSpeaking;
|
|
1273
|
-
private pendingResolvers;
|
|
1274
|
-
private messageId;
|
|
1275
|
-
constructor(config: VADWorkerConfig);
|
|
1276
|
-
get isLoaded(): boolean;
|
|
1277
|
-
/**
|
|
1278
|
-
* Backend type (always 'wasm' for Worker, WebGPU not supported in Workers)
|
|
1279
|
-
*/
|
|
1280
|
-
get backend(): 'wasm' | null;
|
|
1281
|
-
get sampleRate(): number;
|
|
1282
|
-
get threshold(): number;
|
|
1283
|
-
/**
|
|
1284
|
-
* Get required chunk size in samples
|
|
1285
|
-
*/
|
|
1286
|
-
getChunkSize(): number;
|
|
1287
|
-
/**
|
|
1288
|
-
* Get chunk duration in milliseconds
|
|
1289
|
-
*/
|
|
1290
|
-
getChunkDurationMs(): number;
|
|
1291
|
-
/**
|
|
1292
|
-
* Create the worker from inline script
|
|
1293
|
-
*/
|
|
1294
|
-
private createWorker;
|
|
1295
|
-
/**
|
|
1296
|
-
* Handle messages from worker
|
|
1297
|
-
*/
|
|
1298
|
-
private handleWorkerMessage;
|
|
1299
|
-
/**
|
|
1300
|
-
* Send message to worker and wait for response
|
|
1301
|
-
*/
|
|
1302
|
-
private sendMessage;
|
|
1303
|
-
/**
|
|
1304
|
-
* Load the ONNX model in the worker
|
|
1305
|
-
*/
|
|
1306
|
-
load(): Promise<VADWorkerModelInfo>;
|
|
1092
|
+
interface SileroVADBackend {
|
|
1093
|
+
/** Current backend type (webgpu, wasm, or null if not loaded) */
|
|
1094
|
+
readonly backend: RuntimeBackend | null;
|
|
1095
|
+
/** Whether the model is loaded and ready for inference */
|
|
1096
|
+
readonly isLoaded: boolean;
|
|
1097
|
+
/** Audio sample rate (8000 or 16000 Hz) */
|
|
1098
|
+
readonly sampleRate: number;
|
|
1099
|
+
/** Speech detection threshold (0-1) */
|
|
1100
|
+
readonly threshold: number;
|
|
1307
1101
|
/**
|
|
1308
|
-
*
|
|
1102
|
+
* Load the ONNX model
|
|
1103
|
+
* @returns Model loading information
|
|
1309
1104
|
*/
|
|
1310
|
-
|
|
1105
|
+
load(): Promise<VADModelInfo | VADWorkerModelInfo>;
|
|
1311
1106
|
/**
|
|
1312
1107
|
* Process a single audio chunk
|
|
1313
|
-
*
|
|
1314
|
-
* @param audioChunk - Float32Array of exactly chunkSize samples (512 for 16kHz, 256 for 8kHz)
|
|
1108
|
+
* @param audioChunk - Float32Array of exactly chunkSize samples
|
|
1315
1109
|
* @returns VAD result with speech probability
|
|
1316
1110
|
*/
|
|
1317
1111
|
process(audioChunk: Float32Array): Promise<VADResult>;
|
|
1318
1112
|
/**
|
|
1319
|
-
*
|
|
1113
|
+
* Reset state for new audio stream
|
|
1320
1114
|
*/
|
|
1321
|
-
|
|
1115
|
+
reset(): void | Promise<void>;
|
|
1322
1116
|
/**
|
|
1323
|
-
* Dispose of the
|
|
1117
|
+
* Dispose of the model and free resources
|
|
1324
1118
|
*/
|
|
1325
1119
|
dispose(): Promise<void>;
|
|
1326
1120
|
/**
|
|
1327
|
-
*
|
|
1121
|
+
* Get required chunk size in samples
|
|
1328
1122
|
*/
|
|
1329
|
-
|
|
1123
|
+
getChunkSize(): number;
|
|
1124
|
+
/**
|
|
1125
|
+
* Get chunk duration in milliseconds
|
|
1126
|
+
*/
|
|
1127
|
+
getChunkDurationMs(): number;
|
|
1330
1128
|
}
|
|
1331
1129
|
|
|
1332
1130
|
/**
|
|
@@ -1454,43 +1252,33 @@ declare class UnifiedInferenceWorker {
|
|
|
1454
1252
|
|
|
1455
1253
|
/** Base config shared across all inference factory functions */
|
|
1456
1254
|
interface InferenceFactoryConfig {
|
|
1457
|
-
/**
|
|
1458
|
-
* Worker mode:
|
|
1459
|
-
* - 'auto' (default): Use Worker if supported, else main thread
|
|
1460
|
-
* - true: Force Worker (throws if unsupported)
|
|
1461
|
-
* - false: Force main thread
|
|
1462
|
-
*/
|
|
1463
|
-
useWorker?: boolean | 'auto';
|
|
1464
1255
|
/**
|
|
1465
1256
|
* Unified inference worker instance.
|
|
1466
|
-
*
|
|
1257
|
+
* Routes inference through the shared worker,
|
|
1467
1258
|
* keeping all inference off the main thread.
|
|
1468
|
-
* Takes precedence over useWorker setting.
|
|
1469
1259
|
*/
|
|
1470
1260
|
unifiedWorker?: UnifiedInferenceWorker;
|
|
1471
1261
|
}
|
|
1472
1262
|
|
|
1473
1263
|
/**
|
|
1474
|
-
* Factory function for A2E inference
|
|
1264
|
+
* Factory function for A2E inference via UnifiedInferenceWorker
|
|
1475
1265
|
*
|
|
1476
1266
|
* Creates an A2EBackend instance with zero-config defaults (HuggingFace CDN).
|
|
1477
|
-
*
|
|
1267
|
+
* Routes inference through the shared unified worker.
|
|
1478
1268
|
*
|
|
1479
1269
|
* @category Inference
|
|
1480
1270
|
*
|
|
1481
|
-
* @example
|
|
1271
|
+
* @example
|
|
1482
1272
|
* ```typescript
|
|
1483
|
-
* import { createA2E } from '@omote/core';
|
|
1273
|
+
* import { createA2E, UnifiedInferenceWorker } from '@omote/core';
|
|
1274
|
+
*
|
|
1275
|
+
* const worker = new UnifiedInferenceWorker();
|
|
1276
|
+
* await worker.init();
|
|
1484
1277
|
*
|
|
1485
|
-
* const a2e = createA2E(
|
|
1278
|
+
* const a2e = createA2E({ unifiedWorker: worker });
|
|
1486
1279
|
* await a2e.load();
|
|
1487
1280
|
* const { blendshapes } = await a2e.infer(audioSamples);
|
|
1488
1281
|
* ```
|
|
1489
|
-
*
|
|
1490
|
-
* @example Custom model URL
|
|
1491
|
-
* ```typescript
|
|
1492
|
-
* const a2e = createA2E({ modelUrl: '/models/lam.onnx' });
|
|
1493
|
-
* ```
|
|
1494
1282
|
*/
|
|
1495
1283
|
|
|
1496
1284
|
/**
|
|
@@ -1506,13 +1294,13 @@ interface CreateA2EConfig extends InferenceFactoryConfig {
|
|
|
1506
1294
|
* Set to `false` to skip external data loading (single-file models only).
|
|
1507
1295
|
*/
|
|
1508
1296
|
externalDataUrl?: string | false;
|
|
1509
|
-
/** Backend preference (default: 'auto') */
|
|
1510
|
-
backend?: BackendPreference;
|
|
1511
1297
|
/** Number of identity classes (default: 12) */
|
|
1512
1298
|
numIdentityClasses?: number;
|
|
1513
1299
|
}
|
|
1514
1300
|
/**
|
|
1515
|
-
* Create an A2E instance
|
|
1301
|
+
* Create an A2E instance via the unified worker.
|
|
1302
|
+
*
|
|
1303
|
+
* If no `unifiedWorker` is provided, a dedicated worker is created on load().
|
|
1516
1304
|
*
|
|
1517
1305
|
* @param config - Factory configuration
|
|
1518
1306
|
* @returns An A2EBackend instance
|
|
@@ -1528,7 +1316,7 @@ declare function createA2E(config?: CreateA2EConfig): A2EBackend;
|
|
|
1528
1316
|
/**
|
|
1529
1317
|
* Generic frame source -- any object that emits 'frame' events with blendshapes.
|
|
1530
1318
|
*
|
|
1531
|
-
* Implemented by PlaybackPipeline, MicLipSync,
|
|
1319
|
+
* Implemented by PlaybackPipeline, MicLipSync, and any custom source.
|
|
1532
1320
|
* Used by OmoteAvatar (all renderer adapters) to receive animation frames.
|
|
1533
1321
|
*/
|
|
1534
1322
|
interface FrameSource {
|
|
@@ -1557,7 +1345,7 @@ interface TranscriptResult {
|
|
|
1557
1345
|
inferenceTimeMs?: number;
|
|
1558
1346
|
}
|
|
1559
1347
|
/**
|
|
1560
|
-
* Consumer's response handler.
|
|
1348
|
+
* Consumer's response handler. VoiceOrchestrator calls this with transcribed text.
|
|
1561
1349
|
* Consumer must stream audio back for playback + lip sync.
|
|
1562
1350
|
*/
|
|
1563
1351
|
interface ResponseHandler {
|
|
@@ -1588,6 +1376,8 @@ interface ResponseHandler {
|
|
|
1588
1376
|
*/
|
|
1589
1377
|
|
|
1590
1378
|
interface TTSSpeakerConfig {
|
|
1379
|
+
/** Skip LAM download — audio playback only, no lip sync. Default: false. */
|
|
1380
|
+
audioOnly?: boolean;
|
|
1591
1381
|
/** Per-character expression weight scaling */
|
|
1592
1382
|
profile?: ExpressionProfile;
|
|
1593
1383
|
/** Identity/style index for A2E model (default: 0) */
|
|
@@ -1600,8 +1390,8 @@ interface TTSSpeakerConfig {
|
|
|
1600
1390
|
neutralTransitionMs?: number;
|
|
1601
1391
|
/** Pre-built A2E backend (skip internal createA2E). */
|
|
1602
1392
|
lam?: A2EBackend;
|
|
1603
|
-
/** LAM model config (only when lam not provided) */
|
|
1604
|
-
models?: CreateA2EConfig
|
|
1393
|
+
/** LAM model config (only when lam not provided). unifiedWorker is supplied by TTSSpeaker. */
|
|
1394
|
+
models?: Omit<CreateA2EConfig, 'unifiedWorker'>;
|
|
1605
1395
|
/** Shared unified worker (recommended for iOS) */
|
|
1606
1396
|
unifiedWorker?: UnifiedInferenceWorker;
|
|
1607
1397
|
}
|
|
@@ -1610,6 +1400,7 @@ declare class TTSSpeaker {
|
|
|
1610
1400
|
private tts;
|
|
1611
1401
|
private ownedLam;
|
|
1612
1402
|
private ownedWorker;
|
|
1403
|
+
private usesSharedWorker;
|
|
1613
1404
|
private currentAbort;
|
|
1614
1405
|
private _isSpeaking;
|
|
1615
1406
|
private _audioOnly;
|
|
@@ -1623,11 +1414,8 @@ declare class TTSSpeaker {
|
|
|
1623
1414
|
/**
|
|
1624
1415
|
* Connect a TTS backend.
|
|
1625
1416
|
*
|
|
1626
|
-
*
|
|
1627
|
-
*
|
|
1628
|
-
*
|
|
1629
|
-
* When config is omitted or has none of those, audio-only mode is used:
|
|
1630
|
-
* TTS → AudioScheduler (speakers only, no blendshapes, no LAM download).
|
|
1417
|
+
* By default, the full lip sync pipeline is created (auto-downloads LAM).
|
|
1418
|
+
* Pass `audioOnly: true` for audio-only mode (no blendshapes, no LAM download).
|
|
1631
1419
|
*
|
|
1632
1420
|
* @param tts - TTS backend to use for speech synthesis
|
|
1633
1421
|
* @param config - Optional configuration for A2E, expression profile, etc.
|
|
@@ -1643,6 +1431,8 @@ declare class TTSSpeaker {
|
|
|
1643
1431
|
speak(text: string, options?: {
|
|
1644
1432
|
signal?: AbortSignal;
|
|
1645
1433
|
voice?: string;
|
|
1434
|
+
speed?: number;
|
|
1435
|
+
language?: string;
|
|
1646
1436
|
}): Promise<void>;
|
|
1647
1437
|
/** Audio-only speak: TTS → resample → AudioScheduler (no blendshapes). */
|
|
1648
1438
|
private speakAudioOnly;
|
|
@@ -1662,13 +1452,20 @@ declare class TTSSpeaker {
|
|
|
1662
1452
|
streamText(options: {
|
|
1663
1453
|
signal?: AbortSignal;
|
|
1664
1454
|
voice?: string;
|
|
1455
|
+
speed?: number;
|
|
1456
|
+
language?: string;
|
|
1665
1457
|
}): Promise<{
|
|
1666
1458
|
push: (token: string) => void;
|
|
1667
1459
|
end: () => Promise<void>;
|
|
1668
1460
|
}>;
|
|
1669
1461
|
/** streamText in audio-only mode: TTS → AudioScheduler (no blendshapes). */
|
|
1670
1462
|
private streamTextAudioOnly;
|
|
1671
|
-
/**
|
|
1463
|
+
/**
|
|
1464
|
+
* Warm up AudioContext for iOS/Safari autoplay policy.
|
|
1465
|
+
* Call from a user gesture handler (click/tap) before speak().
|
|
1466
|
+
*/
|
|
1467
|
+
warmup(): Promise<void>;
|
|
1468
|
+
/** Abort current speak if any. Triggers neutral transition on PlaybackPipeline. */
|
|
1672
1469
|
stop(): void;
|
|
1673
1470
|
/** Clean teardown of all owned resources. */
|
|
1674
1471
|
dispose(): Promise<void>;
|
|
@@ -1704,11 +1501,13 @@ interface CreateTTSPlayerConfig {
|
|
|
1704
1501
|
modelUrl?: string;
|
|
1705
1502
|
/** Voice data base URL override */
|
|
1706
1503
|
voiceBaseUrl?: string;
|
|
1504
|
+
/** Shared unified worker (created automatically if not provided) */
|
|
1505
|
+
unifiedWorker?: UnifiedInferenceWorker;
|
|
1707
1506
|
}
|
|
1708
1507
|
/**
|
|
1709
1508
|
* Zero-config TTS player. Speak text through speakers without an avatar.
|
|
1710
1509
|
*
|
|
1711
|
-
* Uses Kokoro TTS (82M q8, ~92MB) with automatic worker
|
|
1510
|
+
* Uses Kokoro TTS (82M q8, ~92MB) with automatic worker creation.
|
|
1712
1511
|
* No LAM model is downloaded — audio plays directly through AudioScheduler.
|
|
1713
1512
|
*/
|
|
1714
1513
|
declare function createTTSPlayer(config?: CreateTTSPlayerConfig): TTSPlayer;
|
|
@@ -1717,254 +1516,27 @@ declare function createTTSPlayer(config?: CreateTTSPlayerConfig): TTSPlayer;
|
|
|
1717
1516
|
*/
|
|
1718
1517
|
declare class TTSPlayer extends TTSSpeaker {
|
|
1719
1518
|
private backend;
|
|
1720
|
-
|
|
1519
|
+
private ttsWorker;
|
|
1520
|
+
private ttsPlayerUsesSharedWorker;
|
|
1521
|
+
private ttsConfig;
|
|
1522
|
+
constructor(config?: CreateTTSPlayerConfig);
|
|
1721
1523
|
/** Load TTS model and connect in audio-only mode. */
|
|
1722
1524
|
load(): Promise<void>;
|
|
1723
1525
|
/** Whether the TTS model is loaded and ready. */
|
|
1724
1526
|
get isLoaded(): boolean;
|
|
1527
|
+
dispose(): Promise<void>;
|
|
1725
1528
|
}
|
|
1726
1529
|
|
|
1727
1530
|
/**
|
|
1728
|
-
*
|
|
1729
|
-
*
|
|
1730
|
-
* Provides a unified API that automatically selects the optimal implementation:
|
|
1731
|
-
* - Worker supported: Uses SenseVoiceWorker (off-main-thread inference)
|
|
1732
|
-
* - Worker unsupported: Uses SenseVoiceInference (main thread)
|
|
1733
|
-
*
|
|
1734
|
-
* @category Inference
|
|
1735
|
-
*
|
|
1736
|
-
* @example Auto-detect (recommended)
|
|
1737
|
-
* ```typescript
|
|
1738
|
-
* import { createSenseVoice } from '@omote/core';
|
|
1531
|
+
* SpeechListener — Standalone listening primitive.
|
|
1739
1532
|
*
|
|
1740
|
-
*
|
|
1741
|
-
*
|
|
1742
|
-
* });
|
|
1743
|
-
* await asr.load();
|
|
1744
|
-
* const { text, emotion } = await asr.transcribe(audioSamples);
|
|
1745
|
-
* ```
|
|
1533
|
+
* Composes: MicrophoneCapture → SileroVAD → SenseVoice ASR → transcript events.
|
|
1534
|
+
* Used independently or alongside TTSSpeaker and VoiceOrchestrator.
|
|
1746
1535
|
*
|
|
1747
|
-
*
|
|
1748
|
-
*
|
|
1749
|
-
* const asr = createSenseVoice({
|
|
1750
|
-
* modelUrl: '/models/sensevoice/model.int8.onnx',
|
|
1751
|
-
* useWorker: true,
|
|
1752
|
-
* });
|
|
1753
|
-
* ```
|
|
1536
|
+
* Does NOT handle TTS, LAM, or response routing — those belong to TTSSpeaker
|
|
1537
|
+
* and VoiceOrchestrator respectively.
|
|
1754
1538
|
*
|
|
1755
|
-
* @
|
|
1756
|
-
* ```typescript
|
|
1757
|
-
* const asr = createSenseVoice({
|
|
1758
|
-
* modelUrl: '/models/sensevoice/model.int8.onnx',
|
|
1759
|
-
* useWorker: false,
|
|
1760
|
-
* });
|
|
1761
|
-
* ```
|
|
1762
|
-
*/
|
|
1763
|
-
|
|
1764
|
-
/**
|
|
1765
|
-
* Common interface for both SenseVoiceInference and SenseVoiceWorker
|
|
1766
|
-
*/
|
|
1767
|
-
interface SenseVoiceBackend {
|
|
1768
|
-
/** Whether the model is loaded and ready for inference */
|
|
1769
|
-
readonly isLoaded: boolean;
|
|
1770
|
-
/** Current backend type ('wasm' for worker, full backend for main thread, null if not loaded) */
|
|
1771
|
-
readonly backend: 'wasm' | 'webgpu' | null;
|
|
1772
|
-
/**
|
|
1773
|
-
* Load the ONNX model
|
|
1774
|
-
* @param onProgress - Optional progress callback (fires once at 100% for worker)
|
|
1775
|
-
* @returns Model loading information
|
|
1776
|
-
*/
|
|
1777
|
-
load(onProgress?: (loaded: number, total: number) => void): Promise<SenseVoiceModelInfo>;
|
|
1778
|
-
/**
|
|
1779
|
-
* Transcribe audio samples to text
|
|
1780
|
-
* @param audioSamples - Float32Array of audio samples at 16kHz
|
|
1781
|
-
* @returns Transcription result
|
|
1782
|
-
*/
|
|
1783
|
-
transcribe(audioSamples: Float32Array): Promise<SenseVoiceResult>;
|
|
1784
|
-
/**
|
|
1785
|
-
* Dispose of the model and free resources
|
|
1786
|
-
*/
|
|
1787
|
-
dispose(): Promise<void>;
|
|
1788
|
-
}
|
|
1789
|
-
/**
|
|
1790
|
-
* Configuration for the SenseVoice factory
|
|
1791
|
-
*/
|
|
1792
|
-
interface CreateSenseVoiceConfig extends InferenceFactoryConfig {
|
|
1793
|
-
/** Path or URL to model.int8.onnx (239MB). Default: HuggingFace CDN */
|
|
1794
|
-
modelUrl?: string;
|
|
1795
|
-
/** Path or URL to tokens.txt vocabulary file (default: sibling of modelUrl) */
|
|
1796
|
-
tokensUrl?: string;
|
|
1797
|
-
/** Language hint (default: 'auto') */
|
|
1798
|
-
language?: SenseVoiceLanguage;
|
|
1799
|
-
/** Text normalization (default: 'with_itn') */
|
|
1800
|
-
textNorm?: 'with_itn' | 'without_itn';
|
|
1801
|
-
}
|
|
1802
|
-
/**
|
|
1803
|
-
* Create a SenseVoice ASR instance with automatic implementation selection
|
|
1804
|
-
*
|
|
1805
|
-
* @param config - Factory configuration
|
|
1806
|
-
* @returns A SenseVoiceBackend instance (either Worker or main thread)
|
|
1807
|
-
*/
|
|
1808
|
-
declare function createSenseVoice(config?: CreateSenseVoiceConfig): SenseVoiceBackend;
|
|
1809
|
-
|
|
1810
|
-
/**
|
|
1811
|
-
* Factory function for Silero VAD with automatic Worker vs main thread selection
|
|
1812
|
-
*
|
|
1813
|
-
* Provides a unified API that automatically selects the optimal implementation:
|
|
1814
|
-
* - Desktop browsers: Uses SileroVADWorker (off-main-thread inference)
|
|
1815
|
-
* - Mobile devices: Uses SileroVADInference (main thread, avoids memory overhead)
|
|
1816
|
-
* - Fallback: Gracefully falls back to main thread if Worker fails
|
|
1817
|
-
*
|
|
1818
|
-
* @category Inference
|
|
1819
|
-
*
|
|
1820
|
-
* @example Basic usage (auto-detect)
|
|
1821
|
-
* ```typescript
|
|
1822
|
-
* import { createSileroVAD } from '@omote/core';
|
|
1823
|
-
*
|
|
1824
|
-
* const vad = createSileroVAD({
|
|
1825
|
-
* modelUrl: '/models/silero-vad.onnx',
|
|
1826
|
-
* threshold: 0.5,
|
|
1827
|
-
* });
|
|
1828
|
-
*
|
|
1829
|
-
* await vad.load();
|
|
1830
|
-
* const result = await vad.process(audioChunk);
|
|
1831
|
-
* if (result.isSpeech) {
|
|
1832
|
-
* console.log('Speech detected!', result.probability);
|
|
1833
|
-
* }
|
|
1834
|
-
* ```
|
|
1835
|
-
*
|
|
1836
|
-
* @example Force worker usage
|
|
1837
|
-
* ```typescript
|
|
1838
|
-
* const vad = createSileroVAD({
|
|
1839
|
-
* modelUrl: '/models/silero-vad.onnx',
|
|
1840
|
-
* useWorker: true, // Force Worker even on mobile
|
|
1841
|
-
* });
|
|
1842
|
-
* ```
|
|
1843
|
-
*
|
|
1844
|
-
* @example Force main thread
|
|
1845
|
-
* ```typescript
|
|
1846
|
-
* const vad = createSileroVAD({
|
|
1847
|
-
* modelUrl: '/models/silero-vad.onnx',
|
|
1848
|
-
* useWorker: false, // Force main thread
|
|
1849
|
-
* });
|
|
1850
|
-
* ```
|
|
1851
|
-
*/
|
|
1852
|
-
|
|
1853
|
-
/**
|
|
1854
|
-
* Common interface for both SileroVADInference and SileroVADWorker
|
|
1855
|
-
*
|
|
1856
|
-
* This interface defines the shared API that both implementations provide,
|
|
1857
|
-
* allowing consumers to use either interchangeably.
|
|
1858
|
-
*/
|
|
1859
|
-
interface SileroVADBackend {
|
|
1860
|
-
/** Current backend type (webgpu, wasm, or null if not loaded) */
|
|
1861
|
-
readonly backend: RuntimeBackend | null;
|
|
1862
|
-
/** Whether the model is loaded and ready for inference */
|
|
1863
|
-
readonly isLoaded: boolean;
|
|
1864
|
-
/** Audio sample rate (8000 or 16000 Hz) */
|
|
1865
|
-
readonly sampleRate: number;
|
|
1866
|
-
/** Speech detection threshold (0-1) */
|
|
1867
|
-
readonly threshold: number;
|
|
1868
|
-
/**
|
|
1869
|
-
* Load the ONNX model
|
|
1870
|
-
* @returns Model loading information
|
|
1871
|
-
*/
|
|
1872
|
-
load(): Promise<VADModelInfo | VADWorkerModelInfo>;
|
|
1873
|
-
/**
|
|
1874
|
-
* Process a single audio chunk
|
|
1875
|
-
* @param audioChunk - Float32Array of exactly chunkSize samples
|
|
1876
|
-
* @returns VAD result with speech probability
|
|
1877
|
-
*/
|
|
1878
|
-
process(audioChunk: Float32Array): Promise<VADResult>;
|
|
1879
|
-
/**
|
|
1880
|
-
* Reset state for new audio stream
|
|
1881
|
-
*/
|
|
1882
|
-
reset(): void | Promise<void>;
|
|
1883
|
-
/**
|
|
1884
|
-
* Dispose of the model and free resources
|
|
1885
|
-
*/
|
|
1886
|
-
dispose(): Promise<void>;
|
|
1887
|
-
/**
|
|
1888
|
-
* Get required chunk size in samples
|
|
1889
|
-
*/
|
|
1890
|
-
getChunkSize(): number;
|
|
1891
|
-
/**
|
|
1892
|
-
* Get chunk duration in milliseconds
|
|
1893
|
-
*/
|
|
1894
|
-
getChunkDurationMs(): number;
|
|
1895
|
-
}
|
|
1896
|
-
/**
|
|
1897
|
-
* Configuration for the Silero VAD factory
|
|
1898
|
-
*
|
|
1899
|
-
* Extends SileroVADConfig with worker-specific options.
|
|
1900
|
-
*/
|
|
1901
|
-
interface SileroVADFactoryConfig extends Omit<SileroVADConfig, 'modelUrl'>, InferenceFactoryConfig {
|
|
1902
|
-
/** Path or URL to the ONNX model. Default: HuggingFace CDN */
|
|
1903
|
-
modelUrl?: string;
|
|
1904
|
-
/**
|
|
1905
|
-
* Fallback to main thread on worker errors.
|
|
1906
|
-
*
|
|
1907
|
-
* When true (default), if the Worker fails to load or encounters an error,
|
|
1908
|
-
* the factory will automatically create a main thread instance instead.
|
|
1909
|
-
*
|
|
1910
|
-
* When false, worker errors will propagate as exceptions.
|
|
1911
|
-
*
|
|
1912
|
-
* Default: true
|
|
1913
|
-
*/
|
|
1914
|
-
fallbackOnError?: boolean;
|
|
1915
|
-
}
|
|
1916
|
-
/**
|
|
1917
|
-
* Check if the current environment supports VAD Web Workers
|
|
1918
|
-
*
|
|
1919
|
-
* Requirements:
|
|
1920
|
-
* - Worker constructor must exist
|
|
1921
|
-
* - Blob URL support (for inline worker script)
|
|
1922
|
-
*
|
|
1923
|
-
* @returns true if VAD Worker is supported
|
|
1924
|
-
*/
|
|
1925
|
-
declare function supportsVADWorker(): boolean;
|
|
1926
|
-
/**
|
|
1927
|
-
* Create a Silero VAD instance with automatic implementation selection
|
|
1928
|
-
*
|
|
1929
|
-
* This factory function automatically selects between:
|
|
1930
|
-
* - **SileroVADWorker**: Off-main-thread inference (better for desktop)
|
|
1931
|
-
* - **SileroVADInference**: Main thread inference (better for mobile)
|
|
1932
|
-
*
|
|
1933
|
-
* The selection is based on:
|
|
1934
|
-
* 1. Explicit `useWorker` config (if provided)
|
|
1935
|
-
* 2. Platform detection (mobile vs desktop)
|
|
1936
|
-
* 3. Worker API availability
|
|
1937
|
-
*
|
|
1938
|
-
* Both implementations share the same interface (SileroVADBackend),
|
|
1939
|
-
* so consumers can use either interchangeably.
|
|
1940
|
-
*
|
|
1941
|
-
* @param config - Factory configuration
|
|
1942
|
-
* @returns A SileroVAD instance (either Worker or main thread)
|
|
1943
|
-
*
|
|
1944
|
-
* @example
|
|
1945
|
-
* ```typescript
|
|
1946
|
-
* // Auto-detect (recommended)
|
|
1947
|
-
* const vad = createSileroVAD({ modelUrl: '/models/silero-vad.onnx' });
|
|
1948
|
-
*
|
|
1949
|
-
* // Force Worker
|
|
1950
|
-
* const vadWorker = createSileroVAD({ modelUrl: '/models/silero-vad.onnx', useWorker: true });
|
|
1951
|
-
*
|
|
1952
|
-
* // Force main thread
|
|
1953
|
-
* const vadMain = createSileroVAD({ modelUrl: '/models/silero-vad.onnx', useWorker: false });
|
|
1954
|
-
* ```
|
|
1955
|
-
*/
|
|
1956
|
-
declare function createSileroVAD(config?: SileroVADFactoryConfig): SileroVADBackend;
|
|
1957
|
-
|
|
1958
|
-
/**
|
|
1959
|
-
* SpeechListener — Standalone listening primitive.
|
|
1960
|
-
*
|
|
1961
|
-
* Composes: MicrophoneCapture → SileroVAD → SenseVoice ASR → transcript events.
|
|
1962
|
-
* Extracted from VoicePipeline's listening half so it can be used independently.
|
|
1963
|
-
*
|
|
1964
|
-
* Does NOT handle TTS, LAM, or response routing — those belong to TTSSpeaker
|
|
1965
|
-
* and VoicePipeline respectively.
|
|
1966
|
-
*
|
|
1967
|
-
* @category Audio
|
|
1539
|
+
* @category Audio
|
|
1968
1540
|
*/
|
|
1969
1541
|
|
|
1970
1542
|
interface SpeechListenerConfig {
|
|
@@ -1981,6 +1553,7 @@ interface SpeechListenerConfig {
|
|
|
1981
1553
|
modelUrl: string;
|
|
1982
1554
|
tokensUrl?: string;
|
|
1983
1555
|
language?: string;
|
|
1556
|
+
textNorm?: 'with_itn' | 'without_itn';
|
|
1984
1557
|
};
|
|
1985
1558
|
vad: {
|
|
1986
1559
|
modelUrl: string;
|
|
@@ -2035,6 +1608,7 @@ declare class SpeechListener extends EventEmitter<SpeechListenerEvents> {
|
|
|
2035
1608
|
private asr;
|
|
2036
1609
|
private vad;
|
|
2037
1610
|
private ownedWorker;
|
|
1611
|
+
private usesSharedWorker;
|
|
2038
1612
|
private mic;
|
|
2039
1613
|
private omoteEvents;
|
|
2040
1614
|
private _unsubChunk;
|
|
@@ -2164,240 +1738,80 @@ declare class InterruptionHandler extends EventEmitter<InterruptionEvents> {
|
|
|
2164
1738
|
}
|
|
2165
1739
|
|
|
2166
1740
|
/**
|
|
2167
|
-
* SenseVoice ASR
|
|
2168
|
-
*
|
|
2169
|
-
* Runs SenseVoice speech recognition in a dedicated Web Worker to prevent
|
|
2170
|
-
* main thread blocking. Uses inline worker script (Blob URL pattern) to
|
|
2171
|
-
* avoid separate file deployment.
|
|
2172
|
-
*
|
|
2173
|
-
* Key design decisions:
|
|
2174
|
-
* - WASM backend only (WebGPU doesn't work in Workers)
|
|
2175
|
-
* - All preprocessing (fbank, LFR, CMVN) and CTC decoding inlined in worker
|
|
2176
|
-
* - Audio copied (not transferred) to retain main thread access
|
|
2177
|
-
* - ONNX Runtime loaded from CDN in worker (no bundler complications)
|
|
2178
|
-
* - iOS: model URL passed as string to ORT (avoids 239MB JS heap allocation)
|
|
1741
|
+
* Factory function for SenseVoice ASR via UnifiedInferenceWorker
|
|
2179
1742
|
*
|
|
2180
1743
|
* @category Inference
|
|
2181
1744
|
*
|
|
2182
|
-
* @example
|
|
1745
|
+
* @example
|
|
2183
1746
|
* ```typescript
|
|
2184
|
-
* import {
|
|
1747
|
+
* import { createSenseVoice, UnifiedInferenceWorker } from '@omote/core';
|
|
2185
1748
|
*
|
|
2186
|
-
* const
|
|
1749
|
+
* const worker = new UnifiedInferenceWorker();
|
|
1750
|
+
* await worker.init();
|
|
1751
|
+
*
|
|
1752
|
+
* const asr = createSenseVoice({
|
|
2187
1753
|
* modelUrl: '/models/sensevoice/model.int8.onnx',
|
|
2188
|
-
*
|
|
1754
|
+
* unifiedWorker: worker,
|
|
2189
1755
|
* });
|
|
2190
1756
|
* await asr.load();
|
|
2191
|
-
*
|
|
2192
|
-
* const { text, emotion, language } = await asr.transcribe(audioSamples);
|
|
2193
|
-
* console.log(text); // "Hello world"
|
|
2194
|
-
* console.log(emotion); // "NEUTRAL"
|
|
2195
|
-
* console.log(language); // "en"
|
|
1757
|
+
* const { text, emotion } = await asr.transcribe(audioSamples);
|
|
2196
1758
|
* ```
|
|
2197
1759
|
*/
|
|
2198
1760
|
|
|
2199
1761
|
/**
|
|
2200
|
-
* Configuration for SenseVoice
|
|
1762
|
+
* Configuration for the SenseVoice factory
|
|
2201
1763
|
*/
|
|
2202
|
-
interface
|
|
2203
|
-
/** Path or URL to model.int8.onnx (239MB) */
|
|
2204
|
-
modelUrl
|
|
1764
|
+
interface CreateSenseVoiceConfig extends InferenceFactoryConfig {
|
|
1765
|
+
/** Path or URL to model.int8.onnx (239MB). Default: HuggingFace CDN */
|
|
1766
|
+
modelUrl?: string;
|
|
2205
1767
|
/** Path or URL to tokens.txt vocabulary file (default: sibling of modelUrl) */
|
|
2206
1768
|
tokensUrl?: string;
|
|
2207
|
-
/** Language hint (default: 'auto'
|
|
1769
|
+
/** Language hint (default: 'auto') */
|
|
2208
1770
|
language?: SenseVoiceLanguage;
|
|
2209
|
-
/** Text normalization
|
|
1771
|
+
/** Text normalization (default: 'with_itn') */
|
|
2210
1772
|
textNorm?: 'with_itn' | 'without_itn';
|
|
2211
1773
|
}
|
|
2212
1774
|
/**
|
|
2213
|
-
* SenseVoice ASR
|
|
1775
|
+
* Create a SenseVoice ASR instance via the unified worker.
|
|
2214
1776
|
*
|
|
2215
|
-
*
|
|
2216
|
-
* All preprocessing (fbank, LFR, CMVN) and CTC decoding run in the worker.
|
|
1777
|
+
* If no `unifiedWorker` is provided, a dedicated worker is created on load().
|
|
2217
1778
|
*
|
|
2218
|
-
* @
|
|
1779
|
+
* @param config - Factory configuration
|
|
1780
|
+
* @returns A SenseVoiceBackend instance
|
|
2219
1781
|
*/
|
|
2220
|
-
declare
|
|
2221
|
-
private worker;
|
|
2222
|
-
private config;
|
|
2223
|
-
private isLoading;
|
|
2224
|
-
private _isLoaded;
|
|
2225
|
-
private inferenceQueue;
|
|
2226
|
-
private poisoned;
|
|
2227
|
-
private pendingResolvers;
|
|
2228
|
-
private languageId;
|
|
2229
|
-
private textNormId;
|
|
2230
|
-
constructor(config: SenseVoiceWorkerConfig);
|
|
2231
|
-
get isLoaded(): boolean;
|
|
2232
|
-
/**
|
|
2233
|
-
* Backend type (always 'wasm' for Worker, WebGPU not supported in Workers)
|
|
2234
|
-
*/
|
|
2235
|
-
get backend(): 'wasm' | null;
|
|
2236
|
-
/**
|
|
2237
|
-
* Create the worker from inline script
|
|
2238
|
-
*/
|
|
2239
|
-
private createWorker;
|
|
2240
|
-
/**
|
|
2241
|
-
* Handle messages from worker
|
|
2242
|
-
*/
|
|
2243
|
-
private handleWorkerMessage;
|
|
2244
|
-
/**
|
|
2245
|
-
* Send message to worker and wait for response
|
|
2246
|
-
*/
|
|
2247
|
-
private sendMessage;
|
|
2248
|
-
/**
|
|
2249
|
-
* Load the ONNX model in the worker
|
|
2250
|
-
*
|
|
2251
|
-
* @param onProgress - Optional progress callback. Fires once at 100% when load completes
|
|
2252
|
-
* (the worker downloads and loads the model internally, so granular progress is not available).
|
|
2253
|
-
*/
|
|
2254
|
-
load(onProgress?: (loaded: number, total: number) => void): Promise<SenseVoiceModelInfo>;
|
|
2255
|
-
/**
|
|
2256
|
-
* Transcribe audio samples to text
|
|
2257
|
-
*
|
|
2258
|
-
* @param audioSamples Float32Array of audio samples at 16kHz, [-1, 1] range
|
|
2259
|
-
* @returns Transcription result with text, emotion, language, and event
|
|
2260
|
-
*/
|
|
2261
|
-
transcribe(audioSamples: Float32Array): Promise<SenseVoiceResult>;
|
|
2262
|
-
/**
|
|
2263
|
-
* Queue inference to serialize worker calls
|
|
2264
|
-
*/
|
|
2265
|
-
private queueInference;
|
|
2266
|
-
/**
|
|
2267
|
-
* Dispose of the worker and free resources
|
|
2268
|
-
*/
|
|
2269
|
-
dispose(): Promise<void>;
|
|
2270
|
-
/**
|
|
2271
|
-
* Check if Web Workers are supported
|
|
2272
|
-
*/
|
|
2273
|
-
static isSupported(): boolean;
|
|
2274
|
-
}
|
|
1782
|
+
declare function createSenseVoice(config?: CreateSenseVoiceConfig): SenseVoiceBackend;
|
|
2275
1783
|
|
|
2276
1784
|
/**
|
|
2277
1785
|
* Shared blendshape constants and utilities for lip sync inference
|
|
2278
1786
|
*
|
|
2279
1787
|
* Contains ARKIT_BLENDSHAPES (canonical 52-blendshape ordering), symmetrization,
|
|
2280
|
-
* and interpolation utilities used by A2EInference and all renderer adapters.
|
|
2281
|
-
*
|
|
2282
|
-
* This module is the single source of truth for blendshape ordering to
|
|
2283
|
-
* avoid circular dependencies between inference classes.
|
|
2284
|
-
*
|
|
2285
|
-
* @category Inference
|
|
2286
|
-
*/
|
|
2287
|
-
/**
|
|
2288
|
-
* ARKit blendshape names in alphabetical order (52 total)
|
|
2289
|
-
* This is the canonical ordering used by all A2E models in the SDK.
|
|
2290
|
-
*/
|
|
2291
|
-
declare const ARKIT_BLENDSHAPES: readonly ["browDownLeft", "browDownRight", "browInnerUp", "browOuterUpLeft", "browOuterUpRight", "cheekPuff", "cheekSquintLeft", "cheekSquintRight", "eyeBlinkLeft", "eyeBlinkRight", "eyeLookDownLeft", "eyeLookDownRight", "eyeLookInLeft", "eyeLookInRight", "eyeLookOutLeft", "eyeLookOutRight", "eyeLookUpLeft", "eyeLookUpRight", "eyeSquintLeft", "eyeSquintRight", "eyeWideLeft", "eyeWideRight", "jawForward", "jawLeft", "jawOpen", "jawRight", "mouthClose", "mouthDimpleLeft", "mouthDimpleRight", "mouthFrownLeft", "mouthFrownRight", "mouthFunnel", "mouthLeft", "mouthLowerDownLeft", "mouthLowerDownRight", "mouthPressLeft", "mouthPressRight", "mouthPucker", "mouthRight", "mouthRollLower", "mouthRollUpper", "mouthShrugLower", "mouthShrugUpper", "mouthSmileLeft", "mouthSmileRight", "mouthStretchLeft", "mouthStretchRight", "mouthUpperUpLeft", "mouthUpperUpRight", "noseSneerLeft", "noseSneerRight", "tongueOut"];
|
|
2292
|
-
/** @deprecated Use ARKIT_BLENDSHAPES instead */
|
|
2293
|
-
declare const LAM_BLENDSHAPES: readonly ["browDownLeft", "browDownRight", "browInnerUp", "browOuterUpLeft", "browOuterUpRight", "cheekPuff", "cheekSquintLeft", "cheekSquintRight", "eyeBlinkLeft", "eyeBlinkRight", "eyeLookDownLeft", "eyeLookDownRight", "eyeLookInLeft", "eyeLookInRight", "eyeLookOutLeft", "eyeLookOutRight", "eyeLookUpLeft", "eyeLookUpRight", "eyeSquintLeft", "eyeSquintRight", "eyeWideLeft", "eyeWideRight", "jawForward", "jawLeft", "jawOpen", "jawRight", "mouthClose", "mouthDimpleLeft", "mouthDimpleRight", "mouthFrownLeft", "mouthFrownRight", "mouthFunnel", "mouthLeft", "mouthLowerDownLeft", "mouthLowerDownRight", "mouthPressLeft", "mouthPressRight", "mouthPucker", "mouthRight", "mouthRollLower", "mouthRollUpper", "mouthShrugLower", "mouthShrugUpper", "mouthSmileLeft", "mouthSmileRight", "mouthStretchLeft", "mouthStretchRight", "mouthUpperUpLeft", "mouthUpperUpRight", "noseSneerLeft", "noseSneerRight", "tongueOut"];
|
|
2294
|
-
/**
|
|
2295
|
-
* Linearly interpolate between two blendshape weight arrays.
|
|
2296
|
-
*
|
|
2297
|
-
* Pure math utility with zero renderer dependency — used by all renderer
|
|
2298
|
-
* adapters (@omote/three, @omote/babylon, @omote/r3f) for smooth frame
|
|
2299
|
-
* transitions.
|
|
2300
|
-
*
|
|
2301
|
-
* @param current - Current blendshape weights
|
|
2302
|
-
* @param target - Target blendshape weights
|
|
2303
|
-
* @param factor - Interpolation factor (0 = no change, 1 = snap to target). Default: 0.3
|
|
2304
|
-
* @returns Interpolated weights as number[]
|
|
2305
|
-
*/
|
|
2306
|
-
declare function lerpBlendshapes(current: Float32Array | number[], target: Float32Array | number[], factor?: number): number[];
|
|
2307
|
-
|
|
2308
|
-
/**
|
|
2309
|
-
* A2E inference engine for Audio-to-Expression (LAM model)
|
|
2310
|
-
*
|
|
2311
|
-
* Runs entirely in the browser using WebGPU or WASM.
|
|
2312
|
-
* Takes raw 16kHz audio and outputs 52 ARKit blendshapes for lip sync.
|
|
2313
|
-
* Uses the LAM (Large Animation Model) — see {@link A2EBackend} for the interface.
|
|
2314
|
-
*
|
|
2315
|
-
* @see {@link createA2E} for the recommended zero-config factory
|
|
2316
|
-
* @see {@link A2EBackend} for the common interface
|
|
2317
|
-
* @category Inference
|
|
2318
|
-
*
|
|
2319
|
-
* @example Basic usage
|
|
2320
|
-
* ```typescript
|
|
2321
|
-
* import { A2EInference } from '@omote/core';
|
|
2322
|
-
*
|
|
2323
|
-
* const a2e = new A2EInference({ modelUrl: '/models/lam.onnx' });
|
|
2324
|
-
* await a2e.load();
|
|
2325
|
-
*
|
|
2326
|
-
* // Process 1 second of audio (16kHz = 16000 samples)
|
|
2327
|
-
* const result = await a2e.infer(audioSamples);
|
|
2328
|
-
* console.log('Blendshapes:', result.blendshapes); // [30, 52] for 30fps
|
|
2329
|
-
* ```
|
|
2330
|
-
*/
|
|
2331
|
-
|
|
2332
|
-
interface A2EInferenceConfig {
|
|
2333
|
-
/** Path or URL to the ONNX model */
|
|
2334
|
-
modelUrl: string;
|
|
2335
|
-
/**
|
|
2336
|
-
* Path or URL to external model data file (.onnx.data weights).
|
|
2337
|
-
* Default: `${modelUrl}.data` (e.g., /models/model.onnx.data)
|
|
2338
|
-
*
|
|
2339
|
-
* Set to `false` to skip external data loading (single-file models only).
|
|
2340
|
-
*/
|
|
2341
|
-
externalDataUrl?: string | false;
|
|
2342
|
-
/** Preferred backend (auto will try WebGPU first, fallback to WASM) */
|
|
2343
|
-
backend?: BackendPreference;
|
|
2344
|
-
/** Number of identity classes (default: 12 for streaming model) */
|
|
2345
|
-
numIdentityClasses?: number;
|
|
2346
|
-
/**
|
|
2347
|
-
* Number of audio samples per inference chunk (default: 16000).
|
|
2348
|
-
* Model supports variable chunk sizes. Smaller chunks = lower latency,
|
|
2349
|
-
* more inference overhead. 8000 (500ms) is recommended for real-time lip sync.
|
|
2350
|
-
*/
|
|
2351
|
-
chunkSize?: number;
|
|
2352
|
-
}
|
|
2353
|
-
|
|
2354
|
-
declare class A2EInference implements A2EBackend {
|
|
2355
|
-
readonly modelId: "a2e";
|
|
2356
|
-
private session;
|
|
2357
|
-
private ort;
|
|
2358
|
-
private config;
|
|
2359
|
-
private _backend;
|
|
2360
|
-
private isLoading;
|
|
2361
|
-
private numIdentityClasses;
|
|
2362
|
-
readonly chunkSize: number;
|
|
2363
|
-
private inferenceQueue;
|
|
2364
|
-
private poisoned;
|
|
2365
|
-
private static readonly INFERENCE_TIMEOUT_MS;
|
|
2366
|
-
constructor(config: A2EInferenceConfig);
|
|
2367
|
-
/**
|
|
2368
|
-
* Check if WebGPU is available and working
|
|
2369
|
-
* (iOS returns false even if navigator.gpu exists due to ONNX Runtime bugs)
|
|
2370
|
-
*/
|
|
2371
|
-
static isWebGPUAvailable: typeof isWebGPUAvailable;
|
|
2372
|
-
get backend(): 'webgpu' | 'wasm' | null;
|
|
2373
|
-
get isLoaded(): boolean;
|
|
2374
|
-
/** True if inference timed out and the session is permanently unusable */
|
|
2375
|
-
get isSessionPoisoned(): boolean;
|
|
2376
|
-
/**
|
|
2377
|
-
* Load the ONNX model
|
|
2378
|
-
*/
|
|
2379
|
-
load(): Promise<A2EModelInfo>;
|
|
2380
|
-
/**
|
|
2381
|
-
* Run inference on raw audio
|
|
2382
|
-
* @param audioSamples - Float32Array of raw audio at 16kHz
|
|
2383
|
-
* @param identityIndex - Optional identity index (0-11, default 0 = neutral)
|
|
2384
|
-
*
|
|
2385
|
-
* Audio will be zero-padded or truncated to chunkSize samples.
|
|
2386
|
-
*/
|
|
2387
|
-
infer(audioSamples: Float32Array, identityIndex?: number): Promise<A2EResult>;
|
|
2388
|
-
/**
|
|
2389
|
-
* Queue inference to serialize ONNX session calls
|
|
2390
|
-
*/
|
|
2391
|
-
private queueInference;
|
|
2392
|
-
/**
|
|
2393
|
-
* Get blendshape value by name for a specific frame
|
|
2394
|
-
*/
|
|
2395
|
-
getBlendshape(blendshapes: Float32Array, name: typeof LAM_BLENDSHAPES[number]): number;
|
|
2396
|
-
/**
|
|
2397
|
-
* Dispose of the model and free resources
|
|
2398
|
-
*/
|
|
2399
|
-
dispose(): Promise<void>;
|
|
2400
|
-
}
|
|
1788
|
+
* and interpolation utilities used by A2EInference and all renderer adapters.
|
|
1789
|
+
*
|
|
1790
|
+
* This module is the single source of truth for blendshape ordering to
|
|
1791
|
+
* avoid circular dependencies between inference classes.
|
|
1792
|
+
*
|
|
1793
|
+
* @category Inference
|
|
1794
|
+
*/
|
|
1795
|
+
/**
|
|
1796
|
+
* ARKit blendshape names in alphabetical order (52 total)
|
|
1797
|
+
* This is the canonical ordering used by all A2E models in the SDK.
|
|
1798
|
+
*/
|
|
1799
|
+
declare const ARKIT_BLENDSHAPES: readonly ["browDownLeft", "browDownRight", "browInnerUp", "browOuterUpLeft", "browOuterUpRight", "cheekPuff", "cheekSquintLeft", "cheekSquintRight", "eyeBlinkLeft", "eyeBlinkRight", "eyeLookDownLeft", "eyeLookDownRight", "eyeLookInLeft", "eyeLookInRight", "eyeLookOutLeft", "eyeLookOutRight", "eyeLookUpLeft", "eyeLookUpRight", "eyeSquintLeft", "eyeSquintRight", "eyeWideLeft", "eyeWideRight", "jawForward", "jawLeft", "jawOpen", "jawRight", "mouthClose", "mouthDimpleLeft", "mouthDimpleRight", "mouthFrownLeft", "mouthFrownRight", "mouthFunnel", "mouthLeft", "mouthLowerDownLeft", "mouthLowerDownRight", "mouthPressLeft", "mouthPressRight", "mouthPucker", "mouthRight", "mouthRollLower", "mouthRollUpper", "mouthShrugLower", "mouthShrugUpper", "mouthSmileLeft", "mouthSmileRight", "mouthStretchLeft", "mouthStretchRight", "mouthUpperUpLeft", "mouthUpperUpRight", "noseSneerLeft", "noseSneerRight", "tongueOut"];
|
|
1800
|
+
/** @deprecated Use ARKIT_BLENDSHAPES instead */
|
|
1801
|
+
declare const LAM_BLENDSHAPES: readonly ["browDownLeft", "browDownRight", "browInnerUp", "browOuterUpLeft", "browOuterUpRight", "cheekPuff", "cheekSquintLeft", "cheekSquintRight", "eyeBlinkLeft", "eyeBlinkRight", "eyeLookDownLeft", "eyeLookDownRight", "eyeLookInLeft", "eyeLookInRight", "eyeLookOutLeft", "eyeLookOutRight", "eyeLookUpLeft", "eyeLookUpRight", "eyeSquintLeft", "eyeSquintRight", "eyeWideLeft", "eyeWideRight", "jawForward", "jawLeft", "jawOpen", "jawRight", "mouthClose", "mouthDimpleLeft", "mouthDimpleRight", "mouthFrownLeft", "mouthFrownRight", "mouthFunnel", "mouthLeft", "mouthLowerDownLeft", "mouthLowerDownRight", "mouthPressLeft", "mouthPressRight", "mouthPucker", "mouthRight", "mouthRollLower", "mouthRollUpper", "mouthShrugLower", "mouthShrugUpper", "mouthSmileLeft", "mouthSmileRight", "mouthStretchLeft", "mouthStretchRight", "mouthUpperUpLeft", "mouthUpperUpRight", "noseSneerLeft", "noseSneerRight", "tongueOut"];
|
|
1802
|
+
/**
|
|
1803
|
+
* Linearly interpolate between two blendshape weight arrays.
|
|
1804
|
+
*
|
|
1805
|
+
* Pure math utility with zero renderer dependency — used by all renderer
|
|
1806
|
+
* adapters (@omote/three, @omote/babylon, @omote/r3f) for smooth frame
|
|
1807
|
+
* transitions.
|
|
1808
|
+
*
|
|
1809
|
+
* @param current - Current blendshape weights
|
|
1810
|
+
* @param target - Target blendshape weights
|
|
1811
|
+
* @param factor - Interpolation factor (0 = no change, 1 = snap to target). Default: 0.3
|
|
1812
|
+
* @returns Interpolated weights as number[]
|
|
1813
|
+
*/
|
|
1814
|
+
declare function lerpBlendshapes(current: Float32Array | number[], target: Float32Array | number[], factor?: number): number[];
|
|
2401
1815
|
|
|
2402
1816
|
/**
|
|
2403
1817
|
* Default and user-configurable model URLs for all ONNX models
|
|
@@ -2434,7 +1848,7 @@ type ModelUrlKey = 'lam' | 'senseVoice' | 'sileroVad' | 'kokoroTTS' | 'kokoroVoi
|
|
|
2434
1848
|
* Resolved model URLs — user overrides take priority, HuggingFace CDN is fallback.
|
|
2435
1849
|
*
|
|
2436
1850
|
* All SDK factories (`createA2E`, `createSenseVoice`, `createSileroVAD`) and
|
|
2437
|
-
* orchestrators (`
|
|
1851
|
+
* orchestrators (`VoiceOrchestrator`) read from this object. Call
|
|
2438
1852
|
* {@link configureModelUrls} before constructing any pipelines to point
|
|
2439
1853
|
* models at your own CDN.
|
|
2440
1854
|
*/
|
|
@@ -2704,6 +2118,44 @@ declare class BlendshapeSmoother {
|
|
|
2704
2118
|
reset(): void;
|
|
2705
2119
|
}
|
|
2706
2120
|
|
|
2121
|
+
/**
|
|
2122
|
+
* Factory function for Silero VAD via UnifiedInferenceWorker
|
|
2123
|
+
*
|
|
2124
|
+
* @category Inference
|
|
2125
|
+
*
|
|
2126
|
+
* @example
|
|
2127
|
+
* ```typescript
|
|
2128
|
+
* import { createSileroVAD, UnifiedInferenceWorker } from '@omote/core';
|
|
2129
|
+
*
|
|
2130
|
+
* const worker = new UnifiedInferenceWorker();
|
|
2131
|
+
* await worker.init();
|
|
2132
|
+
*
|
|
2133
|
+
* const vad = createSileroVAD({
|
|
2134
|
+
* modelUrl: '/models/silero-vad.onnx',
|
|
2135
|
+
* unifiedWorker: worker,
|
|
2136
|
+
* });
|
|
2137
|
+
* await vad.load();
|
|
2138
|
+
* const result = await vad.process(audioChunk);
|
|
2139
|
+
* ```
|
|
2140
|
+
*/
|
|
2141
|
+
|
|
2142
|
+
/**
|
|
2143
|
+
* Configuration for the Silero VAD factory
|
|
2144
|
+
*/
|
|
2145
|
+
interface SileroVADFactoryConfig extends Omit<SileroVADConfig, 'modelUrl'>, InferenceFactoryConfig {
|
|
2146
|
+
/** Path or URL to the ONNX model. Default: HuggingFace CDN */
|
|
2147
|
+
modelUrl?: string;
|
|
2148
|
+
}
|
|
2149
|
+
/**
|
|
2150
|
+
* Create a Silero VAD instance via the unified worker.
|
|
2151
|
+
*
|
|
2152
|
+
* If no `unifiedWorker` is provided, a dedicated worker is created on load().
|
|
2153
|
+
*
|
|
2154
|
+
* @param config - Factory configuration
|
|
2155
|
+
* @returns A SileroVADBackend instance
|
|
2156
|
+
*/
|
|
2157
|
+
declare function createSileroVAD(config?: SileroVADFactoryConfig): SileroVADBackend;
|
|
2158
|
+
|
|
2707
2159
|
/**
|
|
2708
2160
|
* SenseVoice adapter backed by UnifiedInferenceWorker
|
|
2709
2161
|
*
|
|
@@ -2762,34 +2214,9 @@ declare class A2EUnifiedAdapter implements A2EBackend {
|
|
|
2762
2214
|
}
|
|
2763
2215
|
|
|
2764
2216
|
/**
|
|
2765
|
-
* Kokoro TTS
|
|
2766
|
-
*
|
|
2767
|
-
* Pure ONNX pipeline for browser-based text-to-speech. No transformers.js dependency.
|
|
2768
|
-
* Uses eSpeak-NG WASM for phonemization and Kokoro-82M (q8, 92MB) for synthesis.
|
|
2769
|
-
*
|
|
2770
|
-
* Pipeline: Text → Normalize → Phonemize (eSpeak WASM) → Tokenize → Voice Style → ONNX → Audio
|
|
2217
|
+
* Kokoro TTS type definitions
|
|
2771
2218
|
*
|
|
2772
2219
|
* @category Inference
|
|
2773
|
-
*
|
|
2774
|
-
* @example Basic usage
|
|
2775
|
-
* ```typescript
|
|
2776
|
-
* import { KokoroTTSInference } from '@omote/core';
|
|
2777
|
-
*
|
|
2778
|
-
* const tts = new KokoroTTSInference({ defaultVoice: 'af_heart' });
|
|
2779
|
-
* await tts.load();
|
|
2780
|
-
*
|
|
2781
|
-
* const { audio, duration } = await tts.synthesize("Hello world");
|
|
2782
|
-
* // audio: Float32Array @ 24kHz
|
|
2783
|
-
* ```
|
|
2784
|
-
*
|
|
2785
|
-
* @example Streaming (sentence-by-sentence)
|
|
2786
|
-
* ```typescript
|
|
2787
|
-
* for await (const chunk of tts.stream("First sentence. Second sentence.")) {
|
|
2788
|
-
* playbackPipeline.feedBuffer(chunk.audio);
|
|
2789
|
-
* }
|
|
2790
|
-
* ```
|
|
2791
|
-
*
|
|
2792
|
-
* @module inference/KokoroTTSInference
|
|
2793
2220
|
*/
|
|
2794
2221
|
|
|
2795
2222
|
interface KokoroTTSConfig {
|
|
@@ -2803,6 +2230,8 @@ interface KokoroTTSConfig {
|
|
|
2803
2230
|
backend?: BackendPreference;
|
|
2804
2231
|
/** Speech speed multiplier (default: 1.0) */
|
|
2805
2232
|
speed?: number;
|
|
2233
|
+
/** Eagerly load phonemizer + default voice during load() instead of first speak(). Default: true. */
|
|
2234
|
+
eagerLoad?: boolean;
|
|
2806
2235
|
}
|
|
2807
2236
|
interface KokoroTTSResult {
|
|
2808
2237
|
/** Audio samples at 24kHz */
|
|
@@ -2841,67 +2270,6 @@ interface SynthesizeOptions {
|
|
|
2841
2270
|
* Returns trimmed text on success, throws on invalid input.
|
|
2842
2271
|
*/
|
|
2843
2272
|
declare function validateTTSInput(text: unknown, voiceName: string, speed: number, availableVoices?: string[]): string;
|
|
2844
|
-
declare class KokoroTTSInference implements TTSBackend {
|
|
2845
|
-
private readonly config;
|
|
2846
|
-
private readonly modelUrl;
|
|
2847
|
-
private readonly voiceBaseUrl;
|
|
2848
|
-
private ort;
|
|
2849
|
-
private session;
|
|
2850
|
-
private _backend;
|
|
2851
|
-
private isLoading;
|
|
2852
|
-
private poisoned;
|
|
2853
|
-
private inferenceQueue;
|
|
2854
|
-
private phonemizerReady;
|
|
2855
|
-
private defaultVoiceLoaded;
|
|
2856
|
-
/** Cached voice data (voice name → Float32Array) */
|
|
2857
|
-
private loadedVoices;
|
|
2858
|
-
constructor(config?: KokoroTTSConfig);
|
|
2859
|
-
get isLoaded(): boolean;
|
|
2860
|
-
get sampleRate(): number;
|
|
2861
|
-
/**
|
|
2862
|
-
* Load the ONNX model, phonemizer WASM, and default voice.
|
|
2863
|
-
* Safe to call multiple times (no-ops after first successful load).
|
|
2864
|
-
*/
|
|
2865
|
-
load(): Promise<KokoroTTSModelInfo>;
|
|
2866
|
-
/**
|
|
2867
|
-
* Lazily initialize phonemizer and default voice on first use.
|
|
2868
|
-
* Moves 100-200ms of main-thread blocking out of load() into first synthesis.
|
|
2869
|
-
*/
|
|
2870
|
-
private ensureReady;
|
|
2871
|
-
/**
|
|
2872
|
-
* Synthesize speech from text (one-shot, full audio output).
|
|
2873
|
-
*
|
|
2874
|
-
* @param text - Input text to synthesize
|
|
2875
|
-
* @param options - Voice and speed overrides
|
|
2876
|
-
* @returns Audio Float32Array at 24kHz with duration
|
|
2877
|
-
*/
|
|
2878
|
-
synthesize(text: string, options?: SynthesizeOptions): Promise<KokoroTTSResult>;
|
|
2879
|
-
/**
|
|
2880
|
-
* Stream synthesis sentence-by-sentence (async generator).
|
|
2881
|
-
* Splits text on sentence boundaries and yields audio for each.
|
|
2882
|
-
*
|
|
2883
|
-
* Compatible with both `SynthesizeOptions` (legacy) and `TTSStreamOptions` (TTSBackend).
|
|
2884
|
-
*
|
|
2885
|
-
* @param text - Input text (can be multiple sentences)
|
|
2886
|
-
* @param options - Voice, speed, and abort signal overrides
|
|
2887
|
-
*/
|
|
2888
|
-
stream(text: string, options?: SynthesizeOptions & TTSStreamOptions): AsyncGenerator<KokoroStreamChunk & TTSChunk>;
|
|
2889
|
-
/**
|
|
2890
|
-
* Preload a voice (fetches and caches the .bin file).
|
|
2891
|
-
*/
|
|
2892
|
-
preloadVoice(voiceName: string): Promise<void>;
|
|
2893
|
-
/**
|
|
2894
|
-
* List available voice names.
|
|
2895
|
-
*/
|
|
2896
|
-
listVoices(): string[];
|
|
2897
|
-
/**
|
|
2898
|
-
* Release the ONNX session and clear cached voices.
|
|
2899
|
-
*/
|
|
2900
|
-
dispose(): Promise<void>;
|
|
2901
|
-
private ensureVoice;
|
|
2902
|
-
private queueInference;
|
|
2903
|
-
private runInference;
|
|
2904
|
-
}
|
|
2905
2273
|
|
|
2906
2274
|
/**
|
|
2907
2275
|
* Kokoro TTS adapter backed by UnifiedInferenceWorker
|
|
@@ -2917,6 +2285,7 @@ declare class KokoroTTSUnifiedAdapter implements TTSBackend {
|
|
|
2917
2285
|
private readonly modelUrl;
|
|
2918
2286
|
private readonly voiceBaseUrl;
|
|
2919
2287
|
private _isLoaded;
|
|
2288
|
+
private _backend;
|
|
2920
2289
|
private loadedGeneration;
|
|
2921
2290
|
/** Per-adapter inference queue — ensures sequential state updates. */
|
|
2922
2291
|
private inferenceQueue;
|
|
@@ -3184,102 +2553,15 @@ declare class SafariSpeechRecognition {
|
|
|
3184
2553
|
}
|
|
3185
2554
|
|
|
3186
2555
|
/**
|
|
3187
|
-
* Kokoro TTS
|
|
3188
|
-
*
|
|
3189
|
-
* Moves the heavy ONNX `session.run()` to a dedicated Web Worker to prevent
|
|
3190
|
-
* main thread blocking (~1-2s per sentence on WASM). Phonemizer, tokenizer,
|
|
3191
|
-
* and voice logic stay on the main thread (fast, <10ms combined).
|
|
3192
|
-
*
|
|
3193
|
-
* Architecture:
|
|
3194
|
-
* ```
|
|
3195
|
-
* Main Thread (KokoroTTSWorker): Worker (WORKER_SCRIPT):
|
|
3196
|
-
* stream(text) →
|
|
3197
|
-
* splitSentences(text)
|
|
3198
|
-
* for each sentence:
|
|
3199
|
-
* phonemize(sentence) → phonemes
|
|
3200
|
-
* tokenize(phonemes) → tokens
|
|
3201
|
-
* ensureVoice() → style
|
|
3202
|
-
* postMessage(tokens, style, speed) ──→ session.run(feeds)
|
|
3203
|
-
* await result ←── postMessage(audio)
|
|
3204
|
-
* yield {audio, text, phonemes, duration}
|
|
3205
|
-
* ```
|
|
3206
|
-
*
|
|
3207
|
-
* @category Inference
|
|
3208
|
-
*
|
|
3209
|
-
* @example Basic usage
|
|
3210
|
-
* ```typescript
|
|
3211
|
-
* import { KokoroTTSWorker } from '@omote/core';
|
|
3212
|
-
*
|
|
3213
|
-
* const tts = new KokoroTTSWorker({ defaultVoice: 'af_heart' });
|
|
3214
|
-
* await tts.load();
|
|
3215
|
-
*
|
|
3216
|
-
* for await (const chunk of tts.stream("Hello world!")) {
|
|
3217
|
-
* playbackPipeline.feedBuffer(chunk.audio);
|
|
3218
|
-
* }
|
|
3219
|
-
* ```
|
|
3220
|
-
*
|
|
3221
|
-
* @module inference/KokoroTTSWorker
|
|
3222
|
-
*/
|
|
3223
|
-
|
|
3224
|
-
/**
|
|
3225
|
-
* Kokoro TTS Worker — off-main-thread ONNX inference for non-blocking TTS.
|
|
3226
|
-
*
|
|
3227
|
-
* Phonemizer/tokenizer/voice logic run on the main thread (fast, <10ms).
|
|
3228
|
-
* Only the heavy ONNX `session.run()` is delegated to the worker.
|
|
3229
|
-
*
|
|
3230
|
-
* Implements the same TTSBackend interface as KokoroTTSInference.
|
|
3231
|
-
*
|
|
3232
|
-
* @see KokoroTTSInference for main-thread version
|
|
3233
|
-
*/
|
|
3234
|
-
declare class KokoroTTSWorker implements TTSBackend {
|
|
3235
|
-
private readonly config;
|
|
3236
|
-
private readonly modelUrl;
|
|
3237
|
-
private readonly voiceBaseUrl;
|
|
3238
|
-
private worker;
|
|
3239
|
-
private _isLoaded;
|
|
3240
|
-
private isLoading;
|
|
3241
|
-
private poisoned;
|
|
3242
|
-
/** Serializes all worker calls (stream sentence chunks + synthesize) */
|
|
3243
|
-
private inferenceQueue;
|
|
3244
|
-
/** Cached voice data (voice name → Float32Array) */
|
|
3245
|
-
private loadedVoices;
|
|
3246
|
-
/** Pending message handlers */
|
|
3247
|
-
private pendingResolvers;
|
|
3248
|
-
constructor(config?: KokoroTTSConfig);
|
|
3249
|
-
get isLoaded(): boolean;
|
|
3250
|
-
get sampleRate(): number;
|
|
3251
|
-
load(): Promise<KokoroTTSModelInfo>;
|
|
3252
|
-
synthesize(text: string, options?: SynthesizeOptions): Promise<KokoroTTSResult>;
|
|
3253
|
-
stream(text: string, options?: SynthesizeOptions & TTSStreamOptions): AsyncGenerator<KokoroStreamChunk & TTSChunk>;
|
|
3254
|
-
preloadVoice(voiceName: string): Promise<void>;
|
|
3255
|
-
listVoices(): string[];
|
|
3256
|
-
dispose(): Promise<void>;
|
|
3257
|
-
static isSupported(): boolean;
|
|
3258
|
-
private ensureVoice;
|
|
3259
|
-
private createWorker;
|
|
3260
|
-
private handleWorkerMessage;
|
|
3261
|
-
private sendMessage;
|
|
3262
|
-
/**
|
|
3263
|
-
* Queue worker inference through the serialization queue.
|
|
3264
|
-
* Sends pre-computed tokens + style to worker, returns audio.
|
|
3265
|
-
*/
|
|
3266
|
-
private runWorkerInference;
|
|
3267
|
-
/**
|
|
3268
|
-
* One-shot synthesis (phonemize + tokenize + worker inference).
|
|
3269
|
-
*/
|
|
3270
|
-
private queueInference;
|
|
3271
|
-
}
|
|
3272
|
-
|
|
3273
|
-
/**
|
|
3274
|
-
* Factory function for Kokoro TTS with automatic Worker vs main thread selection
|
|
2556
|
+
* Factory function for Kokoro TTS via UnifiedInferenceWorker
|
|
3275
2557
|
*
|
|
3276
|
-
*
|
|
3277
|
-
*
|
|
3278
|
-
*
|
|
2558
|
+
* When called without a `unifiedWorker`, a dedicated worker is created
|
|
2559
|
+
* automatically on the first `load()` call. Pass a shared worker when using
|
|
2560
|
+
* VoiceOrchestrator or multiple models to avoid extra WASM instances.
|
|
3279
2561
|
*
|
|
3280
2562
|
* @category Inference
|
|
3281
2563
|
*
|
|
3282
|
-
* @example
|
|
2564
|
+
* @example Standalone (auto-creates worker)
|
|
3283
2565
|
* ```typescript
|
|
3284
2566
|
* import { createKokoroTTS } from '@omote/core';
|
|
3285
2567
|
*
|
|
@@ -3291,14 +2573,9 @@ declare class KokoroTTSWorker implements TTSBackend {
|
|
|
3291
2573
|
* }
|
|
3292
2574
|
* ```
|
|
3293
2575
|
*
|
|
3294
|
-
* @example
|
|
2576
|
+
* @example With shared worker
|
|
3295
2577
|
* ```typescript
|
|
3296
|
-
* const tts = createKokoroTTS({ defaultVoice: 'af_heart',
|
|
3297
|
-
* ```
|
|
3298
|
-
*
|
|
3299
|
-
* @example Force main thread
|
|
3300
|
-
* ```typescript
|
|
3301
|
-
* const tts = createKokoroTTS({ defaultVoice: 'af_heart', useWorker: false });
|
|
2578
|
+
* const tts = createKokoroTTS({ defaultVoice: 'af_heart', unifiedWorker: worker });
|
|
3302
2579
|
* ```
|
|
3303
2580
|
*/
|
|
3304
2581
|
|
|
@@ -3308,10 +2585,12 @@ declare class KokoroTTSWorker implements TTSBackend {
|
|
|
3308
2585
|
interface CreateKokoroTTSConfig extends KokoroTTSConfig, InferenceFactoryConfig {
|
|
3309
2586
|
}
|
|
3310
2587
|
/**
|
|
3311
|
-
* Create a Kokoro TTS instance
|
|
2588
|
+
* Create a Kokoro TTS instance via the unified worker.
|
|
2589
|
+
*
|
|
2590
|
+
* If no `unifiedWorker` is provided, a dedicated worker is created on load().
|
|
3312
2591
|
*
|
|
3313
2592
|
* @param config - Factory configuration
|
|
3314
|
-
* @returns A TTSBackend instance
|
|
2593
|
+
* @returns A TTSBackend instance
|
|
3315
2594
|
*/
|
|
3316
2595
|
declare function createKokoroTTS(config?: CreateKokoroTTSConfig): TTSBackend;
|
|
3317
2596
|
|
|
@@ -3360,7 +2639,7 @@ declare function listVoices(): string[];
|
|
|
3360
2639
|
* ElevenLabs TTS Backend — Cloud text-to-speech via ElevenLabs REST API.
|
|
3361
2640
|
*
|
|
3362
2641
|
* Implements the TTSBackend interface so it can be used anywhere Kokoro TTS is used
|
|
3363
|
-
* (TTSPlayback, TTSSpeaker,
|
|
2642
|
+
* (TTSPlayback, TTSSpeaker, VoiceOrchestrator, PlaybackPipeline, etc.)
|
|
3364
2643
|
*
|
|
3365
2644
|
* Zero external dependencies — uses fetch() directly.
|
|
3366
2645
|
*
|
|
@@ -3438,141 +2717,6 @@ declare class ElevenLabsTTSBackend implements TTSBackend {
|
|
|
3438
2717
|
private getHttpErrorMessage;
|
|
3439
2718
|
}
|
|
3440
2719
|
|
|
3441
|
-
/**
|
|
3442
|
-
* AWS Polly TTS Backend — Cloud text-to-speech via consumer-provided AWS SDK call.
|
|
3443
|
-
*
|
|
3444
|
-
* Implements the TTSBackend interface. Keeps @omote/core free of AWS SDK dependencies
|
|
3445
|
-
* by delegating the actual Polly API call to a consumer-provided function.
|
|
3446
|
-
*
|
|
3447
|
-
* @category Inference
|
|
3448
|
-
*
|
|
3449
|
-
* @example Basic usage with AWS SDK v3
|
|
3450
|
-
* ```typescript
|
|
3451
|
-
* import { PollyTTSBackend } from '@omote/core';
|
|
3452
|
-
* import { PollyClient, SynthesizeSpeechCommand } from '@aws-sdk/client-polly';
|
|
3453
|
-
*
|
|
3454
|
-
* const polly = new PollyClient({ region: 'us-east-1' });
|
|
3455
|
-
*
|
|
3456
|
-
* const tts = new PollyTTSBackend({
|
|
3457
|
-
* synthesizeFn: async (text, voice, sampleRate) => {
|
|
3458
|
-
* const cmd = new SynthesizeSpeechCommand({
|
|
3459
|
-
* Text: text,
|
|
3460
|
-
* VoiceId: voice,
|
|
3461
|
-
* Engine: 'neural',
|
|
3462
|
-
* OutputFormat: 'pcm',
|
|
3463
|
-
* SampleRate: String(sampleRate),
|
|
3464
|
-
* });
|
|
3465
|
-
* const result = await polly.send(cmd);
|
|
3466
|
-
* const stream = result.AudioStream;
|
|
3467
|
-
* // Convert stream to ArrayBuffer (Node or browser)
|
|
3468
|
-
* const chunks: Uint8Array[] = [];
|
|
3469
|
-
* for await (const chunk of stream as AsyncIterable<Uint8Array>) {
|
|
3470
|
-
* chunks.push(chunk);
|
|
3471
|
-
* }
|
|
3472
|
-
* const totalLength = chunks.reduce((sum, c) => sum + c.length, 0);
|
|
3473
|
-
* const merged = new Uint8Array(totalLength);
|
|
3474
|
-
* let offset = 0;
|
|
3475
|
-
* for (const chunk of chunks) {
|
|
3476
|
-
* merged.set(chunk, offset);
|
|
3477
|
-
* offset += chunk.length;
|
|
3478
|
-
* }
|
|
3479
|
-
* return {
|
|
3480
|
-
* audio: merged.buffer,
|
|
3481
|
-
* contentType: result.ContentType ?? 'audio/pcm',
|
|
3482
|
-
* };
|
|
3483
|
-
* },
|
|
3484
|
-
* });
|
|
3485
|
-
*
|
|
3486
|
-
* await tts.load();
|
|
3487
|
-
* for await (const chunk of tts.stream("Hello world!")) {
|
|
3488
|
-
* playbackPipeline.feedBuffer(chunk.audio);
|
|
3489
|
-
* }
|
|
3490
|
-
* ```
|
|
3491
|
-
*/
|
|
3492
|
-
|
|
3493
|
-
/**
|
|
3494
|
-
* Result from the consumer-provided synthesize function.
|
|
3495
|
-
*/
|
|
3496
|
-
interface PollySynthesizeResult {
|
|
3497
|
-
/** Raw PCM audio bytes (Int16 LE) */
|
|
3498
|
-
audio: ArrayBuffer;
|
|
3499
|
-
/** Content type from Polly response (e.g., 'audio/pcm') */
|
|
3500
|
-
contentType: string;
|
|
3501
|
-
}
|
|
3502
|
-
/**
|
|
3503
|
-
* Configuration for PollyTTSBackend.
|
|
3504
|
-
*
|
|
3505
|
-
* The `synthesizeFn` callback lets consumers use their own AWS SDK setup
|
|
3506
|
-
* (credentials, region, SDK version) without @omote/core depending on `@aws-sdk/client-polly`.
|
|
3507
|
-
*/
|
|
3508
|
-
interface PollyConfig {
|
|
3509
|
-
/**
|
|
3510
|
-
* Consumer-provided function that calls AWS Polly.
|
|
3511
|
-
* Must return PCM audio (Int16 LE) at the requested sample rate.
|
|
3512
|
-
*
|
|
3513
|
-
* @param text - Text to synthesize
|
|
3514
|
-
* @param voice - Polly voice ID (e.g., 'Joanna')
|
|
3515
|
-
* @param sampleRate - Requested output sample rate (e.g., 16000)
|
|
3516
|
-
* @returns PCM audio buffer and content type
|
|
3517
|
-
*/
|
|
3518
|
-
synthesizeFn: (text: string, voice: string, sampleRate: number) => Promise<PollySynthesizeResult>;
|
|
3519
|
-
/** Polly voice ID (default: 'Joanna') */
|
|
3520
|
-
voice?: string;
|
|
3521
|
-
/** Output sample rate in Hz (default: 16000) */
|
|
3522
|
-
sampleRate?: number;
|
|
3523
|
-
/** Polly engine type (default: 'neural') */
|
|
3524
|
-
engine?: 'neural' | 'standard' | 'generative' | 'long-form';
|
|
3525
|
-
}
|
|
3526
|
-
declare class PollyTTSBackend implements TTSBackend {
|
|
3527
|
-
private readonly synthesizeFn;
|
|
3528
|
-
private readonly voice;
|
|
3529
|
-
private readonly _sampleRate;
|
|
3530
|
-
private readonly engine;
|
|
3531
|
-
private _isLoaded;
|
|
3532
|
-
constructor(config: PollyConfig);
|
|
3533
|
-
get sampleRate(): number;
|
|
3534
|
-
get isLoaded(): boolean;
|
|
3535
|
-
/**
|
|
3536
|
-
* No-op for cloud TTS (no model to load).
|
|
3537
|
-
* Marks backend as ready.
|
|
3538
|
-
*/
|
|
3539
|
-
load(): Promise<void>;
|
|
3540
|
-
/**
|
|
3541
|
-
* Synthesize audio via consumer's Polly function.
|
|
3542
|
-
*
|
|
3543
|
-
* Polly's SynthesizeSpeech is request/response (not streaming for PCM),
|
|
3544
|
-
* so this yields a single chunk per call. For long text, consider splitting
|
|
3545
|
-
* into sentences on the consumer side.
|
|
3546
|
-
*/
|
|
3547
|
-
stream(text: string, options?: TTSStreamOptions): AsyncGenerator<TTSChunk>;
|
|
3548
|
-
dispose(): Promise<void>;
|
|
3549
|
-
}
|
|
3550
|
-
|
|
3551
|
-
/**
|
|
3552
|
-
* ORT CDN configuration
|
|
3553
|
-
*
|
|
3554
|
-
* Allows consumers to override the CDN base URL used for loading
|
|
3555
|
-
* ONNX Runtime WASM/WebGPU binaries. By default, ORT loads from
|
|
3556
|
-
* its bundled CDN path. Use {@link configureOrtCdn} to point at
|
|
3557
|
-
* a self-hosted or enterprise CDN.
|
|
3558
|
-
*
|
|
3559
|
-
* @category Inference
|
|
3560
|
-
*/
|
|
3561
|
-
/**
|
|
3562
|
-
* Override the CDN base URL for ONNX Runtime WASM/WebGPU binaries.
|
|
3563
|
-
*
|
|
3564
|
-
* Must be an HTTPS URL or a relative path (starts with `/` or `./`).
|
|
3565
|
-
* Call this once at app startup, before loading any models.
|
|
3566
|
-
*
|
|
3567
|
-
* @param cdnPath - HTTPS URL or relative path to ORT binaries directory
|
|
3568
|
-
* @throws If cdnPath is not HTTPS or a relative path
|
|
3569
|
-
*/
|
|
3570
|
-
declare function configureOrtCdn(cdnPath: string): void;
|
|
3571
|
-
/**
|
|
3572
|
-
* Get the current ORT CDN base URL override, or null if using defaults.
|
|
3573
|
-
*/
|
|
3574
|
-
declare function getOrtCdnBase(): string | null;
|
|
3575
|
-
|
|
3576
2720
|
/**
|
|
3577
2721
|
* Emotion - Helper for creating emotion vectors for avatar animation
|
|
3578
2722
|
*
|
|
@@ -3987,13 +3131,90 @@ interface FetchWithCacheOptions {
|
|
|
3987
3131
|
*/
|
|
3988
3132
|
declare function fetchWithCache(url: string, optionsOrProgress?: FetchWithCacheOptions | ((loaded: number, total: number) => void)): Promise<ArrayBuffer>;
|
|
3989
3133
|
/**
|
|
3990
|
-
* Preload models into cache without creating sessions
|
|
3134
|
+
* Preload models into cache without creating sessions
|
|
3135
|
+
*/
|
|
3136
|
+
declare function preloadModels(urls: string[], onProgress?: (current: number, total: number, url: string) => void): Promise<void>;
|
|
3137
|
+
/**
|
|
3138
|
+
* Format bytes as human readable string
|
|
3139
|
+
*/
|
|
3140
|
+
declare function formatBytes(bytes: number): string;
|
|
3141
|
+
|
|
3142
|
+
/**
|
|
3143
|
+
* Console Exporter
|
|
3144
|
+
*
|
|
3145
|
+
* Exports telemetry data to the browser console for development/debugging.
|
|
3146
|
+
*
|
|
3147
|
+
* @category Telemetry
|
|
3148
|
+
*/
|
|
3149
|
+
|
|
3150
|
+
/**
|
|
3151
|
+
* Span data structure for export
|
|
3152
|
+
*/
|
|
3153
|
+
interface SpanData {
|
|
3154
|
+
name: string;
|
|
3155
|
+
traceId: string;
|
|
3156
|
+
spanId: string;
|
|
3157
|
+
parentSpanId?: string;
|
|
3158
|
+
startTime: number;
|
|
3159
|
+
endTime: number;
|
|
3160
|
+
durationMs: number;
|
|
3161
|
+
/** Epoch timestamp in ms for OTLP export (start) */
|
|
3162
|
+
epochMs: number;
|
|
3163
|
+
/** Epoch timestamp in ms for OTLP export (end) */
|
|
3164
|
+
endEpochMs: number;
|
|
3165
|
+
status: 'ok' | 'error';
|
|
3166
|
+
attributes: SpanAttributes;
|
|
3167
|
+
error?: Error;
|
|
3168
|
+
}
|
|
3169
|
+
/**
|
|
3170
|
+
* Metric data structure for export
|
|
3171
|
+
*/
|
|
3172
|
+
interface MetricData {
|
|
3173
|
+
name: string;
|
|
3174
|
+
type: 'counter' | 'histogram';
|
|
3175
|
+
value: number;
|
|
3176
|
+
attributes: Record<string, string | number | boolean>;
|
|
3177
|
+
timestamp: number;
|
|
3178
|
+
/** Histogram bucket data for OTLP export */
|
|
3179
|
+
histogramData?: {
|
|
3180
|
+
count: number;
|
|
3181
|
+
sum: number;
|
|
3182
|
+
min: number;
|
|
3183
|
+
max: number;
|
|
3184
|
+
bucketBoundaries: number[];
|
|
3185
|
+
bucketCounts: number[];
|
|
3186
|
+
};
|
|
3187
|
+
}
|
|
3188
|
+
/**
|
|
3189
|
+
* Exporter interface that all exporters must implement
|
|
3991
3190
|
*/
|
|
3992
|
-
|
|
3191
|
+
interface TelemetryExporterInterface {
|
|
3192
|
+
/** Export a completed span */
|
|
3193
|
+
exportSpan(span: SpanData): void;
|
|
3194
|
+
/** Export a metric */
|
|
3195
|
+
exportMetric(metric: MetricData): void;
|
|
3196
|
+
/** Flush any buffered data */
|
|
3197
|
+
flush(): Promise<void>;
|
|
3198
|
+
/** Shutdown the exporter */
|
|
3199
|
+
shutdown(): Promise<void>;
|
|
3200
|
+
}
|
|
3993
3201
|
/**
|
|
3994
|
-
*
|
|
3202
|
+
* Console exporter for development/debugging
|
|
3203
|
+
*
|
|
3204
|
+
* Outputs spans and metrics to the browser console with formatting.
|
|
3995
3205
|
*/
|
|
3996
|
-
declare
|
|
3206
|
+
declare class ConsoleExporter implements TelemetryExporterInterface {
|
|
3207
|
+
private enabled;
|
|
3208
|
+
private prefix;
|
|
3209
|
+
constructor(options?: {
|
|
3210
|
+
enabled?: boolean;
|
|
3211
|
+
prefix?: string;
|
|
3212
|
+
});
|
|
3213
|
+
exportSpan(span: SpanData): void;
|
|
3214
|
+
exportMetric(metric: MetricData): void;
|
|
3215
|
+
flush(): Promise<void>;
|
|
3216
|
+
shutdown(): Promise<void>;
|
|
3217
|
+
}
|
|
3997
3218
|
|
|
3998
3219
|
/**
|
|
3999
3220
|
* Telemetry Types
|
|
@@ -4046,6 +3267,8 @@ interface TelemetryConfig {
|
|
|
4046
3267
|
metricsEnabled?: boolean;
|
|
4047
3268
|
/** Metrics export interval in ms. Default: 60000 */
|
|
4048
3269
|
metricsIntervalMs?: number;
|
|
3270
|
+
/** Custom exporter instance (overrides `exporter` when provided) */
|
|
3271
|
+
customExporter?: TelemetryExporterInterface;
|
|
4049
3272
|
}
|
|
4050
3273
|
/**
|
|
4051
3274
|
* Span attributes for model operations
|
|
@@ -4118,7 +3341,7 @@ declare const MetricNames: {
|
|
|
4118
3341
|
readonly CACHE_QUOTA_WARNING: "omote.cache.quota_warning";
|
|
4119
3342
|
/** Counter: Cache eviction (LRU) */
|
|
4120
3343
|
readonly CACHE_EVICTION: "omote.cache.eviction";
|
|
4121
|
-
/** Histogram:
|
|
3344
|
+
/** Histogram: Voice turn latency (speech end → transcript ready, excludes playback) */
|
|
4122
3345
|
readonly VOICE_TURN_LATENCY: "omote.voice.turn.latency";
|
|
4123
3346
|
/** Histogram: ASR transcription latency in ms */
|
|
4124
3347
|
readonly VOICE_TRANSCRIPTION_LATENCY: "omote.voice.transcription.latency";
|
|
@@ -4146,20 +3369,9 @@ declare const MetricNames: {
|
|
|
4146
3369
|
readonly COMPOSITOR_COMPOSE_LATENCY: "omote.compositor.compose.latency_us";
|
|
4147
3370
|
/** Counter: Frames exceeding budget threshold */
|
|
4148
3371
|
readonly AVATAR_FRAME_DROPS: "omote.avatar.frame.drops";
|
|
3372
|
+
/** Counter: Audio scheduling gaps (playback fell behind) */
|
|
3373
|
+
readonly AUDIO_SCHEDULE_GAP: "omote.audio.schedule_gap";
|
|
4149
3374
|
};
|
|
4150
|
-
/**
|
|
4151
|
-
* Centralized error type taxonomy for structured error reporting.
|
|
4152
|
-
*/
|
|
4153
|
-
declare const ErrorTypes: {
|
|
4154
|
-
readonly INFERENCE: "inference_error";
|
|
4155
|
-
readonly NETWORK: "network_error";
|
|
4156
|
-
readonly TIMEOUT: "timeout";
|
|
4157
|
-
readonly USER: "user_error";
|
|
4158
|
-
readonly RUNTIME: "runtime_error";
|
|
4159
|
-
readonly MEDIA: "media_error";
|
|
4160
|
-
readonly MODEL: "model_error";
|
|
4161
|
-
};
|
|
4162
|
-
type ErrorType = typeof ErrorTypes[keyof typeof ErrorTypes];
|
|
4163
3375
|
/**
|
|
4164
3376
|
* Histogram buckets for inference latency (ms)
|
|
4165
3377
|
*/
|
|
@@ -4235,6 +3447,7 @@ declare function getTelemetry(): OmoteTelemetry | null;
|
|
|
4235
3447
|
declare class OmoteTelemetry {
|
|
4236
3448
|
private config;
|
|
4237
3449
|
private exporter;
|
|
3450
|
+
private exporterReady;
|
|
4238
3451
|
private activeTraceId;
|
|
4239
3452
|
private metricsIntervalId;
|
|
4240
3453
|
private spanStack;
|
|
@@ -4310,7 +3523,7 @@ declare class OmoteTelemetry {
|
|
|
4310
3523
|
* });
|
|
4311
3524
|
* ```
|
|
4312
3525
|
*/
|
|
4313
|
-
recordHistogram(name: string, value: number, attributes?: Record<string, string | number | boolean
|
|
3526
|
+
recordHistogram(name: string, value: number, attributes?: Record<string, string | number | boolean>, bucketBoundaries?: number[]): void;
|
|
4314
3527
|
/**
|
|
4315
3528
|
* Generate unique key for metric with attributes
|
|
4316
3529
|
*/
|
|
@@ -4345,70 +3558,6 @@ declare class OmoteTelemetry {
|
|
|
4345
3558
|
} | null;
|
|
4346
3559
|
}
|
|
4347
3560
|
|
|
4348
|
-
/**
|
|
4349
|
-
* Console Exporter
|
|
4350
|
-
*
|
|
4351
|
-
* Exports telemetry data to the browser console for development/debugging.
|
|
4352
|
-
*
|
|
4353
|
-
* @category Telemetry
|
|
4354
|
-
*/
|
|
4355
|
-
|
|
4356
|
-
/**
|
|
4357
|
-
* Span data structure for export
|
|
4358
|
-
*/
|
|
4359
|
-
interface SpanData {
|
|
4360
|
-
name: string;
|
|
4361
|
-
traceId: string;
|
|
4362
|
-
spanId: string;
|
|
4363
|
-
parentSpanId?: string;
|
|
4364
|
-
startTime: number;
|
|
4365
|
-
endTime: number;
|
|
4366
|
-
durationMs: number;
|
|
4367
|
-
status: 'ok' | 'error';
|
|
4368
|
-
attributes: SpanAttributes;
|
|
4369
|
-
error?: Error;
|
|
4370
|
-
}
|
|
4371
|
-
/**
|
|
4372
|
-
* Metric data structure for export
|
|
4373
|
-
*/
|
|
4374
|
-
interface MetricData {
|
|
4375
|
-
name: string;
|
|
4376
|
-
type: 'counter' | 'histogram';
|
|
4377
|
-
value: number;
|
|
4378
|
-
attributes: Record<string, string | number | boolean>;
|
|
4379
|
-
timestamp: number;
|
|
4380
|
-
}
|
|
4381
|
-
/**
|
|
4382
|
-
* Exporter interface that all exporters must implement
|
|
4383
|
-
*/
|
|
4384
|
-
interface TelemetryExporterInterface {
|
|
4385
|
-
/** Export a completed span */
|
|
4386
|
-
exportSpan(span: SpanData): void;
|
|
4387
|
-
/** Export a metric */
|
|
4388
|
-
exportMetric(metric: MetricData): void;
|
|
4389
|
-
/** Flush any buffered data */
|
|
4390
|
-
flush(): Promise<void>;
|
|
4391
|
-
/** Shutdown the exporter */
|
|
4392
|
-
shutdown(): Promise<void>;
|
|
4393
|
-
}
|
|
4394
|
-
/**
|
|
4395
|
-
* Console exporter for development/debugging
|
|
4396
|
-
*
|
|
4397
|
-
* Outputs spans and metrics to the browser console with formatting.
|
|
4398
|
-
*/
|
|
4399
|
-
declare class ConsoleExporter implements TelemetryExporterInterface {
|
|
4400
|
-
private enabled;
|
|
4401
|
-
private prefix;
|
|
4402
|
-
constructor(options?: {
|
|
4403
|
-
enabled?: boolean;
|
|
4404
|
-
prefix?: string;
|
|
4405
|
-
});
|
|
4406
|
-
exportSpan(span: SpanData): void;
|
|
4407
|
-
exportMetric(metric: MetricData): void;
|
|
4408
|
-
flush(): Promise<void>;
|
|
4409
|
-
shutdown(): Promise<void>;
|
|
4410
|
-
}
|
|
4411
|
-
|
|
4412
3561
|
/**
|
|
4413
3562
|
* OTLP Exporter
|
|
4414
3563
|
*
|
|
@@ -4966,7 +4115,7 @@ declare class ProceduralLifeLayer {
|
|
|
4966
4115
|
*/
|
|
4967
4116
|
update(delta: number, input?: LifeLayerInput): LifeLayerOutput;
|
|
4968
4117
|
/**
|
|
4969
|
-
* Write life layer output directly to a Float32Array[52] in
|
|
4118
|
+
* Write life layer output directly to a Float32Array[52] in ARKIT_BLENDSHAPES order.
|
|
4970
4119
|
*
|
|
4971
4120
|
* Includes micro-jitter (0.4% amplitude simplex noise on all channels) to
|
|
4972
4121
|
* break uncanny stillness on undriven channels.
|
|
@@ -5301,7 +4450,7 @@ declare class FaceCompositor {
|
|
|
5301
4450
|
/**
|
|
5302
4451
|
* Compose a single output frame from the 5-stage signal chain.
|
|
5303
4452
|
*
|
|
5304
|
-
* @param base - A2E raw output (Float32Array[52],
|
|
4453
|
+
* @param base - A2E raw output (Float32Array[52], ARKIT_BLENDSHAPES order)
|
|
5305
4454
|
* @param input - Per-frame input (deltaTime, emotion, life layer params)
|
|
5306
4455
|
* @param target - Optional pre-allocated output buffer (avoids per-frame allocation).
|
|
5307
4456
|
* When omitted, an internal buffer is used (valid until next compose() call).
|
|
@@ -5583,216 +4732,6 @@ declare class MicLipSync extends EventEmitter<MicLipSyncEvents> {
|
|
|
5583
4732
|
private setState;
|
|
5584
4733
|
}
|
|
5585
4734
|
|
|
5586
|
-
/**
|
|
5587
|
-
* VoicePipeline - Full conversational agent loop
|
|
5588
|
-
*
|
|
5589
|
-
* Composes: MicrophoneCapture → SileroVAD → SenseVoice ASR → PlaybackPipeline (A2E)
|
|
5590
|
-
*
|
|
5591
|
-
* State machine: idle → loading → ready → listening → thinking → speaking → listening → ...
|
|
5592
|
-
*
|
|
5593
|
-
* The consumer provides an `onResponse` callback that receives transcribed text
|
|
5594
|
-
* and streams audio back for playback + lip sync. VoicePipeline is backend-agnostic.
|
|
5595
|
-
*
|
|
5596
|
-
* @category Orchestration
|
|
5597
|
-
*/
|
|
5598
|
-
|
|
5599
|
-
/** Shared config options for all VoicePipeline modes */
|
|
5600
|
-
interface VoicePipelineBaseConfig {
|
|
5601
|
-
/** Pre-built backends — skip internal factory creation. Takes precedence over `models`. */
|
|
5602
|
-
backends?: {
|
|
5603
|
-
asr: SenseVoiceBackend;
|
|
5604
|
-
lam: A2EBackend;
|
|
5605
|
-
vad: SileroVADBackend;
|
|
5606
|
-
tts?: TTSBackend;
|
|
5607
|
-
};
|
|
5608
|
-
/** External unified worker (reuse across pipelines). Takes precedence over internal creation. */
|
|
5609
|
-
unifiedWorker?: UnifiedInferenceWorker;
|
|
5610
|
-
/** URLs and options for model loading. Required if `backends` not provided. */
|
|
5611
|
-
models?: {
|
|
5612
|
-
senseVoice: {
|
|
5613
|
-
modelUrl: string;
|
|
5614
|
-
tokensUrl?: string;
|
|
5615
|
-
language?: string;
|
|
5616
|
-
};
|
|
5617
|
-
lam: {
|
|
5618
|
-
modelUrl: string;
|
|
5619
|
-
externalDataUrl?: string | false;
|
|
5620
|
-
backend?: 'auto' | 'webgpu' | 'wasm';
|
|
5621
|
-
};
|
|
5622
|
-
vad: {
|
|
5623
|
-
modelUrl: string;
|
|
5624
|
-
threshold?: number;
|
|
5625
|
-
preSpeechBufferChunks?: number;
|
|
5626
|
-
};
|
|
5627
|
-
};
|
|
5628
|
-
/** Per-character expression weight scaling */
|
|
5629
|
-
profile?: ExpressionProfile;
|
|
5630
|
-
/** Identity/style index for A2E model (default: 0) */
|
|
5631
|
-
identityIndex?: number;
|
|
5632
|
-
/** Base silence timeout in ms (default: 500) */
|
|
5633
|
-
silenceTimeoutMs?: number;
|
|
5634
|
-
/** Extended silence timeout for long utterances (default: 700) */
|
|
5635
|
-
silenceTimeoutExtendedMs?: number;
|
|
5636
|
-
/** Enable adaptive timeout based on speech duration (default: true) */
|
|
5637
|
-
adaptiveTimeout?: boolean;
|
|
5638
|
-
/** Minimum audio duration in seconds (default: 0.3) */
|
|
5639
|
-
minAudioDurationSec?: number;
|
|
5640
|
-
/** Minimum audio energy (default: 0.02) */
|
|
5641
|
-
minAudioEnergy?: number;
|
|
5642
|
-
/** Enable audio normalization for quiet audio (default: true) */
|
|
5643
|
-
normalizeAudio?: boolean;
|
|
5644
|
-
/** Progressive transcription interval — desktop (default: 500ms) */
|
|
5645
|
-
progressiveIntervalMs?: number;
|
|
5646
|
-
/** Progressive transcription interval — iOS (default: 800ms) */
|
|
5647
|
-
progressiveIntervalIosMs?: number;
|
|
5648
|
-
/** Coverage threshold to use progressive result (default: 0.8) */
|
|
5649
|
-
progressiveCoverageThreshold?: number;
|
|
5650
|
-
/** Minimum samples before progressive transcription starts (default: 8000) */
|
|
5651
|
-
progressiveMinSamples?: number;
|
|
5652
|
-
/** Timeout for individual SenseVoice.transcribe() calls (default: 10000ms) */
|
|
5653
|
-
transcriptionTimeoutMs?: number;
|
|
5654
|
-
/** Enable barge-in detection (default: true) */
|
|
5655
|
-
interruptionEnabled?: boolean;
|
|
5656
|
-
/** Minimum speech duration for interruption (default: 200ms) */
|
|
5657
|
-
interruptionMinSpeechMs?: number;
|
|
5658
|
-
/** Audio playback delay (default: auto-detected) */
|
|
5659
|
-
audioDelayMs?: number;
|
|
5660
|
-
/** Coalescer target duration (default: 200ms) */
|
|
5661
|
-
chunkTargetMs?: number;
|
|
5662
|
-
/** Enable neutral transition on playback complete (default: true) */
|
|
5663
|
-
neutralTransitionEnabled?: boolean;
|
|
5664
|
-
/** Duration of neutral fade-out (default: 250ms) */
|
|
5665
|
-
neutralTransitionMs?: number;
|
|
5666
|
-
}
|
|
5667
|
-
/** Cloud TTS mode: consumer handles response + audio streaming */
|
|
5668
|
-
interface VoicePipelineCloudConfig extends VoicePipelineBaseConfig {
|
|
5669
|
-
mode: 'cloud';
|
|
5670
|
-
/** Consumer's response handler (streams audio back) */
|
|
5671
|
-
onResponse: ResponseHandler;
|
|
5672
|
-
}
|
|
5673
|
-
/** Local TTS mode: SDK handles synthesis internally via TTSBackend */
|
|
5674
|
-
interface VoicePipelineLocalConfig extends VoicePipelineBaseConfig {
|
|
5675
|
-
mode: 'local';
|
|
5676
|
-
/**
|
|
5677
|
-
* TTS backend (e.g., KokoroTTSInference). Provide either `tts` or `ttsConfig`.
|
|
5678
|
-
*
|
|
5679
|
-
* When `tts` is provided, VoicePipeline uses it as-is. On iOS, this means
|
|
5680
|
-
* inference runs on the main thread (may cause UI freezes).
|
|
5681
|
-
*
|
|
5682
|
-
* Prefer `ttsConfig` for automatic unified worker integration on iOS.
|
|
5683
|
-
*/
|
|
5684
|
-
tts?: TTSBackend;
|
|
5685
|
-
/**
|
|
5686
|
-
* Kokoro TTS configuration. When provided, VoicePipeline creates the TTS
|
|
5687
|
-
* internally and passes the unified worker on iOS for off-main-thread inference.
|
|
5688
|
-
*
|
|
5689
|
-
* Takes precedence over `tts` if both are provided.
|
|
5690
|
-
*/
|
|
5691
|
-
ttsConfig?: {
|
|
5692
|
-
defaultVoice?: string;
|
|
5693
|
-
speed?: number;
|
|
5694
|
-
modelUrl?: string;
|
|
5695
|
-
voiceBaseUrl?: string;
|
|
5696
|
-
};
|
|
5697
|
-
/** Optional text transform (e.g., LLM call). Receives transcript, returns response text. */
|
|
5698
|
-
onTranscript?: (text: string) => string | Promise<string>;
|
|
5699
|
-
}
|
|
5700
|
-
type VoicePipelineConfig = VoicePipelineCloudConfig | VoicePipelineLocalConfig;
|
|
5701
|
-
interface VoicePipelineEvents {
|
|
5702
|
-
'state': VoicePipelineState;
|
|
5703
|
-
'loading:progress': LoadingProgress;
|
|
5704
|
-
'transcript': TranscriptResult;
|
|
5705
|
-
'frame': FullFaceFrame;
|
|
5706
|
-
'frame:raw': Float32Array;
|
|
5707
|
-
'speech:start': void;
|
|
5708
|
-
'speech:end': {
|
|
5709
|
-
durationMs: number;
|
|
5710
|
-
};
|
|
5711
|
-
'playback:start': {
|
|
5712
|
-
time: number;
|
|
5713
|
-
};
|
|
5714
|
-
'playback:complete': void;
|
|
5715
|
-
'interruption': void;
|
|
5716
|
-
'audio:level': {
|
|
5717
|
-
rms: number;
|
|
5718
|
-
peak: number;
|
|
5719
|
-
};
|
|
5720
|
-
'error': Error;
|
|
5721
|
-
[key: string]: unknown;
|
|
5722
|
-
}
|
|
5723
|
-
declare class VoicePipeline extends EventEmitter<VoicePipelineEvents> {
|
|
5724
|
-
private readonly config;
|
|
5725
|
-
private readonly isLocalMode;
|
|
5726
|
-
private _state;
|
|
5727
|
-
private stopped;
|
|
5728
|
-
private epoch;
|
|
5729
|
-
private _sessionId;
|
|
5730
|
-
private asr;
|
|
5731
|
-
private lam;
|
|
5732
|
-
private vad;
|
|
5733
|
-
private unifiedWorker;
|
|
5734
|
-
private playback;
|
|
5735
|
-
private interruption;
|
|
5736
|
-
private omoteEvents;
|
|
5737
|
-
private mic;
|
|
5738
|
-
private static readonly MAX_AUDIO_BUFFER_SAMPLES;
|
|
5739
|
-
private audioBuffer;
|
|
5740
|
-
private audioBufferSamples;
|
|
5741
|
-
private speechStartTime;
|
|
5742
|
-
private silenceTimer;
|
|
5743
|
-
private isSpeaking;
|
|
5744
|
-
private progressiveTimer;
|
|
5745
|
-
private progressivePromise;
|
|
5746
|
-
private lastProgressiveResult;
|
|
5747
|
-
private lastProgressiveSamples;
|
|
5748
|
-
private asrErrorCount;
|
|
5749
|
-
private progressiveErrorCount;
|
|
5750
|
-
private responseAbortController;
|
|
5751
|
-
private _unsubChunk;
|
|
5752
|
-
private _unsubLevel;
|
|
5753
|
-
private _currentFrame;
|
|
5754
|
-
/** Current pipeline state */
|
|
5755
|
-
get state(): VoicePipelineState;
|
|
5756
|
-
/** Latest blendshape frame */
|
|
5757
|
-
get currentFrame(): Float32Array | null;
|
|
5758
|
-
/** Whether user is currently speaking */
|
|
5759
|
-
get isSpeechActive(): boolean;
|
|
5760
|
-
/** Session ID (generated on start(), null before) */
|
|
5761
|
-
get sessionId(): string | null;
|
|
5762
|
-
constructor(config: VoicePipelineConfig);
|
|
5763
|
-
loadModels(): Promise<void>;
|
|
5764
|
-
/**
|
|
5765
|
-
* Load from pre-built backends (dependency injection path).
|
|
5766
|
-
* Loads any backends that aren't loaded yet.
|
|
5767
|
-
*/
|
|
5768
|
-
private loadFromBackends;
|
|
5769
|
-
/**
|
|
5770
|
-
* Load from factories (original path). Loads SenseVoice, LAM, and VAD in parallel.
|
|
5771
|
-
*/
|
|
5772
|
-
private loadFromFactories;
|
|
5773
|
-
start(): Promise<void>;
|
|
5774
|
-
stop(): void;
|
|
5775
|
-
setProfile(profile: ExpressionProfile): void;
|
|
5776
|
-
dispose(): Promise<void>;
|
|
5777
|
-
private processAudioChunk;
|
|
5778
|
-
private getSilenceTimeout;
|
|
5779
|
-
private onSilenceDetected;
|
|
5780
|
-
private processEndOfSpeech;
|
|
5781
|
-
private callResponseHandler;
|
|
5782
|
-
/** Cloud mode: delegate to consumer's onResponse handler */
|
|
5783
|
-
private handleCloudResponse;
|
|
5784
|
-
/** Local mode: synthesize text with TTSBackend, stream to PlaybackPipeline */
|
|
5785
|
-
private handleLocalResponse;
|
|
5786
|
-
private handleInterruption;
|
|
5787
|
-
private startProgressiveTranscription;
|
|
5788
|
-
private stopProgressiveTranscription;
|
|
5789
|
-
private transcribeWithTimeout;
|
|
5790
|
-
private normalizeAudio;
|
|
5791
|
-
private setState;
|
|
5792
|
-
private emitProgress;
|
|
5793
|
-
private clearSilenceTimer;
|
|
5794
|
-
}
|
|
5795
|
-
|
|
5796
4735
|
/**
|
|
5797
4736
|
* VoiceOrchestrator — Shared voice wiring for OmoteAvatar adapters.
|
|
5798
4737
|
*
|
|
@@ -5810,6 +4749,11 @@ interface VoiceOrchestratorBaseConfig {
|
|
|
5810
4749
|
listener?: SpeechListenerConfig;
|
|
5811
4750
|
interruptionEnabled?: boolean;
|
|
5812
4751
|
profile?: ExpressionProfile;
|
|
4752
|
+
onStateChange?: (state: ConversationalState) => void;
|
|
4753
|
+
onLoadingProgress?: (progress: LoadingProgress) => void;
|
|
4754
|
+
onError?: (error: Error) => void;
|
|
4755
|
+
onTranscriptEvent?: (result: TranscriptResult) => void;
|
|
4756
|
+
onInterruption?: () => void;
|
|
5813
4757
|
}
|
|
5814
4758
|
interface VoiceOrchestratorLocalConfig extends VoiceOrchestratorBaseConfig {
|
|
5815
4759
|
mode?: 'local';
|
|
@@ -5823,12 +4767,23 @@ interface VoiceOrchestratorCloudConfig extends VoiceOrchestratorBaseConfig {
|
|
|
5823
4767
|
lam?: {
|
|
5824
4768
|
modelUrl?: string;
|
|
5825
4769
|
externalDataUrl?: string | false;
|
|
4770
|
+
unifiedWorker?: UnifiedInferenceWorker;
|
|
5826
4771
|
};
|
|
4772
|
+
identityIndex?: number;
|
|
4773
|
+
neutralTransitionEnabled?: boolean;
|
|
5827
4774
|
}
|
|
5828
4775
|
type VoiceOrchestratorConfig = VoiceOrchestratorLocalConfig | VoiceOrchestratorCloudConfig;
|
|
5829
4776
|
interface VoiceOrchestratorEvents {
|
|
5830
4777
|
'state': ConversationalState;
|
|
5831
4778
|
'transcript': TranscriptResult;
|
|
4779
|
+
'interruption': void;
|
|
4780
|
+
'loading:progress': LoadingProgress;
|
|
4781
|
+
'error': Error;
|
|
4782
|
+
'audio:level': {
|
|
4783
|
+
rms: number;
|
|
4784
|
+
peak: number;
|
|
4785
|
+
};
|
|
4786
|
+
'playback:complete': void;
|
|
5832
4787
|
[key: string]: unknown;
|
|
5833
4788
|
}
|
|
5834
4789
|
declare class VoiceOrchestrator extends EventEmitter<VoiceOrchestratorEvents> {
|
|
@@ -5837,6 +4792,8 @@ declare class VoiceOrchestrator extends EventEmitter<VoiceOrchestratorEvents> {
|
|
|
5837
4792
|
private ttsSpeaker;
|
|
5838
4793
|
private playbackPipeline;
|
|
5839
4794
|
private ownedLam;
|
|
4795
|
+
private ownedWorker;
|
|
4796
|
+
private usesSharedWorker;
|
|
5840
4797
|
private transcriptUnsub;
|
|
5841
4798
|
private audioChunkUnsub;
|
|
5842
4799
|
private connectEpoch;
|
|
@@ -5860,10 +4817,14 @@ declare class VoiceOrchestrator extends EventEmitter<VoiceOrchestratorEvents> {
|
|
|
5860
4817
|
speak(text: string, options?: {
|
|
5861
4818
|
signal?: AbortSignal;
|
|
5862
4819
|
voice?: string;
|
|
4820
|
+
speed?: number;
|
|
4821
|
+
language?: string;
|
|
5863
4822
|
}): Promise<void>;
|
|
5864
4823
|
streamText(options?: {
|
|
5865
4824
|
signal?: AbortSignal;
|
|
5866
4825
|
voice?: string;
|
|
4826
|
+
speed?: number;
|
|
4827
|
+
language?: string;
|
|
5867
4828
|
}): Promise<{
|
|
5868
4829
|
push: (token: string) => void;
|
|
5869
4830
|
end: () => Promise<void>;
|
|
@@ -5875,4 +4836,4 @@ declare class VoiceOrchestrator extends EventEmitter<VoiceOrchestratorEvents> {
|
|
|
5875
4836
|
private setState;
|
|
5876
4837
|
}
|
|
5877
4838
|
|
|
5878
|
-
export { type A2EBackend,
|
|
4839
|
+
export { type A2EBackend, type A2EModelInfo, A2EProcessor, type A2EProcessorConfig, type A2EResult, A2EUnifiedAdapter, ALL_AUS, ARKIT_BLENDSHAPES, type AUActivation, AU_TO_ARKIT, type ActiveSpan, type AnimationClip, type AnimationController, AnimationGraph, type AnimationGraphConfig, type AnimationGraphEvents, type AnimationLayer, type AnimationOutput, type AnimationSource, type AnimationSourceOptions, type AnimationState, type AnimationStateName, type AnimationTrigger, AudioChunkCoalescer, type AudioChunkCoalescerOptions, AudioEnergyAnalyzer, AudioScheduler, type AudioSchedulerOptions, BLENDSHAPE_TO_GROUP, type BackendPreference, type BlendWeight, type BlendshapeGroup, BlendshapeSmoother, type BlendshapeSmootherConfig, type BoneFilterConfig, type CacheConfig, type CacheSpanAttributes, CharacterController, type CharacterControllerConfig, type CharacterProfile, type CharacterUpdateInput, type CharacterUpdateOutput, ConsoleExporter, type ConversationalState, type CreateA2EConfig, type CreateKokoroTTSConfig, type CreateSenseVoiceConfig, type CreateTTSPlayerConfig, DEFAULT_ANIMATION_CONFIG, DEFAULT_BONE_FILTER, DEFAULT_MODEL_URLS, EMOTION_NAMES, EMOTION_TO_AU, EMOTION_VECTOR_SIZE, EXPLICIT_EMOTION_COUNT, type ElevenLabsConfig, ElevenLabsTTSBackend, type EmotionAnimationMap, EmotionController, type EmotionLabel, type EmotionName, type EmotionPresetName, EmotionPresets, EmotionResolver, type EmotionWeights, EmphasisDetector, EventEmitter, type ExpressionProfile, FaceCompositor, type FaceCompositorConfig, type FaceCompositorInput, type FaceCompositorOutput, type FetchWithCacheOptions, type FrameSource, type FullFaceFrame, HF_CDN_URLS, INFERENCE_LATENCY_BUCKETS, type InferenceFactoryConfig, type InferenceSpanAttributes, type InterruptionConfig, type InterruptionEvents, InterruptionHandler, KOKORO_VOICES, type KokoroStreamChunk, type KokoroTTSConfig, type KokoroTTSModelInfo, type KokoroTTSResult, KokoroTTSUnifiedAdapter, type KokoroVoiceName, LAM_BLENDSHAPES, type LifeLayerConfig, type LifeLayerInput, type LifeLayerOutput, type LoadingProgress, MIXAMO_PREFIX, MODEL_LOAD_TIME_BUCKETS, type MetricData, MetricNames, MicLipSync, type MicLipSyncConfig, type MicLipSyncEvents, type MicLipSyncFrame, type MicLipSyncState, MicrophoneCapture, type MicrophoneCaptureConfig, ModelCache, type ModelSpanAttributes, type ModelUrlKey, OTLPExporter, type OTLPExporterConfig, OmoteEvents, OmoteTelemetry, PRESERVE_POSITION_BONES, PlaybackPipeline, type PlaybackPipelineConfig, type PlaybackPipelineEvents, type PlaybackState, ProceduralLifeLayer, type Quat, type QuotaInfo, type ResolvedEmotion, type ResponseHandler, RingBuffer, type RuntimeBackend, type SafariSpeechConfig, SafariSpeechRecognition, type SamplingConfig, type SenseVoiceBackend, type SenseVoiceConfig, type SenseVoiceLanguage, type SenseVoiceModelInfo, type SenseVoiceResult, SenseVoiceUnifiedAdapter, type SenseVoiceWorkerConfig, type SileroVADBackend, type SileroVADConfig, type SileroVADFactoryConfig, SileroVADUnifiedAdapter, type SpanAttributes, type SpanData, type SpeechErrorCallback, SpeechListener, type SpeechListenerConfig, type SpeechListenerEvents, type SpeechListenerState, type SpeechRecognitionResult, type SpeechResultCallback, type SpeechSegment, type SynthesizeOptions, type TTSBackend, type TTSChunk, TTSPlayback, type TTSPlaybackConfig, type TTSPlaybackEvents, TTSPlayer, TTSSpeaker, type TTSSpeakerConfig, type TTSStreamOptions, type TelemetryConfig, type TelemetryExporter, type TelemetryExporterInterface, type TrackDescriptor, type TranscriptResult, type Transition, UnifiedInferenceWorker, type VADBackend, type VADModelInfo, type VADResult, type VADWorkerConfig, type VADWorkerModelInfo, type ValidationResult, type Vec3, VoiceOrchestrator, type VoiceOrchestratorCloudConfig, type VoiceOrchestratorConfig, type VoiceOrchestratorEvents, type VoiceOrchestratorLocalConfig, type VoicePipelineState, type WorkerHealthState, analyzeTextEmotion, applyProfile, blendEmotions, calculatePeak, calculateRMS, configureCacheLimit, configureModelUrls, configureTelemetry, createA2E, createEmotionVector, createKokoroTTS, createSenseVoice, createSileroVAD, createTTSPlayer, fetchWithCache, float32ToPcm16, formatBytes, getCacheConfig, getCacheKey, getEmotionPreset, getModelCache, getOptimalWasmThreads, getRecommendedBackend, getTelemetry, hasWebGPUApi, int16ToFloat32, isAndroid, isIOS, isIOSSafari, isMobile, isSafari, isSpeechRecognitionAvailable, isWebGPUAvailable, lerpBlendshapes, lerpEmotion, listVoices as listKokoroVoices, parseEmotionTags, pcm16ToFloat32, preloadModels, resampleLinear, resetModelUrls, resolveBackend, resolveEmotion, shouldEnableWasmProxy, shouldKeepTrack, shouldUseNativeASR, shouldUseServerA2E, stripMixamoPrefix, ttsToPlaybackFormat, validateTTSInput };
|