@livekit/agents 1.0.41 → 1.0.43
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/inference/index.cjs +8 -0
- package/dist/inference/index.cjs.map +1 -1
- package/dist/inference/index.d.cts +2 -2
- package/dist/inference/index.d.ts +2 -2
- package/dist/inference/index.d.ts.map +1 -1
- package/dist/inference/index.js +8 -0
- package/dist/inference/index.js.map +1 -1
- package/dist/inference/stt.cjs +51 -10
- package/dist/inference/stt.cjs.map +1 -1
- package/dist/inference/stt.d.cts +33 -0
- package/dist/inference/stt.d.ts +33 -0
- package/dist/inference/stt.d.ts.map +1 -1
- package/dist/inference/stt.js +48 -9
- package/dist/inference/stt.js.map +1 -1
- package/dist/inference/stt.test.cjs +204 -0
- package/dist/inference/stt.test.cjs.map +1 -0
- package/dist/inference/stt.test.js +203 -0
- package/dist/inference/stt.test.js.map +1 -0
- package/dist/inference/tts.cjs +52 -10
- package/dist/inference/tts.cjs.map +1 -1
- package/dist/inference/tts.d.cts +22 -0
- package/dist/inference/tts.d.ts +22 -0
- package/dist/inference/tts.d.ts.map +1 -1
- package/dist/inference/tts.js +49 -9
- package/dist/inference/tts.js.map +1 -1
- package/dist/inference/tts.test.cjs +223 -0
- package/dist/inference/tts.test.cjs.map +1 -0
- package/dist/inference/tts.test.js +222 -0
- package/dist/inference/tts.test.js.map +1 -0
- package/dist/ipc/inference_proc_lazy_main.cjs +13 -1
- package/dist/ipc/inference_proc_lazy_main.cjs.map +1 -1
- package/dist/ipc/inference_proc_lazy_main.js +13 -1
- package/dist/ipc/inference_proc_lazy_main.js.map +1 -1
- package/dist/ipc/job_proc_lazy_main.cjs +2 -1
- package/dist/ipc/job_proc_lazy_main.cjs.map +1 -1
- package/dist/ipc/job_proc_lazy_main.js +2 -1
- package/dist/ipc/job_proc_lazy_main.js.map +1 -1
- package/dist/ipc/supervised_proc.cjs.map +1 -1
- package/dist/ipc/supervised_proc.d.cts +7 -0
- package/dist/ipc/supervised_proc.d.ts +7 -0
- package/dist/ipc/supervised_proc.d.ts.map +1 -1
- package/dist/ipc/supervised_proc.js.map +1 -1
- package/dist/stt/stt.cjs.map +1 -1
- package/dist/stt/stt.d.cts +7 -0
- package/dist/stt/stt.d.ts +7 -0
- package/dist/stt/stt.d.ts.map +1 -1
- package/dist/stt/stt.js.map +1 -1
- package/dist/transcription.cjs.map +1 -1
- package/dist/transcription.d.cts +6 -0
- package/dist/transcription.d.ts +6 -0
- package/dist/transcription.d.ts.map +1 -1
- package/dist/transcription.js.map +1 -1
- package/dist/vad.cjs +1 -1
- package/dist/vad.cjs.map +1 -1
- package/dist/vad.d.cts +3 -2
- package/dist/vad.d.ts +3 -2
- package/dist/vad.d.ts.map +1 -1
- package/dist/vad.js +1 -1
- package/dist/vad.js.map +1 -1
- package/dist/voice/agent_activity.cjs +1 -2
- package/dist/voice/agent_activity.cjs.map +1 -1
- package/dist/voice/agent_activity.js +1 -2
- package/dist/voice/agent_activity.js.map +1 -1
- package/dist/voice/audio_recognition.cjs +1 -1
- package/dist/voice/audio_recognition.cjs.map +1 -1
- package/dist/voice/audio_recognition.d.cts +14 -0
- package/dist/voice/audio_recognition.d.ts +14 -0
- package/dist/voice/audio_recognition.d.ts.map +1 -1
- package/dist/voice/audio_recognition.js +1 -1
- package/dist/voice/audio_recognition.js.map +1 -1
- package/package.json +1 -1
- package/src/inference/index.ts +8 -0
- package/src/inference/stt.test.ts +236 -0
- package/src/inference/stt.ts +95 -17
- package/src/inference/tts.test.ts +255 -0
- package/src/inference/tts.ts +81 -15
- package/src/ipc/inference_proc_lazy_main.ts +13 -1
- package/src/ipc/job_proc_lazy_main.ts +5 -1
- package/src/ipc/supervised_proc.ts +7 -0
- package/src/stt/stt.ts +7 -0
- package/src/transcription.ts +6 -0
- package/src/vad.ts +4 -3
- package/src/voice/agent_activity.ts +1 -1
- package/src/voice/audio_recognition.ts +16 -1
|
@@ -0,0 +1,236 @@
|
|
|
1
|
+
// SPDX-FileCopyrightText: 2025 LiveKit, Inc.
|
|
2
|
+
//
|
|
3
|
+
// SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
import { beforeAll, describe, expect, it } from 'vitest';
|
|
5
|
+
import { initializeLogger } from '../log.js';
|
|
6
|
+
import { type APIConnectOptions, DEFAULT_API_CONNECT_OPTIONS } from '../types.js';
|
|
7
|
+
import { STT, type STTFallbackModel, normalizeSTTFallback, parseSTTModelString } from './stt.js';
|
|
8
|
+
|
|
9
|
+
beforeAll(() => {
|
|
10
|
+
initializeLogger({ level: 'silent', pretty: false });
|
|
11
|
+
});
|
|
12
|
+
|
|
13
|
+
/** Helper to create STT with required credentials. */
|
|
14
|
+
function makeStt(overrides: Record<string, unknown> = {}) {
|
|
15
|
+
const defaults = {
|
|
16
|
+
model: 'deepgram' as const,
|
|
17
|
+
apiKey: 'test-key',
|
|
18
|
+
apiSecret: 'test-secret',
|
|
19
|
+
baseURL: 'https://example.livekit.cloud',
|
|
20
|
+
};
|
|
21
|
+
return new STT({ ...defaults, ...overrides });
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
describe('parseSTTModelString', () => {
|
|
25
|
+
it('simple model without language', () => {
|
|
26
|
+
const [model, language] = parseSTTModelString('deepgram');
|
|
27
|
+
expect(model).toBe('deepgram');
|
|
28
|
+
expect(language).toBeUndefined();
|
|
29
|
+
});
|
|
30
|
+
|
|
31
|
+
it('model with language suffix', () => {
|
|
32
|
+
const [model, language] = parseSTTModelString('deepgram:en');
|
|
33
|
+
expect(model).toBe('deepgram');
|
|
34
|
+
expect(language).toBe('en');
|
|
35
|
+
});
|
|
36
|
+
|
|
37
|
+
it('provider/model format without language', () => {
|
|
38
|
+
const [model, language] = parseSTTModelString('deepgram/nova-3');
|
|
39
|
+
expect(model).toBe('deepgram/nova-3');
|
|
40
|
+
expect(language).toBeUndefined();
|
|
41
|
+
});
|
|
42
|
+
|
|
43
|
+
it('provider/model format with language', () => {
|
|
44
|
+
const [model, language] = parseSTTModelString('deepgram/nova-3:en');
|
|
45
|
+
expect(model).toBe('deepgram/nova-3');
|
|
46
|
+
expect(language).toBe('en');
|
|
47
|
+
});
|
|
48
|
+
|
|
49
|
+
it.each([
|
|
50
|
+
['cartesia/ink-whisper:de', 'cartesia/ink-whisper', 'de'],
|
|
51
|
+
['assemblyai:es', 'assemblyai', 'es'],
|
|
52
|
+
['deepgram/nova-2-medical:ja', 'deepgram/nova-2-medical', 'ja'],
|
|
53
|
+
['deepgram/nova-3:multi', 'deepgram/nova-3', 'multi'],
|
|
54
|
+
['cartesia:zh', 'cartesia', 'zh'],
|
|
55
|
+
])('various providers and languages: %s', (modelStr, expectedModel, expectedLang) => {
|
|
56
|
+
const [model, language] = parseSTTModelString(modelStr);
|
|
57
|
+
expect(model).toBe(expectedModel);
|
|
58
|
+
expect(language).toBe(expectedLang);
|
|
59
|
+
});
|
|
60
|
+
|
|
61
|
+
it('auto model without language', () => {
|
|
62
|
+
const [model, language] = parseSTTModelString('auto');
|
|
63
|
+
expect(model).toBe('auto');
|
|
64
|
+
expect(language).toBeUndefined();
|
|
65
|
+
});
|
|
66
|
+
|
|
67
|
+
it('auto model with language', () => {
|
|
68
|
+
const [model, language] = parseSTTModelString('auto:pt');
|
|
69
|
+
expect(model).toBe('auto');
|
|
70
|
+
expect(language).toBe('pt');
|
|
71
|
+
});
|
|
72
|
+
});
|
|
73
|
+
|
|
74
|
+
describe('normalizeSTTFallback', () => {
|
|
75
|
+
it('single string model', () => {
|
|
76
|
+
const result = normalizeSTTFallback('deepgram/nova-3');
|
|
77
|
+
expect(result).toEqual([{ model: 'deepgram/nova-3' }]);
|
|
78
|
+
});
|
|
79
|
+
|
|
80
|
+
it('single FallbackModel dict', () => {
|
|
81
|
+
const fallback: STTFallbackModel = { model: 'deepgram/nova-3' };
|
|
82
|
+
const result = normalizeSTTFallback(fallback);
|
|
83
|
+
expect(result).toEqual([{ model: 'deepgram/nova-3' }]);
|
|
84
|
+
});
|
|
85
|
+
|
|
86
|
+
it('list of string models', () => {
|
|
87
|
+
const result = normalizeSTTFallback(['deepgram/nova-3', 'cartesia/ink-whisper']);
|
|
88
|
+
expect(result).toEqual([{ model: 'deepgram/nova-3' }, { model: 'cartesia/ink-whisper' }]);
|
|
89
|
+
});
|
|
90
|
+
|
|
91
|
+
it('list of FallbackModel dicts', () => {
|
|
92
|
+
const fallbacks: STTFallbackModel[] = [{ model: 'deepgram/nova-3' }, { model: 'assemblyai' }];
|
|
93
|
+
const result = normalizeSTTFallback(fallbacks);
|
|
94
|
+
expect(result).toEqual([{ model: 'deepgram/nova-3' }, { model: 'assemblyai' }]);
|
|
95
|
+
});
|
|
96
|
+
|
|
97
|
+
it('mixed list of strings and dicts', () => {
|
|
98
|
+
const result = normalizeSTTFallback([
|
|
99
|
+
'deepgram/nova-3',
|
|
100
|
+
{ model: 'cartesia/ink-whisper' } as STTFallbackModel,
|
|
101
|
+
'assemblyai',
|
|
102
|
+
]);
|
|
103
|
+
expect(result).toEqual([
|
|
104
|
+
{ model: 'deepgram/nova-3' },
|
|
105
|
+
{ model: 'cartesia/ink-whisper' },
|
|
106
|
+
{ model: 'assemblyai' },
|
|
107
|
+
]);
|
|
108
|
+
});
|
|
109
|
+
|
|
110
|
+
it('string with language suffix discards language', () => {
|
|
111
|
+
const result = normalizeSTTFallback('deepgram/nova-3:en');
|
|
112
|
+
expect(result).toEqual([{ model: 'deepgram/nova-3' }]);
|
|
113
|
+
});
|
|
114
|
+
|
|
115
|
+
it('FallbackModel with extraKwargs is preserved', () => {
|
|
116
|
+
const fallback: STTFallbackModel = {
|
|
117
|
+
model: 'deepgram/nova-3',
|
|
118
|
+
extraKwargs: { keywords: [['livekit', 1.5]], punctuate: true },
|
|
119
|
+
};
|
|
120
|
+
const result = normalizeSTTFallback(fallback);
|
|
121
|
+
expect(result).toEqual([
|
|
122
|
+
{
|
|
123
|
+
model: 'deepgram/nova-3',
|
|
124
|
+
extraKwargs: { keywords: [['livekit', 1.5]], punctuate: true },
|
|
125
|
+
},
|
|
126
|
+
]);
|
|
127
|
+
});
|
|
128
|
+
|
|
129
|
+
it('list with extraKwargs preserved', () => {
|
|
130
|
+
const result = normalizeSTTFallback([
|
|
131
|
+
{ model: 'deepgram/nova-3', extraKwargs: { punctuate: true } } as STTFallbackModel,
|
|
132
|
+
'cartesia/ink-whisper',
|
|
133
|
+
{ model: 'assemblyai', extraKwargs: { format_turns: true } } as STTFallbackModel,
|
|
134
|
+
]);
|
|
135
|
+
expect(result).toEqual([
|
|
136
|
+
{ model: 'deepgram/nova-3', extraKwargs: { punctuate: true } },
|
|
137
|
+
{ model: 'cartesia/ink-whisper' },
|
|
138
|
+
{ model: 'assemblyai', extraKwargs: { format_turns: true } },
|
|
139
|
+
]);
|
|
140
|
+
});
|
|
141
|
+
|
|
142
|
+
it('empty list returns empty list', () => {
|
|
143
|
+
const result = normalizeSTTFallback([]);
|
|
144
|
+
expect(result).toEqual([]);
|
|
145
|
+
});
|
|
146
|
+
|
|
147
|
+
it('multiple colons in model string splits on last', () => {
|
|
148
|
+
const result = normalizeSTTFallback('some:model:part:fr');
|
|
149
|
+
expect(result).toEqual([{ model: 'some:model:part' }]);
|
|
150
|
+
});
|
|
151
|
+
});
|
|
152
|
+
|
|
153
|
+
describe('STT constructor fallback and connOptions', () => {
|
|
154
|
+
it('fallback not given defaults to undefined', () => {
|
|
155
|
+
const stt = makeStt();
|
|
156
|
+
expect(stt['opts'].fallback).toBeUndefined();
|
|
157
|
+
});
|
|
158
|
+
|
|
159
|
+
it('fallback single string is normalized', () => {
|
|
160
|
+
const stt = makeStt({ fallback: 'cartesia/ink-whisper' });
|
|
161
|
+
expect(stt['opts'].fallback).toEqual([{ model: 'cartesia/ink-whisper' }]);
|
|
162
|
+
});
|
|
163
|
+
|
|
164
|
+
it('fallback list of strings is normalized', () => {
|
|
165
|
+
const stt = makeStt({ fallback: ['deepgram/nova-3', 'assemblyai'] });
|
|
166
|
+
expect(stt['opts'].fallback).toEqual([{ model: 'deepgram/nova-3' }, { model: 'assemblyai' }]);
|
|
167
|
+
});
|
|
168
|
+
|
|
169
|
+
it('fallback single FallbackModel is normalized to list', () => {
|
|
170
|
+
const stt = makeStt({ fallback: { model: 'deepgram/nova-3' } });
|
|
171
|
+
expect(stt['opts'].fallback).toEqual([{ model: 'deepgram/nova-3' }]);
|
|
172
|
+
});
|
|
173
|
+
|
|
174
|
+
it('fallback with extraKwargs is preserved', () => {
|
|
175
|
+
const stt = makeStt({
|
|
176
|
+
fallback: {
|
|
177
|
+
model: 'deepgram/nova-3',
|
|
178
|
+
extraKwargs: { punctuate: true, keywords: [['livekit', 1.5]] },
|
|
179
|
+
},
|
|
180
|
+
});
|
|
181
|
+
expect(stt['opts'].fallback).toEqual([
|
|
182
|
+
{
|
|
183
|
+
model: 'deepgram/nova-3',
|
|
184
|
+
extraKwargs: { punctuate: true, keywords: [['livekit', 1.5]] },
|
|
185
|
+
},
|
|
186
|
+
]);
|
|
187
|
+
});
|
|
188
|
+
|
|
189
|
+
it('fallback mixed list is normalized', () => {
|
|
190
|
+
const stt = makeStt({
|
|
191
|
+
fallback: [
|
|
192
|
+
'deepgram/nova-3',
|
|
193
|
+
{ model: 'cartesia', extraKwargs: { min_volume: 0.5 } },
|
|
194
|
+
'assemblyai',
|
|
195
|
+
],
|
|
196
|
+
});
|
|
197
|
+
expect(stt['opts'].fallback).toEqual([
|
|
198
|
+
{ model: 'deepgram/nova-3' },
|
|
199
|
+
{ model: 'cartesia', extraKwargs: { min_volume: 0.5 } },
|
|
200
|
+
{ model: 'assemblyai' },
|
|
201
|
+
]);
|
|
202
|
+
});
|
|
203
|
+
|
|
204
|
+
it('fallback string with language discards language', () => {
|
|
205
|
+
const stt = makeStt({ fallback: 'deepgram/nova-3:en' });
|
|
206
|
+
expect(stt['opts'].fallback).toEqual([{ model: 'deepgram/nova-3' }]);
|
|
207
|
+
});
|
|
208
|
+
|
|
209
|
+
it('connOptions not given uses default', () => {
|
|
210
|
+
const stt = makeStt();
|
|
211
|
+
expect(stt['opts'].connOptions).toEqual(DEFAULT_API_CONNECT_OPTIONS);
|
|
212
|
+
});
|
|
213
|
+
|
|
214
|
+
it('connOptions custom timeout', () => {
|
|
215
|
+
const custom: APIConnectOptions = { timeoutMs: 30000, maxRetry: 3, retryIntervalMs: 2000 };
|
|
216
|
+
const stt = makeStt({ connOptions: custom });
|
|
217
|
+
expect(stt['opts'].connOptions).toEqual(custom);
|
|
218
|
+
expect(stt['opts'].connOptions!.timeoutMs).toBe(30000);
|
|
219
|
+
});
|
|
220
|
+
|
|
221
|
+
it('connOptions custom maxRetry', () => {
|
|
222
|
+
const custom: APIConnectOptions = { timeoutMs: 10000, maxRetry: 5, retryIntervalMs: 2000 };
|
|
223
|
+
const stt = makeStt({ connOptions: custom });
|
|
224
|
+
expect(stt['opts'].connOptions).toEqual(custom);
|
|
225
|
+
expect(stt['opts'].connOptions!.maxRetry).toBe(5);
|
|
226
|
+
});
|
|
227
|
+
|
|
228
|
+
it('connOptions full custom', () => {
|
|
229
|
+
const custom: APIConnectOptions = { timeoutMs: 60000, maxRetry: 10, retryIntervalMs: 2000 };
|
|
230
|
+
const stt = makeStt({ connOptions: custom });
|
|
231
|
+
expect(stt['opts'].connOptions).toEqual(custom);
|
|
232
|
+
expect(stt['opts'].connOptions!.timeoutMs).toBe(60000);
|
|
233
|
+
expect(stt['opts'].connOptions!.maxRetry).toBe(10);
|
|
234
|
+
expect(stt['opts'].connOptions!.retryIntervalMs).toBe(2000);
|
|
235
|
+
});
|
|
236
|
+
});
|
package/src/inference/stt.ts
CHANGED
|
@@ -42,29 +42,46 @@ export type AssemblyaiModels =
|
|
|
42
42
|
export type ElevenlabsSTTModels = 'elevenlabs/scribe_v2_realtime';
|
|
43
43
|
|
|
44
44
|
export interface CartesiaOptions {
|
|
45
|
-
|
|
46
|
-
|
|
45
|
+
/** Minimum volume threshold. Default: not specified. */
|
|
46
|
+
min_volume?: number;
|
|
47
|
+
/** Maximum silence duration in seconds. Default: not specified. */
|
|
48
|
+
max_silence_duration_secs?: number;
|
|
47
49
|
}
|
|
48
50
|
|
|
49
51
|
export interface DeepgramOptions {
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
52
|
+
/** Enable filler words. Default: true. */
|
|
53
|
+
filler_words?: boolean;
|
|
54
|
+
/** Enable interim results. Default: true. */
|
|
55
|
+
interim_results?: boolean;
|
|
56
|
+
/** Endpointing timeout in milliseconds. Default: 25. */
|
|
57
|
+
endpointing?: number;
|
|
58
|
+
/** Enable punctuation. Default: false. */
|
|
59
|
+
punctuate?: boolean;
|
|
60
|
+
/** Enable smart formatting. */
|
|
54
61
|
smart_format?: boolean;
|
|
62
|
+
/** Keywords with boost values. */
|
|
55
63
|
keywords?: Array<[string, number]>;
|
|
64
|
+
/** Key terms for recognition. */
|
|
56
65
|
keyterms?: string[];
|
|
66
|
+
/** Enable profanity filter. */
|
|
57
67
|
profanity_filter?: boolean;
|
|
68
|
+
/** Convert spoken numbers to numerals. */
|
|
58
69
|
numerals?: boolean;
|
|
70
|
+
/** Opt out of model improvement program. */
|
|
59
71
|
mip_opt_out?: boolean;
|
|
60
72
|
}
|
|
61
73
|
|
|
62
74
|
export interface AssemblyAIOptions {
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
75
|
+
/** Enable turn formatting. Default: false. */
|
|
76
|
+
format_turns?: boolean;
|
|
77
|
+
/** End of turn confidence threshold. Default: 0.01. */
|
|
78
|
+
end_of_turn_confidence_threshold?: number;
|
|
79
|
+
/** Minimum silence duration in milliseconds when confident about end of turn. Default: 0. */
|
|
80
|
+
min_end_of_turn_silence_when_confident?: number;
|
|
81
|
+
/** Maximum turn silence in milliseconds. Default: not specified. */
|
|
82
|
+
max_turn_silence?: number;
|
|
83
|
+
/** Key terms prompt for recognition. Default: not specified. */
|
|
84
|
+
keyterms_prompt?: string[];
|
|
68
85
|
}
|
|
69
86
|
|
|
70
87
|
export type STTLanguages =
|
|
@@ -93,6 +110,43 @@ export type STTOptions<TModel extends STTModels> = TModel extends DeepgramModels
|
|
|
93
110
|
? AssemblyAIOptions
|
|
94
111
|
: Record<string, unknown>;
|
|
95
112
|
|
|
113
|
+
/** A fallback model with optional extra configuration. Extra fields are passed through to the provider. */
|
|
114
|
+
export interface STTFallbackModel {
|
|
115
|
+
/** Model name (e.g. "deepgram/nova-3", "assemblyai/universal-streaming", "cartesia/ink-whisper"). */
|
|
116
|
+
model: string;
|
|
117
|
+
/** Extra configuration for the model. */
|
|
118
|
+
extraKwargs?: Record<string, unknown>;
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
export type STTFallbackModelType = STTFallbackModel | string;
|
|
122
|
+
|
|
123
|
+
/** Parse a model string into [model, language]. Language is undefined if not specified. */
|
|
124
|
+
export function parseSTTModelString(model: string): [string, string | undefined] {
|
|
125
|
+
const idx = model.lastIndexOf(':');
|
|
126
|
+
if (idx !== -1) {
|
|
127
|
+
return [model.slice(0, idx), model.slice(idx + 1)];
|
|
128
|
+
}
|
|
129
|
+
return [model, undefined];
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
/** Normalize a single or list of FallbackModelType into STTFallbackModel[]. */
|
|
133
|
+
export function normalizeSTTFallback(
|
|
134
|
+
fallback: STTFallbackModelType | STTFallbackModelType[],
|
|
135
|
+
): STTFallbackModel[] {
|
|
136
|
+
const makeFallback = (model: STTFallbackModelType): STTFallbackModel => {
|
|
137
|
+
if (typeof model === 'string') {
|
|
138
|
+
const [name] = parseSTTModelString(model);
|
|
139
|
+
return { model: name };
|
|
140
|
+
}
|
|
141
|
+
return model;
|
|
142
|
+
};
|
|
143
|
+
|
|
144
|
+
if (Array.isArray(fallback)) {
|
|
145
|
+
return fallback.map(makeFallback);
|
|
146
|
+
}
|
|
147
|
+
return [makeFallback(fallback)];
|
|
148
|
+
}
|
|
149
|
+
|
|
96
150
|
export type STTEncoding = 'pcm_s16le';
|
|
97
151
|
|
|
98
152
|
const DEFAULT_ENCODING: STTEncoding = 'pcm_s16le';
|
|
@@ -109,6 +163,8 @@ export interface InferenceSTTOptions<TModel extends STTModels> {
|
|
|
109
163
|
apiKey: string;
|
|
110
164
|
apiSecret: string;
|
|
111
165
|
modelOptions: STTOptions<TModel>;
|
|
166
|
+
fallback?: STTFallbackModel[];
|
|
167
|
+
connOptions?: APIConnectOptions;
|
|
112
168
|
}
|
|
113
169
|
|
|
114
170
|
/**
|
|
@@ -129,6 +185,8 @@ export class STT<TModel extends STTModels> extends BaseSTT {
|
|
|
129
185
|
apiKey?: string;
|
|
130
186
|
apiSecret?: string;
|
|
131
187
|
modelOptions?: STTOptions<TModel>;
|
|
188
|
+
fallback?: STTFallbackModelType | STTFallbackModelType[];
|
|
189
|
+
connOptions?: APIConnectOptions;
|
|
132
190
|
}) {
|
|
133
191
|
super({ streaming: true, interimResults: true, alignedTranscript: 'word' });
|
|
134
192
|
|
|
@@ -141,6 +199,8 @@ export class STT<TModel extends STTModels> extends BaseSTT {
|
|
|
141
199
|
apiKey,
|
|
142
200
|
apiSecret,
|
|
143
201
|
modelOptions = {} as STTOptions<TModel>,
|
|
202
|
+
fallback,
|
|
203
|
+
connOptions,
|
|
144
204
|
} = opts || {};
|
|
145
205
|
|
|
146
206
|
const lkBaseURL = baseURL || process.env.LIVEKIT_INFERENCE_URL || DEFAULT_BASE_URL;
|
|
@@ -155,6 +215,8 @@ export class STT<TModel extends STTModels> extends BaseSTT {
|
|
|
155
215
|
throw new Error('apiSecret is required: pass apiSecret or set LIVEKIT_API_SECRET');
|
|
156
216
|
}
|
|
157
217
|
|
|
218
|
+
const normalizedFallback = fallback ? normalizeSTTFallback(fallback) : undefined;
|
|
219
|
+
|
|
158
220
|
this.opts = {
|
|
159
221
|
model,
|
|
160
222
|
language,
|
|
@@ -164,6 +226,8 @@ export class STT<TModel extends STTModels> extends BaseSTT {
|
|
|
164
226
|
apiKey: lkApiKey,
|
|
165
227
|
apiSecret: lkApiSecret,
|
|
166
228
|
modelOptions,
|
|
229
|
+
fallback: normalizedFallback,
|
|
230
|
+
connOptions: connOptions ?? DEFAULT_API_CONNECT_OPTIONS,
|
|
167
231
|
};
|
|
168
232
|
}
|
|
169
233
|
|
|
@@ -172,11 +236,8 @@ export class STT<TModel extends STTModels> extends BaseSTT {
|
|
|
172
236
|
}
|
|
173
237
|
|
|
174
238
|
static fromModelString(modelString: string): STT<AnyString> {
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
return new STT({ model, language });
|
|
178
|
-
}
|
|
179
|
-
return new STT({ model: modelString });
|
|
239
|
+
const [model, language] = parseSTTModelString(modelString);
|
|
240
|
+
return new STT({ model, language });
|
|
180
241
|
}
|
|
181
242
|
|
|
182
243
|
protected async _recognize(_: AudioBuffer): Promise<SpeechEvent> {
|
|
@@ -195,7 +256,8 @@ export class STT<TModel extends STTModels> extends BaseSTT {
|
|
|
195
256
|
language?: STTLanguages | string;
|
|
196
257
|
connOptions?: APIConnectOptions;
|
|
197
258
|
}): SpeechStream<TModel> {
|
|
198
|
-
const { language, connOptions = DEFAULT_API_CONNECT_OPTIONS } =
|
|
259
|
+
const { language, connOptions = this.opts.connOptions ?? DEFAULT_API_CONNECT_OPTIONS } =
|
|
260
|
+
options || {};
|
|
199
261
|
const streamOpts = {
|
|
200
262
|
...this.opts,
|
|
201
263
|
language: language ?? this.opts.language,
|
|
@@ -224,6 +286,22 @@ export class STT<TModel extends STTModels> extends BaseSTT {
|
|
|
224
286
|
(params.settings as Record<string, unknown>).language = this.opts.language;
|
|
225
287
|
}
|
|
226
288
|
|
|
289
|
+
if (this.opts.fallback?.length) {
|
|
290
|
+
params.fallback = {
|
|
291
|
+
models: this.opts.fallback.map((m) => ({
|
|
292
|
+
model: m.model,
|
|
293
|
+
extra: m.extraKwargs ?? {},
|
|
294
|
+
})),
|
|
295
|
+
};
|
|
296
|
+
}
|
|
297
|
+
|
|
298
|
+
if (this.opts.connOptions) {
|
|
299
|
+
params.connection = {
|
|
300
|
+
timeout: this.opts.connOptions.timeoutMs / 1000,
|
|
301
|
+
retries: this.opts.connOptions.maxRetry,
|
|
302
|
+
};
|
|
303
|
+
}
|
|
304
|
+
|
|
227
305
|
let baseURL = this.opts.baseURL;
|
|
228
306
|
if (baseURL.startsWith('http://') || baseURL.startsWith('https://')) {
|
|
229
307
|
baseURL = baseURL.replace('http', 'ws');
|
|
@@ -0,0 +1,255 @@
|
|
|
1
|
+
// SPDX-FileCopyrightText: 2025 LiveKit, Inc.
|
|
2
|
+
//
|
|
3
|
+
// SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
import { beforeAll, describe, expect, it } from 'vitest';
|
|
5
|
+
import { initializeLogger } from '../log.js';
|
|
6
|
+
import { type APIConnectOptions, DEFAULT_API_CONNECT_OPTIONS } from '../types.js';
|
|
7
|
+
import { TTS, type TTSFallbackModel, normalizeTTSFallback, parseTTSModelString } from './tts.js';
|
|
8
|
+
|
|
9
|
+
beforeAll(() => {
|
|
10
|
+
initializeLogger({ level: 'silent', pretty: false });
|
|
11
|
+
});
|
|
12
|
+
|
|
13
|
+
/** Helper to create TTS with required credentials. */
|
|
14
|
+
function makeTts(overrides: Record<string, unknown> = {}) {
|
|
15
|
+
const defaults = {
|
|
16
|
+
model: 'cartesia/sonic' as const,
|
|
17
|
+
apiKey: 'test-key',
|
|
18
|
+
apiSecret: 'test-secret',
|
|
19
|
+
baseURL: 'https://example.livekit.cloud',
|
|
20
|
+
};
|
|
21
|
+
return new TTS({ ...defaults, ...overrides });
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
describe('parseTTSModelString', () => {
|
|
25
|
+
it('simple model without voice', () => {
|
|
26
|
+
const [model, voice] = parseTTSModelString('cartesia');
|
|
27
|
+
expect(model).toBe('cartesia');
|
|
28
|
+
expect(voice).toBeUndefined();
|
|
29
|
+
});
|
|
30
|
+
|
|
31
|
+
it('model with voice suffix', () => {
|
|
32
|
+
const [model, voice] = parseTTSModelString('cartesia:my-voice-id');
|
|
33
|
+
expect(model).toBe('cartesia');
|
|
34
|
+
expect(voice).toBe('my-voice-id');
|
|
35
|
+
});
|
|
36
|
+
|
|
37
|
+
it('provider/model format without voice', () => {
|
|
38
|
+
const [model, voice] = parseTTSModelString('cartesia/sonic');
|
|
39
|
+
expect(model).toBe('cartesia/sonic');
|
|
40
|
+
expect(voice).toBeUndefined();
|
|
41
|
+
});
|
|
42
|
+
|
|
43
|
+
it('provider/model format with voice', () => {
|
|
44
|
+
const [model, voice] = parseTTSModelString('cartesia/sonic:my-voice-id');
|
|
45
|
+
expect(model).toBe('cartesia/sonic');
|
|
46
|
+
expect(voice).toBe('my-voice-id');
|
|
47
|
+
});
|
|
48
|
+
|
|
49
|
+
it.each([
|
|
50
|
+
['elevenlabs/eleven_flash_v2:voice123', 'elevenlabs/eleven_flash_v2', 'voice123'],
|
|
51
|
+
['rime:speaker-a', 'rime', 'speaker-a'],
|
|
52
|
+
['rime/mist:narrator', 'rime/mist', 'narrator'],
|
|
53
|
+
['inworld/inworld-tts-1:character', 'inworld/inworld-tts-1', 'character'],
|
|
54
|
+
['cartesia/sonic-turbo:deep-voice', 'cartesia/sonic-turbo', 'deep-voice'],
|
|
55
|
+
])('various providers and voices: %s', (modelStr, expectedModel, expectedVoice) => {
|
|
56
|
+
const [model, voice] = parseTTSModelString(modelStr);
|
|
57
|
+
expect(model).toBe(expectedModel);
|
|
58
|
+
expect(voice).toBe(expectedVoice);
|
|
59
|
+
});
|
|
60
|
+
|
|
61
|
+
it('empty voice after colon', () => {
|
|
62
|
+
const [model, voice] = parseTTSModelString('cartesia/sonic:');
|
|
63
|
+
expect(model).toBe('cartesia/sonic');
|
|
64
|
+
expect(voice).toBe('');
|
|
65
|
+
});
|
|
66
|
+
});
|
|
67
|
+
|
|
68
|
+
describe('normalizeTTSFallback', () => {
|
|
69
|
+
it('single string model', () => {
|
|
70
|
+
const result = normalizeTTSFallback('cartesia/sonic');
|
|
71
|
+
expect(result).toEqual([{ model: 'cartesia/sonic', voice: '' }]);
|
|
72
|
+
});
|
|
73
|
+
|
|
74
|
+
it('single string model with voice', () => {
|
|
75
|
+
const result = normalizeTTSFallback('cartesia/sonic:my-voice');
|
|
76
|
+
expect(result).toEqual([{ model: 'cartesia/sonic', voice: 'my-voice' }]);
|
|
77
|
+
});
|
|
78
|
+
|
|
79
|
+
it('single FallbackModel dict', () => {
|
|
80
|
+
const fallback: TTSFallbackModel = { model: 'cartesia/sonic', voice: 'narrator' };
|
|
81
|
+
const result = normalizeTTSFallback(fallback);
|
|
82
|
+
expect(result).toEqual([{ model: 'cartesia/sonic', voice: 'narrator' }]);
|
|
83
|
+
});
|
|
84
|
+
|
|
85
|
+
it('list of string models', () => {
|
|
86
|
+
const result = normalizeTTSFallback(['cartesia/sonic', 'elevenlabs/eleven_flash_v2']);
|
|
87
|
+
expect(result).toEqual([
|
|
88
|
+
{ model: 'cartesia/sonic', voice: '' },
|
|
89
|
+
{ model: 'elevenlabs/eleven_flash_v2', voice: '' },
|
|
90
|
+
]);
|
|
91
|
+
});
|
|
92
|
+
|
|
93
|
+
it('list of string models with voices', () => {
|
|
94
|
+
const result = normalizeTTSFallback(['cartesia/sonic:voice1', 'elevenlabs:voice2']);
|
|
95
|
+
expect(result).toEqual([
|
|
96
|
+
{ model: 'cartesia/sonic', voice: 'voice1' },
|
|
97
|
+
{ model: 'elevenlabs', voice: 'voice2' },
|
|
98
|
+
]);
|
|
99
|
+
});
|
|
100
|
+
|
|
101
|
+
it('list of FallbackModel dicts', () => {
|
|
102
|
+
const fallbacks: TTSFallbackModel[] = [
|
|
103
|
+
{ model: 'cartesia/sonic', voice: 'narrator' },
|
|
104
|
+
{ model: 'elevenlabs', voice: '' },
|
|
105
|
+
];
|
|
106
|
+
const result = normalizeTTSFallback(fallbacks);
|
|
107
|
+
expect(result).toEqual([
|
|
108
|
+
{ model: 'cartesia/sonic', voice: 'narrator' },
|
|
109
|
+
{ model: 'elevenlabs', voice: '' },
|
|
110
|
+
]);
|
|
111
|
+
});
|
|
112
|
+
|
|
113
|
+
it('mixed list of strings and dicts', () => {
|
|
114
|
+
const result = normalizeTTSFallback([
|
|
115
|
+
'cartesia/sonic:voice1',
|
|
116
|
+
{ model: 'elevenlabs/eleven_flash_v2', voice: 'custom' } as TTSFallbackModel,
|
|
117
|
+
'rime/mist',
|
|
118
|
+
]);
|
|
119
|
+
expect(result).toEqual([
|
|
120
|
+
{ model: 'cartesia/sonic', voice: 'voice1' },
|
|
121
|
+
{ model: 'elevenlabs/eleven_flash_v2', voice: 'custom' },
|
|
122
|
+
{ model: 'rime/mist', voice: '' },
|
|
123
|
+
]);
|
|
124
|
+
});
|
|
125
|
+
|
|
126
|
+
it('FallbackModel with extraKwargs is preserved', () => {
|
|
127
|
+
const fallback: TTSFallbackModel = {
|
|
128
|
+
model: 'cartesia/sonic',
|
|
129
|
+
voice: 'narrator',
|
|
130
|
+
extraKwargs: { duration: 30.0, speed: 'fast' },
|
|
131
|
+
};
|
|
132
|
+
const result = normalizeTTSFallback(fallback);
|
|
133
|
+
expect(result).toEqual([
|
|
134
|
+
{
|
|
135
|
+
model: 'cartesia/sonic',
|
|
136
|
+
voice: 'narrator',
|
|
137
|
+
extraKwargs: { duration: 30.0, speed: 'fast' },
|
|
138
|
+
},
|
|
139
|
+
]);
|
|
140
|
+
});
|
|
141
|
+
|
|
142
|
+
it('list with extraKwargs preserved', () => {
|
|
143
|
+
const result = normalizeTTSFallback([
|
|
144
|
+
{ model: 'cartesia/sonic', voice: 'v1', extraKwargs: { speed: 'slow' } } as TTSFallbackModel,
|
|
145
|
+
'elevenlabs:voice2',
|
|
146
|
+
{ model: 'rime/mist', voice: '', extraKwargs: { custom: true } } as TTSFallbackModel,
|
|
147
|
+
]);
|
|
148
|
+
expect(result).toEqual([
|
|
149
|
+
{ model: 'cartesia/sonic', voice: 'v1', extraKwargs: { speed: 'slow' } },
|
|
150
|
+
{ model: 'elevenlabs', voice: 'voice2' },
|
|
151
|
+
{ model: 'rime/mist', voice: '', extraKwargs: { custom: true } },
|
|
152
|
+
]);
|
|
153
|
+
});
|
|
154
|
+
|
|
155
|
+
it('empty list returns empty list', () => {
|
|
156
|
+
const result = normalizeTTSFallback([]);
|
|
157
|
+
expect(result).toEqual([]);
|
|
158
|
+
});
|
|
159
|
+
|
|
160
|
+
it('FallbackModel with empty voice', () => {
|
|
161
|
+
const fallback: TTSFallbackModel = { model: 'cartesia/sonic', voice: '' };
|
|
162
|
+
const result = normalizeTTSFallback(fallback);
|
|
163
|
+
expect(result).toEqual([{ model: 'cartesia/sonic', voice: '' }]);
|
|
164
|
+
});
|
|
165
|
+
});
|
|
166
|
+
|
|
167
|
+
describe('TTS constructor fallback and connOptions', () => {
|
|
168
|
+
it('fallback not given defaults to undefined', () => {
|
|
169
|
+
const tts = makeTts();
|
|
170
|
+
expect(tts['opts'].fallback).toBeUndefined();
|
|
171
|
+
});
|
|
172
|
+
|
|
173
|
+
it('fallback single string is normalized', () => {
|
|
174
|
+
const tts = makeTts({ fallback: 'elevenlabs/eleven_flash_v2' });
|
|
175
|
+
expect(tts['opts'].fallback).toEqual([{ model: 'elevenlabs/eleven_flash_v2', voice: '' }]);
|
|
176
|
+
});
|
|
177
|
+
|
|
178
|
+
it('fallback single string with voice is normalized', () => {
|
|
179
|
+
const tts = makeTts({ fallback: 'cartesia/sonic:my-voice' });
|
|
180
|
+
expect(tts['opts'].fallback).toEqual([{ model: 'cartesia/sonic', voice: 'my-voice' }]);
|
|
181
|
+
});
|
|
182
|
+
|
|
183
|
+
it('fallback list of strings is normalized', () => {
|
|
184
|
+
const tts = makeTts({ fallback: ['cartesia/sonic', 'elevenlabs'] });
|
|
185
|
+
expect(tts['opts'].fallback).toEqual([
|
|
186
|
+
{ model: 'cartesia/sonic', voice: '' },
|
|
187
|
+
{ model: 'elevenlabs', voice: '' },
|
|
188
|
+
]);
|
|
189
|
+
});
|
|
190
|
+
|
|
191
|
+
it('fallback single FallbackModel is normalized to list', () => {
|
|
192
|
+
const tts = makeTts({ fallback: { model: 'cartesia/sonic', voice: 'narrator' } });
|
|
193
|
+
expect(tts['opts'].fallback).toEqual([{ model: 'cartesia/sonic', voice: 'narrator' }]);
|
|
194
|
+
});
|
|
195
|
+
|
|
196
|
+
it('fallback with extraKwargs is preserved', () => {
|
|
197
|
+
const tts = makeTts({
|
|
198
|
+
fallback: {
|
|
199
|
+
model: 'cartesia/sonic',
|
|
200
|
+
voice: 'narrator',
|
|
201
|
+
extraKwargs: { duration: 30.0, speed: 'fast' },
|
|
202
|
+
},
|
|
203
|
+
});
|
|
204
|
+
expect(tts['opts'].fallback).toEqual([
|
|
205
|
+
{
|
|
206
|
+
model: 'cartesia/sonic',
|
|
207
|
+
voice: 'narrator',
|
|
208
|
+
extraKwargs: { duration: 30.0, speed: 'fast' },
|
|
209
|
+
},
|
|
210
|
+
]);
|
|
211
|
+
});
|
|
212
|
+
|
|
213
|
+
it('fallback mixed list is normalized', () => {
|
|
214
|
+
const tts = makeTts({
|
|
215
|
+
fallback: [
|
|
216
|
+
'cartesia/sonic:voice1',
|
|
217
|
+
{ model: 'elevenlabs', voice: 'custom', extraKwargs: { speed: 'slow' } },
|
|
218
|
+
'rime/mist',
|
|
219
|
+
],
|
|
220
|
+
});
|
|
221
|
+
expect(tts['opts'].fallback).toEqual([
|
|
222
|
+
{ model: 'cartesia/sonic', voice: 'voice1' },
|
|
223
|
+
{ model: 'elevenlabs', voice: 'custom', extraKwargs: { speed: 'slow' } },
|
|
224
|
+
{ model: 'rime/mist', voice: '' },
|
|
225
|
+
]);
|
|
226
|
+
});
|
|
227
|
+
|
|
228
|
+
it('connOptions not given uses default', () => {
|
|
229
|
+
const tts = makeTts();
|
|
230
|
+
expect(tts['opts'].connOptions).toEqual(DEFAULT_API_CONNECT_OPTIONS);
|
|
231
|
+
});
|
|
232
|
+
|
|
233
|
+
it('connOptions custom timeout', () => {
|
|
234
|
+
const custom: APIConnectOptions = { timeoutMs: 30000, maxRetry: 3, retryIntervalMs: 2000 };
|
|
235
|
+
const tts = makeTts({ connOptions: custom });
|
|
236
|
+
expect(tts['opts'].connOptions).toEqual(custom);
|
|
237
|
+
expect(tts['opts'].connOptions!.timeoutMs).toBe(30000);
|
|
238
|
+
});
|
|
239
|
+
|
|
240
|
+
it('connOptions custom maxRetry', () => {
|
|
241
|
+
const custom: APIConnectOptions = { timeoutMs: 10000, maxRetry: 5, retryIntervalMs: 2000 };
|
|
242
|
+
const tts = makeTts({ connOptions: custom });
|
|
243
|
+
expect(tts['opts'].connOptions).toEqual(custom);
|
|
244
|
+
expect(tts['opts'].connOptions!.maxRetry).toBe(5);
|
|
245
|
+
});
|
|
246
|
+
|
|
247
|
+
it('connOptions full custom', () => {
|
|
248
|
+
const custom: APIConnectOptions = { timeoutMs: 60000, maxRetry: 10, retryIntervalMs: 2000 };
|
|
249
|
+
const tts = makeTts({ connOptions: custom });
|
|
250
|
+
expect(tts['opts'].connOptions).toEqual(custom);
|
|
251
|
+
expect(tts['opts'].connOptions!.timeoutMs).toBe(60000);
|
|
252
|
+
expect(tts['opts'].connOptions!.maxRetry).toBe(10);
|
|
253
|
+
expect(tts['opts'].connOptions!.retryIntervalMs).toBe(2000);
|
|
254
|
+
});
|
|
255
|
+
});
|