@livekit/agents 1.0.42 → 1.0.44
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/inference/index.cjs +8 -0
- package/dist/inference/index.cjs.map +1 -1
- package/dist/inference/index.d.cts +2 -2
- package/dist/inference/index.d.ts +2 -2
- package/dist/inference/index.d.ts.map +1 -1
- package/dist/inference/index.js +8 -0
- package/dist/inference/index.js.map +1 -1
- package/dist/inference/stt.cjs +70 -12
- package/dist/inference/stt.cjs.map +1 -1
- package/dist/inference/stt.d.cts +34 -1
- package/dist/inference/stt.d.ts +34 -1
- package/dist/inference/stt.d.ts.map +1 -1
- package/dist/inference/stt.js +67 -11
- package/dist/inference/stt.js.map +1 -1
- package/dist/inference/stt.test.cjs +204 -0
- package/dist/inference/stt.test.cjs.map +1 -0
- package/dist/inference/stt.test.js +203 -0
- package/dist/inference/stt.test.js.map +1 -0
- package/dist/inference/tts.cjs +52 -10
- package/dist/inference/tts.cjs.map +1 -1
- package/dist/inference/tts.d.cts +22 -0
- package/dist/inference/tts.d.ts +22 -0
- package/dist/inference/tts.d.ts.map +1 -1
- package/dist/inference/tts.js +49 -9
- package/dist/inference/tts.js.map +1 -1
- package/dist/inference/tts.test.cjs +223 -0
- package/dist/inference/tts.test.cjs.map +1 -0
- package/dist/inference/tts.test.js +222 -0
- package/dist/inference/tts.test.js.map +1 -0
- package/dist/ipc/inference_proc_lazy_main.cjs +13 -1
- package/dist/ipc/inference_proc_lazy_main.cjs.map +1 -1
- package/dist/ipc/inference_proc_lazy_main.js +13 -1
- package/dist/ipc/inference_proc_lazy_main.js.map +1 -1
- package/dist/ipc/job_proc_lazy_main.cjs +8 -1
- package/dist/ipc/job_proc_lazy_main.cjs.map +1 -1
- package/dist/ipc/job_proc_lazy_main.js +9 -2
- package/dist/ipc/job_proc_lazy_main.js.map +1 -1
- package/dist/ipc/supervised_proc.cjs.map +1 -1
- package/dist/ipc/supervised_proc.d.cts +7 -0
- package/dist/ipc/supervised_proc.d.ts +7 -0
- package/dist/ipc/supervised_proc.d.ts.map +1 -1
- package/dist/ipc/supervised_proc.js.map +1 -1
- package/dist/stt/stt.cjs +4 -0
- package/dist/stt/stt.cjs.map +1 -1
- package/dist/stt/stt.d.cts +7 -0
- package/dist/stt/stt.d.ts +7 -0
- package/dist/stt/stt.d.ts.map +1 -1
- package/dist/stt/stt.js +4 -0
- package/dist/stt/stt.js.map +1 -1
- package/dist/transcription.cjs.map +1 -1
- package/dist/transcription.d.cts +6 -0
- package/dist/transcription.d.ts +6 -0
- package/dist/transcription.d.ts.map +1 -1
- package/dist/transcription.js.map +1 -1
- package/dist/utils.cjs +10 -2
- package/dist/utils.cjs.map +1 -1
- package/dist/utils.d.ts.map +1 -1
- package/dist/utils.js +10 -2
- package/dist/utils.js.map +1 -1
- package/dist/vad.cjs +1 -1
- package/dist/vad.cjs.map +1 -1
- package/dist/vad.d.cts +3 -2
- package/dist/vad.d.ts +3 -2
- package/dist/vad.d.ts.map +1 -1
- package/dist/vad.js +1 -1
- package/dist/vad.js.map +1 -1
- package/dist/voice/agent_activity.cjs +1 -2
- package/dist/voice/agent_activity.cjs.map +1 -1
- package/dist/voice/agent_activity.js +1 -2
- package/dist/voice/agent_activity.js.map +1 -1
- package/dist/voice/audio_recognition.cjs.map +1 -1
- package/dist/voice/audio_recognition.d.cts +14 -0
- package/dist/voice/audio_recognition.d.ts +14 -0
- package/dist/voice/audio_recognition.d.ts.map +1 -1
- package/dist/voice/audio_recognition.js.map +1 -1
- package/package.json +1 -1
- package/src/inference/index.ts +8 -0
- package/src/inference/stt.test.ts +236 -0
- package/src/inference/stt.ts +116 -20
- package/src/inference/tts.test.ts +255 -0
- package/src/inference/tts.ts +81 -15
- package/src/ipc/inference_proc_lazy_main.ts +13 -1
- package/src/ipc/job_proc_lazy_main.ts +18 -2
- package/src/ipc/supervised_proc.ts +7 -0
- package/src/stt/stt.ts +12 -0
- package/src/transcription.ts +6 -0
- package/src/utils.ts +10 -2
- package/src/vad.ts +4 -3
- package/src/voice/agent_activity.ts +1 -1
- package/src/voice/audio_recognition.ts +14 -0
|
@@ -0,0 +1,236 @@
|
|
|
1
|
+
// SPDX-FileCopyrightText: 2025 LiveKit, Inc.
|
|
2
|
+
//
|
|
3
|
+
// SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
import { beforeAll, describe, expect, it } from 'vitest';
|
|
5
|
+
import { initializeLogger } from '../log.js';
|
|
6
|
+
import { type APIConnectOptions, DEFAULT_API_CONNECT_OPTIONS } from '../types.js';
|
|
7
|
+
import { STT, type STTFallbackModel, normalizeSTTFallback, parseSTTModelString } from './stt.js';
|
|
8
|
+
|
|
9
|
+
beforeAll(() => {
|
|
10
|
+
initializeLogger({ level: 'silent', pretty: false });
|
|
11
|
+
});
|
|
12
|
+
|
|
13
|
+
/** Helper to create STT with required credentials. */
|
|
14
|
+
function makeStt(overrides: Record<string, unknown> = {}) {
|
|
15
|
+
const defaults = {
|
|
16
|
+
model: 'deepgram' as const,
|
|
17
|
+
apiKey: 'test-key',
|
|
18
|
+
apiSecret: 'test-secret',
|
|
19
|
+
baseURL: 'https://example.livekit.cloud',
|
|
20
|
+
};
|
|
21
|
+
return new STT({ ...defaults, ...overrides });
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
describe('parseSTTModelString', () => {
|
|
25
|
+
it('simple model without language', () => {
|
|
26
|
+
const [model, language] = parseSTTModelString('deepgram');
|
|
27
|
+
expect(model).toBe('deepgram');
|
|
28
|
+
expect(language).toBeUndefined();
|
|
29
|
+
});
|
|
30
|
+
|
|
31
|
+
it('model with language suffix', () => {
|
|
32
|
+
const [model, language] = parseSTTModelString('deepgram:en');
|
|
33
|
+
expect(model).toBe('deepgram');
|
|
34
|
+
expect(language).toBe('en');
|
|
35
|
+
});
|
|
36
|
+
|
|
37
|
+
it('provider/model format without language', () => {
|
|
38
|
+
const [model, language] = parseSTTModelString('deepgram/nova-3');
|
|
39
|
+
expect(model).toBe('deepgram/nova-3');
|
|
40
|
+
expect(language).toBeUndefined();
|
|
41
|
+
});
|
|
42
|
+
|
|
43
|
+
it('provider/model format with language', () => {
|
|
44
|
+
const [model, language] = parseSTTModelString('deepgram/nova-3:en');
|
|
45
|
+
expect(model).toBe('deepgram/nova-3');
|
|
46
|
+
expect(language).toBe('en');
|
|
47
|
+
});
|
|
48
|
+
|
|
49
|
+
it.each([
|
|
50
|
+
['cartesia/ink-whisper:de', 'cartesia/ink-whisper', 'de'],
|
|
51
|
+
['assemblyai:es', 'assemblyai', 'es'],
|
|
52
|
+
['deepgram/nova-2-medical:ja', 'deepgram/nova-2-medical', 'ja'],
|
|
53
|
+
['deepgram/nova-3:multi', 'deepgram/nova-3', 'multi'],
|
|
54
|
+
['cartesia:zh', 'cartesia', 'zh'],
|
|
55
|
+
])('various providers and languages: %s', (modelStr, expectedModel, expectedLang) => {
|
|
56
|
+
const [model, language] = parseSTTModelString(modelStr);
|
|
57
|
+
expect(model).toBe(expectedModel);
|
|
58
|
+
expect(language).toBe(expectedLang);
|
|
59
|
+
});
|
|
60
|
+
|
|
61
|
+
it('auto model without language', () => {
|
|
62
|
+
const [model, language] = parseSTTModelString('auto');
|
|
63
|
+
expect(model).toBe('auto');
|
|
64
|
+
expect(language).toBeUndefined();
|
|
65
|
+
});
|
|
66
|
+
|
|
67
|
+
it('auto model with language', () => {
|
|
68
|
+
const [model, language] = parseSTTModelString('auto:pt');
|
|
69
|
+
expect(model).toBe('auto');
|
|
70
|
+
expect(language).toBe('pt');
|
|
71
|
+
});
|
|
72
|
+
});
|
|
73
|
+
|
|
74
|
+
describe('normalizeSTTFallback', () => {
|
|
75
|
+
it('single string model', () => {
|
|
76
|
+
const result = normalizeSTTFallback('deepgram/nova-3');
|
|
77
|
+
expect(result).toEqual([{ model: 'deepgram/nova-3' }]);
|
|
78
|
+
});
|
|
79
|
+
|
|
80
|
+
it('single FallbackModel dict', () => {
|
|
81
|
+
const fallback: STTFallbackModel = { model: 'deepgram/nova-3' };
|
|
82
|
+
const result = normalizeSTTFallback(fallback);
|
|
83
|
+
expect(result).toEqual([{ model: 'deepgram/nova-3' }]);
|
|
84
|
+
});
|
|
85
|
+
|
|
86
|
+
it('list of string models', () => {
|
|
87
|
+
const result = normalizeSTTFallback(['deepgram/nova-3', 'cartesia/ink-whisper']);
|
|
88
|
+
expect(result).toEqual([{ model: 'deepgram/nova-3' }, { model: 'cartesia/ink-whisper' }]);
|
|
89
|
+
});
|
|
90
|
+
|
|
91
|
+
it('list of FallbackModel dicts', () => {
|
|
92
|
+
const fallbacks: STTFallbackModel[] = [{ model: 'deepgram/nova-3' }, { model: 'assemblyai' }];
|
|
93
|
+
const result = normalizeSTTFallback(fallbacks);
|
|
94
|
+
expect(result).toEqual([{ model: 'deepgram/nova-3' }, { model: 'assemblyai' }]);
|
|
95
|
+
});
|
|
96
|
+
|
|
97
|
+
it('mixed list of strings and dicts', () => {
|
|
98
|
+
const result = normalizeSTTFallback([
|
|
99
|
+
'deepgram/nova-3',
|
|
100
|
+
{ model: 'cartesia/ink-whisper' } as STTFallbackModel,
|
|
101
|
+
'assemblyai',
|
|
102
|
+
]);
|
|
103
|
+
expect(result).toEqual([
|
|
104
|
+
{ model: 'deepgram/nova-3' },
|
|
105
|
+
{ model: 'cartesia/ink-whisper' },
|
|
106
|
+
{ model: 'assemblyai' },
|
|
107
|
+
]);
|
|
108
|
+
});
|
|
109
|
+
|
|
110
|
+
it('string with language suffix discards language', () => {
|
|
111
|
+
const result = normalizeSTTFallback('deepgram/nova-3:en');
|
|
112
|
+
expect(result).toEqual([{ model: 'deepgram/nova-3' }]);
|
|
113
|
+
});
|
|
114
|
+
|
|
115
|
+
it('FallbackModel with extraKwargs is preserved', () => {
|
|
116
|
+
const fallback: STTFallbackModel = {
|
|
117
|
+
model: 'deepgram/nova-3',
|
|
118
|
+
extraKwargs: { keywords: [['livekit', 1.5]], punctuate: true },
|
|
119
|
+
};
|
|
120
|
+
const result = normalizeSTTFallback(fallback);
|
|
121
|
+
expect(result).toEqual([
|
|
122
|
+
{
|
|
123
|
+
model: 'deepgram/nova-3',
|
|
124
|
+
extraKwargs: { keywords: [['livekit', 1.5]], punctuate: true },
|
|
125
|
+
},
|
|
126
|
+
]);
|
|
127
|
+
});
|
|
128
|
+
|
|
129
|
+
it('list with extraKwargs preserved', () => {
|
|
130
|
+
const result = normalizeSTTFallback([
|
|
131
|
+
{ model: 'deepgram/nova-3', extraKwargs: { punctuate: true } } as STTFallbackModel,
|
|
132
|
+
'cartesia/ink-whisper',
|
|
133
|
+
{ model: 'assemblyai', extraKwargs: { format_turns: true } } as STTFallbackModel,
|
|
134
|
+
]);
|
|
135
|
+
expect(result).toEqual([
|
|
136
|
+
{ model: 'deepgram/nova-3', extraKwargs: { punctuate: true } },
|
|
137
|
+
{ model: 'cartesia/ink-whisper' },
|
|
138
|
+
{ model: 'assemblyai', extraKwargs: { format_turns: true } },
|
|
139
|
+
]);
|
|
140
|
+
});
|
|
141
|
+
|
|
142
|
+
it('empty list returns empty list', () => {
|
|
143
|
+
const result = normalizeSTTFallback([]);
|
|
144
|
+
expect(result).toEqual([]);
|
|
145
|
+
});
|
|
146
|
+
|
|
147
|
+
it('multiple colons in model string splits on last', () => {
|
|
148
|
+
const result = normalizeSTTFallback('some:model:part:fr');
|
|
149
|
+
expect(result).toEqual([{ model: 'some:model:part' }]);
|
|
150
|
+
});
|
|
151
|
+
});
|
|
152
|
+
|
|
153
|
+
describe('STT constructor fallback and connOptions', () => {
|
|
154
|
+
it('fallback not given defaults to undefined', () => {
|
|
155
|
+
const stt = makeStt();
|
|
156
|
+
expect(stt['opts'].fallback).toBeUndefined();
|
|
157
|
+
});
|
|
158
|
+
|
|
159
|
+
it('fallback single string is normalized', () => {
|
|
160
|
+
const stt = makeStt({ fallback: 'cartesia/ink-whisper' });
|
|
161
|
+
expect(stt['opts'].fallback).toEqual([{ model: 'cartesia/ink-whisper' }]);
|
|
162
|
+
});
|
|
163
|
+
|
|
164
|
+
it('fallback list of strings is normalized', () => {
|
|
165
|
+
const stt = makeStt({ fallback: ['deepgram/nova-3', 'assemblyai'] });
|
|
166
|
+
expect(stt['opts'].fallback).toEqual([{ model: 'deepgram/nova-3' }, { model: 'assemblyai' }]);
|
|
167
|
+
});
|
|
168
|
+
|
|
169
|
+
it('fallback single FallbackModel is normalized to list', () => {
|
|
170
|
+
const stt = makeStt({ fallback: { model: 'deepgram/nova-3' } });
|
|
171
|
+
expect(stt['opts'].fallback).toEqual([{ model: 'deepgram/nova-3' }]);
|
|
172
|
+
});
|
|
173
|
+
|
|
174
|
+
it('fallback with extraKwargs is preserved', () => {
|
|
175
|
+
const stt = makeStt({
|
|
176
|
+
fallback: {
|
|
177
|
+
model: 'deepgram/nova-3',
|
|
178
|
+
extraKwargs: { punctuate: true, keywords: [['livekit', 1.5]] },
|
|
179
|
+
},
|
|
180
|
+
});
|
|
181
|
+
expect(stt['opts'].fallback).toEqual([
|
|
182
|
+
{
|
|
183
|
+
model: 'deepgram/nova-3',
|
|
184
|
+
extraKwargs: { punctuate: true, keywords: [['livekit', 1.5]] },
|
|
185
|
+
},
|
|
186
|
+
]);
|
|
187
|
+
});
|
|
188
|
+
|
|
189
|
+
it('fallback mixed list is normalized', () => {
|
|
190
|
+
const stt = makeStt({
|
|
191
|
+
fallback: [
|
|
192
|
+
'deepgram/nova-3',
|
|
193
|
+
{ model: 'cartesia', extraKwargs: { min_volume: 0.5 } },
|
|
194
|
+
'assemblyai',
|
|
195
|
+
],
|
|
196
|
+
});
|
|
197
|
+
expect(stt['opts'].fallback).toEqual([
|
|
198
|
+
{ model: 'deepgram/nova-3' },
|
|
199
|
+
{ model: 'cartesia', extraKwargs: { min_volume: 0.5 } },
|
|
200
|
+
{ model: 'assemblyai' },
|
|
201
|
+
]);
|
|
202
|
+
});
|
|
203
|
+
|
|
204
|
+
it('fallback string with language discards language', () => {
|
|
205
|
+
const stt = makeStt({ fallback: 'deepgram/nova-3:en' });
|
|
206
|
+
expect(stt['opts'].fallback).toEqual([{ model: 'deepgram/nova-3' }]);
|
|
207
|
+
});
|
|
208
|
+
|
|
209
|
+
it('connOptions not given uses default', () => {
|
|
210
|
+
const stt = makeStt();
|
|
211
|
+
expect(stt['opts'].connOptions).toEqual(DEFAULT_API_CONNECT_OPTIONS);
|
|
212
|
+
});
|
|
213
|
+
|
|
214
|
+
it('connOptions custom timeout', () => {
|
|
215
|
+
const custom: APIConnectOptions = { timeoutMs: 30000, maxRetry: 3, retryIntervalMs: 2000 };
|
|
216
|
+
const stt = makeStt({ connOptions: custom });
|
|
217
|
+
expect(stt['opts'].connOptions).toEqual(custom);
|
|
218
|
+
expect(stt['opts'].connOptions!.timeoutMs).toBe(30000);
|
|
219
|
+
});
|
|
220
|
+
|
|
221
|
+
it('connOptions custom maxRetry', () => {
|
|
222
|
+
const custom: APIConnectOptions = { timeoutMs: 10000, maxRetry: 5, retryIntervalMs: 2000 };
|
|
223
|
+
const stt = makeStt({ connOptions: custom });
|
|
224
|
+
expect(stt['opts'].connOptions).toEqual(custom);
|
|
225
|
+
expect(stt['opts'].connOptions!.maxRetry).toBe(5);
|
|
226
|
+
});
|
|
227
|
+
|
|
228
|
+
it('connOptions full custom', () => {
|
|
229
|
+
const custom: APIConnectOptions = { timeoutMs: 60000, maxRetry: 10, retryIntervalMs: 2000 };
|
|
230
|
+
const stt = makeStt({ connOptions: custom });
|
|
231
|
+
expect(stt['opts'].connOptions).toEqual(custom);
|
|
232
|
+
expect(stt['opts'].connOptions!.timeoutMs).toBe(60000);
|
|
233
|
+
expect(stt['opts'].connOptions!.maxRetry).toBe(10);
|
|
234
|
+
expect(stt['opts'].connOptions!.retryIntervalMs).toBe(2000);
|
|
235
|
+
});
|
|
236
|
+
});
|
package/src/inference/stt.ts
CHANGED
|
@@ -42,29 +42,46 @@ export type AssemblyaiModels =
|
|
|
42
42
|
export type ElevenlabsSTTModels = 'elevenlabs/scribe_v2_realtime';
|
|
43
43
|
|
|
44
44
|
export interface CartesiaOptions {
|
|
45
|
-
|
|
46
|
-
|
|
45
|
+
/** Minimum volume threshold. Default: not specified. */
|
|
46
|
+
min_volume?: number;
|
|
47
|
+
/** Maximum silence duration in seconds. Default: not specified. */
|
|
48
|
+
max_silence_duration_secs?: number;
|
|
47
49
|
}
|
|
48
50
|
|
|
49
51
|
export interface DeepgramOptions {
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
52
|
+
/** Enable filler words. Default: true. */
|
|
53
|
+
filler_words?: boolean;
|
|
54
|
+
/** Enable interim results. Default: true. */
|
|
55
|
+
interim_results?: boolean;
|
|
56
|
+
/** Endpointing timeout in milliseconds. Default: 25. */
|
|
57
|
+
endpointing?: number;
|
|
58
|
+
/** Enable punctuation. Default: false. */
|
|
59
|
+
punctuate?: boolean;
|
|
60
|
+
/** Enable smart formatting. */
|
|
54
61
|
smart_format?: boolean;
|
|
62
|
+
/** Keywords with boost values. */
|
|
55
63
|
keywords?: Array<[string, number]>;
|
|
64
|
+
/** Key terms for recognition. */
|
|
56
65
|
keyterms?: string[];
|
|
66
|
+
/** Enable profanity filter. */
|
|
57
67
|
profanity_filter?: boolean;
|
|
68
|
+
/** Convert spoken numbers to numerals. */
|
|
58
69
|
numerals?: boolean;
|
|
70
|
+
/** Opt out of model improvement program. */
|
|
59
71
|
mip_opt_out?: boolean;
|
|
60
72
|
}
|
|
61
73
|
|
|
62
74
|
export interface AssemblyAIOptions {
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
75
|
+
/** Enable turn formatting. Default: false. */
|
|
76
|
+
format_turns?: boolean;
|
|
77
|
+
/** End of turn confidence threshold. Default: 0.01. */
|
|
78
|
+
end_of_turn_confidence_threshold?: number;
|
|
79
|
+
/** Minimum silence duration in milliseconds when confident about end of turn. Default: 0. */
|
|
80
|
+
min_end_of_turn_silence_when_confident?: number;
|
|
81
|
+
/** Maximum turn silence in milliseconds. Default: not specified. */
|
|
82
|
+
max_turn_silence?: number;
|
|
83
|
+
/** Key terms prompt for recognition. Default: not specified. */
|
|
84
|
+
keyterms_prompt?: string[];
|
|
68
85
|
}
|
|
69
86
|
|
|
70
87
|
export type STTLanguages =
|
|
@@ -93,6 +110,43 @@ export type STTOptions<TModel extends STTModels> = TModel extends DeepgramModels
|
|
|
93
110
|
? AssemblyAIOptions
|
|
94
111
|
: Record<string, unknown>;
|
|
95
112
|
|
|
113
|
+
/** A fallback model with optional extra configuration. Extra fields are passed through to the provider. */
|
|
114
|
+
export interface STTFallbackModel {
|
|
115
|
+
/** Model name (e.g. "deepgram/nova-3", "assemblyai/universal-streaming", "cartesia/ink-whisper"). */
|
|
116
|
+
model: string;
|
|
117
|
+
/** Extra configuration for the model. */
|
|
118
|
+
extraKwargs?: Record<string, unknown>;
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
export type STTFallbackModelType = STTFallbackModel | string;
|
|
122
|
+
|
|
123
|
+
/** Parse a model string into [model, language]. Language is undefined if not specified. */
|
|
124
|
+
export function parseSTTModelString(model: string): [string, string | undefined] {
|
|
125
|
+
const idx = model.lastIndexOf(':');
|
|
126
|
+
if (idx !== -1) {
|
|
127
|
+
return [model.slice(0, idx), model.slice(idx + 1)];
|
|
128
|
+
}
|
|
129
|
+
return [model, undefined];
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
/** Normalize a single or list of FallbackModelType into STTFallbackModel[]. */
|
|
133
|
+
export function normalizeSTTFallback(
|
|
134
|
+
fallback: STTFallbackModelType | STTFallbackModelType[],
|
|
135
|
+
): STTFallbackModel[] {
|
|
136
|
+
const makeFallback = (model: STTFallbackModelType): STTFallbackModel => {
|
|
137
|
+
if (typeof model === 'string') {
|
|
138
|
+
const [name] = parseSTTModelString(model);
|
|
139
|
+
return { model: name };
|
|
140
|
+
}
|
|
141
|
+
return model;
|
|
142
|
+
};
|
|
143
|
+
|
|
144
|
+
if (Array.isArray(fallback)) {
|
|
145
|
+
return fallback.map(makeFallback);
|
|
146
|
+
}
|
|
147
|
+
return [makeFallback(fallback)];
|
|
148
|
+
}
|
|
149
|
+
|
|
96
150
|
export type STTEncoding = 'pcm_s16le';
|
|
97
151
|
|
|
98
152
|
const DEFAULT_ENCODING: STTEncoding = 'pcm_s16le';
|
|
@@ -109,6 +163,8 @@ export interface InferenceSTTOptions<TModel extends STTModels> {
|
|
|
109
163
|
apiKey: string;
|
|
110
164
|
apiSecret: string;
|
|
111
165
|
modelOptions: STTOptions<TModel>;
|
|
166
|
+
fallback?: STTFallbackModel[];
|
|
167
|
+
connOptions?: APIConnectOptions;
|
|
112
168
|
}
|
|
113
169
|
|
|
114
170
|
/**
|
|
@@ -121,7 +177,7 @@ export class STT<TModel extends STTModels> extends BaseSTT {
|
|
|
121
177
|
#logger = log();
|
|
122
178
|
|
|
123
179
|
constructor(opts?: {
|
|
124
|
-
model?:
|
|
180
|
+
model?: ModelWithLanguage;
|
|
125
181
|
language?: STTLanguages;
|
|
126
182
|
baseURL?: string;
|
|
127
183
|
encoding?: STTEncoding;
|
|
@@ -129,6 +185,8 @@ export class STT<TModel extends STTModels> extends BaseSTT {
|
|
|
129
185
|
apiKey?: string;
|
|
130
186
|
apiSecret?: string;
|
|
131
187
|
modelOptions?: STTOptions<TModel>;
|
|
188
|
+
fallback?: STTFallbackModelType | STTFallbackModelType[];
|
|
189
|
+
connOptions?: APIConnectOptions;
|
|
132
190
|
}) {
|
|
133
191
|
super({ streaming: true, interimResults: true, alignedTranscript: 'word' });
|
|
134
192
|
|
|
@@ -141,6 +199,8 @@ export class STT<TModel extends STTModels> extends BaseSTT {
|
|
|
141
199
|
apiKey,
|
|
142
200
|
apiSecret,
|
|
143
201
|
modelOptions = {} as STTOptions<TModel>,
|
|
202
|
+
fallback,
|
|
203
|
+
connOptions,
|
|
144
204
|
} = opts || {};
|
|
145
205
|
|
|
146
206
|
const lkBaseURL = baseURL || process.env.LIVEKIT_INFERENCE_URL || DEFAULT_BASE_URL;
|
|
@@ -155,15 +215,37 @@ export class STT<TModel extends STTModels> extends BaseSTT {
|
|
|
155
215
|
throw new Error('apiSecret is required: pass apiSecret or set LIVEKIT_API_SECRET');
|
|
156
216
|
}
|
|
157
217
|
|
|
218
|
+
// Parse language from model string if provided: "provider/model:language"
|
|
219
|
+
let nextModel = model;
|
|
220
|
+
let nextLanguage = language;
|
|
221
|
+
if (typeof nextModel === 'string') {
|
|
222
|
+
const idx = nextModel.lastIndexOf(':');
|
|
223
|
+
if (idx !== -1) {
|
|
224
|
+
const languageFromModel = nextModel.slice(idx + 1) as STTLanguages;
|
|
225
|
+
if (nextLanguage && nextLanguage !== languageFromModel) {
|
|
226
|
+
this.#logger.warn(
|
|
227
|
+
'`language` is provided via both argument and model, using the one from the argument',
|
|
228
|
+
{ language: nextLanguage, model: nextModel },
|
|
229
|
+
);
|
|
230
|
+
} else {
|
|
231
|
+
nextLanguage = languageFromModel;
|
|
232
|
+
}
|
|
233
|
+
nextModel = nextModel.slice(0, idx) as TModel;
|
|
234
|
+
}
|
|
235
|
+
}
|
|
236
|
+
const normalizedFallback = fallback ? normalizeSTTFallback(fallback) : undefined;
|
|
237
|
+
|
|
158
238
|
this.opts = {
|
|
159
|
-
model,
|
|
160
|
-
language,
|
|
239
|
+
model: nextModel as TModel,
|
|
240
|
+
language: nextLanguage,
|
|
161
241
|
encoding,
|
|
162
242
|
sampleRate,
|
|
163
243
|
baseURL: lkBaseURL,
|
|
164
244
|
apiKey: lkApiKey,
|
|
165
245
|
apiSecret: lkApiSecret,
|
|
166
246
|
modelOptions,
|
|
247
|
+
fallback: normalizedFallback,
|
|
248
|
+
connOptions: connOptions ?? DEFAULT_API_CONNECT_OPTIONS,
|
|
167
249
|
};
|
|
168
250
|
}
|
|
169
251
|
|
|
@@ -172,11 +254,8 @@ export class STT<TModel extends STTModels> extends BaseSTT {
|
|
|
172
254
|
}
|
|
173
255
|
|
|
174
256
|
static fromModelString(modelString: string): STT<AnyString> {
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
return new STT({ model, language });
|
|
178
|
-
}
|
|
179
|
-
return new STT({ model: modelString });
|
|
257
|
+
const [model, language] = parseSTTModelString(modelString);
|
|
258
|
+
return new STT({ model, language });
|
|
180
259
|
}
|
|
181
260
|
|
|
182
261
|
protected async _recognize(_: AudioBuffer): Promise<SpeechEvent> {
|
|
@@ -195,7 +274,8 @@ export class STT<TModel extends STTModels> extends BaseSTT {
|
|
|
195
274
|
language?: STTLanguages | string;
|
|
196
275
|
connOptions?: APIConnectOptions;
|
|
197
276
|
}): SpeechStream<TModel> {
|
|
198
|
-
const { language, connOptions = DEFAULT_API_CONNECT_OPTIONS } =
|
|
277
|
+
const { language, connOptions = this.opts.connOptions ?? DEFAULT_API_CONNECT_OPTIONS } =
|
|
278
|
+
options || {};
|
|
199
279
|
const streamOpts = {
|
|
200
280
|
...this.opts,
|
|
201
281
|
language: language ?? this.opts.language,
|
|
@@ -224,6 +304,22 @@ export class STT<TModel extends STTModels> extends BaseSTT {
|
|
|
224
304
|
(params.settings as Record<string, unknown>).language = this.opts.language;
|
|
225
305
|
}
|
|
226
306
|
|
|
307
|
+
if (this.opts.fallback?.length) {
|
|
308
|
+
params.fallback = {
|
|
309
|
+
models: this.opts.fallback.map((m) => ({
|
|
310
|
+
model: m.model,
|
|
311
|
+
extra: m.extraKwargs ?? {},
|
|
312
|
+
})),
|
|
313
|
+
};
|
|
314
|
+
}
|
|
315
|
+
|
|
316
|
+
if (this.opts.connOptions) {
|
|
317
|
+
params.connection = {
|
|
318
|
+
timeout: this.opts.connOptions.timeoutMs / 1000,
|
|
319
|
+
retries: this.opts.connOptions.maxRetry,
|
|
320
|
+
};
|
|
321
|
+
}
|
|
322
|
+
|
|
227
323
|
let baseURL = this.opts.baseURL;
|
|
228
324
|
if (baseURL.startsWith('http://') || baseURL.startsWith('https://')) {
|
|
229
325
|
baseURL = baseURL.replace('http', 'ws');
|