@livekit/agents 1.0.42 → 1.0.44
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/inference/index.cjs +8 -0
- package/dist/inference/index.cjs.map +1 -1
- package/dist/inference/index.d.cts +2 -2
- package/dist/inference/index.d.ts +2 -2
- package/dist/inference/index.d.ts.map +1 -1
- package/dist/inference/index.js +8 -0
- package/dist/inference/index.js.map +1 -1
- package/dist/inference/stt.cjs +70 -12
- package/dist/inference/stt.cjs.map +1 -1
- package/dist/inference/stt.d.cts +34 -1
- package/dist/inference/stt.d.ts +34 -1
- package/dist/inference/stt.d.ts.map +1 -1
- package/dist/inference/stt.js +67 -11
- package/dist/inference/stt.js.map +1 -1
- package/dist/inference/stt.test.cjs +204 -0
- package/dist/inference/stt.test.cjs.map +1 -0
- package/dist/inference/stt.test.js +203 -0
- package/dist/inference/stt.test.js.map +1 -0
- package/dist/inference/tts.cjs +52 -10
- package/dist/inference/tts.cjs.map +1 -1
- package/dist/inference/tts.d.cts +22 -0
- package/dist/inference/tts.d.ts +22 -0
- package/dist/inference/tts.d.ts.map +1 -1
- package/dist/inference/tts.js +49 -9
- package/dist/inference/tts.js.map +1 -1
- package/dist/inference/tts.test.cjs +223 -0
- package/dist/inference/tts.test.cjs.map +1 -0
- package/dist/inference/tts.test.js +222 -0
- package/dist/inference/tts.test.js.map +1 -0
- package/dist/ipc/inference_proc_lazy_main.cjs +13 -1
- package/dist/ipc/inference_proc_lazy_main.cjs.map +1 -1
- package/dist/ipc/inference_proc_lazy_main.js +13 -1
- package/dist/ipc/inference_proc_lazy_main.js.map +1 -1
- package/dist/ipc/job_proc_lazy_main.cjs +8 -1
- package/dist/ipc/job_proc_lazy_main.cjs.map +1 -1
- package/dist/ipc/job_proc_lazy_main.js +9 -2
- package/dist/ipc/job_proc_lazy_main.js.map +1 -1
- package/dist/ipc/supervised_proc.cjs.map +1 -1
- package/dist/ipc/supervised_proc.d.cts +7 -0
- package/dist/ipc/supervised_proc.d.ts +7 -0
- package/dist/ipc/supervised_proc.d.ts.map +1 -1
- package/dist/ipc/supervised_proc.js.map +1 -1
- package/dist/stt/stt.cjs +4 -0
- package/dist/stt/stt.cjs.map +1 -1
- package/dist/stt/stt.d.cts +7 -0
- package/dist/stt/stt.d.ts +7 -0
- package/dist/stt/stt.d.ts.map +1 -1
- package/dist/stt/stt.js +4 -0
- package/dist/stt/stt.js.map +1 -1
- package/dist/transcription.cjs.map +1 -1
- package/dist/transcription.d.cts +6 -0
- package/dist/transcription.d.ts +6 -0
- package/dist/transcription.d.ts.map +1 -1
- package/dist/transcription.js.map +1 -1
- package/dist/utils.cjs +10 -2
- package/dist/utils.cjs.map +1 -1
- package/dist/utils.d.ts.map +1 -1
- package/dist/utils.js +10 -2
- package/dist/utils.js.map +1 -1
- package/dist/vad.cjs +1 -1
- package/dist/vad.cjs.map +1 -1
- package/dist/vad.d.cts +3 -2
- package/dist/vad.d.ts +3 -2
- package/dist/vad.d.ts.map +1 -1
- package/dist/vad.js +1 -1
- package/dist/vad.js.map +1 -1
- package/dist/voice/agent_activity.cjs +1 -2
- package/dist/voice/agent_activity.cjs.map +1 -1
- package/dist/voice/agent_activity.js +1 -2
- package/dist/voice/agent_activity.js.map +1 -1
- package/dist/voice/audio_recognition.cjs.map +1 -1
- package/dist/voice/audio_recognition.d.cts +14 -0
- package/dist/voice/audio_recognition.d.ts +14 -0
- package/dist/voice/audio_recognition.d.ts.map +1 -1
- package/dist/voice/audio_recognition.js.map +1 -1
- package/package.json +1 -1
- package/src/inference/index.ts +8 -0
- package/src/inference/stt.test.ts +236 -0
- package/src/inference/stt.ts +116 -20
- package/src/inference/tts.test.ts +255 -0
- package/src/inference/tts.ts +81 -15
- package/src/ipc/inference_proc_lazy_main.ts +13 -1
- package/src/ipc/job_proc_lazy_main.ts +18 -2
- package/src/ipc/supervised_proc.ts +7 -0
- package/src/stt/stt.ts +12 -0
- package/src/transcription.ts +6 -0
- package/src/utils.ts +10 -2
- package/src/vad.ts +4 -3
- package/src/voice/agent_activity.ts +1 -1
- package/src/voice/audio_recognition.ts +14 -0
|
@@ -0,0 +1,255 @@
|
|
|
1
|
+
// SPDX-FileCopyrightText: 2025 LiveKit, Inc.
|
|
2
|
+
//
|
|
3
|
+
// SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
import { beforeAll, describe, expect, it } from 'vitest';
|
|
5
|
+
import { initializeLogger } from '../log.js';
|
|
6
|
+
import { type APIConnectOptions, DEFAULT_API_CONNECT_OPTIONS } from '../types.js';
|
|
7
|
+
import { TTS, type TTSFallbackModel, normalizeTTSFallback, parseTTSModelString } from './tts.js';
|
|
8
|
+
|
|
9
|
+
beforeAll(() => {
|
|
10
|
+
initializeLogger({ level: 'silent', pretty: false });
|
|
11
|
+
});
|
|
12
|
+
|
|
13
|
+
/** Helper to create TTS with required credentials. */
|
|
14
|
+
function makeTts(overrides: Record<string, unknown> = {}) {
|
|
15
|
+
const defaults = {
|
|
16
|
+
model: 'cartesia/sonic' as const,
|
|
17
|
+
apiKey: 'test-key',
|
|
18
|
+
apiSecret: 'test-secret',
|
|
19
|
+
baseURL: 'https://example.livekit.cloud',
|
|
20
|
+
};
|
|
21
|
+
return new TTS({ ...defaults, ...overrides });
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
describe('parseTTSModelString', () => {
|
|
25
|
+
it('simple model without voice', () => {
|
|
26
|
+
const [model, voice] = parseTTSModelString('cartesia');
|
|
27
|
+
expect(model).toBe('cartesia');
|
|
28
|
+
expect(voice).toBeUndefined();
|
|
29
|
+
});
|
|
30
|
+
|
|
31
|
+
it('model with voice suffix', () => {
|
|
32
|
+
const [model, voice] = parseTTSModelString('cartesia:my-voice-id');
|
|
33
|
+
expect(model).toBe('cartesia');
|
|
34
|
+
expect(voice).toBe('my-voice-id');
|
|
35
|
+
});
|
|
36
|
+
|
|
37
|
+
it('provider/model format without voice', () => {
|
|
38
|
+
const [model, voice] = parseTTSModelString('cartesia/sonic');
|
|
39
|
+
expect(model).toBe('cartesia/sonic');
|
|
40
|
+
expect(voice).toBeUndefined();
|
|
41
|
+
});
|
|
42
|
+
|
|
43
|
+
it('provider/model format with voice', () => {
|
|
44
|
+
const [model, voice] = parseTTSModelString('cartesia/sonic:my-voice-id');
|
|
45
|
+
expect(model).toBe('cartesia/sonic');
|
|
46
|
+
expect(voice).toBe('my-voice-id');
|
|
47
|
+
});
|
|
48
|
+
|
|
49
|
+
it.each([
|
|
50
|
+
['elevenlabs/eleven_flash_v2:voice123', 'elevenlabs/eleven_flash_v2', 'voice123'],
|
|
51
|
+
['rime:speaker-a', 'rime', 'speaker-a'],
|
|
52
|
+
['rime/mist:narrator', 'rime/mist', 'narrator'],
|
|
53
|
+
['inworld/inworld-tts-1:character', 'inworld/inworld-tts-1', 'character'],
|
|
54
|
+
['cartesia/sonic-turbo:deep-voice', 'cartesia/sonic-turbo', 'deep-voice'],
|
|
55
|
+
])('various providers and voices: %s', (modelStr, expectedModel, expectedVoice) => {
|
|
56
|
+
const [model, voice] = parseTTSModelString(modelStr);
|
|
57
|
+
expect(model).toBe(expectedModel);
|
|
58
|
+
expect(voice).toBe(expectedVoice);
|
|
59
|
+
});
|
|
60
|
+
|
|
61
|
+
it('empty voice after colon', () => {
|
|
62
|
+
const [model, voice] = parseTTSModelString('cartesia/sonic:');
|
|
63
|
+
expect(model).toBe('cartesia/sonic');
|
|
64
|
+
expect(voice).toBe('');
|
|
65
|
+
});
|
|
66
|
+
});
|
|
67
|
+
|
|
68
|
+
describe('normalizeTTSFallback', () => {
|
|
69
|
+
it('single string model', () => {
|
|
70
|
+
const result = normalizeTTSFallback('cartesia/sonic');
|
|
71
|
+
expect(result).toEqual([{ model: 'cartesia/sonic', voice: '' }]);
|
|
72
|
+
});
|
|
73
|
+
|
|
74
|
+
it('single string model with voice', () => {
|
|
75
|
+
const result = normalizeTTSFallback('cartesia/sonic:my-voice');
|
|
76
|
+
expect(result).toEqual([{ model: 'cartesia/sonic', voice: 'my-voice' }]);
|
|
77
|
+
});
|
|
78
|
+
|
|
79
|
+
it('single FallbackModel dict', () => {
|
|
80
|
+
const fallback: TTSFallbackModel = { model: 'cartesia/sonic', voice: 'narrator' };
|
|
81
|
+
const result = normalizeTTSFallback(fallback);
|
|
82
|
+
expect(result).toEqual([{ model: 'cartesia/sonic', voice: 'narrator' }]);
|
|
83
|
+
});
|
|
84
|
+
|
|
85
|
+
it('list of string models', () => {
|
|
86
|
+
const result = normalizeTTSFallback(['cartesia/sonic', 'elevenlabs/eleven_flash_v2']);
|
|
87
|
+
expect(result).toEqual([
|
|
88
|
+
{ model: 'cartesia/sonic', voice: '' },
|
|
89
|
+
{ model: 'elevenlabs/eleven_flash_v2', voice: '' },
|
|
90
|
+
]);
|
|
91
|
+
});
|
|
92
|
+
|
|
93
|
+
it('list of string models with voices', () => {
|
|
94
|
+
const result = normalizeTTSFallback(['cartesia/sonic:voice1', 'elevenlabs:voice2']);
|
|
95
|
+
expect(result).toEqual([
|
|
96
|
+
{ model: 'cartesia/sonic', voice: 'voice1' },
|
|
97
|
+
{ model: 'elevenlabs', voice: 'voice2' },
|
|
98
|
+
]);
|
|
99
|
+
});
|
|
100
|
+
|
|
101
|
+
it('list of FallbackModel dicts', () => {
|
|
102
|
+
const fallbacks: TTSFallbackModel[] = [
|
|
103
|
+
{ model: 'cartesia/sonic', voice: 'narrator' },
|
|
104
|
+
{ model: 'elevenlabs', voice: '' },
|
|
105
|
+
];
|
|
106
|
+
const result = normalizeTTSFallback(fallbacks);
|
|
107
|
+
expect(result).toEqual([
|
|
108
|
+
{ model: 'cartesia/sonic', voice: 'narrator' },
|
|
109
|
+
{ model: 'elevenlabs', voice: '' },
|
|
110
|
+
]);
|
|
111
|
+
});
|
|
112
|
+
|
|
113
|
+
it('mixed list of strings and dicts', () => {
|
|
114
|
+
const result = normalizeTTSFallback([
|
|
115
|
+
'cartesia/sonic:voice1',
|
|
116
|
+
{ model: 'elevenlabs/eleven_flash_v2', voice: 'custom' } as TTSFallbackModel,
|
|
117
|
+
'rime/mist',
|
|
118
|
+
]);
|
|
119
|
+
expect(result).toEqual([
|
|
120
|
+
{ model: 'cartesia/sonic', voice: 'voice1' },
|
|
121
|
+
{ model: 'elevenlabs/eleven_flash_v2', voice: 'custom' },
|
|
122
|
+
{ model: 'rime/mist', voice: '' },
|
|
123
|
+
]);
|
|
124
|
+
});
|
|
125
|
+
|
|
126
|
+
it('FallbackModel with extraKwargs is preserved', () => {
|
|
127
|
+
const fallback: TTSFallbackModel = {
|
|
128
|
+
model: 'cartesia/sonic',
|
|
129
|
+
voice: 'narrator',
|
|
130
|
+
extraKwargs: { duration: 30.0, speed: 'fast' },
|
|
131
|
+
};
|
|
132
|
+
const result = normalizeTTSFallback(fallback);
|
|
133
|
+
expect(result).toEqual([
|
|
134
|
+
{
|
|
135
|
+
model: 'cartesia/sonic',
|
|
136
|
+
voice: 'narrator',
|
|
137
|
+
extraKwargs: { duration: 30.0, speed: 'fast' },
|
|
138
|
+
},
|
|
139
|
+
]);
|
|
140
|
+
});
|
|
141
|
+
|
|
142
|
+
it('list with extraKwargs preserved', () => {
|
|
143
|
+
const result = normalizeTTSFallback([
|
|
144
|
+
{ model: 'cartesia/sonic', voice: 'v1', extraKwargs: { speed: 'slow' } } as TTSFallbackModel,
|
|
145
|
+
'elevenlabs:voice2',
|
|
146
|
+
{ model: 'rime/mist', voice: '', extraKwargs: { custom: true } } as TTSFallbackModel,
|
|
147
|
+
]);
|
|
148
|
+
expect(result).toEqual([
|
|
149
|
+
{ model: 'cartesia/sonic', voice: 'v1', extraKwargs: { speed: 'slow' } },
|
|
150
|
+
{ model: 'elevenlabs', voice: 'voice2' },
|
|
151
|
+
{ model: 'rime/mist', voice: '', extraKwargs: { custom: true } },
|
|
152
|
+
]);
|
|
153
|
+
});
|
|
154
|
+
|
|
155
|
+
it('empty list returns empty list', () => {
|
|
156
|
+
const result = normalizeTTSFallback([]);
|
|
157
|
+
expect(result).toEqual([]);
|
|
158
|
+
});
|
|
159
|
+
|
|
160
|
+
it('FallbackModel with empty voice', () => {
|
|
161
|
+
const fallback: TTSFallbackModel = { model: 'cartesia/sonic', voice: '' };
|
|
162
|
+
const result = normalizeTTSFallback(fallback);
|
|
163
|
+
expect(result).toEqual([{ model: 'cartesia/sonic', voice: '' }]);
|
|
164
|
+
});
|
|
165
|
+
});
|
|
166
|
+
|
|
167
|
+
describe('TTS constructor fallback and connOptions', () => {
|
|
168
|
+
it('fallback not given defaults to undefined', () => {
|
|
169
|
+
const tts = makeTts();
|
|
170
|
+
expect(tts['opts'].fallback).toBeUndefined();
|
|
171
|
+
});
|
|
172
|
+
|
|
173
|
+
it('fallback single string is normalized', () => {
|
|
174
|
+
const tts = makeTts({ fallback: 'elevenlabs/eleven_flash_v2' });
|
|
175
|
+
expect(tts['opts'].fallback).toEqual([{ model: 'elevenlabs/eleven_flash_v2', voice: '' }]);
|
|
176
|
+
});
|
|
177
|
+
|
|
178
|
+
it('fallback single string with voice is normalized', () => {
|
|
179
|
+
const tts = makeTts({ fallback: 'cartesia/sonic:my-voice' });
|
|
180
|
+
expect(tts['opts'].fallback).toEqual([{ model: 'cartesia/sonic', voice: 'my-voice' }]);
|
|
181
|
+
});
|
|
182
|
+
|
|
183
|
+
it('fallback list of strings is normalized', () => {
|
|
184
|
+
const tts = makeTts({ fallback: ['cartesia/sonic', 'elevenlabs'] });
|
|
185
|
+
expect(tts['opts'].fallback).toEqual([
|
|
186
|
+
{ model: 'cartesia/sonic', voice: '' },
|
|
187
|
+
{ model: 'elevenlabs', voice: '' },
|
|
188
|
+
]);
|
|
189
|
+
});
|
|
190
|
+
|
|
191
|
+
it('fallback single FallbackModel is normalized to list', () => {
|
|
192
|
+
const tts = makeTts({ fallback: { model: 'cartesia/sonic', voice: 'narrator' } });
|
|
193
|
+
expect(tts['opts'].fallback).toEqual([{ model: 'cartesia/sonic', voice: 'narrator' }]);
|
|
194
|
+
});
|
|
195
|
+
|
|
196
|
+
it('fallback with extraKwargs is preserved', () => {
|
|
197
|
+
const tts = makeTts({
|
|
198
|
+
fallback: {
|
|
199
|
+
model: 'cartesia/sonic',
|
|
200
|
+
voice: 'narrator',
|
|
201
|
+
extraKwargs: { duration: 30.0, speed: 'fast' },
|
|
202
|
+
},
|
|
203
|
+
});
|
|
204
|
+
expect(tts['opts'].fallback).toEqual([
|
|
205
|
+
{
|
|
206
|
+
model: 'cartesia/sonic',
|
|
207
|
+
voice: 'narrator',
|
|
208
|
+
extraKwargs: { duration: 30.0, speed: 'fast' },
|
|
209
|
+
},
|
|
210
|
+
]);
|
|
211
|
+
});
|
|
212
|
+
|
|
213
|
+
it('fallback mixed list is normalized', () => {
|
|
214
|
+
const tts = makeTts({
|
|
215
|
+
fallback: [
|
|
216
|
+
'cartesia/sonic:voice1',
|
|
217
|
+
{ model: 'elevenlabs', voice: 'custom', extraKwargs: { speed: 'slow' } },
|
|
218
|
+
'rime/mist',
|
|
219
|
+
],
|
|
220
|
+
});
|
|
221
|
+
expect(tts['opts'].fallback).toEqual([
|
|
222
|
+
{ model: 'cartesia/sonic', voice: 'voice1' },
|
|
223
|
+
{ model: 'elevenlabs', voice: 'custom', extraKwargs: { speed: 'slow' } },
|
|
224
|
+
{ model: 'rime/mist', voice: '' },
|
|
225
|
+
]);
|
|
226
|
+
});
|
|
227
|
+
|
|
228
|
+
it('connOptions not given uses default', () => {
|
|
229
|
+
const tts = makeTts();
|
|
230
|
+
expect(tts['opts'].connOptions).toEqual(DEFAULT_API_CONNECT_OPTIONS);
|
|
231
|
+
});
|
|
232
|
+
|
|
233
|
+
it('connOptions custom timeout', () => {
|
|
234
|
+
const custom: APIConnectOptions = { timeoutMs: 30000, maxRetry: 3, retryIntervalMs: 2000 };
|
|
235
|
+
const tts = makeTts({ connOptions: custom });
|
|
236
|
+
expect(tts['opts'].connOptions).toEqual(custom);
|
|
237
|
+
expect(tts['opts'].connOptions!.timeoutMs).toBe(30000);
|
|
238
|
+
});
|
|
239
|
+
|
|
240
|
+
it('connOptions custom maxRetry', () => {
|
|
241
|
+
const custom: APIConnectOptions = { timeoutMs: 10000, maxRetry: 5, retryIntervalMs: 2000 };
|
|
242
|
+
const tts = makeTts({ connOptions: custom });
|
|
243
|
+
expect(tts['opts'].connOptions).toEqual(custom);
|
|
244
|
+
expect(tts['opts'].connOptions!.maxRetry).toBe(5);
|
|
245
|
+
});
|
|
246
|
+
|
|
247
|
+
it('connOptions full custom', () => {
|
|
248
|
+
const custom: APIConnectOptions = { timeoutMs: 60000, maxRetry: 10, retryIntervalMs: 2000 };
|
|
249
|
+
const tts = makeTts({ connOptions: custom });
|
|
250
|
+
expect(tts['opts'].connOptions).toEqual(custom);
|
|
251
|
+
expect(tts['opts'].connOptions!.timeoutMs).toBe(60000);
|
|
252
|
+
expect(tts['opts'].connOptions!.maxRetry).toBe(10);
|
|
253
|
+
expect(tts['opts'].connOptions!.retryIntervalMs).toBe(2000);
|
|
254
|
+
});
|
|
255
|
+
});
|
package/src/inference/tts.ts
CHANGED
|
@@ -16,7 +16,6 @@ import { Event, Future, Task, cancelAndWait, combineSignals, shortuuid } from '.
|
|
|
16
16
|
import {
|
|
17
17
|
type TtsClientEvent,
|
|
18
18
|
type TtsServerEvent,
|
|
19
|
-
type TtsSessionCreateEvent,
|
|
20
19
|
ttsClientEventSchema,
|
|
21
20
|
ttsServerEventSchema,
|
|
22
21
|
} from './api_protos.js';
|
|
@@ -46,13 +45,17 @@ export type InworldModels =
|
|
|
46
45
|
export type RimeModels = 'rime/arcana' | 'rime/mistv2';
|
|
47
46
|
|
|
48
47
|
export interface CartesiaOptions {
|
|
49
|
-
|
|
50
|
-
|
|
48
|
+
/** Maximum duration of audio in seconds. */
|
|
49
|
+
duration?: number;
|
|
50
|
+
/** Speech speed. Default: not specified. */
|
|
51
|
+
speed?: 'slow' | 'normal' | 'fast';
|
|
51
52
|
}
|
|
52
53
|
|
|
53
54
|
export interface ElevenlabsOptions {
|
|
54
|
-
|
|
55
|
-
|
|
55
|
+
/** Inactivity timeout in seconds. Default: 60. */
|
|
56
|
+
inactivity_timeout?: number;
|
|
57
|
+
/** Text normalization mode. Default: "auto". */
|
|
58
|
+
apply_text_normalization?: 'auto' | 'off' | 'on';
|
|
56
59
|
}
|
|
57
60
|
|
|
58
61
|
export interface DeepgramTTSOptions {}
|
|
@@ -90,6 +93,45 @@ export type TTSOptions<TModel extends TTSModels> = TModel extends CartesiaModels
|
|
|
90
93
|
? InworldOptions
|
|
91
94
|
: Record<string, unknown>;
|
|
92
95
|
|
|
96
|
+
/** Parse a model string into [model, voice]. Voice is undefined if not specified. */
|
|
97
|
+
export function parseTTSModelString(model: string): [string, string | undefined] {
|
|
98
|
+
const idx = model.lastIndexOf(':');
|
|
99
|
+
if (idx !== -1) {
|
|
100
|
+
return [model.slice(0, idx), model.slice(idx + 1)];
|
|
101
|
+
}
|
|
102
|
+
return [model, undefined];
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
/** A fallback model with optional extra configuration. Extra fields are passed through to the provider. */
|
|
106
|
+
export interface TTSFallbackModel {
|
|
107
|
+
/** Model name (e.g. "cartesia/sonic", "elevenlabs/eleven_flash_v2", "rime/arcana"). */
|
|
108
|
+
model: string;
|
|
109
|
+
/** Voice to use for the model. */
|
|
110
|
+
voice: string;
|
|
111
|
+
/** Extra configuration for the model. */
|
|
112
|
+
extraKwargs?: Record<string, unknown>;
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
export type TTSFallbackModelType = TTSFallbackModel | string;
|
|
116
|
+
|
|
117
|
+
/** Normalize a single or list of FallbackModelType into TTSFallbackModel[]. */
|
|
118
|
+
export function normalizeTTSFallback(
|
|
119
|
+
fallback: TTSFallbackModelType | TTSFallbackModelType[],
|
|
120
|
+
): TTSFallbackModel[] {
|
|
121
|
+
const makeFallback = (model: TTSFallbackModelType): TTSFallbackModel => {
|
|
122
|
+
if (typeof model === 'string') {
|
|
123
|
+
const [name, voice] = parseTTSModelString(model);
|
|
124
|
+
return { model: name, voice: voice ?? '' };
|
|
125
|
+
}
|
|
126
|
+
return model;
|
|
127
|
+
};
|
|
128
|
+
|
|
129
|
+
if (Array.isArray(fallback)) {
|
|
130
|
+
return fallback.map(makeFallback);
|
|
131
|
+
}
|
|
132
|
+
return [makeFallback(fallback)];
|
|
133
|
+
}
|
|
134
|
+
|
|
93
135
|
type TTSEncoding = 'pcm_s16le';
|
|
94
136
|
|
|
95
137
|
const DEFAULT_ENCODING: TTSEncoding = 'pcm_s16le';
|
|
@@ -108,6 +150,8 @@ export interface InferenceTTSOptions<TModel extends TTSModels> {
|
|
|
108
150
|
apiKey: string;
|
|
109
151
|
apiSecret: string;
|
|
110
152
|
modelOptions: TTSOptions<TModel>;
|
|
153
|
+
fallback?: TTSFallbackModel[];
|
|
154
|
+
connOptions?: APIConnectOptions;
|
|
111
155
|
}
|
|
112
156
|
|
|
113
157
|
/**
|
|
@@ -130,6 +174,8 @@ export class TTS<TModel extends TTSModels> extends BaseTTS {
|
|
|
130
174
|
apiKey?: string;
|
|
131
175
|
apiSecret?: string;
|
|
132
176
|
modelOptions?: TTSOptions<TModel>;
|
|
177
|
+
fallback?: TTSFallbackModelType | TTSFallbackModelType[];
|
|
178
|
+
connOptions?: APIConnectOptions;
|
|
133
179
|
}) {
|
|
134
180
|
const sampleRate = opts?.sampleRate ?? DEFAULT_SAMPLE_RATE;
|
|
135
181
|
super(sampleRate, 1, { streaming: true });
|
|
@@ -143,6 +189,8 @@ export class TTS<TModel extends TTSModels> extends BaseTTS {
|
|
|
143
189
|
apiKey,
|
|
144
190
|
apiSecret,
|
|
145
191
|
modelOptions = {} as TTSOptions<TModel>,
|
|
192
|
+
fallback,
|
|
193
|
+
connOptions,
|
|
146
194
|
} = opts || {};
|
|
147
195
|
|
|
148
196
|
const lkBaseURL = baseURL || process.env.LIVEKIT_INFERENCE_URL || DEFAULT_BASE_URL;
|
|
@@ -176,6 +224,8 @@ export class TTS<TModel extends TTSModels> extends BaseTTS {
|
|
|
176
224
|
}
|
|
177
225
|
}
|
|
178
226
|
|
|
227
|
+
const normalizedFallback = fallback ? normalizeTTSFallback(fallback) : undefined;
|
|
228
|
+
|
|
179
229
|
this.opts = {
|
|
180
230
|
model: nextModel,
|
|
181
231
|
voice: nextVoice,
|
|
@@ -186,6 +236,8 @@ export class TTS<TModel extends TTSModels> extends BaseTTS {
|
|
|
186
236
|
apiKey: lkApiKey,
|
|
187
237
|
apiSecret: lkApiSecret,
|
|
188
238
|
modelOptions,
|
|
239
|
+
fallback: normalizedFallback,
|
|
240
|
+
connOptions: connOptions ?? DEFAULT_API_CONNECT_OPTIONS,
|
|
189
241
|
};
|
|
190
242
|
|
|
191
243
|
// Initialize connection pool
|
|
@@ -203,11 +255,8 @@ export class TTS<TModel extends TTSModels> extends BaseTTS {
|
|
|
203
255
|
}
|
|
204
256
|
|
|
205
257
|
static fromModelString(modelString: string): TTS<AnyString> {
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
return new TTS({ model, voice });
|
|
209
|
-
}
|
|
210
|
-
return new TTS({ model: modelString });
|
|
258
|
+
const [model, voice] = parseTTSModelString(modelString);
|
|
259
|
+
return new TTS({ model, voice: voice || undefined });
|
|
211
260
|
}
|
|
212
261
|
|
|
213
262
|
updateOptions(opts: Partial<Pick<InferenceTTSOptions<TModel>, 'model' | 'voice' | 'language'>>) {
|
|
@@ -222,7 +271,7 @@ export class TTS<TModel extends TTSModels> extends BaseTTS {
|
|
|
222
271
|
}
|
|
223
272
|
|
|
224
273
|
stream(options?: { connOptions?: APIConnectOptions }): SynthesizeStream<TModel> {
|
|
225
|
-
const { connOptions = DEFAULT_API_CONNECT_OPTIONS } = options || {};
|
|
274
|
+
const { connOptions = this.opts.connOptions ?? DEFAULT_API_CONNECT_OPTIONS } = options || {};
|
|
226
275
|
const stream = new SynthesizeStream(this, { ...this.opts }, connOptions);
|
|
227
276
|
this.streams.add(stream);
|
|
228
277
|
return stream;
|
|
@@ -243,11 +292,28 @@ export class TTS<TModel extends TTSModels> extends BaseTTS {
|
|
|
243
292
|
sample_rate: String(this.opts.sampleRate),
|
|
244
293
|
encoding: this.opts.encoding,
|
|
245
294
|
extra: this.opts.modelOptions,
|
|
246
|
-
} as
|
|
295
|
+
} as Record<string, unknown>;
|
|
296
|
+
|
|
297
|
+
if (this.opts.voice) (params as Record<string, unknown>).voice = this.opts.voice;
|
|
298
|
+
if (this.opts.model) (params as Record<string, unknown>).model = this.opts.model;
|
|
299
|
+
if (this.opts.language) (params as Record<string, unknown>).language = this.opts.language;
|
|
300
|
+
|
|
301
|
+
if (this.opts.fallback?.length) {
|
|
302
|
+
params.fallback = {
|
|
303
|
+
models: this.opts.fallback.map((m) => ({
|
|
304
|
+
model: m.model,
|
|
305
|
+
voice: m.voice,
|
|
306
|
+
extra: m.extraKwargs ?? {},
|
|
307
|
+
})),
|
|
308
|
+
};
|
|
309
|
+
}
|
|
247
310
|
|
|
248
|
-
if (this.opts.
|
|
249
|
-
|
|
250
|
-
|
|
311
|
+
if (this.opts.connOptions) {
|
|
312
|
+
params.connection = {
|
|
313
|
+
timeout: this.opts.connOptions.timeoutMs / 1000,
|
|
314
|
+
retries: this.opts.connOptions.maxRetry,
|
|
315
|
+
};
|
|
316
|
+
}
|
|
251
317
|
|
|
252
318
|
this.#logger.debug({ url }, 'inference.TTS creating new websocket connection (pool miss)');
|
|
253
319
|
const socket = await connectWs(url, headers, timeout);
|
|
@@ -36,7 +36,19 @@ const ORPHANED_TIMEOUT = 15 * 1000;
|
|
|
36
36
|
|
|
37
37
|
const runners: { [id: string]: InferenceRunner } = await Promise.all(
|
|
38
38
|
Object.entries(JSON.parse(process.argv[2]!)).map(async ([k, v]) => {
|
|
39
|
-
return [
|
|
39
|
+
return [
|
|
40
|
+
k,
|
|
41
|
+
await import(v as string).then((m) => {
|
|
42
|
+
// Handle both ESM (m.default is the class) and CJS (m.default.default is the class)
|
|
43
|
+
const Runner = typeof m.default === 'function' ? m.default : m.default?.default;
|
|
44
|
+
if (typeof Runner !== 'function') {
|
|
45
|
+
throw new Error(
|
|
46
|
+
`Unable to load inference runner: Missing or invalid default export in ${v}`,
|
|
47
|
+
);
|
|
48
|
+
}
|
|
49
|
+
return new Runner();
|
|
50
|
+
}),
|
|
51
|
+
];
|
|
40
52
|
}),
|
|
41
53
|
).then(Object.fromEntries);
|
|
42
54
|
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
// SPDX-FileCopyrightText: 2024 LiveKit, Inc.
|
|
2
2
|
//
|
|
3
3
|
// SPDX-License-Identifier: Apache-2.0
|
|
4
|
-
import { Room, RoomEvent } from '@livekit/rtc-node';
|
|
4
|
+
import { Room, RoomEvent, dispose } from '@livekit/rtc-node';
|
|
5
5
|
import { EventEmitter, once } from 'node:events';
|
|
6
6
|
import { pathToFileURL } from 'node:url';
|
|
7
7
|
import type { Logger } from 'pino';
|
|
@@ -156,7 +156,11 @@ const startJob = (
|
|
|
156
156
|
// [2] import.meta.filename of function containing entry file
|
|
157
157
|
const moduleFile = process.argv[2];
|
|
158
158
|
const agent: Agent = await import(pathToFileURL(moduleFile!).pathname).then((module) => {
|
|
159
|
-
|
|
159
|
+
// Handle both ESM (module.default is the agent) and CJS (module.default.default is the agent)
|
|
160
|
+
const agent =
|
|
161
|
+
typeof module.default === 'function' || isAgent(module.default)
|
|
162
|
+
? module.default
|
|
163
|
+
: module.default?.default;
|
|
160
164
|
if (agent === undefined || !isAgent(agent)) {
|
|
161
165
|
throw new Error(`Unable to load agent: Missing or invalid default export in ${moduleFile}`);
|
|
162
166
|
}
|
|
@@ -241,6 +245,18 @@ const startJob = (
|
|
|
241
245
|
|
|
242
246
|
await join.await;
|
|
243
247
|
|
|
248
|
+
// Dispose native FFI resources (Rust FfiServer, tokio runtimes, libwebrtc)
|
|
249
|
+
// before process.exit() to prevent libc++abi mutex crash during teardown.
|
|
250
|
+
// Without this, process.exit() can kill the process while native threads are
|
|
251
|
+
// still running, causing: "mutex lock failed: Invalid argument"
|
|
252
|
+
// See: https://github.com/livekit/node-sdks/issues/564
|
|
253
|
+
try {
|
|
254
|
+
await dispose();
|
|
255
|
+
logger.debug('native resources disposed');
|
|
256
|
+
} catch (error) {
|
|
257
|
+
logger.warn({ error }, 'failed to dispose native resources');
|
|
258
|
+
}
|
|
259
|
+
|
|
244
260
|
logger.debug('Job process shutdown');
|
|
245
261
|
process.exit(0);
|
|
246
262
|
}
|
|
@@ -10,12 +10,19 @@ import { Future } from '../utils.js';
|
|
|
10
10
|
import type { IPCMessage } from './message.js';
|
|
11
11
|
|
|
12
12
|
export interface ProcOpts {
|
|
13
|
+
/** Timeout for process initialization in milliseconds. */
|
|
13
14
|
initializeTimeout: number;
|
|
15
|
+
/** Timeout for process shutdown in milliseconds. */
|
|
14
16
|
closeTimeout: number;
|
|
17
|
+
/** Memory usage warning threshold in megabytes. */
|
|
15
18
|
memoryWarnMB: number;
|
|
19
|
+
/** Memory usage limit in megabytes. */
|
|
16
20
|
memoryLimitMB: number;
|
|
21
|
+
/** Interval for health check pings in milliseconds. */
|
|
17
22
|
pingInterval: number;
|
|
23
|
+
/** Timeout waiting for pong response in milliseconds. */
|
|
18
24
|
pingTimeout: number;
|
|
25
|
+
/** Threshold for warning about unresponsive processes in milliseconds. */
|
|
19
26
|
highPingThreshold: number;
|
|
20
27
|
}
|
|
21
28
|
|
package/src/stt/stt.ts
CHANGED
|
@@ -49,15 +49,22 @@ export enum SpeechEventType {
|
|
|
49
49
|
|
|
50
50
|
/** SpeechData contains metadata about this {@link SpeechEvent}. */
|
|
51
51
|
export interface SpeechData {
|
|
52
|
+
/** Language code of the speech. */
|
|
52
53
|
language: string;
|
|
54
|
+
/** Transcribed text. */
|
|
53
55
|
text: string;
|
|
56
|
+
/** Start time of the speech segment in seconds. */
|
|
54
57
|
startTime: number;
|
|
58
|
+
/** End time of the speech segment in seconds. */
|
|
55
59
|
endTime: number;
|
|
60
|
+
/** Confidence score of the transcription (0-1). */
|
|
56
61
|
confidence: number;
|
|
62
|
+
/** Word-level timing information. */
|
|
57
63
|
words?: TimedString[];
|
|
58
64
|
}
|
|
59
65
|
|
|
60
66
|
export interface RecognitionUsage {
|
|
67
|
+
/** Duration of the audio that was recognized in seconds. */
|
|
61
68
|
audioDuration: number;
|
|
62
69
|
}
|
|
63
70
|
|
|
@@ -344,6 +351,11 @@ export abstract class SpeechStream implements AsyncIterableIterator<SpeechEvent>
|
|
|
344
351
|
}
|
|
345
352
|
}
|
|
346
353
|
|
|
354
|
+
if (frame.samplesPerChannel === 0) {
|
|
355
|
+
this.input.put(frame);
|
|
356
|
+
return;
|
|
357
|
+
}
|
|
358
|
+
|
|
347
359
|
if (this.resampler) {
|
|
348
360
|
const frames = this.resampler.push(frame);
|
|
349
361
|
for (const frame of frames) {
|
package/src/transcription.ts
CHANGED
|
@@ -13,11 +13,17 @@ import { AsyncIterableQueue, Future, shortuuid } from './utils.js';
|
|
|
13
13
|
const STANDARD_SPEECH_RATE = 3830;
|
|
14
14
|
|
|
15
15
|
export interface TextSyncOptions {
|
|
16
|
+
/** Language code for transcription. */
|
|
16
17
|
language: string;
|
|
18
|
+
/** Speech speed multiplier. */
|
|
17
19
|
speed: number;
|
|
20
|
+
/** Delay between sentences in milliseconds. */
|
|
18
21
|
newSentenceDelay: number;
|
|
22
|
+
/** Tokenizer for splitting text into sentences. */
|
|
19
23
|
sentenceTokenizer: SentenceTokenizer;
|
|
24
|
+
/** Function to hyphenate words. */
|
|
20
25
|
hyphenateWord: (word: string) => string[];
|
|
26
|
+
/** Function to split text into words with positions. */
|
|
21
27
|
splitWords: (words: string) => [string, number, number][];
|
|
22
28
|
}
|
|
23
29
|
|
package/src/utils.ts
CHANGED
|
@@ -651,14 +651,22 @@ export function resampleStream({
|
|
|
651
651
|
let resampler: AudioResampler | null = null;
|
|
652
652
|
const transformStream = new TransformStream<AudioFrame, AudioFrame>({
|
|
653
653
|
transform(chunk: AudioFrame, controller: TransformStreamDefaultController<AudioFrame>) {
|
|
654
|
+
if (chunk.samplesPerChannel === 0) {
|
|
655
|
+
controller.enqueue(chunk);
|
|
656
|
+
return;
|
|
657
|
+
}
|
|
654
658
|
if (!resampler) {
|
|
655
659
|
resampler = new AudioResampler(chunk.sampleRate, outputRate);
|
|
656
660
|
}
|
|
657
661
|
for (const frame of resampler.push(chunk)) {
|
|
658
662
|
controller.enqueue(frame);
|
|
659
663
|
}
|
|
660
|
-
|
|
661
|
-
|
|
664
|
+
},
|
|
665
|
+
flush(controller) {
|
|
666
|
+
if (resampler) {
|
|
667
|
+
for (const frame of resampler.flush()) {
|
|
668
|
+
controller.enqueue(frame);
|
|
669
|
+
}
|
|
662
670
|
}
|
|
663
671
|
},
|
|
664
672
|
});
|
package/src/vad.ts
CHANGED
|
@@ -30,9 +30,9 @@ export interface VADEvent {
|
|
|
30
30
|
samplesIndex: number;
|
|
31
31
|
/** Timestamp when the event was fired. */
|
|
32
32
|
timestamp: number;
|
|
33
|
-
/** Duration of the speech segment. */
|
|
33
|
+
/** Duration of the speech segment in seconds. */
|
|
34
34
|
speechDuration: number;
|
|
35
|
-
/** Duration of the silence segment. */
|
|
35
|
+
/** Duration of the silence segment in seconds. */
|
|
36
36
|
silenceDuration: number;
|
|
37
37
|
/**
|
|
38
38
|
* List of audio frames associated with the speech.
|
|
@@ -56,6 +56,7 @@ export interface VADEvent {
|
|
|
56
56
|
}
|
|
57
57
|
|
|
58
58
|
export interface VADCapabilities {
|
|
59
|
+
/** Duration of each VAD inference window in milliseconds. Used to batch metrics emissions to roughly once per second. */
|
|
59
60
|
updateInterval: number;
|
|
60
61
|
}
|
|
61
62
|
|
|
@@ -154,7 +155,7 @@ export abstract class VADStream implements AsyncIterableIterator<VADEvent> {
|
|
|
154
155
|
switch (value.type) {
|
|
155
156
|
case VADEventType.START_OF_SPEECH:
|
|
156
157
|
inferenceCount++;
|
|
157
|
-
if (inferenceCount >=
|
|
158
|
+
if (inferenceCount >= 1000 / this.#vad.capabilities.updateInterval) {
|
|
158
159
|
this.#vad.emit('metrics_collected', {
|
|
159
160
|
type: 'vad_metrics',
|
|
160
161
|
timestamp: Date.now(),
|
|
@@ -1023,7 +1023,7 @@ export class AgentActivity implements RecognitionHooks {
|
|
|
1023
1023
|
toolChoice: toOaiToolChoice(toolChoice !== undefined ? toolChoice : this.toolChoice),
|
|
1024
1024
|
},
|
|
1025
1025
|
abortController,
|
|
1026
|
-
instructions
|
|
1026
|
+
instructions,
|
|
1027
1027
|
userMessage,
|
|
1028
1028
|
),
|
|
1029
1029
|
),
|
|
@@ -18,11 +18,17 @@ import type { TurnDetectionMode } from './agent_session.js';
|
|
|
18
18
|
import type { STTNode } from './io.js';
|
|
19
19
|
|
|
20
20
|
export interface EndOfTurnInfo {
|
|
21
|
+
/** The new transcript text from the user's speech. */
|
|
21
22
|
newTranscript: string;
|
|
23
|
+
/** Confidence score of the transcript (0-1). */
|
|
22
24
|
transcriptConfidence: number;
|
|
25
|
+
/** Delay from speech stop to final transcription in milliseconds. */
|
|
23
26
|
transcriptionDelay: number;
|
|
27
|
+
/** Delay from speech stop to end of utterance detection in milliseconds. */
|
|
24
28
|
endOfUtteranceDelay: number;
|
|
29
|
+
/** Timestamp when user started speaking (milliseconds since epoch). */
|
|
25
30
|
startedSpeakingAt: number | undefined;
|
|
31
|
+
/** Timestamp when user stopped speaking (milliseconds since epoch). */
|
|
26
32
|
stoppedSpeakingAt: number | undefined;
|
|
27
33
|
}
|
|
28
34
|
|
|
@@ -50,13 +56,21 @@ export interface _TurnDetector {
|
|
|
50
56
|
}
|
|
51
57
|
|
|
52
58
|
export interface AudioRecognitionOptions {
|
|
59
|
+
/** Hooks for recognition events. */
|
|
53
60
|
recognitionHooks: RecognitionHooks;
|
|
61
|
+
/** Speech-to-text node. */
|
|
54
62
|
stt?: STTNode;
|
|
63
|
+
/** Voice activity detection. */
|
|
55
64
|
vad?: VAD;
|
|
65
|
+
/** Turn detector for end-of-turn prediction. */
|
|
56
66
|
turnDetector?: _TurnDetector;
|
|
67
|
+
/** Turn detection mode. */
|
|
57
68
|
turnDetectionMode?: Exclude<TurnDetectionMode, _TurnDetector>;
|
|
69
|
+
/** Minimum endpointing delay in milliseconds. */
|
|
58
70
|
minEndpointingDelay: number;
|
|
71
|
+
/** Maximum endpointing delay in milliseconds. */
|
|
59
72
|
maxEndpointingDelay: number;
|
|
73
|
+
/** Root span context for tracing. */
|
|
60
74
|
rootSpanContext?: Context;
|
|
61
75
|
}
|
|
62
76
|
|