@livekit/agents 1.0.5 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +3 -0
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +2 -1
- package/dist/index.d.ts +2 -1
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +2 -0
- package/dist/index.js.map +1 -1
- package/dist/inference/api_protos.cjs +104 -0
- package/dist/inference/api_protos.cjs.map +1 -0
- package/dist/inference/api_protos.d.cts +222 -0
- package/dist/inference/api_protos.d.ts +222 -0
- package/dist/inference/api_protos.d.ts.map +1 -0
- package/dist/inference/api_protos.js +70 -0
- package/dist/inference/api_protos.js.map +1 -0
- package/dist/inference/index.cjs +56 -0
- package/dist/inference/index.cjs.map +1 -0
- package/dist/inference/index.d.cts +9 -0
- package/dist/inference/index.d.ts +9 -0
- package/dist/inference/index.d.ts.map +1 -0
- package/dist/inference/index.js +16 -0
- package/dist/inference/index.js.map +1 -0
- package/dist/inference/llm.cjs +315 -0
- package/dist/inference/llm.cjs.map +1 -0
- package/dist/inference/llm.d.cts +92 -0
- package/dist/inference/llm.d.ts +92 -0
- package/dist/inference/llm.d.ts.map +1 -0
- package/dist/inference/llm.js +286 -0
- package/dist/inference/llm.js.map +1 -0
- package/dist/inference/stt.cjs +305 -0
- package/dist/inference/stt.cjs.map +1 -0
- package/dist/inference/stt.d.cts +79 -0
- package/dist/inference/stt.d.ts +79 -0
- package/dist/inference/stt.d.ts.map +1 -0
- package/dist/inference/stt.js +284 -0
- package/dist/inference/stt.js.map +1 -0
- package/dist/inference/tts.cjs +317 -0
- package/dist/inference/tts.cjs.map +1 -0
- package/dist/inference/tts.d.cts +75 -0
- package/dist/inference/tts.d.ts +75 -0
- package/dist/inference/tts.d.ts.map +1 -0
- package/dist/inference/tts.js +299 -0
- package/dist/inference/tts.js.map +1 -0
- package/dist/inference/utils.cjs +76 -0
- package/dist/inference/utils.cjs.map +1 -0
- package/dist/inference/utils.d.cts +5 -0
- package/dist/inference/utils.d.ts +5 -0
- package/dist/inference/utils.d.ts.map +1 -0
- package/dist/inference/utils.js +51 -0
- package/dist/inference/utils.js.map +1 -0
- package/dist/tts/tts.cjs +1 -1
- package/dist/tts/tts.cjs.map +1 -1
- package/dist/tts/tts.js +1 -1
- package/dist/tts/tts.js.map +1 -1
- package/dist/utils.cjs +11 -0
- package/dist/utils.cjs.map +1 -1
- package/dist/utils.d.cts +1 -0
- package/dist/utils.d.ts +1 -0
- package/dist/utils.d.ts.map +1 -1
- package/dist/utils.js +10 -0
- package/dist/utils.js.map +1 -1
- package/dist/voice/agent.cjs +16 -3
- package/dist/voice/agent.cjs.map +1 -1
- package/dist/voice/agent.d.cts +4 -3
- package/dist/voice/agent.d.ts +4 -3
- package/dist/voice/agent.d.ts.map +1 -1
- package/dist/voice/agent.js +20 -3
- package/dist/voice/agent.js.map +1 -1
- package/dist/voice/agent_session.cjs +16 -3
- package/dist/voice/agent_session.cjs.map +1 -1
- package/dist/voice/agent_session.d.cts +4 -3
- package/dist/voice/agent_session.d.ts +4 -3
- package/dist/voice/agent_session.d.ts.map +1 -1
- package/dist/voice/agent_session.js +20 -3
- package/dist/voice/agent_session.js.map +1 -1
- package/dist/voice/room_io/_input.cjs +9 -0
- package/dist/voice/room_io/_input.cjs.map +1 -1
- package/dist/voice/room_io/_input.d.ts.map +1 -1
- package/dist/voice/room_io/_input.js +10 -0
- package/dist/voice/room_io/_input.js.map +1 -1
- package/dist/worker.cjs.map +1 -1
- package/dist/worker.d.ts.map +1 -1
- package/dist/worker.js +1 -1
- package/dist/worker.js.map +1 -1
- package/package.json +3 -2
- package/src/index.ts +2 -1
- package/src/inference/api_protos.ts +82 -0
- package/src/inference/index.ts +12 -0
- package/src/inference/llm.ts +485 -0
- package/src/inference/stt.ts +414 -0
- package/src/inference/tts.ts +421 -0
- package/src/inference/utils.ts +66 -0
- package/src/tts/tts.ts +1 -1
- package/src/utils.ts +11 -0
- package/src/voice/agent.ts +30 -6
- package/src/voice/agent_session.ts +29 -6
- package/src/voice/room_io/_input.ts +12 -1
- package/src/worker.ts +2 -7
|
@@ -0,0 +1,421 @@
|
|
|
1
|
+
// SPDX-FileCopyrightText: 2025 LiveKit, Inc.
|
|
2
|
+
//
|
|
3
|
+
// SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
import type { AudioFrame } from '@livekit/rtc-node';
|
|
5
|
+
import { WebSocket } from 'ws';
|
|
6
|
+
import { APIError, APIStatusError } from '../_exceptions.js';
|
|
7
|
+
import { AudioByteStream } from '../audio.js';
|
|
8
|
+
import { log } from '../log.js';
|
|
9
|
+
import { createStreamChannel } from '../stream/stream_channel.js';
|
|
10
|
+
import { basic as tokenizeBasic } from '../tokenize/index.js';
|
|
11
|
+
import {
|
|
12
|
+
SynthesizeStream as BaseSynthesizeStream,
|
|
13
|
+
TTS as BaseTTS,
|
|
14
|
+
ChunkedStream,
|
|
15
|
+
} from '../tts/index.js';
|
|
16
|
+
import { type APIConnectOptions, DEFAULT_API_CONNECT_OPTIONS } from '../types.js';
|
|
17
|
+
import { shortuuid } from '../utils.js';
|
|
18
|
+
import {
|
|
19
|
+
type TtsClientEvent,
|
|
20
|
+
type TtsServerEvent,
|
|
21
|
+
type TtsSessionCreateEvent,
|
|
22
|
+
ttsClientEventSchema,
|
|
23
|
+
ttsServerEventSchema,
|
|
24
|
+
} from './api_protos.js';
|
|
25
|
+
import { type AnyModels, connectWs, createAccessToken } from './utils.js';
|
|
26
|
+
|
|
27
|
+
type _CartesiaModels = 'cartesia' | 'cartesia/sonic' | 'cartesia/sonic-2' | 'cartesia/sonic-turbo';
|
|
28
|
+
|
|
29
|
+
export type CartesiaModels = _CartesiaModels | `${_CartesiaModels}:${string}`;
|
|
30
|
+
|
|
31
|
+
type _ElevenlabsModels =
|
|
32
|
+
| 'elevenlabs'
|
|
33
|
+
| 'elevenlabs/eleven_flash_v2'
|
|
34
|
+
| 'elevenlabs/eleven_flash_v2_5'
|
|
35
|
+
| 'elevenlabs/eleven_turbo_v2'
|
|
36
|
+
| 'elevenlabs/eleven_turbo_v2_5'
|
|
37
|
+
| 'elevenlabs/eleven_multilingual_v2';
|
|
38
|
+
|
|
39
|
+
export type ElevenlabsModels = _ElevenlabsModels | `${_ElevenlabsModels}:${string}`;
|
|
40
|
+
|
|
41
|
+
export type _RimeModels = 'rime' | 'rime/mist' | 'rime/mistv2' | 'rime/arcana';
|
|
42
|
+
|
|
43
|
+
export type RimeModels = _RimeModels | `${_RimeModels}:${string}`;
|
|
44
|
+
|
|
45
|
+
export type _InworldModels = 'inworld' | 'inworld/inworld-tts-1';
|
|
46
|
+
|
|
47
|
+
export type InworldModels = _InworldModels | `${_InworldModels}:${string}`;
|
|
48
|
+
|
|
49
|
+
export interface CartesiaOptions {
|
|
50
|
+
duration?: number; // max duration of audio in seconds
|
|
51
|
+
speed?: 'slow' | 'normal' | 'fast'; // default: not specified
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
export interface ElevenlabsOptions {
|
|
55
|
+
inactivity_timeout?: number; // default: 60
|
|
56
|
+
apply_text_normalization?: 'auto' | 'off' | 'on'; // default: "auto"
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
export interface RimeOptions {}
|
|
60
|
+
|
|
61
|
+
export interface InworldOptions {}
|
|
62
|
+
|
|
63
|
+
export type TTSModels = CartesiaModels | ElevenlabsModels | RimeModels | InworldModels | AnyModels;
|
|
64
|
+
|
|
65
|
+
export type TTSOptions<TModel extends TTSModels> = TModel extends CartesiaModels
|
|
66
|
+
? CartesiaOptions
|
|
67
|
+
: TModel extends ElevenlabsModels
|
|
68
|
+
? ElevenlabsOptions
|
|
69
|
+
: TModel extends RimeOptions
|
|
70
|
+
? RimeOptions
|
|
71
|
+
: TModel extends InworldOptions
|
|
72
|
+
? InworldOptions
|
|
73
|
+
: Record<string, unknown>;
|
|
74
|
+
|
|
75
|
+
type TTSEncoding = 'pcm_s16le';
|
|
76
|
+
|
|
77
|
+
const DEFAULT_ENCODING: TTSEncoding = 'pcm_s16le';
|
|
78
|
+
const DEFAULT_SAMPLE_RATE = 16000;
|
|
79
|
+
const DEFAULT_BASE_URL = 'https://agent-gateway.livekit.cloud/v1';
|
|
80
|
+
const NUM_CHANNELS = 1;
|
|
81
|
+
const DEFAULT_LANGUAGE = 'en';
|
|
82
|
+
|
|
83
|
+
export interface InferenceTTSOptions<TModel extends TTSModels> {
|
|
84
|
+
model?: TModel;
|
|
85
|
+
voice?: string;
|
|
86
|
+
language?: string;
|
|
87
|
+
encoding: TTSEncoding;
|
|
88
|
+
sampleRate: number;
|
|
89
|
+
baseURL: string;
|
|
90
|
+
apiKey: string;
|
|
91
|
+
apiSecret: string;
|
|
92
|
+
extraKwargs: TTSOptions<TModel>;
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
export class TTS<TModel extends TTSModels> extends BaseTTS {
|
|
96
|
+
private opts: InferenceTTSOptions<TModel>;
|
|
97
|
+
private streams: Set<SynthesizeStream<TModel>> = new Set();
|
|
98
|
+
|
|
99
|
+
#logger = log();
|
|
100
|
+
|
|
101
|
+
constructor(opts: {
|
|
102
|
+
model: TModel;
|
|
103
|
+
voice?: string;
|
|
104
|
+
language?: string;
|
|
105
|
+
baseURL?: string;
|
|
106
|
+
encoding?: TTSEncoding;
|
|
107
|
+
sampleRate?: number;
|
|
108
|
+
apiKey?: string;
|
|
109
|
+
apiSecret?: string;
|
|
110
|
+
extraKwargs?: TTSOptions<TModel>;
|
|
111
|
+
}) {
|
|
112
|
+
const sampleRate = opts?.sampleRate ?? DEFAULT_SAMPLE_RATE;
|
|
113
|
+
super(sampleRate, 1, { streaming: true });
|
|
114
|
+
|
|
115
|
+
const {
|
|
116
|
+
model,
|
|
117
|
+
voice,
|
|
118
|
+
language = DEFAULT_LANGUAGE,
|
|
119
|
+
baseURL,
|
|
120
|
+
encoding = DEFAULT_ENCODING,
|
|
121
|
+
apiKey,
|
|
122
|
+
apiSecret,
|
|
123
|
+
extraKwargs = {} as TTSOptions<TModel>,
|
|
124
|
+
} = opts || {};
|
|
125
|
+
|
|
126
|
+
const lkBaseURL = baseURL || process.env.LIVEKIT_INFERENCE_URL || DEFAULT_BASE_URL;
|
|
127
|
+
const lkApiKey = apiKey || process.env.LIVEKIT_INFERENCE_API_KEY || process.env.LIVEKIT_API_KEY;
|
|
128
|
+
if (!lkApiKey) {
|
|
129
|
+
throw new Error('apiKey is required: pass apiKey or set LIVEKIT_API_KEY');
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
const lkApiSecret =
|
|
133
|
+
apiSecret || process.env.LIVEKIT_INFERENCE_API_SECRET || process.env.LIVEKIT_API_SECRET;
|
|
134
|
+
if (!lkApiSecret) {
|
|
135
|
+
throw new Error('apiSecret is required: pass apiSecret or set LIVEKIT_API_SECRET');
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
// read voice id from the model if provided: "provider/model:voice_id"
|
|
139
|
+
let nextModel = model;
|
|
140
|
+
let nextVoice = voice;
|
|
141
|
+
if (typeof nextModel === 'string') {
|
|
142
|
+
const idx = nextModel.lastIndexOf(':');
|
|
143
|
+
if (idx !== -1) {
|
|
144
|
+
const voiceFromModel = nextModel.slice(idx + 1);
|
|
145
|
+
if (nextVoice && nextVoice !== voiceFromModel) {
|
|
146
|
+
this.#logger.warn(
|
|
147
|
+
'`voice` is provided via both argument and model, using the one from the argument',
|
|
148
|
+
{ voice: nextVoice, model: nextModel },
|
|
149
|
+
);
|
|
150
|
+
} else {
|
|
151
|
+
nextVoice = voiceFromModel;
|
|
152
|
+
}
|
|
153
|
+
nextModel = nextModel.slice(0, idx) as TModel;
|
|
154
|
+
}
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
this.opts = {
|
|
158
|
+
model: nextModel,
|
|
159
|
+
voice: nextVoice,
|
|
160
|
+
language,
|
|
161
|
+
encoding,
|
|
162
|
+
sampleRate,
|
|
163
|
+
baseURL: lkBaseURL,
|
|
164
|
+
apiKey: lkApiKey,
|
|
165
|
+
apiSecret: lkApiSecret,
|
|
166
|
+
extraKwargs,
|
|
167
|
+
};
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
get label() {
|
|
171
|
+
return 'inference.TTS';
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
updateOptions(opts: Partial<Pick<InferenceTTSOptions<TModel>, 'model' | 'voice' | 'language'>>) {
|
|
175
|
+
this.opts = { ...this.opts, ...opts };
|
|
176
|
+
for (const stream of this.streams) {
|
|
177
|
+
stream.updateOptions(opts);
|
|
178
|
+
}
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
synthesize(_: string): ChunkedStream {
|
|
182
|
+
throw new Error('ChunkedStream is not implemented');
|
|
183
|
+
}
|
|
184
|
+
|
|
185
|
+
stream(options?: { connOptions?: APIConnectOptions }): SynthesizeStream<TModel> {
|
|
186
|
+
const { connOptions = DEFAULT_API_CONNECT_OPTIONS } = options || {};
|
|
187
|
+
const stream = new SynthesizeStream(this, { ...this.opts }, connOptions);
|
|
188
|
+
this.streams.add(stream);
|
|
189
|
+
return stream;
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
async connectWs(timeout: number): Promise<WebSocket> {
|
|
193
|
+
let baseURL = this.opts.baseURL;
|
|
194
|
+
if (baseURL.startsWith('http://') || baseURL.startsWith('https://')) {
|
|
195
|
+
baseURL = baseURL.replace('http', 'ws');
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
const token = await createAccessToken(this.opts.apiKey, this.opts.apiSecret);
|
|
199
|
+
const url = `${baseURL}/tts`;
|
|
200
|
+
const headers = { Authorization: `Bearer ${token}` } as Record<string, string>;
|
|
201
|
+
|
|
202
|
+
const params = {
|
|
203
|
+
type: 'session.create',
|
|
204
|
+
sample_rate: String(this.opts.sampleRate),
|
|
205
|
+
encoding: this.opts.encoding,
|
|
206
|
+
extra: this.opts.extraKwargs,
|
|
207
|
+
} as TtsSessionCreateEvent;
|
|
208
|
+
|
|
209
|
+
if (this.opts.voice) params.voice = this.opts.voice;
|
|
210
|
+
if (this.opts.model) params.model = this.opts.model;
|
|
211
|
+
if (this.opts.language) params.language = this.opts.language;
|
|
212
|
+
|
|
213
|
+
const socket = await connectWs(url, headers, timeout);
|
|
214
|
+
socket.send(JSON.stringify(params));
|
|
215
|
+
return socket;
|
|
216
|
+
}
|
|
217
|
+
|
|
218
|
+
async closeWs(ws: WebSocket) {
|
|
219
|
+
await ws.close();
|
|
220
|
+
}
|
|
221
|
+
|
|
222
|
+
async close() {
|
|
223
|
+
for (const stream of this.streams) {
|
|
224
|
+
await stream.close();
|
|
225
|
+
}
|
|
226
|
+
this.streams.clear();
|
|
227
|
+
}
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
export class SynthesizeStream<TModel extends TTSModels> extends BaseSynthesizeStream {
|
|
231
|
+
private opts: InferenceTTSOptions<TModel>;
|
|
232
|
+
private tts: TTS<TModel>;
|
|
233
|
+
private connOptions: APIConnectOptions;
|
|
234
|
+
|
|
235
|
+
#logger = log();
|
|
236
|
+
|
|
237
|
+
constructor(tts: TTS<TModel>, opts: InferenceTTSOptions<TModel>, connOptions: APIConnectOptions) {
|
|
238
|
+
super(tts, connOptions);
|
|
239
|
+
this.opts = opts;
|
|
240
|
+
this.tts = tts;
|
|
241
|
+
this.connOptions = connOptions;
|
|
242
|
+
}
|
|
243
|
+
|
|
244
|
+
get label() {
|
|
245
|
+
return 'inference.SynthesizeStream';
|
|
246
|
+
}
|
|
247
|
+
|
|
248
|
+
updateOptions(opts: Partial<Pick<InferenceTTSOptions<TModel>, 'model' | 'voice' | 'language'>>) {
|
|
249
|
+
this.opts = { ...this.opts, ...opts };
|
|
250
|
+
}
|
|
251
|
+
|
|
252
|
+
protected async run(): Promise<void> {
|
|
253
|
+
let ws: WebSocket | null = null;
|
|
254
|
+
let closing = false;
|
|
255
|
+
let finalReceived = false;
|
|
256
|
+
let lastFrame: AudioFrame | undefined;
|
|
257
|
+
|
|
258
|
+
const sendTokenizerStream = new tokenizeBasic.SentenceTokenizer().stream();
|
|
259
|
+
const eventChannel = createStreamChannel<TtsServerEvent>();
|
|
260
|
+
const requestId = shortuuid('tts_request_');
|
|
261
|
+
|
|
262
|
+
const resourceCleanup = () => {
|
|
263
|
+
if (closing) return;
|
|
264
|
+
closing = true;
|
|
265
|
+
sendTokenizerStream.close();
|
|
266
|
+
eventChannel.close();
|
|
267
|
+
ws?.removeAllListeners();
|
|
268
|
+
ws?.close();
|
|
269
|
+
};
|
|
270
|
+
|
|
271
|
+
const sendClientEvent = async (event: TtsClientEvent) => {
|
|
272
|
+
const validatedEvent = await ttsClientEventSchema.parseAsync(event);
|
|
273
|
+
if (!ws || ws.readyState !== WebSocket.OPEN) {
|
|
274
|
+
this.#logger.warn('Trying to send client TTS event to a closed WebSocket');
|
|
275
|
+
return;
|
|
276
|
+
}
|
|
277
|
+
ws.send(JSON.stringify(validatedEvent));
|
|
278
|
+
};
|
|
279
|
+
|
|
280
|
+
const sendLastFrame = (segmentId: string, final: boolean) => {
|
|
281
|
+
if (lastFrame) {
|
|
282
|
+
this.queue.put({ requestId, segmentId, frame: lastFrame, final });
|
|
283
|
+
lastFrame = undefined;
|
|
284
|
+
}
|
|
285
|
+
};
|
|
286
|
+
|
|
287
|
+
const createInputTask = async () => {
|
|
288
|
+
for await (const data of this.input) {
|
|
289
|
+
if (this.abortController.signal.aborted) break;
|
|
290
|
+
if (data === SynthesizeStream.FLUSH_SENTINEL) {
|
|
291
|
+
sendTokenizerStream.flush();
|
|
292
|
+
continue;
|
|
293
|
+
}
|
|
294
|
+
sendTokenizerStream.pushText(data);
|
|
295
|
+
}
|
|
296
|
+
sendTokenizerStream.endInput();
|
|
297
|
+
};
|
|
298
|
+
|
|
299
|
+
const createSentenceStreamTask = async () => {
|
|
300
|
+
for await (const ev of sendTokenizerStream) {
|
|
301
|
+
if (this.abortController.signal.aborted) break;
|
|
302
|
+
|
|
303
|
+
sendClientEvent({
|
|
304
|
+
type: 'input_transcript',
|
|
305
|
+
transcript: ev.token + ' ',
|
|
306
|
+
});
|
|
307
|
+
}
|
|
308
|
+
|
|
309
|
+
sendClientEvent({ type: 'session.flush' });
|
|
310
|
+
};
|
|
311
|
+
|
|
312
|
+
const createWsListenerTask = async (ws: WebSocket) => {
|
|
313
|
+
return new Promise<void>((resolve, reject) => {
|
|
314
|
+
this.abortController.signal.addEventListener('abort', () => {
|
|
315
|
+
resourceCleanup();
|
|
316
|
+
reject(new Error('WebSocket connection aborted'));
|
|
317
|
+
});
|
|
318
|
+
|
|
319
|
+
ws.on('message', async (data) => {
|
|
320
|
+
const eventJson = JSON.parse(data.toString()) as Record<string, unknown>;
|
|
321
|
+
const validatedEvent = ttsServerEventSchema.parse(eventJson);
|
|
322
|
+
eventChannel.write(validatedEvent);
|
|
323
|
+
});
|
|
324
|
+
|
|
325
|
+
ws.on('error', (e) => {
|
|
326
|
+
this.#logger.error({ error: e }, 'WebSocket error');
|
|
327
|
+
resourceCleanup();
|
|
328
|
+
reject(e);
|
|
329
|
+
});
|
|
330
|
+
|
|
331
|
+
ws.on('close', () => {
|
|
332
|
+
resourceCleanup();
|
|
333
|
+
|
|
334
|
+
if (!closing) return this.#logger.error('WebSocket closed unexpectedly');
|
|
335
|
+
if (finalReceived) return resolve();
|
|
336
|
+
|
|
337
|
+
reject(
|
|
338
|
+
new APIStatusError({
|
|
339
|
+
message: 'Gateway connection closed unexpectedly',
|
|
340
|
+
options: { requestId },
|
|
341
|
+
}),
|
|
342
|
+
);
|
|
343
|
+
});
|
|
344
|
+
});
|
|
345
|
+
};
|
|
346
|
+
|
|
347
|
+
const createRecvTask = async () => {
|
|
348
|
+
let currentSessionId: string | null = null;
|
|
349
|
+
|
|
350
|
+
const bstream = new AudioByteStream(this.opts.sampleRate, NUM_CHANNELS);
|
|
351
|
+
const serverEventStream = eventChannel.stream();
|
|
352
|
+
const reader = serverEventStream.getReader();
|
|
353
|
+
|
|
354
|
+
try {
|
|
355
|
+
while (!this.closed && !this.abortController.signal.aborted) {
|
|
356
|
+
const result = await reader.read();
|
|
357
|
+
if (this.abortController.signal.aborted) return;
|
|
358
|
+
if (result.done) return;
|
|
359
|
+
|
|
360
|
+
const serverEvent = result.value;
|
|
361
|
+
switch (serverEvent.type) {
|
|
362
|
+
case 'session.created':
|
|
363
|
+
currentSessionId = serverEvent.session_id;
|
|
364
|
+
break;
|
|
365
|
+
case 'output_audio':
|
|
366
|
+
const base64Data = new Int8Array(Buffer.from(serverEvent.audio, 'base64'));
|
|
367
|
+
for (const frame of bstream.write(base64Data.buffer)) {
|
|
368
|
+
sendLastFrame(currentSessionId!, false);
|
|
369
|
+
lastFrame = frame;
|
|
370
|
+
}
|
|
371
|
+
break;
|
|
372
|
+
case 'done':
|
|
373
|
+
finalReceived = true;
|
|
374
|
+
for (const frame of bstream.flush()) {
|
|
375
|
+
sendLastFrame(currentSessionId!, false);
|
|
376
|
+
lastFrame = frame;
|
|
377
|
+
}
|
|
378
|
+
sendLastFrame(currentSessionId!, true);
|
|
379
|
+
this.queue.put(SynthesizeStream.END_OF_STREAM);
|
|
380
|
+
break;
|
|
381
|
+
case 'session.closed':
|
|
382
|
+
resourceCleanup();
|
|
383
|
+
break;
|
|
384
|
+
case 'error':
|
|
385
|
+
this.#logger.error(
|
|
386
|
+
{ serverEvent },
|
|
387
|
+
'Received error message from LiveKit TTS WebSocket',
|
|
388
|
+
);
|
|
389
|
+
resourceCleanup();
|
|
390
|
+
throw new APIError(`LiveKit TTS returned error: ${serverEvent.message}`);
|
|
391
|
+
default:
|
|
392
|
+
this.#logger.warn('Unexpected message %s', serverEvent);
|
|
393
|
+
break;
|
|
394
|
+
}
|
|
395
|
+
}
|
|
396
|
+
} finally {
|
|
397
|
+
reader.releaseLock();
|
|
398
|
+
try {
|
|
399
|
+
await serverEventStream.cancel();
|
|
400
|
+
} catch (e) {
|
|
401
|
+
this.#logger.debug('Error cancelling serverEventStream (may already be cancelled):', e);
|
|
402
|
+
}
|
|
403
|
+
}
|
|
404
|
+
};
|
|
405
|
+
|
|
406
|
+
try {
|
|
407
|
+
ws = await this.tts.connectWs(this.connOptions.timeoutMs);
|
|
408
|
+
|
|
409
|
+
await Promise.all([
|
|
410
|
+
createInputTask(),
|
|
411
|
+
createSentenceStreamTask(),
|
|
412
|
+
createWsListenerTask(ws),
|
|
413
|
+
createRecvTask(),
|
|
414
|
+
]);
|
|
415
|
+
} catch (e) {
|
|
416
|
+
this.#logger.error('Error in SynthesizeStream', { error: e });
|
|
417
|
+
} finally {
|
|
418
|
+
resourceCleanup();
|
|
419
|
+
}
|
|
420
|
+
}
|
|
421
|
+
}
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
// SPDX-FileCopyrightText: 2025 LiveKit, Inc.
|
|
2
|
+
//
|
|
3
|
+
// SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
import { AccessToken } from 'livekit-server-sdk';
|
|
5
|
+
import { WebSocket } from 'ws';
|
|
6
|
+
import { APIConnectionError, APIStatusError } from '../index.js';
|
|
7
|
+
|
|
8
|
+
export type AnyModels = string & NonNullable<unknown>;
|
|
9
|
+
|
|
10
|
+
export async function createAccessToken(
|
|
11
|
+
apiKey: string,
|
|
12
|
+
apiSecret: string,
|
|
13
|
+
ttl: number = 600,
|
|
14
|
+
): Promise<string> {
|
|
15
|
+
const token = new AccessToken(apiKey, apiSecret, { identity: 'agent', ttl });
|
|
16
|
+
token.addInferenceGrant({ perform: true });
|
|
17
|
+
|
|
18
|
+
return await token.toJwt();
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
export async function connectWs(
|
|
22
|
+
url: string,
|
|
23
|
+
headers: Record<string, string>,
|
|
24
|
+
timeoutMs: number,
|
|
25
|
+
): Promise<WebSocket> {
|
|
26
|
+
return new Promise<WebSocket>((resolve, reject) => {
|
|
27
|
+
const socket = new WebSocket(url, { headers: headers });
|
|
28
|
+
|
|
29
|
+
const timeout = setTimeout(() => {
|
|
30
|
+
reject(new APIConnectionError({ message: 'Timeout connecting to LiveKit WebSocket' }));
|
|
31
|
+
}, timeoutMs);
|
|
32
|
+
|
|
33
|
+
const onOpen = () => {
|
|
34
|
+
clearTimeout(timeout);
|
|
35
|
+
resolve(socket);
|
|
36
|
+
};
|
|
37
|
+
|
|
38
|
+
const onError = (err: unknown) => {
|
|
39
|
+
clearTimeout(timeout);
|
|
40
|
+
if (err && typeof err === 'object' && 'code' in err && (err as any).code === 429) {
|
|
41
|
+
reject(
|
|
42
|
+
new APIStatusError({
|
|
43
|
+
message: 'LiveKit gateway quota exceeded',
|
|
44
|
+
options: { statusCode: 429 },
|
|
45
|
+
}),
|
|
46
|
+
);
|
|
47
|
+
} else {
|
|
48
|
+
reject(new APIConnectionError({ message: 'Error connecting to LiveKit WebSocket' }));
|
|
49
|
+
}
|
|
50
|
+
};
|
|
51
|
+
|
|
52
|
+
const onClose = (code: number) => {
|
|
53
|
+
clearTimeout(timeout);
|
|
54
|
+
if (code !== 1000) {
|
|
55
|
+
reject(
|
|
56
|
+
new APIConnectionError({
|
|
57
|
+
message: 'Connection closed unexpectedly',
|
|
58
|
+
}),
|
|
59
|
+
);
|
|
60
|
+
}
|
|
61
|
+
};
|
|
62
|
+
socket.once('open', onOpen);
|
|
63
|
+
socket.once('error', onError);
|
|
64
|
+
socket.once('close', onClose);
|
|
65
|
+
});
|
|
66
|
+
}
|
package/src/tts/tts.ts
CHANGED
|
@@ -443,7 +443,7 @@ export abstract class ChunkedStream implements AsyncIterableIterator<Synthesized
|
|
|
443
443
|
for await (const audio of this.queue) {
|
|
444
444
|
this.output.put(audio);
|
|
445
445
|
requestId = audio.requestId;
|
|
446
|
-
if (
|
|
446
|
+
if (ttfb === BigInt(-1)) {
|
|
447
447
|
ttfb = process.hrtime.bigint() - startTime;
|
|
448
448
|
}
|
|
449
449
|
audioDuration += audio.frame.samplesPerChannel / audio.frame.sampleRate;
|
package/src/utils.ts
CHANGED
|
@@ -817,3 +817,14 @@ export async function waitForTrackPublication({
|
|
|
817
817
|
room.off(RoomEvent.TrackPublished, onTrackPublished);
|
|
818
818
|
}
|
|
819
819
|
}
|
|
820
|
+
|
|
821
|
+
export async function waitForAbort(signal: AbortSignal) {
|
|
822
|
+
const abortFuture = new Future<void>();
|
|
823
|
+
const handler = () => {
|
|
824
|
+
abortFuture.resolve();
|
|
825
|
+
signal.removeEventListener('abort', handler);
|
|
826
|
+
};
|
|
827
|
+
|
|
828
|
+
signal.addEventListener('abort', handler, { once: true });
|
|
829
|
+
return await abortFuture.await;
|
|
830
|
+
}
|
package/src/voice/agent.ts
CHANGED
|
@@ -4,6 +4,14 @@
|
|
|
4
4
|
import type { AudioFrame } from '@livekit/rtc-node';
|
|
5
5
|
import { AsyncLocalStorage } from 'node:async_hooks';
|
|
6
6
|
import { ReadableStream } from 'node:stream/web';
|
|
7
|
+
import {
|
|
8
|
+
LLM as InferenceLLM,
|
|
9
|
+
STT as InferenceSTT,
|
|
10
|
+
TTS as InferenceTTS,
|
|
11
|
+
type LLMModels,
|
|
12
|
+
type STTModels,
|
|
13
|
+
type TTSModels,
|
|
14
|
+
} from '../inference/index.js';
|
|
7
15
|
import { ReadonlyChatContext } from '../llm/chat_context.js';
|
|
8
16
|
import type { ChatMessage, FunctionCall, RealtimeModel } from '../llm/index.js';
|
|
9
17
|
import {
|
|
@@ -55,10 +63,10 @@ export interface AgentOptions<UserData> {
|
|
|
55
63
|
chatCtx?: ChatContext;
|
|
56
64
|
tools?: ToolContext<UserData>;
|
|
57
65
|
turnDetection?: TurnDetectionMode;
|
|
58
|
-
stt?: STT;
|
|
66
|
+
stt?: STT | STTModels;
|
|
59
67
|
vad?: VAD;
|
|
60
|
-
llm?: LLM | RealtimeModel;
|
|
61
|
-
tts?: TTS;
|
|
68
|
+
llm?: LLM | RealtimeModel | LLMModels;
|
|
69
|
+
tts?: TTS | TTSModels;
|
|
62
70
|
allowInterruptions?: boolean;
|
|
63
71
|
minConsecutiveSpeechDelay?: number;
|
|
64
72
|
}
|
|
@@ -101,10 +109,26 @@ export class Agent<UserData = any> {
|
|
|
101
109
|
: ChatContext.empty();
|
|
102
110
|
|
|
103
111
|
this.turnDetection = turnDetection;
|
|
104
|
-
this._stt = stt;
|
|
105
112
|
this._vad = vad;
|
|
106
|
-
|
|
107
|
-
|
|
113
|
+
|
|
114
|
+
if (typeof stt === 'string') {
|
|
115
|
+
this._stt = new InferenceSTT({ model: stt });
|
|
116
|
+
} else {
|
|
117
|
+
this._stt = stt;
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
if (typeof llm === 'string') {
|
|
121
|
+
this._llm = new InferenceLLM({ model: llm });
|
|
122
|
+
} else {
|
|
123
|
+
this._llm = llm;
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
if (typeof tts === 'string') {
|
|
127
|
+
this._tts = new InferenceTTS({ model: tts });
|
|
128
|
+
} else {
|
|
129
|
+
this._tts = tts;
|
|
130
|
+
}
|
|
131
|
+
|
|
108
132
|
this._agentActivity = undefined;
|
|
109
133
|
}
|
|
110
134
|
|
|
@@ -5,6 +5,14 @@ import type { AudioFrame, Room } from '@livekit/rtc-node';
|
|
|
5
5
|
import type { TypedEventEmitter as TypedEmitter } from '@livekit/typed-emitter';
|
|
6
6
|
import { EventEmitter } from 'node:events';
|
|
7
7
|
import type { ReadableStream } from 'node:stream/web';
|
|
8
|
+
import {
|
|
9
|
+
LLM as InferenceLLM,
|
|
10
|
+
STT as InferenceSTT,
|
|
11
|
+
TTS as InferenceTTS,
|
|
12
|
+
type LLMModels,
|
|
13
|
+
type STTModels,
|
|
14
|
+
type TTSModels,
|
|
15
|
+
} from '../inference/index.js';
|
|
8
16
|
import { getJobContext } from '../job.js';
|
|
9
17
|
import { ChatContext, ChatMessage } from '../llm/chat_context.js';
|
|
10
18
|
import type { LLM, RealtimeModel, RealtimeModelError, ToolChoice } from '../llm/index.js';
|
|
@@ -77,10 +85,10 @@ export type AgentSessionCallbacks = {
|
|
|
77
85
|
|
|
78
86
|
export type AgentSessionOptions<UserData = UnknownUserData> = {
|
|
79
87
|
turnDetection?: TurnDetectionMode;
|
|
80
|
-
stt?: STT;
|
|
88
|
+
stt?: STT | STTModels;
|
|
81
89
|
vad?: VAD;
|
|
82
|
-
llm?: LLM | RealtimeModel;
|
|
83
|
-
tts?: TTS;
|
|
90
|
+
llm?: LLM | RealtimeModel | LLMModels;
|
|
91
|
+
tts?: TTS | TTSModels;
|
|
84
92
|
userData?: UserData;
|
|
85
93
|
voiceOptions?: Partial<VoiceOptions>;
|
|
86
94
|
};
|
|
@@ -128,9 +136,24 @@ export class AgentSession<
|
|
|
128
136
|
} = opts;
|
|
129
137
|
|
|
130
138
|
this.vad = vad;
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
139
|
+
|
|
140
|
+
if (typeof stt === 'string') {
|
|
141
|
+
this.stt = new InferenceSTT({ model: stt });
|
|
142
|
+
} else {
|
|
143
|
+
this.stt = stt;
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
if (typeof llm === 'string') {
|
|
147
|
+
this.llm = new InferenceLLM({ model: llm });
|
|
148
|
+
} else {
|
|
149
|
+
this.llm = llm;
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
if (typeof tts === 'string') {
|
|
153
|
+
this.tts = new InferenceTTS({ model: tts });
|
|
154
|
+
} else {
|
|
155
|
+
this.tts = tts;
|
|
156
|
+
}
|
|
134
157
|
this.turnDetection = turnDetection;
|
|
135
158
|
this._userData = userData;
|
|
136
159
|
|
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
// SPDX-FileCopyrightText: 2025 LiveKit, Inc.
|
|
2
2
|
//
|
|
3
3
|
// SPDX-License-Identifier: Apache-2.0
|
|
4
|
-
import type { AudioFrame } from '@livekit/rtc-node';
|
|
5
4
|
import {
|
|
5
|
+
AudioFrame,
|
|
6
6
|
AudioStream,
|
|
7
7
|
type NoiseCancellationOptions,
|
|
8
8
|
RemoteParticipant,
|
|
@@ -66,6 +66,17 @@ export class ParticipantAudioInputStream extends AudioInput {
|
|
|
66
66
|
? participant
|
|
67
67
|
: this.room.remoteParticipants.get(participantIdentity);
|
|
68
68
|
|
|
69
|
+
// Convert Map iterator to array for Pino serialization
|
|
70
|
+
const trackPublicationsArray = Array.from(participantValue?.trackPublications.values() ?? []);
|
|
71
|
+
|
|
72
|
+
this.logger.info(
|
|
73
|
+
{
|
|
74
|
+
participantValue: participantValue?.identity,
|
|
75
|
+
trackPublications: trackPublicationsArray,
|
|
76
|
+
lengthOfTrackPublications: trackPublicationsArray.length,
|
|
77
|
+
},
|
|
78
|
+
'participantValue.trackPublications',
|
|
79
|
+
);
|
|
69
80
|
// We need to check if the participant has a microphone track and subscribe to it
|
|
70
81
|
// in case we miss the tracksubscribed event
|
|
71
82
|
if (participantValue) {
|
package/src/worker.ts
CHANGED
|
@@ -1,12 +1,7 @@
|
|
|
1
1
|
// SPDX-FileCopyrightText: 2024 LiveKit, Inc.
|
|
2
2
|
//
|
|
3
3
|
// SPDX-License-Identifier: Apache-2.0
|
|
4
|
-
import type {
|
|
5
|
-
JobAssignment,
|
|
6
|
-
JobTermination,
|
|
7
|
-
ParticipantInfo,
|
|
8
|
-
TrackSource,
|
|
9
|
-
} from '@livekit/protocol';
|
|
4
|
+
import type { JobAssignment, JobTermination, TrackSource } from '@livekit/protocol';
|
|
10
5
|
import {
|
|
11
6
|
type AvailabilityRequest,
|
|
12
7
|
JobType,
|
|
@@ -15,7 +10,7 @@ import {
|
|
|
15
10
|
WorkerMessage,
|
|
16
11
|
WorkerStatus,
|
|
17
12
|
} from '@livekit/protocol';
|
|
18
|
-
import { AccessToken, RoomServiceClient } from 'livekit-server-sdk';
|
|
13
|
+
import { AccessToken, ParticipantInfo, RoomServiceClient } from 'livekit-server-sdk';
|
|
19
14
|
import { EventEmitter } from 'node:events';
|
|
20
15
|
import os from 'node:os';
|
|
21
16
|
import { WebSocket } from 'ws';
|