@livekit/agents 1.0.5 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (97) hide show
  1. package/dist/index.cjs +3 -0
  2. package/dist/index.cjs.map +1 -1
  3. package/dist/index.d.cts +2 -1
  4. package/dist/index.d.ts +2 -1
  5. package/dist/index.d.ts.map +1 -1
  6. package/dist/index.js +2 -0
  7. package/dist/index.js.map +1 -1
  8. package/dist/inference/api_protos.cjs +104 -0
  9. package/dist/inference/api_protos.cjs.map +1 -0
  10. package/dist/inference/api_protos.d.cts +222 -0
  11. package/dist/inference/api_protos.d.ts +222 -0
  12. package/dist/inference/api_protos.d.ts.map +1 -0
  13. package/dist/inference/api_protos.js +70 -0
  14. package/dist/inference/api_protos.js.map +1 -0
  15. package/dist/inference/index.cjs +56 -0
  16. package/dist/inference/index.cjs.map +1 -0
  17. package/dist/inference/index.d.cts +9 -0
  18. package/dist/inference/index.d.ts +9 -0
  19. package/dist/inference/index.d.ts.map +1 -0
  20. package/dist/inference/index.js +16 -0
  21. package/dist/inference/index.js.map +1 -0
  22. package/dist/inference/llm.cjs +315 -0
  23. package/dist/inference/llm.cjs.map +1 -0
  24. package/dist/inference/llm.d.cts +92 -0
  25. package/dist/inference/llm.d.ts +92 -0
  26. package/dist/inference/llm.d.ts.map +1 -0
  27. package/dist/inference/llm.js +286 -0
  28. package/dist/inference/llm.js.map +1 -0
  29. package/dist/inference/stt.cjs +305 -0
  30. package/dist/inference/stt.cjs.map +1 -0
  31. package/dist/inference/stt.d.cts +79 -0
  32. package/dist/inference/stt.d.ts +79 -0
  33. package/dist/inference/stt.d.ts.map +1 -0
  34. package/dist/inference/stt.js +284 -0
  35. package/dist/inference/stt.js.map +1 -0
  36. package/dist/inference/tts.cjs +317 -0
  37. package/dist/inference/tts.cjs.map +1 -0
  38. package/dist/inference/tts.d.cts +75 -0
  39. package/dist/inference/tts.d.ts +75 -0
  40. package/dist/inference/tts.d.ts.map +1 -0
  41. package/dist/inference/tts.js +299 -0
  42. package/dist/inference/tts.js.map +1 -0
  43. package/dist/inference/utils.cjs +76 -0
  44. package/dist/inference/utils.cjs.map +1 -0
  45. package/dist/inference/utils.d.cts +5 -0
  46. package/dist/inference/utils.d.ts +5 -0
  47. package/dist/inference/utils.d.ts.map +1 -0
  48. package/dist/inference/utils.js +51 -0
  49. package/dist/inference/utils.js.map +1 -0
  50. package/dist/tts/tts.cjs +1 -1
  51. package/dist/tts/tts.cjs.map +1 -1
  52. package/dist/tts/tts.js +1 -1
  53. package/dist/tts/tts.js.map +1 -1
  54. package/dist/utils.cjs +11 -0
  55. package/dist/utils.cjs.map +1 -1
  56. package/dist/utils.d.cts +1 -0
  57. package/dist/utils.d.ts +1 -0
  58. package/dist/utils.d.ts.map +1 -1
  59. package/dist/utils.js +10 -0
  60. package/dist/utils.js.map +1 -1
  61. package/dist/voice/agent.cjs +16 -3
  62. package/dist/voice/agent.cjs.map +1 -1
  63. package/dist/voice/agent.d.cts +4 -3
  64. package/dist/voice/agent.d.ts +4 -3
  65. package/dist/voice/agent.d.ts.map +1 -1
  66. package/dist/voice/agent.js +20 -3
  67. package/dist/voice/agent.js.map +1 -1
  68. package/dist/voice/agent_session.cjs +16 -3
  69. package/dist/voice/agent_session.cjs.map +1 -1
  70. package/dist/voice/agent_session.d.cts +4 -3
  71. package/dist/voice/agent_session.d.ts +4 -3
  72. package/dist/voice/agent_session.d.ts.map +1 -1
  73. package/dist/voice/agent_session.js +20 -3
  74. package/dist/voice/agent_session.js.map +1 -1
  75. package/dist/voice/room_io/_input.cjs +9 -0
  76. package/dist/voice/room_io/_input.cjs.map +1 -1
  77. package/dist/voice/room_io/_input.d.ts.map +1 -1
  78. package/dist/voice/room_io/_input.js +10 -0
  79. package/dist/voice/room_io/_input.js.map +1 -1
  80. package/dist/worker.cjs.map +1 -1
  81. package/dist/worker.d.ts.map +1 -1
  82. package/dist/worker.js +1 -1
  83. package/dist/worker.js.map +1 -1
  84. package/package.json +3 -2
  85. package/src/index.ts +2 -1
  86. package/src/inference/api_protos.ts +82 -0
  87. package/src/inference/index.ts +12 -0
  88. package/src/inference/llm.ts +485 -0
  89. package/src/inference/stt.ts +414 -0
  90. package/src/inference/tts.ts +421 -0
  91. package/src/inference/utils.ts +66 -0
  92. package/src/tts/tts.ts +1 -1
  93. package/src/utils.ts +11 -0
  94. package/src/voice/agent.ts +30 -6
  95. package/src/voice/agent_session.ts +29 -6
  96. package/src/voice/room_io/_input.ts +12 -1
  97. package/src/worker.ts +2 -7
@@ -0,0 +1,421 @@
1
+ // SPDX-FileCopyrightText: 2025 LiveKit, Inc.
2
+ //
3
+ // SPDX-License-Identifier: Apache-2.0
4
+ import type { AudioFrame } from '@livekit/rtc-node';
5
+ import { WebSocket } from 'ws';
6
+ import { APIError, APIStatusError } from '../_exceptions.js';
7
+ import { AudioByteStream } from '../audio.js';
8
+ import { log } from '../log.js';
9
+ import { createStreamChannel } from '../stream/stream_channel.js';
10
+ import { basic as tokenizeBasic } from '../tokenize/index.js';
11
+ import {
12
+ SynthesizeStream as BaseSynthesizeStream,
13
+ TTS as BaseTTS,
14
+ ChunkedStream,
15
+ } from '../tts/index.js';
16
+ import { type APIConnectOptions, DEFAULT_API_CONNECT_OPTIONS } from '../types.js';
17
+ import { shortuuid } from '../utils.js';
18
+ import {
19
+ type TtsClientEvent,
20
+ type TtsServerEvent,
21
+ type TtsSessionCreateEvent,
22
+ ttsClientEventSchema,
23
+ ttsServerEventSchema,
24
+ } from './api_protos.js';
25
+ import { type AnyModels, connectWs, createAccessToken } from './utils.js';
26
+
27
+ type _CartesiaModels = 'cartesia' | 'cartesia/sonic' | 'cartesia/sonic-2' | 'cartesia/sonic-turbo';
28
+
29
+ export type CartesiaModels = _CartesiaModels | `${_CartesiaModels}:${string}`;
30
+
31
+ type _ElevenlabsModels =
32
+ | 'elevenlabs'
33
+ | 'elevenlabs/eleven_flash_v2'
34
+ | 'elevenlabs/eleven_flash_v2_5'
35
+ | 'elevenlabs/eleven_turbo_v2'
36
+ | 'elevenlabs/eleven_turbo_v2_5'
37
+ | 'elevenlabs/eleven_multilingual_v2';
38
+
39
+ export type ElevenlabsModels = _ElevenlabsModels | `${_ElevenlabsModels}:${string}`;
40
+
41
+ export type _RimeModels = 'rime' | 'rime/mist' | 'rime/mistv2' | 'rime/arcana';
42
+
43
+ export type RimeModels = _RimeModels | `${_RimeModels}:${string}`;
44
+
45
+ export type _InworldModels = 'inworld' | 'inworld/inworld-tts-1';
46
+
47
+ export type InworldModels = _InworldModels | `${_InworldModels}:${string}`;
48
+
49
+ export interface CartesiaOptions {
50
+ duration?: number; // max duration of audio in seconds
51
+ speed?: 'slow' | 'normal' | 'fast'; // default: not specified
52
+ }
53
+
54
+ export interface ElevenlabsOptions {
55
+ inactivity_timeout?: number; // default: 60
56
+ apply_text_normalization?: 'auto' | 'off' | 'on'; // default: "auto"
57
+ }
58
+
59
+ export interface RimeOptions {}
60
+
61
+ export interface InworldOptions {}
62
+
63
+ export type TTSModels = CartesiaModels | ElevenlabsModels | RimeModels | InworldModels | AnyModels;
64
+
65
+ export type TTSOptions<TModel extends TTSModels> = TModel extends CartesiaModels
66
+ ? CartesiaOptions
67
+ : TModel extends ElevenlabsModels
68
+ ? ElevenlabsOptions
69
+ : TModel extends RimeOptions
70
+ ? RimeOptions
71
+ : TModel extends InworldOptions
72
+ ? InworldOptions
73
+ : Record<string, unknown>;
74
+
75
+ type TTSEncoding = 'pcm_s16le';
76
+
77
+ const DEFAULT_ENCODING: TTSEncoding = 'pcm_s16le';
78
+ const DEFAULT_SAMPLE_RATE = 16000;
79
+ const DEFAULT_BASE_URL = 'https://agent-gateway.livekit.cloud/v1';
80
+ const NUM_CHANNELS = 1;
81
+ const DEFAULT_LANGUAGE = 'en';
82
+
83
+ export interface InferenceTTSOptions<TModel extends TTSModels> {
84
+ model?: TModel;
85
+ voice?: string;
86
+ language?: string;
87
+ encoding: TTSEncoding;
88
+ sampleRate: number;
89
+ baseURL: string;
90
+ apiKey: string;
91
+ apiSecret: string;
92
+ extraKwargs: TTSOptions<TModel>;
93
+ }
94
+
95
+ export class TTS<TModel extends TTSModels> extends BaseTTS {
96
+ private opts: InferenceTTSOptions<TModel>;
97
+ private streams: Set<SynthesizeStream<TModel>> = new Set();
98
+
99
+ #logger = log();
100
+
101
+ constructor(opts: {
102
+ model: TModel;
103
+ voice?: string;
104
+ language?: string;
105
+ baseURL?: string;
106
+ encoding?: TTSEncoding;
107
+ sampleRate?: number;
108
+ apiKey?: string;
109
+ apiSecret?: string;
110
+ extraKwargs?: TTSOptions<TModel>;
111
+ }) {
112
+ const sampleRate = opts?.sampleRate ?? DEFAULT_SAMPLE_RATE;
113
+ super(sampleRate, 1, { streaming: true });
114
+
115
+ const {
116
+ model,
117
+ voice,
118
+ language = DEFAULT_LANGUAGE,
119
+ baseURL,
120
+ encoding = DEFAULT_ENCODING,
121
+ apiKey,
122
+ apiSecret,
123
+ extraKwargs = {} as TTSOptions<TModel>,
124
+ } = opts || {};
125
+
126
+ const lkBaseURL = baseURL || process.env.LIVEKIT_INFERENCE_URL || DEFAULT_BASE_URL;
127
+ const lkApiKey = apiKey || process.env.LIVEKIT_INFERENCE_API_KEY || process.env.LIVEKIT_API_KEY;
128
+ if (!lkApiKey) {
129
+ throw new Error('apiKey is required: pass apiKey or set LIVEKIT_API_KEY');
130
+ }
131
+
132
+ const lkApiSecret =
133
+ apiSecret || process.env.LIVEKIT_INFERENCE_API_SECRET || process.env.LIVEKIT_API_SECRET;
134
+ if (!lkApiSecret) {
135
+ throw new Error('apiSecret is required: pass apiSecret or set LIVEKIT_API_SECRET');
136
+ }
137
+
138
+ // read voice id from the model if provided: "provider/model:voice_id"
139
+ let nextModel = model;
140
+ let nextVoice = voice;
141
+ if (typeof nextModel === 'string') {
142
+ const idx = nextModel.lastIndexOf(':');
143
+ if (idx !== -1) {
144
+ const voiceFromModel = nextModel.slice(idx + 1);
145
+ if (nextVoice && nextVoice !== voiceFromModel) {
146
+ this.#logger.warn(
147
+ '`voice` is provided via both argument and model, using the one from the argument',
148
+ { voice: nextVoice, model: nextModel },
149
+ );
150
+ } else {
151
+ nextVoice = voiceFromModel;
152
+ }
153
+ nextModel = nextModel.slice(0, idx) as TModel;
154
+ }
155
+ }
156
+
157
+ this.opts = {
158
+ model: nextModel,
159
+ voice: nextVoice,
160
+ language,
161
+ encoding,
162
+ sampleRate,
163
+ baseURL: lkBaseURL,
164
+ apiKey: lkApiKey,
165
+ apiSecret: lkApiSecret,
166
+ extraKwargs,
167
+ };
168
+ }
169
+
170
+ get label() {
171
+ return 'inference.TTS';
172
+ }
173
+
174
+ updateOptions(opts: Partial<Pick<InferenceTTSOptions<TModel>, 'model' | 'voice' | 'language'>>) {
175
+ this.opts = { ...this.opts, ...opts };
176
+ for (const stream of this.streams) {
177
+ stream.updateOptions(opts);
178
+ }
179
+ }
180
+
181
+ synthesize(_: string): ChunkedStream {
182
+ throw new Error('ChunkedStream is not implemented');
183
+ }
184
+
185
+ stream(options?: { connOptions?: APIConnectOptions }): SynthesizeStream<TModel> {
186
+ const { connOptions = DEFAULT_API_CONNECT_OPTIONS } = options || {};
187
+ const stream = new SynthesizeStream(this, { ...this.opts }, connOptions);
188
+ this.streams.add(stream);
189
+ return stream;
190
+ }
191
+
192
+ async connectWs(timeout: number): Promise<WebSocket> {
193
+ let baseURL = this.opts.baseURL;
194
+ if (baseURL.startsWith('http://') || baseURL.startsWith('https://')) {
195
+ baseURL = baseURL.replace('http', 'ws');
196
+ }
197
+
198
+ const token = await createAccessToken(this.opts.apiKey, this.opts.apiSecret);
199
+ const url = `${baseURL}/tts`;
200
+ const headers = { Authorization: `Bearer ${token}` } as Record<string, string>;
201
+
202
+ const params = {
203
+ type: 'session.create',
204
+ sample_rate: String(this.opts.sampleRate),
205
+ encoding: this.opts.encoding,
206
+ extra: this.opts.extraKwargs,
207
+ } as TtsSessionCreateEvent;
208
+
209
+ if (this.opts.voice) params.voice = this.opts.voice;
210
+ if (this.opts.model) params.model = this.opts.model;
211
+ if (this.opts.language) params.language = this.opts.language;
212
+
213
+ const socket = await connectWs(url, headers, timeout);
214
+ socket.send(JSON.stringify(params));
215
+ return socket;
216
+ }
217
+
218
+ async closeWs(ws: WebSocket) {
219
+ await ws.close();
220
+ }
221
+
222
+ async close() {
223
+ for (const stream of this.streams) {
224
+ await stream.close();
225
+ }
226
+ this.streams.clear();
227
+ }
228
+ }
229
+
230
+ export class SynthesizeStream<TModel extends TTSModels> extends BaseSynthesizeStream {
231
+ private opts: InferenceTTSOptions<TModel>;
232
+ private tts: TTS<TModel>;
233
+ private connOptions: APIConnectOptions;
234
+
235
+ #logger = log();
236
+
237
+ constructor(tts: TTS<TModel>, opts: InferenceTTSOptions<TModel>, connOptions: APIConnectOptions) {
238
+ super(tts, connOptions);
239
+ this.opts = opts;
240
+ this.tts = tts;
241
+ this.connOptions = connOptions;
242
+ }
243
+
244
+ get label() {
245
+ return 'inference.SynthesizeStream';
246
+ }
247
+
248
+ updateOptions(opts: Partial<Pick<InferenceTTSOptions<TModel>, 'model' | 'voice' | 'language'>>) {
249
+ this.opts = { ...this.opts, ...opts };
250
+ }
251
+
252
+ protected async run(): Promise<void> {
253
+ let ws: WebSocket | null = null;
254
+ let closing = false;
255
+ let finalReceived = false;
256
+ let lastFrame: AudioFrame | undefined;
257
+
258
+ const sendTokenizerStream = new tokenizeBasic.SentenceTokenizer().stream();
259
+ const eventChannel = createStreamChannel<TtsServerEvent>();
260
+ const requestId = shortuuid('tts_request_');
261
+
262
+ const resourceCleanup = () => {
263
+ if (closing) return;
264
+ closing = true;
265
+ sendTokenizerStream.close();
266
+ eventChannel.close();
267
+ ws?.removeAllListeners();
268
+ ws?.close();
269
+ };
270
+
271
+ const sendClientEvent = async (event: TtsClientEvent) => {
272
+ const validatedEvent = await ttsClientEventSchema.parseAsync(event);
273
+ if (!ws || ws.readyState !== WebSocket.OPEN) {
274
+ this.#logger.warn('Trying to send client TTS event to a closed WebSocket');
275
+ return;
276
+ }
277
+ ws.send(JSON.stringify(validatedEvent));
278
+ };
279
+
280
+ const sendLastFrame = (segmentId: string, final: boolean) => {
281
+ if (lastFrame) {
282
+ this.queue.put({ requestId, segmentId, frame: lastFrame, final });
283
+ lastFrame = undefined;
284
+ }
285
+ };
286
+
287
+ const createInputTask = async () => {
288
+ for await (const data of this.input) {
289
+ if (this.abortController.signal.aborted) break;
290
+ if (data === SynthesizeStream.FLUSH_SENTINEL) {
291
+ sendTokenizerStream.flush();
292
+ continue;
293
+ }
294
+ sendTokenizerStream.pushText(data);
295
+ }
296
+ sendTokenizerStream.endInput();
297
+ };
298
+
299
+ const createSentenceStreamTask = async () => {
300
+ for await (const ev of sendTokenizerStream) {
301
+ if (this.abortController.signal.aborted) break;
302
+
303
+ sendClientEvent({
304
+ type: 'input_transcript',
305
+ transcript: ev.token + ' ',
306
+ });
307
+ }
308
+
309
+ sendClientEvent({ type: 'session.flush' });
310
+ };
311
+
312
+ const createWsListenerTask = async (ws: WebSocket) => {
313
+ return new Promise<void>((resolve, reject) => {
314
+ this.abortController.signal.addEventListener('abort', () => {
315
+ resourceCleanup();
316
+ reject(new Error('WebSocket connection aborted'));
317
+ });
318
+
319
+ ws.on('message', async (data) => {
320
+ const eventJson = JSON.parse(data.toString()) as Record<string, unknown>;
321
+ const validatedEvent = ttsServerEventSchema.parse(eventJson);
322
+ eventChannel.write(validatedEvent);
323
+ });
324
+
325
+ ws.on('error', (e) => {
326
+ this.#logger.error({ error: e }, 'WebSocket error');
327
+ resourceCleanup();
328
+ reject(e);
329
+ });
330
+
331
+ ws.on('close', () => {
332
+ resourceCleanup();
333
+
334
+ if (!closing) return this.#logger.error('WebSocket closed unexpectedly');
335
+ if (finalReceived) return resolve();
336
+
337
+ reject(
338
+ new APIStatusError({
339
+ message: 'Gateway connection closed unexpectedly',
340
+ options: { requestId },
341
+ }),
342
+ );
343
+ });
344
+ });
345
+ };
346
+
347
+ const createRecvTask = async () => {
348
+ let currentSessionId: string | null = null;
349
+
350
+ const bstream = new AudioByteStream(this.opts.sampleRate, NUM_CHANNELS);
351
+ const serverEventStream = eventChannel.stream();
352
+ const reader = serverEventStream.getReader();
353
+
354
+ try {
355
+ while (!this.closed && !this.abortController.signal.aborted) {
356
+ const result = await reader.read();
357
+ if (this.abortController.signal.aborted) return;
358
+ if (result.done) return;
359
+
360
+ const serverEvent = result.value;
361
+ switch (serverEvent.type) {
362
+ case 'session.created':
363
+ currentSessionId = serverEvent.session_id;
364
+ break;
365
+ case 'output_audio':
366
+ const base64Data = new Int8Array(Buffer.from(serverEvent.audio, 'base64'));
367
+ for (const frame of bstream.write(base64Data.buffer)) {
368
+ sendLastFrame(currentSessionId!, false);
369
+ lastFrame = frame;
370
+ }
371
+ break;
372
+ case 'done':
373
+ finalReceived = true;
374
+ for (const frame of bstream.flush()) {
375
+ sendLastFrame(currentSessionId!, false);
376
+ lastFrame = frame;
377
+ }
378
+ sendLastFrame(currentSessionId!, true);
379
+ this.queue.put(SynthesizeStream.END_OF_STREAM);
380
+ break;
381
+ case 'session.closed':
382
+ resourceCleanup();
383
+ break;
384
+ case 'error':
385
+ this.#logger.error(
386
+ { serverEvent },
387
+ 'Received error message from LiveKit TTS WebSocket',
388
+ );
389
+ resourceCleanup();
390
+ throw new APIError(`LiveKit TTS returned error: ${serverEvent.message}`);
391
+ default:
392
+ this.#logger.warn('Unexpected message %s', serverEvent);
393
+ break;
394
+ }
395
+ }
396
+ } finally {
397
+ reader.releaseLock();
398
+ try {
399
+ await serverEventStream.cancel();
400
+ } catch (e) {
401
+ this.#logger.debug('Error cancelling serverEventStream (may already be cancelled):', e);
402
+ }
403
+ }
404
+ };
405
+
406
+ try {
407
+ ws = await this.tts.connectWs(this.connOptions.timeoutMs);
408
+
409
+ await Promise.all([
410
+ createInputTask(),
411
+ createSentenceStreamTask(),
412
+ createWsListenerTask(ws),
413
+ createRecvTask(),
414
+ ]);
415
+ } catch (e) {
416
+ this.#logger.error('Error in SynthesizeStream', { error: e });
417
+ } finally {
418
+ resourceCleanup();
419
+ }
420
+ }
421
+ }
@@ -0,0 +1,66 @@
1
+ // SPDX-FileCopyrightText: 2025 LiveKit, Inc.
2
+ //
3
+ // SPDX-License-Identifier: Apache-2.0
4
+ import { AccessToken } from 'livekit-server-sdk';
5
+ import { WebSocket } from 'ws';
6
+ import { APIConnectionError, APIStatusError } from '../index.js';
7
+
8
+ export type AnyModels = string & NonNullable<unknown>;
9
+
10
+ export async function createAccessToken(
11
+ apiKey: string,
12
+ apiSecret: string,
13
+ ttl: number = 600,
14
+ ): Promise<string> {
15
+ const token = new AccessToken(apiKey, apiSecret, { identity: 'agent', ttl });
16
+ token.addInferenceGrant({ perform: true });
17
+
18
+ return await token.toJwt();
19
+ }
20
+
21
+ export async function connectWs(
22
+ url: string,
23
+ headers: Record<string, string>,
24
+ timeoutMs: number,
25
+ ): Promise<WebSocket> {
26
+ return new Promise<WebSocket>((resolve, reject) => {
27
+ const socket = new WebSocket(url, { headers: headers });
28
+
29
+ const timeout = setTimeout(() => {
30
+ reject(new APIConnectionError({ message: 'Timeout connecting to LiveKit WebSocket' }));
31
+ }, timeoutMs);
32
+
33
+ const onOpen = () => {
34
+ clearTimeout(timeout);
35
+ resolve(socket);
36
+ };
37
+
38
+ const onError = (err: unknown) => {
39
+ clearTimeout(timeout);
40
+ if (err && typeof err === 'object' && 'code' in err && (err as any).code === 429) {
41
+ reject(
42
+ new APIStatusError({
43
+ message: 'LiveKit gateway quota exceeded',
44
+ options: { statusCode: 429 },
45
+ }),
46
+ );
47
+ } else {
48
+ reject(new APIConnectionError({ message: 'Error connecting to LiveKit WebSocket' }));
49
+ }
50
+ };
51
+
52
+ const onClose = (code: number) => {
53
+ clearTimeout(timeout);
54
+ if (code !== 1000) {
55
+ reject(
56
+ new APIConnectionError({
57
+ message: 'Connection closed unexpectedly',
58
+ }),
59
+ );
60
+ }
61
+ };
62
+ socket.once('open', onOpen);
63
+ socket.once('error', onError);
64
+ socket.once('close', onClose);
65
+ });
66
+ }
package/src/tts/tts.ts CHANGED
@@ -443,7 +443,7 @@ export abstract class ChunkedStream implements AsyncIterableIterator<Synthesized
443
443
  for await (const audio of this.queue) {
444
444
  this.output.put(audio);
445
445
  requestId = audio.requestId;
446
- if (!ttfb) {
446
+ if (ttfb === BigInt(-1)) {
447
447
  ttfb = process.hrtime.bigint() - startTime;
448
448
  }
449
449
  audioDuration += audio.frame.samplesPerChannel / audio.frame.sampleRate;
package/src/utils.ts CHANGED
@@ -817,3 +817,14 @@ export async function waitForTrackPublication({
817
817
  room.off(RoomEvent.TrackPublished, onTrackPublished);
818
818
  }
819
819
  }
820
+
821
+ export async function waitForAbort(signal: AbortSignal) {
822
+ const abortFuture = new Future<void>();
823
+ const handler = () => {
824
+ abortFuture.resolve();
825
+ signal.removeEventListener('abort', handler);
826
+ };
827
+
828
+ signal.addEventListener('abort', handler, { once: true });
829
+ return await abortFuture.await;
830
+ }
@@ -4,6 +4,14 @@
4
4
  import type { AudioFrame } from '@livekit/rtc-node';
5
5
  import { AsyncLocalStorage } from 'node:async_hooks';
6
6
  import { ReadableStream } from 'node:stream/web';
7
+ import {
8
+ LLM as InferenceLLM,
9
+ STT as InferenceSTT,
10
+ TTS as InferenceTTS,
11
+ type LLMModels,
12
+ type STTModels,
13
+ type TTSModels,
14
+ } from '../inference/index.js';
7
15
  import { ReadonlyChatContext } from '../llm/chat_context.js';
8
16
  import type { ChatMessage, FunctionCall, RealtimeModel } from '../llm/index.js';
9
17
  import {
@@ -55,10 +63,10 @@ export interface AgentOptions<UserData> {
55
63
  chatCtx?: ChatContext;
56
64
  tools?: ToolContext<UserData>;
57
65
  turnDetection?: TurnDetectionMode;
58
- stt?: STT;
66
+ stt?: STT | STTModels;
59
67
  vad?: VAD;
60
- llm?: LLM | RealtimeModel;
61
- tts?: TTS;
68
+ llm?: LLM | RealtimeModel | LLMModels;
69
+ tts?: TTS | TTSModels;
62
70
  allowInterruptions?: boolean;
63
71
  minConsecutiveSpeechDelay?: number;
64
72
  }
@@ -101,10 +109,26 @@ export class Agent<UserData = any> {
101
109
  : ChatContext.empty();
102
110
 
103
111
  this.turnDetection = turnDetection;
104
- this._stt = stt;
105
112
  this._vad = vad;
106
- this._llm = llm;
107
- this._tts = tts;
113
+
114
+ if (typeof stt === 'string') {
115
+ this._stt = new InferenceSTT({ model: stt });
116
+ } else {
117
+ this._stt = stt;
118
+ }
119
+
120
+ if (typeof llm === 'string') {
121
+ this._llm = new InferenceLLM({ model: llm });
122
+ } else {
123
+ this._llm = llm;
124
+ }
125
+
126
+ if (typeof tts === 'string') {
127
+ this._tts = new InferenceTTS({ model: tts });
128
+ } else {
129
+ this._tts = tts;
130
+ }
131
+
108
132
  this._agentActivity = undefined;
109
133
  }
110
134
 
@@ -5,6 +5,14 @@ import type { AudioFrame, Room } from '@livekit/rtc-node';
5
5
  import type { TypedEventEmitter as TypedEmitter } from '@livekit/typed-emitter';
6
6
  import { EventEmitter } from 'node:events';
7
7
  import type { ReadableStream } from 'node:stream/web';
8
+ import {
9
+ LLM as InferenceLLM,
10
+ STT as InferenceSTT,
11
+ TTS as InferenceTTS,
12
+ type LLMModels,
13
+ type STTModels,
14
+ type TTSModels,
15
+ } from '../inference/index.js';
8
16
  import { getJobContext } from '../job.js';
9
17
  import { ChatContext, ChatMessage } from '../llm/chat_context.js';
10
18
  import type { LLM, RealtimeModel, RealtimeModelError, ToolChoice } from '../llm/index.js';
@@ -77,10 +85,10 @@ export type AgentSessionCallbacks = {
77
85
 
78
86
  export type AgentSessionOptions<UserData = UnknownUserData> = {
79
87
  turnDetection?: TurnDetectionMode;
80
- stt?: STT;
88
+ stt?: STT | STTModels;
81
89
  vad?: VAD;
82
- llm?: LLM | RealtimeModel;
83
- tts?: TTS;
90
+ llm?: LLM | RealtimeModel | LLMModels;
91
+ tts?: TTS | TTSModels;
84
92
  userData?: UserData;
85
93
  voiceOptions?: Partial<VoiceOptions>;
86
94
  };
@@ -128,9 +136,24 @@ export class AgentSession<
128
136
  } = opts;
129
137
 
130
138
  this.vad = vad;
131
- this.stt = stt;
132
- this.llm = llm;
133
- this.tts = tts;
139
+
140
+ if (typeof stt === 'string') {
141
+ this.stt = new InferenceSTT({ model: stt });
142
+ } else {
143
+ this.stt = stt;
144
+ }
145
+
146
+ if (typeof llm === 'string') {
147
+ this.llm = new InferenceLLM({ model: llm });
148
+ } else {
149
+ this.llm = llm;
150
+ }
151
+
152
+ if (typeof tts === 'string') {
153
+ this.tts = new InferenceTTS({ model: tts });
154
+ } else {
155
+ this.tts = tts;
156
+ }
134
157
  this.turnDetection = turnDetection;
135
158
  this._userData = userData;
136
159
 
@@ -1,8 +1,8 @@
1
1
  // SPDX-FileCopyrightText: 2025 LiveKit, Inc.
2
2
  //
3
3
  // SPDX-License-Identifier: Apache-2.0
4
- import type { AudioFrame } from '@livekit/rtc-node';
5
4
  import {
5
+ AudioFrame,
6
6
  AudioStream,
7
7
  type NoiseCancellationOptions,
8
8
  RemoteParticipant,
@@ -66,6 +66,17 @@ export class ParticipantAudioInputStream extends AudioInput {
66
66
  ? participant
67
67
  : this.room.remoteParticipants.get(participantIdentity);
68
68
 
69
+ // Convert Map iterator to array for Pino serialization
70
+ const trackPublicationsArray = Array.from(participantValue?.trackPublications.values() ?? []);
71
+
72
+ this.logger.info(
73
+ {
74
+ participantValue: participantValue?.identity,
75
+ trackPublications: trackPublicationsArray,
76
+ lengthOfTrackPublications: trackPublicationsArray.length,
77
+ },
78
+ 'participantValue.trackPublications',
79
+ );
69
80
  // We need to check if the participant has a microphone track and subscribe to it
70
81
  // in case we miss the tracksubscribed event
71
82
  if (participantValue) {
package/src/worker.ts CHANGED
@@ -1,12 +1,7 @@
1
1
  // SPDX-FileCopyrightText: 2024 LiveKit, Inc.
2
2
  //
3
3
  // SPDX-License-Identifier: Apache-2.0
4
- import type {
5
- JobAssignment,
6
- JobTermination,
7
- ParticipantInfo,
8
- TrackSource,
9
- } from '@livekit/protocol';
4
+ import type { JobAssignment, JobTermination, TrackSource } from '@livekit/protocol';
10
5
  import {
11
6
  type AvailabilityRequest,
12
7
  JobType,
@@ -15,7 +10,7 @@ import {
15
10
  WorkerMessage,
16
11
  WorkerStatus,
17
12
  } from '@livekit/protocol';
18
- import { AccessToken, RoomServiceClient } from 'livekit-server-sdk';
13
+ import { AccessToken, ParticipantInfo, RoomServiceClient } from 'livekit-server-sdk';
19
14
  import { EventEmitter } from 'node:events';
20
15
  import os from 'node:os';
21
16
  import { WebSocket } from 'ws';