@livekit/agents 1.0.6 → 1.0.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (149) hide show
  1. package/dist/index.cjs +3 -0
  2. package/dist/index.cjs.map +1 -1
  3. package/dist/index.d.cts +2 -1
  4. package/dist/index.d.ts +2 -1
  5. package/dist/index.d.ts.map +1 -1
  6. package/dist/index.js +2 -0
  7. package/dist/index.js.map +1 -1
  8. package/dist/inference/api_protos.cjs +104 -0
  9. package/dist/inference/api_protos.cjs.map +1 -0
  10. package/dist/inference/api_protos.d.cts +222 -0
  11. package/dist/inference/api_protos.d.ts +222 -0
  12. package/dist/inference/api_protos.d.ts.map +1 -0
  13. package/dist/inference/api_protos.js +70 -0
  14. package/dist/inference/api_protos.js.map +1 -0
  15. package/dist/inference/index.cjs +56 -0
  16. package/dist/inference/index.cjs.map +1 -0
  17. package/dist/inference/index.d.cts +8 -0
  18. package/dist/inference/index.d.ts +8 -0
  19. package/dist/inference/index.d.ts.map +1 -0
  20. package/dist/inference/index.js +23 -0
  21. package/dist/inference/index.js.map +1 -0
  22. package/dist/inference/llm.cjs +301 -0
  23. package/dist/inference/llm.cjs.map +1 -0
  24. package/dist/inference/llm.d.cts +107 -0
  25. package/dist/inference/llm.d.ts +107 -0
  26. package/dist/inference/llm.d.ts.map +1 -0
  27. package/dist/inference/llm.js +272 -0
  28. package/dist/inference/llm.js.map +1 -0
  29. package/dist/inference/stt.cjs +313 -0
  30. package/dist/inference/stt.cjs.map +1 -0
  31. package/dist/inference/stt.d.cts +87 -0
  32. package/dist/inference/stt.d.ts +87 -0
  33. package/dist/inference/stt.d.ts.map +1 -0
  34. package/dist/inference/stt.js +292 -0
  35. package/dist/inference/stt.js.map +1 -0
  36. package/dist/inference/tts.cjs +324 -0
  37. package/dist/inference/tts.cjs.map +1 -0
  38. package/dist/inference/tts.d.cts +77 -0
  39. package/dist/inference/tts.d.ts +77 -0
  40. package/dist/inference/tts.d.ts.map +1 -0
  41. package/dist/inference/tts.js +306 -0
  42. package/dist/inference/tts.js.map +1 -0
  43. package/dist/inference/utils.cjs +76 -0
  44. package/dist/inference/utils.cjs.map +1 -0
  45. package/dist/inference/utils.d.cts +5 -0
  46. package/dist/inference/utils.d.ts +5 -0
  47. package/dist/inference/utils.d.ts.map +1 -0
  48. package/dist/inference/utils.js +51 -0
  49. package/dist/inference/utils.js.map +1 -0
  50. package/dist/llm/remote_chat_context.cjs.map +1 -1
  51. package/dist/llm/remote_chat_context.d.cts +2 -0
  52. package/dist/llm/remote_chat_context.d.ts +2 -0
  53. package/dist/llm/remote_chat_context.d.ts.map +1 -1
  54. package/dist/llm/remote_chat_context.js.map +1 -1
  55. package/dist/tts/tts.cjs +1 -1
  56. package/dist/tts/tts.cjs.map +1 -1
  57. package/dist/tts/tts.js +1 -1
  58. package/dist/tts/tts.js.map +1 -1
  59. package/dist/utils.cjs +11 -0
  60. package/dist/utils.cjs.map +1 -1
  61. package/dist/utils.d.cts +1 -0
  62. package/dist/utils.d.ts +1 -0
  63. package/dist/utils.d.ts.map +1 -1
  64. package/dist/utils.js +10 -0
  65. package/dist/utils.js.map +1 -1
  66. package/dist/voice/agent.cjs +16 -3
  67. package/dist/voice/agent.cjs.map +1 -1
  68. package/dist/voice/agent.d.cts +5 -3
  69. package/dist/voice/agent.d.ts +5 -3
  70. package/dist/voice/agent.d.ts.map +1 -1
  71. package/dist/voice/agent.js +20 -3
  72. package/dist/voice/agent.js.map +1 -1
  73. package/dist/voice/agent_activity.cjs +4 -2
  74. package/dist/voice/agent_activity.cjs.map +1 -1
  75. package/dist/voice/agent_activity.d.ts.map +1 -1
  76. package/dist/voice/agent_activity.js +4 -2
  77. package/dist/voice/agent_activity.js.map +1 -1
  78. package/dist/voice/agent_session.cjs +16 -3
  79. package/dist/voice/agent_session.cjs.map +1 -1
  80. package/dist/voice/agent_session.d.cts +4 -3
  81. package/dist/voice/agent_session.d.ts +4 -3
  82. package/dist/voice/agent_session.d.ts.map +1 -1
  83. package/dist/voice/agent_session.js +20 -3
  84. package/dist/voice/agent_session.js.map +1 -1
  85. package/dist/voice/events.cjs +2 -0
  86. package/dist/voice/events.cjs.map +1 -1
  87. package/dist/voice/events.d.cts +4 -1
  88. package/dist/voice/events.d.ts +4 -1
  89. package/dist/voice/events.d.ts.map +1 -1
  90. package/dist/voice/events.js +2 -0
  91. package/dist/voice/events.js.map +1 -1
  92. package/dist/voice/generation.cjs.map +1 -1
  93. package/dist/voice/generation.d.cts +1 -0
  94. package/dist/voice/generation.d.ts +1 -0
  95. package/dist/voice/generation.d.ts.map +1 -1
  96. package/dist/voice/generation.js.map +1 -1
  97. package/dist/voice/room_io/_input.cjs.map +1 -1
  98. package/dist/voice/room_io/_input.d.ts.map +1 -1
  99. package/dist/voice/room_io/_input.js +1 -0
  100. package/dist/voice/room_io/_input.js.map +1 -1
  101. package/dist/voice/room_io/_output.cjs +1 -1
  102. package/dist/voice/room_io/_output.cjs.map +1 -1
  103. package/dist/voice/room_io/_output.d.cts +1 -0
  104. package/dist/voice/room_io/_output.d.ts +1 -0
  105. package/dist/voice/room_io/_output.d.ts.map +1 -1
  106. package/dist/voice/room_io/_output.js +1 -1
  107. package/dist/voice/room_io/_output.js.map +1 -1
  108. package/dist/voice/room_io/room_io.cjs +1 -1
  109. package/dist/voice/room_io/room_io.cjs.map +1 -1
  110. package/dist/voice/room_io/room_io.d.cts +20 -0
  111. package/dist/voice/room_io/room_io.d.ts +20 -0
  112. package/dist/voice/room_io/room_io.d.ts.map +1 -1
  113. package/dist/voice/room_io/room_io.js +1 -1
  114. package/dist/voice/room_io/room_io.js.map +1 -1
  115. package/dist/voice/transcription/synchronizer.cjs +1 -1
  116. package/dist/voice/transcription/synchronizer.cjs.map +1 -1
  117. package/dist/voice/transcription/synchronizer.d.cts +1 -0
  118. package/dist/voice/transcription/synchronizer.d.ts +1 -0
  119. package/dist/voice/transcription/synchronizer.d.ts.map +1 -1
  120. package/dist/voice/transcription/synchronizer.js +1 -1
  121. package/dist/voice/transcription/synchronizer.js.map +1 -1
  122. package/dist/worker.cjs +3 -3
  123. package/dist/worker.cjs.map +1 -1
  124. package/dist/worker.d.cts +3 -0
  125. package/dist/worker.d.ts +3 -0
  126. package/dist/worker.d.ts.map +1 -1
  127. package/dist/worker.js +4 -4
  128. package/dist/worker.js.map +1 -1
  129. package/package.json +3 -2
  130. package/src/index.ts +2 -1
  131. package/src/inference/api_protos.ts +82 -0
  132. package/src/inference/index.ts +32 -0
  133. package/src/inference/llm.ts +464 -0
  134. package/src/inference/stt.ts +444 -0
  135. package/src/inference/tts.ts +432 -0
  136. package/src/inference/utils.ts +66 -0
  137. package/src/llm/remote_chat_context.ts +2 -2
  138. package/src/tts/tts.ts +1 -1
  139. package/src/utils.ts +11 -0
  140. package/src/voice/agent.ts +31 -7
  141. package/src/voice/agent_activity.ts +2 -0
  142. package/src/voice/agent_session.ts +30 -6
  143. package/src/voice/events.ts +6 -0
  144. package/src/voice/generation.ts +1 -1
  145. package/src/voice/room_io/_input.ts +1 -1
  146. package/src/voice/room_io/_output.ts +1 -1
  147. package/src/voice/room_io/room_io.ts +21 -2
  148. package/src/voice/transcription/synchronizer.ts +1 -1
  149. package/src/worker.ts +5 -10
@@ -0,0 +1,432 @@
1
+ // SPDX-FileCopyrightText: 2025 LiveKit, Inc.
2
+ //
3
+ // SPDX-License-Identifier: Apache-2.0
4
+ import type { AudioFrame } from '@livekit/rtc-node';
5
+ import { WebSocket } from 'ws';
6
+ import { APIError, APIStatusError } from '../_exceptions.js';
7
+ import { AudioByteStream } from '../audio.js';
8
+ import { log } from '../log.js';
9
+ import { createStreamChannel } from '../stream/stream_channel.js';
10
+ import { basic as tokenizeBasic } from '../tokenize/index.js';
11
+ import {
12
+ SynthesizeStream as BaseSynthesizeStream,
13
+ TTS as BaseTTS,
14
+ ChunkedStream,
15
+ } from '../tts/index.js';
16
+ import { type APIConnectOptions, DEFAULT_API_CONNECT_OPTIONS } from '../types.js';
17
+ import { shortuuid } from '../utils.js';
18
+ import {
19
+ type TtsClientEvent,
20
+ type TtsServerEvent,
21
+ type TtsSessionCreateEvent,
22
+ ttsClientEventSchema,
23
+ ttsServerEventSchema,
24
+ } from './api_protos.js';
25
+ import { type AnyString, connectWs, createAccessToken } from './utils.js';
26
+
27
+ export type CartesiaModels =
28
+ | 'cartesia'
29
+ | 'cartesia/sonic'
30
+ | 'cartesia/sonic-2'
31
+ | 'cartesia/sonic-turbo';
32
+
33
+ export type ElevenlabsModels =
34
+ | 'elevenlabs'
35
+ | 'elevenlabs/eleven_flash_v2'
36
+ | 'elevenlabs/eleven_flash_v2_5'
37
+ | 'elevenlabs/eleven_turbo_v2'
38
+ | 'elevenlabs/eleven_turbo_v2_5'
39
+ | 'elevenlabs/eleven_multilingual_v2';
40
+
41
+ export type RimeModels = 'rime' | 'rime/mist' | 'rime/mistv2' | 'rime/arcana';
42
+
43
+ export type InworldModels = 'inworld' | 'inworld/inworld-tts-1';
44
+
45
+ export interface CartesiaOptions {
46
+ duration?: number; // max duration of audio in seconds
47
+ speed?: 'slow' | 'normal' | 'fast'; // default: not specified
48
+ }
49
+
50
+ export interface ElevenlabsOptions {
51
+ inactivity_timeout?: number; // default: 60
52
+ apply_text_normalization?: 'auto' | 'off' | 'on'; // default: "auto"
53
+ }
54
+
55
+ export interface RimeOptions {}
56
+
57
+ export interface InworldOptions {}
58
+
59
+ type _TTSModels = CartesiaModels | ElevenlabsModels | RimeModels | InworldModels;
60
+
61
+ export type TTSModels = CartesiaModels | ElevenlabsModels | RimeModels | InworldModels | AnyString;
62
+
63
+ export type ModelWithVoice = `${_TTSModels}:${string}` | TTSModels;
64
+
65
+ export type TTSOptions<TModel extends TTSModels> = TModel extends CartesiaModels
66
+ ? CartesiaOptions
67
+ : TModel extends ElevenlabsModels
68
+ ? ElevenlabsOptions
69
+ : TModel extends RimeOptions
70
+ ? RimeOptions
71
+ : TModel extends InworldOptions
72
+ ? InworldOptions
73
+ : Record<string, unknown>;
74
+
75
+ type TTSEncoding = 'pcm_s16le';
76
+
77
+ const DEFAULT_ENCODING: TTSEncoding = 'pcm_s16le';
78
+ const DEFAULT_SAMPLE_RATE = 16000;
79
+ const DEFAULT_BASE_URL = 'https://agent-gateway.livekit.cloud/v1';
80
+ const NUM_CHANNELS = 1;
81
+ const DEFAULT_LANGUAGE = 'en';
82
+
83
+ export interface InferenceTTSOptions<TModel extends TTSModels> {
84
+ model?: TModel;
85
+ voice?: string;
86
+ language?: string;
87
+ encoding: TTSEncoding;
88
+ sampleRate: number;
89
+ baseURL: string;
90
+ apiKey: string;
91
+ apiSecret: string;
92
+ modelOptions: TTSOptions<TModel>;
93
+ }
94
+
95
+ /**
96
+ * Livekit Cloud Inference TTS
97
+ */
98
+ export class TTS<TModel extends TTSModels> extends BaseTTS {
99
+ private opts: InferenceTTSOptions<TModel>;
100
+ private streams: Set<SynthesizeStream<TModel>> = new Set();
101
+
102
+ #logger = log();
103
+
104
+ constructor(opts: {
105
+ model: TModel;
106
+ voice?: string;
107
+ language?: string;
108
+ baseURL?: string;
109
+ encoding?: TTSEncoding;
110
+ sampleRate?: number;
111
+ apiKey?: string;
112
+ apiSecret?: string;
113
+ modelOptions?: TTSOptions<TModel>;
114
+ }) {
115
+ const sampleRate = opts?.sampleRate ?? DEFAULT_SAMPLE_RATE;
116
+ super(sampleRate, 1, { streaming: true });
117
+
118
+ const {
119
+ model,
120
+ voice,
121
+ language = DEFAULT_LANGUAGE,
122
+ baseURL,
123
+ encoding = DEFAULT_ENCODING,
124
+ apiKey,
125
+ apiSecret,
126
+ modelOptions = {} as TTSOptions<TModel>,
127
+ } = opts || {};
128
+
129
+ const lkBaseURL = baseURL || process.env.LIVEKIT_INFERENCE_URL || DEFAULT_BASE_URL;
130
+ const lkApiKey = apiKey || process.env.LIVEKIT_INFERENCE_API_KEY || process.env.LIVEKIT_API_KEY;
131
+ if (!lkApiKey) {
132
+ throw new Error('apiKey is required: pass apiKey or set LIVEKIT_API_KEY');
133
+ }
134
+
135
+ const lkApiSecret =
136
+ apiSecret || process.env.LIVEKIT_INFERENCE_API_SECRET || process.env.LIVEKIT_API_SECRET;
137
+ if (!lkApiSecret) {
138
+ throw new Error('apiSecret is required: pass apiSecret or set LIVEKIT_API_SECRET');
139
+ }
140
+
141
+ // read voice id from the model if provided: "provider/model:voice_id"
142
+ let nextModel = model;
143
+ let nextVoice = voice;
144
+ if (typeof nextModel === 'string') {
145
+ const idx = nextModel.lastIndexOf(':');
146
+ if (idx !== -1) {
147
+ const voiceFromModel = nextModel.slice(idx + 1);
148
+ if (nextVoice && nextVoice !== voiceFromModel) {
149
+ this.#logger.warn(
150
+ '`voice` is provided via both argument and model, using the one from the argument',
151
+ { voice: nextVoice, model: nextModel },
152
+ );
153
+ } else {
154
+ nextVoice = voiceFromModel;
155
+ }
156
+ nextModel = nextModel.slice(0, idx) as TModel;
157
+ }
158
+ }
159
+
160
+ this.opts = {
161
+ model: nextModel,
162
+ voice: nextVoice,
163
+ language,
164
+ encoding,
165
+ sampleRate,
166
+ baseURL: lkBaseURL,
167
+ apiKey: lkApiKey,
168
+ apiSecret: lkApiSecret,
169
+ modelOptions,
170
+ };
171
+ }
172
+
173
+ get label() {
174
+ return 'inference.TTS';
175
+ }
176
+
177
+ static fromModelString(modelString: string): TTS<AnyString> {
178
+ if (modelString.includes(':')) {
179
+ const [model, voice] = modelString.split(':') as [TTSModels, string];
180
+ return new TTS({ model, voice });
181
+ }
182
+ return new TTS({ model: modelString });
183
+ }
184
+
185
+ updateOptions(opts: Partial<Pick<InferenceTTSOptions<TModel>, 'model' | 'voice' | 'language'>>) {
186
+ this.opts = { ...this.opts, ...opts };
187
+ for (const stream of this.streams) {
188
+ stream.updateOptions(opts);
189
+ }
190
+ }
191
+
192
+ synthesize(_: string): ChunkedStream {
193
+ throw new Error('ChunkedStream is not implemented');
194
+ }
195
+
196
+ stream(options?: { connOptions?: APIConnectOptions }): SynthesizeStream<TModel> {
197
+ const { connOptions = DEFAULT_API_CONNECT_OPTIONS } = options || {};
198
+ const stream = new SynthesizeStream(this, { ...this.opts }, connOptions);
199
+ this.streams.add(stream);
200
+ return stream;
201
+ }
202
+
203
+ async connectWs(timeout: number): Promise<WebSocket> {
204
+ let baseURL = this.opts.baseURL;
205
+ if (baseURL.startsWith('http://') || baseURL.startsWith('https://')) {
206
+ baseURL = baseURL.replace('http', 'ws');
207
+ }
208
+
209
+ const token = await createAccessToken(this.opts.apiKey, this.opts.apiSecret);
210
+ const url = `${baseURL}/tts`;
211
+ const headers = { Authorization: `Bearer ${token}` } as Record<string, string>;
212
+
213
+ const params = {
214
+ type: 'session.create',
215
+ sample_rate: String(this.opts.sampleRate),
216
+ encoding: this.opts.encoding,
217
+ extra: this.opts.modelOptions,
218
+ } as TtsSessionCreateEvent;
219
+
220
+ if (this.opts.voice) params.voice = this.opts.voice;
221
+ if (this.opts.model) params.model = this.opts.model;
222
+ if (this.opts.language) params.language = this.opts.language;
223
+
224
+ const socket = await connectWs(url, headers, timeout);
225
+ socket.send(JSON.stringify(params));
226
+ return socket;
227
+ }
228
+
229
+ async closeWs(ws: WebSocket) {
230
+ await ws.close();
231
+ }
232
+
233
+ async close() {
234
+ for (const stream of this.streams) {
235
+ await stream.close();
236
+ }
237
+ this.streams.clear();
238
+ }
239
+ }
240
+
241
+ export class SynthesizeStream<TModel extends TTSModels> extends BaseSynthesizeStream {
242
+ private opts: InferenceTTSOptions<TModel>;
243
+ private tts: TTS<TModel>;
244
+ private connOptions: APIConnectOptions;
245
+
246
+ #logger = log();
247
+
248
+ constructor(tts: TTS<TModel>, opts: InferenceTTSOptions<TModel>, connOptions: APIConnectOptions) {
249
+ super(tts, connOptions);
250
+ this.opts = opts;
251
+ this.tts = tts;
252
+ this.connOptions = connOptions;
253
+ }
254
+
255
+ get label() {
256
+ return 'inference.SynthesizeStream';
257
+ }
258
+
259
+ updateOptions(opts: Partial<Pick<InferenceTTSOptions<TModel>, 'model' | 'voice' | 'language'>>) {
260
+ this.opts = { ...this.opts, ...opts };
261
+ }
262
+
263
+ protected async run(): Promise<void> {
264
+ let ws: WebSocket | null = null;
265
+ let closing = false;
266
+ let finalReceived = false;
267
+ let lastFrame: AudioFrame | undefined;
268
+
269
+ const sendTokenizerStream = new tokenizeBasic.SentenceTokenizer().stream();
270
+ const eventChannel = createStreamChannel<TtsServerEvent>();
271
+ const requestId = shortuuid('tts_request_');
272
+
273
+ const resourceCleanup = () => {
274
+ if (closing) return;
275
+ closing = true;
276
+ sendTokenizerStream.close();
277
+ eventChannel.close();
278
+ ws?.removeAllListeners();
279
+ ws?.close();
280
+ };
281
+
282
+ const sendClientEvent = async (event: TtsClientEvent) => {
283
+ const validatedEvent = await ttsClientEventSchema.parseAsync(event);
284
+ if (!ws || ws.readyState !== WebSocket.OPEN) {
285
+ this.#logger.warn('Trying to send client TTS event to a closed WebSocket');
286
+ return;
287
+ }
288
+ ws.send(JSON.stringify(validatedEvent));
289
+ };
290
+
291
+ const sendLastFrame = (segmentId: string, final: boolean) => {
292
+ if (lastFrame) {
293
+ this.queue.put({ requestId, segmentId, frame: lastFrame, final });
294
+ lastFrame = undefined;
295
+ }
296
+ };
297
+
298
+ const createInputTask = async () => {
299
+ for await (const data of this.input) {
300
+ if (this.abortController.signal.aborted) break;
301
+ if (data === SynthesizeStream.FLUSH_SENTINEL) {
302
+ sendTokenizerStream.flush();
303
+ continue;
304
+ }
305
+ sendTokenizerStream.pushText(data);
306
+ }
307
+ sendTokenizerStream.endInput();
308
+ };
309
+
310
+ const createSentenceStreamTask = async () => {
311
+ for await (const ev of sendTokenizerStream) {
312
+ if (this.abortController.signal.aborted) break;
313
+
314
+ sendClientEvent({
315
+ type: 'input_transcript',
316
+ transcript: ev.token + ' ',
317
+ });
318
+ }
319
+
320
+ sendClientEvent({ type: 'session.flush' });
321
+ };
322
+
323
+ const createWsListenerTask = async (ws: WebSocket) => {
324
+ return new Promise<void>((resolve, reject) => {
325
+ this.abortController.signal.addEventListener('abort', () => {
326
+ resourceCleanup();
327
+ reject(new Error('WebSocket connection aborted'));
328
+ });
329
+
330
+ ws.on('message', async (data) => {
331
+ const eventJson = JSON.parse(data.toString()) as Record<string, unknown>;
332
+ const validatedEvent = ttsServerEventSchema.parse(eventJson);
333
+ eventChannel.write(validatedEvent);
334
+ });
335
+
336
+ ws.on('error', (e) => {
337
+ this.#logger.error({ error: e }, 'WebSocket error');
338
+ resourceCleanup();
339
+ reject(e);
340
+ });
341
+
342
+ ws.on('close', () => {
343
+ resourceCleanup();
344
+
345
+ if (!closing) return this.#logger.error('WebSocket closed unexpectedly');
346
+ if (finalReceived) return resolve();
347
+
348
+ reject(
349
+ new APIStatusError({
350
+ message: 'Gateway connection closed unexpectedly',
351
+ options: { requestId },
352
+ }),
353
+ );
354
+ });
355
+ });
356
+ };
357
+
358
+ const createRecvTask = async () => {
359
+ let currentSessionId: string | null = null;
360
+
361
+ const bstream = new AudioByteStream(this.opts.sampleRate, NUM_CHANNELS);
362
+ const serverEventStream = eventChannel.stream();
363
+ const reader = serverEventStream.getReader();
364
+
365
+ try {
366
+ while (!this.closed && !this.abortController.signal.aborted) {
367
+ const result = await reader.read();
368
+ if (this.abortController.signal.aborted) return;
369
+ if (result.done) return;
370
+
371
+ const serverEvent = result.value;
372
+ switch (serverEvent.type) {
373
+ case 'session.created':
374
+ currentSessionId = serverEvent.session_id;
375
+ break;
376
+ case 'output_audio':
377
+ const base64Data = new Int8Array(Buffer.from(serverEvent.audio, 'base64'));
378
+ for (const frame of bstream.write(base64Data.buffer)) {
379
+ sendLastFrame(currentSessionId!, false);
380
+ lastFrame = frame;
381
+ }
382
+ break;
383
+ case 'done':
384
+ finalReceived = true;
385
+ for (const frame of bstream.flush()) {
386
+ sendLastFrame(currentSessionId!, false);
387
+ lastFrame = frame;
388
+ }
389
+ sendLastFrame(currentSessionId!, true);
390
+ this.queue.put(SynthesizeStream.END_OF_STREAM);
391
+ break;
392
+ case 'session.closed':
393
+ resourceCleanup();
394
+ break;
395
+ case 'error':
396
+ this.#logger.error(
397
+ { serverEvent },
398
+ 'Received error message from LiveKit TTS WebSocket',
399
+ );
400
+ resourceCleanup();
401
+ throw new APIError(`LiveKit TTS returned error: ${serverEvent.message}`);
402
+ default:
403
+ this.#logger.warn('Unexpected message %s', serverEvent);
404
+ break;
405
+ }
406
+ }
407
+ } finally {
408
+ reader.releaseLock();
409
+ try {
410
+ await serverEventStream.cancel();
411
+ } catch (e) {
412
+ this.#logger.debug('Error cancelling serverEventStream (may already be cancelled):', e);
413
+ }
414
+ }
415
+ };
416
+
417
+ try {
418
+ ws = await this.tts.connectWs(this.connOptions.timeoutMs);
419
+
420
+ await Promise.all([
421
+ createInputTask(),
422
+ createSentenceStreamTask(),
423
+ createWsListenerTask(ws),
424
+ createRecvTask(),
425
+ ]);
426
+ } catch (e) {
427
+ this.#logger.error('Error in SynthesizeStream', { error: e });
428
+ } finally {
429
+ resourceCleanup();
430
+ }
431
+ }
432
+ }
@@ -0,0 +1,66 @@
1
+ // SPDX-FileCopyrightText: 2025 LiveKit, Inc.
2
+ //
3
+ // SPDX-License-Identifier: Apache-2.0
4
+ import { AccessToken } from 'livekit-server-sdk';
5
+ import { WebSocket } from 'ws';
6
+ import { APIConnectionError, APIStatusError } from '../index.js';
7
+
8
+ export type AnyString = string & NonNullable<unknown>;
9
+
10
+ export async function createAccessToken(
11
+ apiKey: string,
12
+ apiSecret: string,
13
+ ttl: number = 600,
14
+ ): Promise<string> {
15
+ const token = new AccessToken(apiKey, apiSecret, { identity: 'agent', ttl });
16
+ token.addInferenceGrant({ perform: true });
17
+
18
+ return await token.toJwt();
19
+ }
20
+
21
+ export async function connectWs(
22
+ url: string,
23
+ headers: Record<string, string>,
24
+ timeoutMs: number,
25
+ ): Promise<WebSocket> {
26
+ return new Promise<WebSocket>((resolve, reject) => {
27
+ const socket = new WebSocket(url, { headers: headers });
28
+
29
+ const timeout = setTimeout(() => {
30
+ reject(new APIConnectionError({ message: 'Timeout connecting to LiveKit WebSocket' }));
31
+ }, timeoutMs);
32
+
33
+ const onOpen = () => {
34
+ clearTimeout(timeout);
35
+ resolve(socket);
36
+ };
37
+
38
+ const onError = (err: unknown) => {
39
+ clearTimeout(timeout);
40
+ if (err && typeof err === 'object' && 'code' in err && (err as any).code === 429) {
41
+ reject(
42
+ new APIStatusError({
43
+ message: 'LiveKit gateway quota exceeded',
44
+ options: { statusCode: 429 },
45
+ }),
46
+ );
47
+ } else {
48
+ reject(new APIConnectionError({ message: 'Error connecting to LiveKit WebSocket' }));
49
+ }
50
+ };
51
+
52
+ const onClose = (code: number) => {
53
+ clearTimeout(timeout);
54
+ if (code !== 1000) {
55
+ reject(
56
+ new APIConnectionError({
57
+ message: 'Connection closed unexpectedly',
58
+ }),
59
+ );
60
+ }
61
+ };
62
+ socket.once('open', onOpen);
63
+ socket.once('error', onError);
64
+ socket.once('close', onClose);
65
+ });
66
+ }
@@ -6,9 +6,9 @@ import type { ChatItem } from './chat_context.js';
6
6
 
7
7
  export interface RemoteChatItem {
8
8
  item: ChatItem;
9
- /* @internal */
9
+ /** @internal */
10
10
  _prev?: RemoteChatItem | null;
11
- /* @internal */
11
+ /** @internal */
12
12
  _next?: RemoteChatItem | null;
13
13
  }
14
14
 
package/src/tts/tts.ts CHANGED
@@ -443,7 +443,7 @@ export abstract class ChunkedStream implements AsyncIterableIterator<Synthesized
443
443
  for await (const audio of this.queue) {
444
444
  this.output.put(audio);
445
445
  requestId = audio.requestId;
446
- if (!ttfb) {
446
+ if (ttfb === BigInt(-1)) {
447
447
  ttfb = process.hrtime.bigint() - startTime;
448
448
  }
449
449
  audioDuration += audio.frame.samplesPerChannel / audio.frame.sampleRate;
package/src/utils.ts CHANGED
@@ -817,3 +817,14 @@ export async function waitForTrackPublication({
817
817
  room.off(RoomEvent.TrackPublished, onTrackPublished);
818
818
  }
819
819
  }
820
+
821
+ export async function waitForAbort(signal: AbortSignal) {
822
+ const abortFuture = new Future<void>();
823
+ const handler = () => {
824
+ abortFuture.resolve();
825
+ signal.removeEventListener('abort', handler);
826
+ };
827
+
828
+ signal.addEventListener('abort', handler, { once: true });
829
+ return await abortFuture.await;
830
+ }
@@ -4,6 +4,14 @@
4
4
  import type { AudioFrame } from '@livekit/rtc-node';
5
5
  import { AsyncLocalStorage } from 'node:async_hooks';
6
6
  import { ReadableStream } from 'node:stream/web';
7
+ import {
8
+ LLM as InferenceLLM,
9
+ STT as InferenceSTT,
10
+ TTS as InferenceTTS,
11
+ type LLMModels,
12
+ type STTModelString,
13
+ type TTSModelString,
14
+ } from '../inference/index.js';
7
15
  import { ReadonlyChatContext } from '../llm/chat_context.js';
8
16
  import type { ChatMessage, FunctionCall, RealtimeModel } from '../llm/index.js';
9
17
  import {
@@ -46,7 +54,7 @@ export function isStopResponse(value: unknown): value is StopResponse {
46
54
  }
47
55
 
48
56
  export interface ModelSettings {
49
- /* The tool choice to use when calling the LLM. */
57
+ /** The tool choice to use when calling the LLM. */
50
58
  toolChoice?: ToolChoice;
51
59
  }
52
60
 
@@ -55,10 +63,10 @@ export interface AgentOptions<UserData> {
55
63
  chatCtx?: ChatContext;
56
64
  tools?: ToolContext<UserData>;
57
65
  turnDetection?: TurnDetectionMode;
58
- stt?: STT;
66
+ stt?: STT | STTModelString;
59
67
  vad?: VAD;
60
- llm?: LLM | RealtimeModel;
61
- tts?: TTS;
68
+ llm?: LLM | RealtimeModel | LLMModels;
69
+ tts?: TTS | TTSModelString;
62
70
  allowInterruptions?: boolean;
63
71
  minConsecutiveSpeechDelay?: number;
64
72
  }
@@ -101,10 +109,26 @@ export class Agent<UserData = any> {
101
109
  : ChatContext.empty();
102
110
 
103
111
  this.turnDetection = turnDetection;
104
- this._stt = stt;
105
112
  this._vad = vad;
106
- this._llm = llm;
107
- this._tts = tts;
113
+
114
+ if (typeof stt === 'string') {
115
+ this._stt = InferenceSTT.fromModelString(stt);
116
+ } else {
117
+ this._stt = stt;
118
+ }
119
+
120
+ if (typeof llm === 'string') {
121
+ this._llm = InferenceLLM.fromModelString(llm);
122
+ } else {
123
+ this._llm = llm;
124
+ }
125
+
126
+ if (typeof tts === 'string') {
127
+ this._tts = InferenceTTS.fromModelString(tts);
128
+ } else {
129
+ this._tts = tts;
130
+ }
131
+
108
132
  this._agentActivity = undefined;
109
133
  }
110
134
 
@@ -641,6 +641,7 @@ export class AgentActivity implements RecognitionHooks {
641
641
  createUserInputTranscribedEvent({
642
642
  transcript: ev.alternatives![0].text,
643
643
  isFinal: false,
644
+ language: ev.alternatives![0].language,
644
645
  // TODO(AJS-106): add multi participant support
645
646
  }),
646
647
  );
@@ -657,6 +658,7 @@ export class AgentActivity implements RecognitionHooks {
657
658
  createUserInputTranscribedEvent({
658
659
  transcript: ev.alternatives![0].text,
659
660
  isFinal: true,
661
+ language: ev.alternatives![0].language,
660
662
  // TODO(AJS-106): add multi participant support
661
663
  }),
662
664
  );
@@ -5,6 +5,14 @@ import type { AudioFrame, Room } from '@livekit/rtc-node';
5
5
  import type { TypedEventEmitter as TypedEmitter } from '@livekit/typed-emitter';
6
6
  import { EventEmitter } from 'node:events';
7
7
  import type { ReadableStream } from 'node:stream/web';
8
+ import {
9
+ LLM as InferenceLLM,
10
+ STT as InferenceSTT,
11
+ TTS as InferenceTTS,
12
+ type LLMModels,
13
+ type STTModelString,
14
+ type TTSModelString,
15
+ } from '../inference/index.js';
8
16
  import { getJobContext } from '../job.js';
9
17
  import { ChatContext, ChatMessage } from '../llm/chat_context.js';
10
18
  import type { LLM, RealtimeModel, RealtimeModelError, ToolChoice } from '../llm/index.js';
@@ -77,10 +85,10 @@ export type AgentSessionCallbacks = {
77
85
 
78
86
  export type AgentSessionOptions<UserData = UnknownUserData> = {
79
87
  turnDetection?: TurnDetectionMode;
80
- stt?: STT;
88
+ stt?: STT | STTModelString;
81
89
  vad?: VAD;
82
- llm?: LLM | RealtimeModel;
83
- tts?: TTS;
90
+ llm?: LLM | RealtimeModel | LLMModels;
91
+ tts?: TTS | TTSModelString;
84
92
  userData?: UserData;
85
93
  voiceOptions?: Partial<VoiceOptions>;
86
94
  };
@@ -128,9 +136,25 @@ export class AgentSession<
128
136
  } = opts;
129
137
 
130
138
  this.vad = vad;
131
- this.stt = stt;
132
- this.llm = llm;
133
- this.tts = tts;
139
+
140
+ if (typeof stt === 'string') {
141
+ this.stt = InferenceSTT.fromModelString(stt);
142
+ } else {
143
+ this.stt = stt;
144
+ }
145
+
146
+ if (typeof llm === 'string') {
147
+ this.llm = InferenceLLM.fromModelString(llm);
148
+ } else {
149
+ this.llm = llm;
150
+ }
151
+
152
+ if (typeof tts === 'string') {
153
+ this.tts = InferenceTTS.fromModelString(tts);
154
+ } else {
155
+ this.tts = tts;
156
+ }
157
+
134
158
  this.turnDetection = turnDetection;
135
159
  this._userData = userData;
136
160