@livekit/agents 1.0.5 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (97) hide show
  1. package/dist/index.cjs +3 -0
  2. package/dist/index.cjs.map +1 -1
  3. package/dist/index.d.cts +2 -1
  4. package/dist/index.d.ts +2 -1
  5. package/dist/index.d.ts.map +1 -1
  6. package/dist/index.js +2 -0
  7. package/dist/index.js.map +1 -1
  8. package/dist/inference/api_protos.cjs +104 -0
  9. package/dist/inference/api_protos.cjs.map +1 -0
  10. package/dist/inference/api_protos.d.cts +222 -0
  11. package/dist/inference/api_protos.d.ts +222 -0
  12. package/dist/inference/api_protos.d.ts.map +1 -0
  13. package/dist/inference/api_protos.js +70 -0
  14. package/dist/inference/api_protos.js.map +1 -0
  15. package/dist/inference/index.cjs +56 -0
  16. package/dist/inference/index.cjs.map +1 -0
  17. package/dist/inference/index.d.cts +9 -0
  18. package/dist/inference/index.d.ts +9 -0
  19. package/dist/inference/index.d.ts.map +1 -0
  20. package/dist/inference/index.js +16 -0
  21. package/dist/inference/index.js.map +1 -0
  22. package/dist/inference/llm.cjs +315 -0
  23. package/dist/inference/llm.cjs.map +1 -0
  24. package/dist/inference/llm.d.cts +92 -0
  25. package/dist/inference/llm.d.ts +92 -0
  26. package/dist/inference/llm.d.ts.map +1 -0
  27. package/dist/inference/llm.js +286 -0
  28. package/dist/inference/llm.js.map +1 -0
  29. package/dist/inference/stt.cjs +305 -0
  30. package/dist/inference/stt.cjs.map +1 -0
  31. package/dist/inference/stt.d.cts +79 -0
  32. package/dist/inference/stt.d.ts +79 -0
  33. package/dist/inference/stt.d.ts.map +1 -0
  34. package/dist/inference/stt.js +284 -0
  35. package/dist/inference/stt.js.map +1 -0
  36. package/dist/inference/tts.cjs +317 -0
  37. package/dist/inference/tts.cjs.map +1 -0
  38. package/dist/inference/tts.d.cts +75 -0
  39. package/dist/inference/tts.d.ts +75 -0
  40. package/dist/inference/tts.d.ts.map +1 -0
  41. package/dist/inference/tts.js +299 -0
  42. package/dist/inference/tts.js.map +1 -0
  43. package/dist/inference/utils.cjs +76 -0
  44. package/dist/inference/utils.cjs.map +1 -0
  45. package/dist/inference/utils.d.cts +5 -0
  46. package/dist/inference/utils.d.ts +5 -0
  47. package/dist/inference/utils.d.ts.map +1 -0
  48. package/dist/inference/utils.js +51 -0
  49. package/dist/inference/utils.js.map +1 -0
  50. package/dist/tts/tts.cjs +1 -1
  51. package/dist/tts/tts.cjs.map +1 -1
  52. package/dist/tts/tts.js +1 -1
  53. package/dist/tts/tts.js.map +1 -1
  54. package/dist/utils.cjs +11 -0
  55. package/dist/utils.cjs.map +1 -1
  56. package/dist/utils.d.cts +1 -0
  57. package/dist/utils.d.ts +1 -0
  58. package/dist/utils.d.ts.map +1 -1
  59. package/dist/utils.js +10 -0
  60. package/dist/utils.js.map +1 -1
  61. package/dist/voice/agent.cjs +16 -3
  62. package/dist/voice/agent.cjs.map +1 -1
  63. package/dist/voice/agent.d.cts +4 -3
  64. package/dist/voice/agent.d.ts +4 -3
  65. package/dist/voice/agent.d.ts.map +1 -1
  66. package/dist/voice/agent.js +20 -3
  67. package/dist/voice/agent.js.map +1 -1
  68. package/dist/voice/agent_session.cjs +16 -3
  69. package/dist/voice/agent_session.cjs.map +1 -1
  70. package/dist/voice/agent_session.d.cts +4 -3
  71. package/dist/voice/agent_session.d.ts +4 -3
  72. package/dist/voice/agent_session.d.ts.map +1 -1
  73. package/dist/voice/agent_session.js +20 -3
  74. package/dist/voice/agent_session.js.map +1 -1
  75. package/dist/voice/room_io/_input.cjs +9 -0
  76. package/dist/voice/room_io/_input.cjs.map +1 -1
  77. package/dist/voice/room_io/_input.d.ts.map +1 -1
  78. package/dist/voice/room_io/_input.js +10 -0
  79. package/dist/voice/room_io/_input.js.map +1 -1
  80. package/dist/worker.cjs.map +1 -1
  81. package/dist/worker.d.ts.map +1 -1
  82. package/dist/worker.js +1 -1
  83. package/dist/worker.js.map +1 -1
  84. package/package.json +3 -2
  85. package/src/index.ts +2 -1
  86. package/src/inference/api_protos.ts +82 -0
  87. package/src/inference/index.ts +12 -0
  88. package/src/inference/llm.ts +485 -0
  89. package/src/inference/stt.ts +414 -0
  90. package/src/inference/tts.ts +421 -0
  91. package/src/inference/utils.ts +66 -0
  92. package/src/tts/tts.ts +1 -1
  93. package/src/utils.ts +11 -0
  94. package/src/voice/agent.ts +30 -6
  95. package/src/voice/agent_session.ts +29 -6
  96. package/src/voice/room_io/_input.ts +12 -1
  97. package/src/worker.ts +2 -7
@@ -0,0 +1,414 @@
1
+ // SPDX-FileCopyrightText: 2025 LiveKit, Inc.
2
+ //
3
+ // SPDX-License-Identifier: Apache-2.0
4
+ import { type AudioFrame } from '@livekit/rtc-node';
5
+ import { type RawData, WebSocket } from 'ws';
6
+ import { APIError, APIStatusError } from '../_exceptions.js';
7
+ import { AudioByteStream } from '../audio.js';
8
+ import { log } from '../log.js';
9
+ import {
10
+ STT as BaseSTT,
11
+ SpeechStream as BaseSpeechStream,
12
+ type SpeechData,
13
+ type SpeechEvent,
14
+ SpeechEventType,
15
+ } from '../stt/index.js';
16
+ import { type APIConnectOptions, DEFAULT_API_CONNECT_OPTIONS } from '../types.js';
17
+ import { type AudioBuffer, Event, Task, cancelAndWait, shortuuid, waitForAbort } from '../utils.js';
18
+ import { type AnyModels, connectWs, createAccessToken } from './utils.js';
19
+
20
+ export type DeepgramModels =
21
+ | 'deepgram'
22
+ | 'deepgram/nova-3'
23
+ | 'deepgram/nova-3-general'
24
+ | 'deepgram/nova-3-medical'
25
+ | 'deepgram/nova-2'
26
+ | 'deepgram/nova-2-general'
27
+ | 'deepgram/nova-2-medical'
28
+ | 'deepgram/nova-2-phonecall';
29
+
30
+ export type CartesiaModels = 'cartesia' | 'cartesia/ink-whisper';
31
+
32
+ export type AssemblyaiModels = 'assemblyai' | 'assemblyai/universal-streaming';
33
+
34
+ export interface CartesiaOptions {
35
+ min_volume?: number; // default: not specified
36
+ max_silence_duration_secs?: number; // default: not specified
37
+ }
38
+
39
+ export interface DeepgramOptions {
40
+ filler_words?: boolean; // default: true
41
+ interim_results?: boolean; // default: true
42
+ endpointing?: number; // default: 25 (ms)
43
+ punctuate?: boolean; // default: false
44
+ smart_format?: boolean;
45
+ keywords?: Array<[string, number]>;
46
+ keyterms?: string[];
47
+ profanity_filter?: boolean;
48
+ numerals?: boolean;
49
+ mip_opt_out?: boolean;
50
+ }
51
+
52
+ export interface AssemblyaiOptions {
53
+ format_turns?: boolean; // default: false
54
+ end_of_turn_confidence_threshold?: number; // default: 0.01
55
+ min_end_of_turn_silence_when_confident?: number; // default: 0
56
+ max_turn_silence?: number; // default: not specified
57
+ keyterms_prompt?: string[]; // default: not specified
58
+ }
59
+
60
+ export type STTModels = DeepgramModels | CartesiaModels | AssemblyaiModels | AnyModels;
61
+ export type STTOptions<TModel extends STTModels> = TModel extends DeepgramModels
62
+ ? DeepgramOptions
63
+ : TModel extends CartesiaModels
64
+ ? CartesiaOptions
65
+ : TModel extends AssemblyaiModels
66
+ ? AssemblyaiOptions
67
+ : Record<string, unknown>;
68
+
69
+ export type STTLanguages = 'en' | 'de' | 'es' | 'fr' | 'ja' | 'pt' | 'zh';
70
+ export type STTEncoding = 'pcm_s16le';
71
+
72
+ const DEFAULT_ENCODING: STTEncoding = 'pcm_s16le';
73
+ const DEFAULT_SAMPLE_RATE = 16000;
74
+ const DEFAULT_BASE_URL = 'wss://agent-gateway.livekit.cloud/v1';
75
+ const DEFAULT_CANCEL_TIMEOUT = 5000;
76
+
77
+ export interface InferenceSTTOptions<TModel extends STTModels> {
78
+ model: TModel;
79
+ language?: STTLanguages | string;
80
+ encoding: STTEncoding;
81
+ sampleRate: number;
82
+ baseURL: string;
83
+ apiKey: string;
84
+ apiSecret: string;
85
+ extraKwargs: STTOptions<TModel>;
86
+ }
87
+
88
+ export class STT<TModel extends STTModels> extends BaseSTT {
89
+ private opts: InferenceSTTOptions<TModel>;
90
+ private streams: Set<SpeechStream<TModel>> = new Set();
91
+
92
+ constructor(opts: {
93
+ model: TModel;
94
+ language?: STTLanguages | string;
95
+ baseURL?: string;
96
+ encoding?: STTEncoding;
97
+ sampleRate?: number;
98
+ apiKey?: string;
99
+ apiSecret?: string;
100
+ extraKwargs?: STTOptions<TModel>;
101
+ }) {
102
+ super({ streaming: true, interimResults: true });
103
+
104
+ const {
105
+ model,
106
+ language,
107
+ baseURL,
108
+ encoding = DEFAULT_ENCODING,
109
+ sampleRate = DEFAULT_SAMPLE_RATE,
110
+ apiKey,
111
+ apiSecret,
112
+ extraKwargs = {} as STTOptions<TModel>,
113
+ } = opts || {};
114
+
115
+ const lkBaseURL = baseURL || process.env.LIVEKIT_INFERENCE_URL || DEFAULT_BASE_URL;
116
+ const lkApiKey = apiKey || process.env.LIVEKIT_INFERENCE_API_KEY || process.env.LIVEKIT_API_KEY;
117
+ if (!lkApiKey) {
118
+ throw new Error('apiKey is required: pass apiKey or set LIVEKIT_API_KEY');
119
+ }
120
+
121
+ const lkApiSecret =
122
+ apiSecret || process.env.LIVEKIT_INFERENCE_API_SECRET || process.env.LIVEKIT_API_SECRET;
123
+ if (!lkApiSecret) {
124
+ throw new Error('apiSecret is required: pass apiSecret or set LIVEKIT_API_SECRET');
125
+ }
126
+
127
+ this.opts = {
128
+ model,
129
+ language,
130
+ encoding,
131
+ sampleRate,
132
+ baseURL: lkBaseURL,
133
+ apiKey: lkApiKey,
134
+ apiSecret: lkApiSecret,
135
+ extraKwargs,
136
+ };
137
+ }
138
+
139
+ get label(): string {
140
+ return 'inference.STT';
141
+ }
142
+
143
+ protected async _recognize(_: AudioBuffer): Promise<SpeechEvent> {
144
+ throw new Error('LiveKit STT does not support batch recognition, use stream() instead');
145
+ }
146
+
147
+ updateOptions(opts: Partial<Pick<InferenceSTTOptions<TModel>, 'model' | 'language'>>): void {
148
+ this.opts = { ...this.opts, ...opts };
149
+
150
+ for (const stream of this.streams) {
151
+ stream.updateOptions(opts);
152
+ }
153
+ }
154
+
155
+ stream(options?: {
156
+ language?: STTLanguages | string;
157
+ connOptions?: APIConnectOptions;
158
+ }): SpeechStream<TModel> {
159
+ const { language, connOptions = DEFAULT_API_CONNECT_OPTIONS } = options || {};
160
+ const streamOpts = {
161
+ ...this.opts,
162
+ language: language ?? this.opts.language,
163
+ } as InferenceSTTOptions<TModel>;
164
+
165
+ const stream = new SpeechStream(this, streamOpts, connOptions);
166
+ this.streams.add(stream);
167
+
168
+ return stream;
169
+ }
170
+ }
171
+
172
+ export class SpeechStream<TModel extends STTModels> extends BaseSpeechStream {
173
+ private opts: InferenceSTTOptions<TModel>;
174
+ private requestId = shortuuid('stt_request_');
175
+ private speaking = false;
176
+ private speechDuration = 0;
177
+ private reconnectEvent = new Event();
178
+
179
+ #logger = log();
180
+
181
+ constructor(
182
+ sttImpl: STT<TModel>,
183
+ opts: InferenceSTTOptions<TModel>,
184
+ connOptions: APIConnectOptions,
185
+ ) {
186
+ super(sttImpl, opts.sampleRate, connOptions);
187
+ this.opts = opts;
188
+ }
189
+
190
+ get label(): string {
191
+ return 'inference.SpeechStream';
192
+ }
193
+
194
+ updateOptions(opts: Partial<Pick<InferenceSTTOptions<TModel>, 'model' | 'language'>>): void {
195
+ this.opts = { ...this.opts, ...opts };
196
+ }
197
+
198
+ protected async run(): Promise<void> {
199
+ let ws: WebSocket | null = null;
200
+ let closingWs = false;
201
+
202
+ this.reconnectEvent.set();
203
+
204
+ const connect = async () => {
205
+ const params = {
206
+ settings: {
207
+ sample_rate: String(this.opts.sampleRate),
208
+ encoding: this.opts.encoding,
209
+ extra: this.opts.extraKwargs,
210
+ },
211
+ } as Record<string, unknown>;
212
+
213
+ if (this.opts.model) {
214
+ params.model = this.opts.model;
215
+ }
216
+
217
+ if (this.opts.language) {
218
+ (params.settings as Record<string, unknown>).language = this.opts.language;
219
+ }
220
+
221
+ let baseURL = this.opts.baseURL;
222
+ if (baseURL.startsWith('http://') || baseURL.startsWith('https://')) {
223
+ baseURL = baseURL.replace('http', 'ws');
224
+ }
225
+
226
+ const token = await createAccessToken(this.opts.apiKey, this.opts.apiSecret);
227
+ const url = `${baseURL}/stt`;
228
+ const headers = { Authorization: `Bearer ${token}` } as Record<string, string>;
229
+
230
+ const socket = await connectWs(url, headers, 10000);
231
+ const msg = { ...params, type: 'session.create' };
232
+ socket.send(JSON.stringify(msg));
233
+
234
+ return socket;
235
+ };
236
+
237
+ const send = async (socket: WebSocket, signal: AbortSignal) => {
238
+ const audioStream = new AudioByteStream(
239
+ this.opts.sampleRate,
240
+ 1,
241
+ Math.floor(this.opts.sampleRate / 20), // 50ms
242
+ );
243
+
244
+ for await (const ev of this.input) {
245
+ if (signal.aborted) break;
246
+ let frames: AudioFrame[];
247
+
248
+ if (ev === SpeechStream.FLUSH_SENTINEL) {
249
+ frames = audioStream.flush();
250
+ } else {
251
+ const frame = ev as AudioFrame;
252
+ frames = audioStream.write(new Int16Array(frame.data).buffer);
253
+ }
254
+
255
+ for (const frame of frames) {
256
+ this.speechDuration += frame.samplesPerChannel / frame.sampleRate;
257
+ const base64 = Buffer.from(frame.data.buffer).toString('base64');
258
+ const msg = { type: 'input_audio', audio: base64 };
259
+ socket.send(JSON.stringify(msg));
260
+ }
261
+ }
262
+
263
+ closingWs = true;
264
+ socket.send(JSON.stringify({ type: 'session.finalize' }));
265
+ };
266
+
267
+ const recv = async (socket: WebSocket, signal: AbortSignal) => {
268
+ while (!this.closed && !signal.aborted) {
269
+ const dataPromise = new Promise<string>((resolve, reject) => {
270
+ const messageHandler = (d: RawData) => {
271
+ resolve(d.toString());
272
+ removeListeners();
273
+ };
274
+ const errorHandler = (e: Error) => {
275
+ reject(e);
276
+ removeListeners();
277
+ };
278
+ const closeHandler = (code: number) => {
279
+ if (closingWs) {
280
+ resolve('');
281
+ } else {
282
+ reject(
283
+ new APIStatusError({
284
+ message: 'LiveKit STT connection closed unexpectedly',
285
+ options: { statusCode: code },
286
+ }),
287
+ );
288
+ }
289
+ removeListeners();
290
+ };
291
+ const removeListeners = () => {
292
+ socket.removeListener('message', messageHandler);
293
+ socket.removeListener('error', errorHandler);
294
+ socket.removeListener('close', closeHandler);
295
+ };
296
+ socket.once('message', messageHandler);
297
+ socket.once('error', errorHandler);
298
+ socket.once('close', closeHandler);
299
+ });
300
+
301
+ const data = await Promise.race([dataPromise, waitForAbort(signal)]);
302
+
303
+ if (!data || signal.aborted) return;
304
+
305
+ const json = JSON.parse(data);
306
+ const type = json.type as string | undefined;
307
+
308
+ switch (type) {
309
+ case 'session.created':
310
+ case 'session.finalized':
311
+ case 'session.closed':
312
+ break;
313
+ case 'interim_transcript':
314
+ this.processTranscript(json, false);
315
+ break;
316
+ case 'final_transcript':
317
+ this.processTranscript(json, true);
318
+ break;
319
+ case 'error':
320
+ this.#logger.error('received error from LiveKit STT: %o', json);
321
+ throw new APIError(`LiveKit STT returned error: ${JSON.stringify(json)}`);
322
+ default:
323
+ this.#logger.warn('received unexpected message from LiveKit STT: %o', json);
324
+ break;
325
+ }
326
+ }
327
+ };
328
+
329
+ while (true) {
330
+ try {
331
+ ws = await connect();
332
+
333
+ const sendTask = Task.from(async ({ signal }) => {
334
+ await send(ws!, signal);
335
+ });
336
+
337
+ const recvTask = Task.from(async ({ signal }) => {
338
+ await recv(ws!, signal);
339
+ });
340
+
341
+ const tasks = [sendTask, recvTask];
342
+ const waitReconnectTask = Task.from(async ({ signal }) => {
343
+ await Promise.race([this.reconnectEvent.wait(), waitForAbort(signal)]);
344
+ });
345
+
346
+ try {
347
+ await Promise.race([
348
+ Promise.all(tasks.map((task) => task.result)),
349
+ waitReconnectTask.result,
350
+ ]);
351
+
352
+ if (!waitReconnectTask.done) break;
353
+ this.reconnectEvent.clear();
354
+ } finally {
355
+ await cancelAndWait([sendTask, recvTask, waitReconnectTask], DEFAULT_CANCEL_TIMEOUT);
356
+ }
357
+ } finally {
358
+ try {
359
+ if (ws) ws.close();
360
+ } catch {}
361
+ }
362
+ }
363
+ }
364
+
365
+ private processTranscript(data: Record<string, any>, isFinal: boolean) {
366
+ const requestId = data.request_id ?? this.requestId;
367
+ const text = data.transcript ?? '';
368
+ const language = data.language ?? this.opts.language ?? 'en';
369
+
370
+ if (!text && !isFinal) return;
371
+
372
+ // We'll have a more accurate way of detecting when speech started when we have VAD
373
+ if (!this.speaking) {
374
+ this.speaking = true;
375
+ this.queue.put({ type: SpeechEventType.START_OF_SPEECH });
376
+ }
377
+
378
+ const speechData: SpeechData = {
379
+ language,
380
+ startTime: data.start ?? 0,
381
+ endTime: data.duration ?? 0,
382
+ confidence: data.confidence ?? 1.0,
383
+ text,
384
+ };
385
+
386
+ if (isFinal) {
387
+ if (this.speechDuration > 0) {
388
+ this.queue.put({
389
+ type: SpeechEventType.RECOGNITION_USAGE,
390
+ requestId,
391
+ recognitionUsage: { audioDuration: this.speechDuration },
392
+ });
393
+ this.speechDuration = 0;
394
+ }
395
+
396
+ this.queue.put({
397
+ type: SpeechEventType.FINAL_TRANSCRIPT,
398
+ requestId,
399
+ alternatives: [speechData],
400
+ });
401
+
402
+ if (this.speaking) {
403
+ this.speaking = false;
404
+ this.queue.put({ type: SpeechEventType.END_OF_SPEECH });
405
+ }
406
+ } else {
407
+ this.queue.put({
408
+ type: SpeechEventType.INTERIM_TRANSCRIPT,
409
+ requestId,
410
+ alternatives: [speechData],
411
+ });
412
+ }
413
+ }
414
+ }