@livekit/agents 0.2.0 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (71) hide show
  1. package/.turbo/turbo-build.log +1 -1
  2. package/CHANGELOG.md +40 -0
  3. package/dist/audio.d.ts +1 -4
  4. package/dist/audio.d.ts.map +1 -1
  5. package/dist/audio.js +30 -12
  6. package/dist/audio.js.map +1 -1
  7. package/dist/cli.d.ts +1 -1
  8. package/dist/cli.d.ts.map +1 -1
  9. package/dist/cli.js +41 -17
  10. package/dist/cli.js.map +1 -1
  11. package/dist/generator.d.ts +5 -0
  12. package/dist/generator.d.ts.map +1 -1
  13. package/dist/generator.js +11 -0
  14. package/dist/generator.js.map +1 -1
  15. package/dist/http_server.d.ts +1 -0
  16. package/dist/http_server.d.ts.map +1 -1
  17. package/dist/http_server.js +13 -0
  18. package/dist/http_server.js.map +1 -1
  19. package/dist/index.d.ts +3 -1
  20. package/dist/index.d.ts.map +1 -1
  21. package/dist/index.js +3 -1
  22. package/dist/index.js.map +1 -1
  23. package/dist/ipc/job_main.js +9 -1
  24. package/dist/ipc/job_main.js.map +1 -1
  25. package/dist/ipc/proc_pool.d.ts.map +1 -1
  26. package/dist/ipc/proc_pool.js +1 -0
  27. package/dist/ipc/proc_pool.js.map +1 -1
  28. package/dist/job.d.ts +1 -0
  29. package/dist/job.d.ts.map +1 -1
  30. package/dist/job.js +30 -1
  31. package/dist/job.js.map +1 -1
  32. package/dist/multimodal/agent_playout.d.ts +34 -0
  33. package/dist/multimodal/agent_playout.d.ts.map +1 -0
  34. package/dist/multimodal/agent_playout.js +221 -0
  35. package/dist/multimodal/agent_playout.js.map +1 -0
  36. package/dist/multimodal/index.d.ts +3 -0
  37. package/dist/multimodal/index.d.ts.map +1 -0
  38. package/dist/multimodal/index.js +6 -0
  39. package/dist/multimodal/index.js.map +1 -0
  40. package/dist/multimodal/multimodal_agent.d.ts +47 -0
  41. package/dist/multimodal/multimodal_agent.d.ts.map +1 -0
  42. package/dist/multimodal/multimodal_agent.js +329 -0
  43. package/dist/multimodal/multimodal_agent.js.map +1 -0
  44. package/dist/transcription.d.ts +22 -0
  45. package/dist/transcription.d.ts.map +1 -0
  46. package/dist/transcription.js +112 -0
  47. package/dist/transcription.js.map +1 -0
  48. package/dist/utils.d.ts +29 -1
  49. package/dist/utils.d.ts.map +1 -1
  50. package/dist/utils.js +117 -15
  51. package/dist/utils.js.map +1 -1
  52. package/dist/worker.d.ts +3 -1
  53. package/dist/worker.d.ts.map +1 -1
  54. package/dist/worker.js +49 -9
  55. package/dist/worker.js.map +1 -1
  56. package/package.json +6 -4
  57. package/src/audio.ts +21 -20
  58. package/src/cli.ts +42 -17
  59. package/src/generator.ts +14 -0
  60. package/src/http_server.ts +6 -0
  61. package/src/index.ts +3 -1
  62. package/src/ipc/job_main.ts +9 -2
  63. package/src/ipc/proc_pool.ts +1 -0
  64. package/src/job.ts +37 -1
  65. package/src/multimodal/agent_playout.ts +254 -0
  66. package/src/multimodal/index.ts +5 -0
  67. package/src/multimodal/multimodal_agent.ts +426 -0
  68. package/src/transcription.ts +129 -0
  69. package/src/utils.ts +151 -12
  70. package/src/worker.ts +60 -14
  71. package/tsconfig.json +1 -1
@@ -0,0 +1,426 @@
1
+ // SPDX-FileCopyrightText: 2024 LiveKit, Inc.
2
+ //
3
+ // SPDX-License-Identifier: Apache-2.0
4
+ import type {
5
+ LocalTrackPublication,
6
+ RemoteAudioTrack,
7
+ RemoteParticipant,
8
+ Room,
9
+ } from '@livekit/rtc-node';
10
+ import {
11
+ AudioSource,
12
+ AudioStream,
13
+ LocalAudioTrack,
14
+ RoomEvent,
15
+ TrackPublishOptions,
16
+ TrackSource,
17
+ } from '@livekit/rtc-node';
18
+ import { EventEmitter } from 'events';
19
+ import { AudioByteStream } from '../audio.js';
20
+ import type * as llm from '../llm/index.js';
21
+ import { log } from '../log.js';
22
+ import { BasicTranscriptionForwarder } from '../transcription.js';
23
+ import { findMicroTrackId } from '../utils.js';
24
+ import { AgentPlayout, type PlayoutHandle } from './agent_playout.js';
25
+
26
+ /**
27
+ * @internal
28
+ * @beta
29
+ */
30
+ export abstract class RealtimeSession extends EventEmitter {
31
+ // eslint-disable-next-line @typescript-eslint/no-explicit-any
32
+ abstract defaultConversation: any; // openai.realtime.Conversation
33
+ // eslint-disable-next-line @typescript-eslint/no-explicit-any
34
+ abstract inputAudioBuffer: any; // openai.realtime.InputAudioBuffer
35
+ abstract fncCtx: llm.FunctionContext | undefined;
36
+ }
37
+
38
+ /**
39
+ * @internal
40
+ * @beta
41
+ */
42
+ export abstract class RealtimeModel {
43
+ // eslint-disable-next-line @typescript-eslint/no-explicit-any
44
+ abstract session(options: any): RealtimeSession; // openai.realtime.ModelOptions
45
+ abstract close(): Promise<void>;
46
+ abstract sampleRate: number;
47
+ abstract numChannels: number;
48
+ abstract inFrameSize: number;
49
+ abstract outFrameSize: number;
50
+ }
51
+
52
+ export type AgentState = 'initializing' | 'thinking' | 'listening' | 'speaking';
53
+ export const AGENT_STATE_ATTRIBUTE = 'lk.agent.state';
54
+
55
+ /** @beta */
56
+ export class MultimodalAgent {
57
+ model: RealtimeModel;
58
+ room: Room | null = null;
59
+ linkedParticipant: RemoteParticipant | null = null;
60
+ subscribedTrack: RemoteAudioTrack | null = null;
61
+ readMicroTask: { promise: Promise<void>; cancel: () => void } | null = null;
62
+
63
+ constructor({
64
+ model,
65
+ fncCtx,
66
+ }: {
67
+ model: RealtimeModel;
68
+ fncCtx?: llm.FunctionContext | undefined;
69
+ }) {
70
+ this.model = model;
71
+ this.#fncCtx = fncCtx;
72
+ }
73
+
74
+ #participant: RemoteParticipant | string | null = null;
75
+ #agentPublication: LocalTrackPublication | null = null;
76
+ #localTrackSid: string | null = null;
77
+ #localSource: AudioSource | null = null;
78
+ #agentPlayout: AgentPlayout | null = null;
79
+ #playingHandle: PlayoutHandle | undefined = undefined;
80
+ #logger = log();
81
+ #session: RealtimeSession | null = null;
82
+ #fncCtx: llm.FunctionContext | undefined = undefined;
83
+
84
+ #_started: boolean = false;
85
+ #_pendingFunctionCalls: Set<string> = new Set();
86
+ #_speaking: boolean = false;
87
+
88
+ get fncCtx(): llm.FunctionContext | undefined {
89
+ return this.#fncCtx;
90
+ }
91
+
92
+ set fncCtx(ctx: llm.FunctionContext | undefined) {
93
+ this.#fncCtx = ctx;
94
+ if (this.#session) {
95
+ this.#session.fncCtx = ctx;
96
+ }
97
+ }
98
+
99
+ get #pendingFunctionCalls(): Set<string> {
100
+ return this.#_pendingFunctionCalls;
101
+ }
102
+
103
+ set #pendingFunctionCalls(calls: Set<string>) {
104
+ this.#_pendingFunctionCalls = calls;
105
+ this.#updateState();
106
+ }
107
+
108
+ get #speaking(): boolean {
109
+ return this.#_speaking;
110
+ }
111
+
112
+ set #speaking(isSpeaking: boolean) {
113
+ this.#_speaking = isSpeaking;
114
+ this.#updateState();
115
+ }
116
+
117
+ get #started(): boolean {
118
+ return this.#_started;
119
+ }
120
+
121
+ set #started(started: boolean) {
122
+ this.#_started = started;
123
+ this.#updateState();
124
+ }
125
+
126
+ start(
127
+ room: Room,
128
+ participant: RemoteParticipant | string | null = null,
129
+ ): Promise<RealtimeSession> {
130
+ return new Promise(async (resolve, reject) => {
131
+ if (this.#started) {
132
+ reject(new Error('MultimodalAgent already started'));
133
+ }
134
+ this.#updateState();
135
+
136
+ room.on(RoomEvent.ParticipantConnected, (participant: RemoteParticipant) => {
137
+ if (!this.linkedParticipant) {
138
+ return;
139
+ }
140
+
141
+ this.#linkParticipant(participant.identity);
142
+ });
143
+
144
+ this.room = room;
145
+ this.#participant = participant;
146
+
147
+ this.#localSource = new AudioSource(this.model.sampleRate, this.model.numChannels);
148
+ this.#agentPlayout = new AgentPlayout(
149
+ this.#localSource,
150
+ this.model.sampleRate,
151
+ this.model.numChannels,
152
+ this.model.inFrameSize,
153
+ this.model.outFrameSize,
154
+ );
155
+ const track = LocalAudioTrack.createAudioTrack('assistant_voice', this.#localSource);
156
+ const options = new TrackPublishOptions();
157
+ options.source = TrackSource.SOURCE_MICROPHONE;
158
+ this.#agentPublication = (await room.localParticipant?.publishTrack(track, options)) || null;
159
+ if (!this.#agentPublication) {
160
+ this.#logger.error('Failed to publish track');
161
+ reject(new Error('Failed to publish track'));
162
+ return;
163
+ }
164
+
165
+ await this.#agentPublication.waitForSubscription();
166
+
167
+ if (participant) {
168
+ if (typeof participant === 'string') {
169
+ this.#linkParticipant(participant);
170
+ } else {
171
+ this.#linkParticipant(participant.identity);
172
+ }
173
+ } else {
174
+ // No participant specified, try to find the first participant in the room
175
+ for (const participant of room.remoteParticipants.values()) {
176
+ this.#linkParticipant(participant.identity);
177
+ break;
178
+ }
179
+ }
180
+
181
+ this.#session = this.model.session({ fncCtx: this.#fncCtx });
182
+ this.#started = true;
183
+
184
+ // eslint-disable-next-line @typescript-eslint/no-explicit-any
185
+ this.#session.on('response_content_added', (message: any) => {
186
+ // openai.realtime.RealtimeContent
187
+ const trFwd = new BasicTranscriptionForwarder(
188
+ this.room!,
189
+ this.room!.localParticipant!.identity,
190
+ this.#getLocalTrackSid()!,
191
+ message.responseId,
192
+ );
193
+
194
+ const handle = this.#agentPlayout?.play(
195
+ message.itemId,
196
+ message.contentIndex,
197
+ trFwd,
198
+ message.textStream,
199
+ message.audioStream,
200
+ );
201
+ if (handle) {
202
+ this.#speaking = true;
203
+ handle.on('done', () => {
204
+ if (this.#playingHandle == handle) {
205
+ this.#speaking = false;
206
+ }
207
+ });
208
+ }
209
+ this.#playingHandle = handle;
210
+ });
211
+
212
+ // eslint-disable-next-line @typescript-eslint/no-explicit-any
213
+ this.#session.on('input_speech_committed', (ev: any) => {
214
+ // openai.realtime.InputSpeechCommittedEvent
215
+ const participantIdentity = this.linkedParticipant?.identity;
216
+ const trackSid = this.subscribedTrack?.sid;
217
+ if (participantIdentity && trackSid) {
218
+ this.#publishTranscription(participantIdentity, trackSid, '…', false, ev.itemId);
219
+ } else {
220
+ this.#logger.error('Participant or track not set');
221
+ }
222
+ });
223
+
224
+ // eslint-disable-next-line @typescript-eslint/no-explicit-any
225
+ this.#session.on('input_speech_transcription_completed', (ev: any) => {
226
+ // openai.realtime.InputSpeechTranscriptionCompletedEvent
227
+ const transcription = ev.transcript;
228
+ const participantIdentity = this.linkedParticipant?.identity;
229
+ const trackSid = this.subscribedTrack?.sid;
230
+ if (participantIdentity && trackSid) {
231
+ this.#publishTranscription(participantIdentity, trackSid, transcription, true, ev.itemId);
232
+ } else {
233
+ this.#logger.error('Participant or track not set');
234
+ }
235
+ });
236
+
237
+ this.#session.on('input_speech_started', (ev: any) => {
238
+ if (this.#playingHandle && !this.#playingHandle.done) {
239
+ this.#playingHandle.interrupt();
240
+
241
+ this.#session!.defaultConversation.item.truncate(
242
+ this.#playingHandle.itemId,
243
+ this.#playingHandle.contentIndex,
244
+ Math.floor((this.#playingHandle.audioSamples / 24000) * 1000),
245
+ );
246
+
247
+ this.#playingHandle = undefined;
248
+ }
249
+
250
+ const participantIdentity = this.linkedParticipant?.identity;
251
+ const trackSid = this.subscribedTrack?.sid;
252
+ if (participantIdentity && trackSid) {
253
+ this.#publishTranscription(participantIdentity, trackSid, '…', false, ev.itemId);
254
+ }
255
+ });
256
+
257
+ // eslint-disable-next-line @typescript-eslint/no-explicit-any
258
+ this.#session.on('function_call_started', (ev: any) => {
259
+ this.#pendingFunctionCalls.add(ev.callId);
260
+ this.#updateState();
261
+ });
262
+
263
+ // eslint-disable-next-line @typescript-eslint/no-explicit-any
264
+ this.#session.on('function_call_completed', (ev: any) => {
265
+ this.#pendingFunctionCalls.delete(ev.callId);
266
+ this.#updateState();
267
+ });
268
+
269
+ // eslint-disable-next-line @typescript-eslint/no-explicit-any
270
+ this.#session.on('function_call_failed', (ev: any) => {
271
+ this.#pendingFunctionCalls.delete(ev.callId);
272
+ this.#updateState();
273
+ });
274
+
275
+ resolve(this.#session);
276
+ });
277
+ }
278
+
279
+ // TODO
280
+ // close() {
281
+ // if (!this.connected || !this.ws) return;
282
+ // this.logger.debug('stopping assistant');
283
+ // this.ws.close();
284
+ // }
285
+
286
+ #linkParticipant(participantIdentity: string): void {
287
+ if (!this.room) {
288
+ this.#logger.error('Room is not set');
289
+ return;
290
+ }
291
+
292
+ this.linkedParticipant = this.room.remoteParticipants.get(participantIdentity) || null;
293
+ if (!this.linkedParticipant) {
294
+ this.#logger.error(`Participant with identity ${participantIdentity} not found`);
295
+ return;
296
+ }
297
+
298
+ if (this.linkedParticipant.trackPublications.size > 0) {
299
+ this.#subscribeToMicrophone();
300
+ } else {
301
+ this.room.on(RoomEvent.TrackPublished, () => {
302
+ this.#subscribeToMicrophone();
303
+ });
304
+ }
305
+ }
306
+
307
+ #subscribeToMicrophone(): void {
308
+ const readAudioStreamTask = async (audioStream: AudioStream) => {
309
+ const bstream = new AudioByteStream(
310
+ this.model.sampleRate,
311
+ this.model.numChannels,
312
+ this.model.inFrameSize,
313
+ );
314
+
315
+ for await (const frame of audioStream) {
316
+ const audioData = frame.data;
317
+ for (const frame of bstream.write(audioData.buffer)) {
318
+ this.#session!.inputAudioBuffer.append(frame);
319
+ }
320
+ }
321
+ };
322
+
323
+ if (!this.linkedParticipant) {
324
+ this.#logger.error('Participant is not set');
325
+ return;
326
+ }
327
+
328
+ for (const publication of this.linkedParticipant.trackPublications.values()) {
329
+ if (publication.source !== TrackSource.SOURCE_MICROPHONE) {
330
+ continue;
331
+ }
332
+
333
+ if (!publication.subscribed) {
334
+ publication.setSubscribed(true);
335
+ }
336
+
337
+ const track = publication.track;
338
+
339
+ if (track && track !== this.subscribedTrack) {
340
+ this.subscribedTrack = track;
341
+
342
+ if (this.readMicroTask) {
343
+ this.readMicroTask.cancel();
344
+ }
345
+
346
+ let cancel: () => void;
347
+ this.readMicroTask = {
348
+ promise: new Promise<void>((resolve, reject) => {
349
+ cancel = () => {
350
+ reject(new Error('Task cancelled'));
351
+ };
352
+ readAudioStreamTask(
353
+ new AudioStream(track, this.model.sampleRate, this.model.numChannels),
354
+ )
355
+ .then(resolve)
356
+ .catch(reject);
357
+ }),
358
+ cancel: () => cancel(),
359
+ };
360
+ }
361
+ }
362
+ }
363
+
364
+ #getLocalTrackSid(): string | null {
365
+ if (!this.#localTrackSid && this.room && this.room.localParticipant) {
366
+ this.#localTrackSid = findMicroTrackId(this.room, this.room.localParticipant?.identity);
367
+ }
368
+ return this.#localTrackSid;
369
+ }
370
+
371
+ #publishTranscription(
372
+ participantIdentity: string,
373
+ trackSid: string,
374
+ text: string,
375
+ isFinal: boolean,
376
+ id: string,
377
+ ): void {
378
+ this.#logger.debug(
379
+ `Publishing transcription ${participantIdentity} ${trackSid} ${text} ${isFinal} ${id}`,
380
+ );
381
+ if (!this.room?.localParticipant) {
382
+ this.#logger.error('Room or local participant not set');
383
+ return;
384
+ }
385
+
386
+ this.room.localParticipant.publishTranscription({
387
+ participantIdentity,
388
+ trackSid,
389
+ segments: [
390
+ {
391
+ text,
392
+ final: isFinal,
393
+ id,
394
+ startTime: BigInt(0),
395
+ endTime: BigInt(0),
396
+ language: '',
397
+ },
398
+ ],
399
+ });
400
+ }
401
+
402
+ #updateState() {
403
+ let newState: AgentState = 'initializing';
404
+ if (this.#pendingFunctionCalls.size > 0) {
405
+ newState = 'thinking';
406
+ } else if (this.#speaking) {
407
+ newState = 'speaking';
408
+ } else if (this.#started) {
409
+ newState = 'listening';
410
+ }
411
+
412
+ this.#setState(newState);
413
+ }
414
+
415
+ #setState(state: AgentState) {
416
+ if (this.room?.isConnected && this.room.localParticipant) {
417
+ const currentState = this.room.localParticipant.attributes[AGENT_STATE_ATTRIBUTE];
418
+ if (currentState !== state) {
419
+ this.room.localParticipant.setAttributes({
420
+ [AGENT_STATE_ATTRIBUTE]: state,
421
+ });
422
+ this.#logger.debug(`${AGENT_STATE_ATTRIBUTE}: ${currentState} ->${state}`);
423
+ }
424
+ }
425
+ }
426
+ }
@@ -0,0 +1,129 @@
1
+ // SPDX-FileCopyrightText: 2024 LiveKit, Inc.
2
+ //
3
+ // SPDX-License-Identifier: Apache-2.0
4
+ import type { AudioFrame, Room } from '@livekit/rtc-node';
5
+ import { log } from './log.js';
6
+
7
+ export interface TranscriptionForwarder {
8
+ start(): void;
9
+ pushAudio(frame: AudioFrame): void;
10
+ pushText(text: string): void;
11
+ markTextComplete(): void;
12
+ markAudioComplete(): void;
13
+ close(interrupt: boolean): Promise<void>;
14
+ currentCharacterIndex: number;
15
+ }
16
+
17
+ export class BasicTranscriptionForwarder implements TranscriptionForwarder {
18
+ #room: Room;
19
+ #participantIdentity: string;
20
+ #trackSid: string;
21
+ #currentText: string = '';
22
+ #totalAudioDuration: number = 0;
23
+ #currentPlayoutTime: number = 0;
24
+ #DEFAULT_CHARS_PER_SECOND = 16;
25
+ #charsPerSecond: number = this.#DEFAULT_CHARS_PER_SECOND;
26
+ #messageId: string;
27
+ #isRunning: boolean = false;
28
+ #logger = log();
29
+ currentCharacterIndex: number = 0;
30
+
31
+ constructor(room: Room, participantIdentity: string, trackSid: string, messageId: string) {
32
+ this.#room = room;
33
+ this.#participantIdentity = participantIdentity;
34
+ this.#trackSid = trackSid;
35
+ this.#messageId = messageId;
36
+ }
37
+
38
+ start(): void {
39
+ if (!this.#isRunning) {
40
+ this.#isRunning = true;
41
+ this.#startPublishingLoop().catch((error) => {
42
+ this.#logger.error('Error in publishing loop:', error);
43
+ this.#isRunning = false;
44
+ });
45
+ }
46
+ }
47
+
48
+ pushAudio(frame: AudioFrame): void {
49
+ this.#totalAudioDuration += frame.samplesPerChannel / frame.sampleRate;
50
+ }
51
+
52
+ pushText(text: string): void {
53
+ this.#currentText += text;
54
+ }
55
+
56
+ #textIsComplete: boolean = false;
57
+ #audioIsComplete: boolean = false;
58
+
59
+ markTextComplete(): void {
60
+ this.#textIsComplete = true;
61
+ this.#adjustTimingIfBothFinished();
62
+ }
63
+
64
+ markAudioComplete(): void {
65
+ this.#audioIsComplete = true;
66
+ this.#adjustTimingIfBothFinished();
67
+ }
68
+
69
+ #adjustTimingIfBothFinished(): void {
70
+ if (this.#textIsComplete && this.#audioIsComplete) {
71
+ const actualDuration = this.#totalAudioDuration;
72
+ if (actualDuration > 0 && this.#currentText.length > 0) {
73
+ this.#charsPerSecond = this.#currentText.length / actualDuration;
74
+ }
75
+ }
76
+ }
77
+
78
+ #computeSleepInterval(): number {
79
+ return Math.min(Math.max(1 / this.#charsPerSecond, 0.0625), 0.5);
80
+ }
81
+
82
+ async #startPublishingLoop(): Promise<void> {
83
+ this.#isRunning = true;
84
+ let sleepInterval = this.#computeSleepInterval();
85
+ let isComplete = false;
86
+ while (this.#isRunning && !isComplete) {
87
+ this.#currentPlayoutTime += sleepInterval;
88
+ this.currentCharacterIndex = Math.floor(this.#currentPlayoutTime * this.#charsPerSecond);
89
+ isComplete = this.#textIsComplete && this.currentCharacterIndex >= this.#currentText.length;
90
+ await this.#publishTranscription(false);
91
+ if (this.#isRunning && !isComplete) {
92
+ sleepInterval = this.#computeSleepInterval();
93
+ await new Promise((resolve) => setTimeout(resolve, sleepInterval * 1000));
94
+ }
95
+ }
96
+
97
+ if (this.#isRunning) {
98
+ this.close(false);
99
+ }
100
+ }
101
+
102
+ async #publishTranscription(final: boolean): Promise<void> {
103
+ const textToPublish = this.#currentText.slice(0, this.currentCharacterIndex);
104
+ await this.#room.localParticipant?.publishTranscription({
105
+ participantIdentity: this.#participantIdentity,
106
+ trackSid: this.#trackSid,
107
+ segments: [
108
+ {
109
+ text: textToPublish,
110
+ final: final,
111
+ id: this.#messageId,
112
+ startTime: BigInt(0),
113
+ endTime: BigInt(0),
114
+ language: '',
115
+ },
116
+ ],
117
+ });
118
+ }
119
+
120
+ async close(interrupt: boolean): Promise<void> {
121
+ this.#isRunning = false;
122
+
123
+ // Publish whatever we had as final
124
+ if (!interrupt) {
125
+ this.currentCharacterIndex = this.#currentText.length;
126
+ }
127
+ await this.#publishTranscription(true);
128
+ }
129
+ }