@livekit/agents-plugin-openai 0.2.0 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. package/.turbo/turbo-build.log +1 -1
  2. package/CHANGELOG.md +28 -0
  3. package/dist/index.d.ts +1 -1
  4. package/dist/index.d.ts.map +1 -1
  5. package/dist/index.js +1 -1
  6. package/dist/index.js.map +1 -1
  7. package/dist/realtime/api_proto.d.ts +399 -0
  8. package/dist/realtime/api_proto.d.ts.map +1 -0
  9. package/dist/realtime/api_proto.js +9 -0
  10. package/dist/realtime/api_proto.js.map +1 -0
  11. package/dist/realtime/index.d.ts +3 -0
  12. package/dist/realtime/index.d.ts.map +1 -0
  13. package/dist/realtime/index.js +6 -0
  14. package/dist/realtime/index.js.map +1 -0
  15. package/dist/realtime/realtime_model.d.ts +149 -0
  16. package/dist/realtime/realtime_model.d.ts.map +1 -0
  17. package/dist/realtime/realtime_model.js +571 -0
  18. package/dist/realtime/realtime_model.js.map +1 -0
  19. package/package.json +5 -3
  20. package/src/index.ts +1 -2
  21. package/src/realtime/api_proto.ts +565 -0
  22. package/src/realtime/index.ts +5 -0
  23. package/src/realtime/realtime_model.ts +859 -0
  24. package/dist/omni_assistant/agent_playout.d.ts +0 -27
  25. package/dist/omni_assistant/agent_playout.d.ts.map +0 -1
  26. package/dist/omni_assistant/agent_playout.js +0 -111
  27. package/dist/omni_assistant/agent_playout.js.map +0 -1
  28. package/dist/omni_assistant/index.d.ts +0 -61
  29. package/dist/omni_assistant/index.d.ts.map +0 -1
  30. package/dist/omni_assistant/index.js +0 -453
  31. package/dist/omni_assistant/index.js.map +0 -1
  32. package/dist/omni_assistant/proto.d.ts +0 -218
  33. package/dist/omni_assistant/proto.d.ts.map +0 -1
  34. package/dist/omni_assistant/proto.js +0 -68
  35. package/dist/omni_assistant/proto.js.map +0 -1
  36. package/dist/omni_assistant/transcription_forwarder.d.ts +0 -28
  37. package/dist/omni_assistant/transcription_forwarder.d.ts.map +0 -1
  38. package/dist/omni_assistant/transcription_forwarder.js +0 -117
  39. package/dist/omni_assistant/transcription_forwarder.js.map +0 -1
  40. package/src/omni_assistant/agent_playout.ts +0 -127
  41. package/src/omni_assistant/index.ts +0 -547
  42. package/src/omni_assistant/proto.ts +0 -280
  43. package/src/omni_assistant/transcription_forwarder.ts +0 -128
@@ -1,547 +0,0 @@
1
- // SPDX-FileCopyrightText: 2024 LiveKit, Inc.
2
- //
3
- // SPDX-License-Identifier: Apache-2.0
4
- import { AudioByteStream } from '@livekit/agents';
5
- import { findMicroTrackId } from '@livekit/agents';
6
- import { llm, log } from '@livekit/agents';
7
- import type {
8
- AudioFrameEvent,
9
- LocalTrackPublication,
10
- RemoteAudioTrack,
11
- RemoteParticipant,
12
- Room,
13
- } from '@livekit/rtc-node';
14
- import {
15
- AudioSource,
16
- AudioStream,
17
- AudioStreamEvent,
18
- LocalAudioTrack,
19
- RoomEvent,
20
- TrackPublishOptions,
21
- TrackSource,
22
- } from '@livekit/rtc-node';
23
- import { WebSocket } from 'ws';
24
- import { AgentPlayout, type PlayoutHandle } from './agent_playout.js';
25
- import * as proto from './proto.js';
26
- import { BasicTranscriptionForwarder } from './transcription_forwarder.js';
27
-
28
- /** @hidden */
29
- export const defaultSessionConfig: proto.SessionConfig = {
30
- turn_detection: 'server_vad',
31
- input_audio_format: proto.AudioFormat.PCM16,
32
- transcribe_input: true,
33
- vad: {
34
- threshold: 0.5,
35
- prefix_padding_ms: 300,
36
- silence_duration_ms: 200,
37
- },
38
- };
39
-
40
- /** @hidden */
41
- export const defaultConversationConfig: proto.ConversationConfig = {
42
- system_message: 'You are a helpful assistant.',
43
- voice: proto.Voice.ALLOY,
44
- subscribe_to_user_audio: true,
45
- output_audio_format: proto.AudioFormat.PCM16,
46
- tools: [],
47
- tool_choice: proto.ToolChoice.AUTO,
48
- temperature: 0.8,
49
- max_tokens: 2048,
50
- disable_audio: false,
51
- transcribe_input: true,
52
- };
53
-
54
- type ImplOptions = {
55
- apiKey: string;
56
- sessionConfig: proto.SessionConfig;
57
- conversationConfig: proto.ConversationConfig;
58
- functions: llm.FunctionContext;
59
- };
60
-
61
- /** @alpha */
62
- export class OmniAssistant {
63
- options: ImplOptions;
64
- room: Room | null = null;
65
- linkedParticipant: RemoteParticipant | null = null;
66
- subscribedTrack: RemoteAudioTrack | null = null;
67
- readMicroTask: { promise: Promise<void>; cancel: () => void } | null = null;
68
-
69
- constructor({
70
- sessionConfig = defaultSessionConfig,
71
- conversationConfig = defaultConversationConfig,
72
- functions = {},
73
- apiKey = process.env.OPENAI_API_KEY || '',
74
- }: {
75
- sessionConfig?: proto.SessionConfig;
76
- conversationConfig?: proto.ConversationConfig;
77
- functions?: llm.FunctionContext;
78
- apiKey?: string;
79
- }) {
80
- if (!apiKey) {
81
- throw new Error('OpenAI API key is required, whether as an argument or as $OPENAI_API_KEY');
82
- }
83
-
84
- conversationConfig.tools = tools(functions);
85
- this.options = {
86
- apiKey,
87
- sessionConfig,
88
- conversationConfig,
89
- functions,
90
- };
91
- }
92
-
93
- private ws: WebSocket | null = null;
94
- private connected: boolean = false;
95
- private thinking: boolean = false;
96
- private participant: RemoteParticipant | string | null = null;
97
- private agentPublication: LocalTrackPublication | null = null;
98
- private localTrackSid: string | null = null;
99
- private localSource: AudioSource | null = null;
100
- private agentPlayout: AgentPlayout | null = null;
101
- private playingHandle: PlayoutHandle | null = null;
102
- private logger = log();
103
-
104
- get funcCtx(): llm.FunctionContext {
105
- return this.options.functions;
106
- }
107
- set funcCtx(ctx: llm.FunctionContext) {
108
- this.options.functions = ctx;
109
- this.options.conversationConfig.tools = tools(ctx);
110
- this.sendClientCommand({
111
- event: proto.ClientEventType.UPDATE_CONVERSATION_CONFIG,
112
- ...this.options.conversationConfig,
113
- });
114
- }
115
-
116
- start(room: Room, participant: RemoteParticipant | string | null = null): Promise<void> {
117
- return new Promise(async (resolve, reject) => {
118
- if (this.ws !== null) {
119
- this.logger.warn('VoiceAssistant already started');
120
- resolve();
121
- return;
122
- }
123
-
124
- room.on(RoomEvent.ParticipantConnected, (participant: RemoteParticipant) => {
125
- if (!this.linkedParticipant) {
126
- return;
127
- }
128
-
129
- this.linkParticipant(participant.identity);
130
- });
131
- this.room = room;
132
- this.participant = participant;
133
- this.setState(proto.State.INITIALIZING);
134
-
135
- this.localSource = new AudioSource(proto.SAMPLE_RATE, proto.NUM_CHANNELS);
136
- this.agentPlayout = new AgentPlayout(this.localSource);
137
- const track = LocalAudioTrack.createAudioTrack('assistant_voice', this.localSource);
138
- const options = new TrackPublishOptions();
139
- options.source = TrackSource.SOURCE_MICROPHONE;
140
- this.agentPublication = (await room.localParticipant?.publishTrack(track, options)) || null;
141
- if (!this.agentPublication) {
142
- this.logger.error('Failed to publish track');
143
- reject(new Error('Failed to publish track'));
144
- return;
145
- }
146
-
147
- await this.agentPublication.waitForSubscription();
148
-
149
- if (participant) {
150
- if (typeof participant === 'string') {
151
- this.linkParticipant(participant);
152
- } else {
153
- this.linkParticipant(participant.identity);
154
- }
155
- } else {
156
- // No participant specified, try to find the first participant in the room
157
- for (const participant of room.remoteParticipants.values()) {
158
- this.linkParticipant(participant.identity);
159
- break;
160
- }
161
- }
162
-
163
- this.ws = new WebSocket(proto.API_URL, {
164
- headers: {
165
- Authorization: `Bearer ${this.options.apiKey}`,
166
- },
167
- });
168
-
169
- this.ws.onopen = () => {
170
- this.connected = true;
171
- this.sendClientCommand({
172
- event: proto.ClientEventType.UPDATE_SESSION_CONFIG,
173
- ...this.options.sessionConfig,
174
- });
175
- this.sendClientCommand({
176
- event: proto.ClientEventType.UPDATE_CONVERSATION_CONFIG,
177
- ...this.options.conversationConfig,
178
- });
179
- resolve();
180
- };
181
-
182
- this.ws.onerror = (error) => {
183
- reject(error);
184
- };
185
-
186
- this.ws.onclose = () => {
187
- this.connected = false;
188
- this.ws = null;
189
- };
190
-
191
- this.ws.onmessage = (message) => {
192
- this.handleServerEvent(JSON.parse(message.data as string));
193
- };
194
- });
195
- }
196
-
197
- // user-initiated close
198
- close() {
199
- if (!this.connected || !this.ws) return;
200
- this.logger.debug('stopping assistant');
201
- this.ws.close();
202
- }
203
-
204
- addUserMessage(text: string, generate: boolean = true): void {
205
- this.sendClientCommand({
206
- event: proto.ClientEventType.ADD_MESSAGE,
207
- message: {
208
- role: 'user',
209
- content: [
210
- {
211
- type: 'text',
212
- text: text,
213
- },
214
- ],
215
- },
216
- });
217
- if (generate) {
218
- this.sendClientCommand({
219
- event: proto.ClientEventType.GENERATE,
220
- });
221
- }
222
- }
223
-
224
- private setState(state: proto.State) {
225
- // don't override thinking until done
226
- if (this.thinking) return;
227
- if (this.room?.isConnected && this.room.localParticipant) {
228
- const currentState = this.room.localParticipant.attributes['voice_assistant.state'];
229
- if (currentState !== state) {
230
- this.room.localParticipant!.setAttributes({
231
- 'voice_assistant.state': state,
232
- });
233
- this.logger.debug(`voice_assistant.state updated from ${currentState} to ${state}`);
234
- }
235
- }
236
- }
237
-
238
- /// Truncates the data field of the event to the specified maxLength to avoid overwhelming logs
239
- /// with large amounts of base64 audio data.
240
- private loggableEvent(
241
- event: proto.ClientEvent | proto.ServerEvent,
242
- maxLength: number = 30,
243
- ): Record<string, unknown> {
244
- const untypedEvent: Record<string, unknown> = {};
245
- for (const [key, value] of Object.entries(event)) {
246
- if (value !== undefined) {
247
- untypedEvent[key] = value;
248
- }
249
- }
250
-
251
- if (untypedEvent.data && typeof untypedEvent.data === 'string') {
252
- const truncatedData =
253
- untypedEvent.data.slice(0, maxLength) + (untypedEvent.data.length > maxLength ? '…' : '');
254
- return { ...untypedEvent, data: truncatedData };
255
- }
256
- return untypedEvent;
257
- }
258
-
259
- private sendClientCommand(command: proto.ClientEvent): void {
260
- const isAudio = command.event === proto.ClientEventType.ADD_USER_AUDIO;
261
-
262
- if (!this.connected || !this.ws) {
263
- if (!isAudio) this.logger.error('WebSocket is not connected');
264
- return;
265
- }
266
-
267
- if (!isAudio) {
268
- this.logger.debug(`-> ${JSON.stringify(this.loggableEvent(command))}`);
269
- }
270
- this.ws.send(JSON.stringify(command));
271
- }
272
-
273
- private handleServerEvent(event: proto.ServerEvent): void {
274
- this.logger.debug(`<- ${JSON.stringify(this.loggableEvent(event))}`);
275
-
276
- switch (event.event) {
277
- case proto.ServerEventType.START_SESSION:
278
- this.setState(proto.State.LISTENING);
279
- break;
280
- case proto.ServerEventType.ADD_MESSAGE:
281
- break;
282
- case proto.ServerEventType.ADD_CONTENT:
283
- this.handleAddContent(event);
284
- break;
285
- case proto.ServerEventType.MESSAGE_ADDED:
286
- this.handleMessageAdded(event);
287
- break;
288
- case proto.ServerEventType.VAD_SPEECH_STARTED:
289
- this.handleVadSpeechStarted(event);
290
- break;
291
- case proto.ServerEventType.VAD_SPEECH_STOPPED:
292
- break;
293
- case proto.ServerEventType.INPUT_TRANSCRIBED:
294
- this.handleInputTranscribed(event);
295
- break;
296
- case proto.ServerEventType.GENERATION_CANCELED:
297
- this.handleGenerationCanceled();
298
- break;
299
- case proto.ServerEventType.GENERATION_FINISHED:
300
- this.handleGenerationFinished(event);
301
- break;
302
- default:
303
- this.logger.warn(`Unknown server event: ${JSON.stringify(event)}`);
304
- }
305
- }
306
-
307
- private handleAddContent(event: proto.ServerEvent): void {
308
- if (event.event !== proto.ServerEventType.ADD_CONTENT) return;
309
-
310
- const trackSid = this.getLocalTrackSid();
311
- if (!this.room || !this.room.localParticipant || !trackSid || !this.agentPlayout) {
312
- log().error('Room or local participant not set');
313
- return;
314
- }
315
-
316
- if (!this.playingHandle || this.playingHandle.done) {
317
- const trFwd = new BasicTranscriptionForwarder(
318
- this.room,
319
- this.room?.localParticipant?.identity,
320
- trackSid,
321
- event.message_id,
322
- );
323
-
324
- this.setState(proto.State.SPEAKING);
325
- this.playingHandle = this.agentPlayout.play(event.message_id, trFwd);
326
- this.playingHandle.on('complete', () => {
327
- this.setState(proto.State.LISTENING);
328
- });
329
- }
330
- switch (event.type) {
331
- case 'audio':
332
- this.playingHandle?.pushAudio(Buffer.from(event.data, 'base64'));
333
- break;
334
- case 'text':
335
- this.playingHandle?.pushText(event.data);
336
- break;
337
- case 'tool_call':
338
- break;
339
- default:
340
- this.logger.warn(`Unknown content event type: ${event.type}`);
341
- break;
342
- }
343
- }
344
-
345
- private handleMessageAdded(event: proto.ServerEvent): void {
346
- if (event.event !== proto.ServerEventType.MESSAGE_ADDED) return;
347
- for (const toolCall of event.content || []) {
348
- this.options.functions[toolCall.name].execute(toolCall.arguments).then((content) => {
349
- this.thinking = false;
350
- this.sendClientCommand({
351
- event: proto.ClientEventType.ADD_MESSAGE,
352
- message: {
353
- role: 'tool',
354
- tool_call_id: toolCall.tool_call_id,
355
- content: [
356
- {
357
- type: 'text',
358
- text: content,
359
- },
360
- ],
361
- },
362
- });
363
- this.sendClientCommand({
364
- event: proto.ClientEventType.GENERATE,
365
- });
366
- });
367
- break;
368
- }
369
- }
370
-
371
- private handleInputTranscribed(event: proto.ServerEvent): void {
372
- if (event.event !== proto.ServerEventType.INPUT_TRANSCRIBED) return;
373
- const messageId = event.message_id;
374
- const transcription = event.transcript;
375
- if (!messageId || transcription === undefined) {
376
- this.logger.error('Message ID or transcription not set');
377
- return;
378
- }
379
- const participantIdentity = this.linkedParticipant?.identity;
380
- const trackSid = this.subscribedTrack?.sid;
381
- if (participantIdentity && trackSid) {
382
- this.publishTranscription(participantIdentity, trackSid, transcription, true, messageId);
383
- } else {
384
- this.logger.error('Participant or track not set');
385
- }
386
- }
387
-
388
- private handleGenerationCanceled(): void {
389
- if (this.playingHandle && !this.playingHandle.done) {
390
- this.playingHandle.interrupt();
391
- this.sendClientCommand({
392
- event: proto.ClientEventType.TRUNCATE_CONTENT,
393
- message_id: this.playingHandle.messageId,
394
- index: 0, // ignored for now (see OAI docs)
395
- text_chars: this.playingHandle.publishedTextChars(),
396
- audio_samples: this.playingHandle.playedAudioSamples,
397
- });
398
- }
399
- }
400
-
401
- private handleGenerationFinished(event: proto.ServerEvent): void {
402
- if (event.event !== proto.ServerEventType.GENERATION_FINISHED) return;
403
- if (event.reason !== 'interrupt' && event.reason !== 'stop') {
404
- log().warn(`assistant turn finished unexpectedly reason ${event.reason}`);
405
- }
406
-
407
- if (this.playingHandle && !this.playingHandle.interrupted) {
408
- this.playingHandle.endInput();
409
- }
410
- }
411
-
412
- private handleVadSpeechStarted(event: proto.ServerEvent): void {
413
- if (event.event !== proto.ServerEventType.VAD_SPEECH_STARTED) return;
414
- const messageId = event.message_id;
415
- const participantIdentity = this.linkedParticipant?.identity;
416
- const trackSid = this.subscribedTrack?.sid;
417
- if (participantIdentity && trackSid && messageId) {
418
- this.publishTranscription(participantIdentity, trackSid, '', false, messageId);
419
- } else {
420
- this.logger.error('Participant or track or itemId not set');
421
- }
422
- }
423
-
424
- private linkParticipant(participantIdentity: string): void {
425
- if (!this.room) {
426
- this.logger.error('Room is not set');
427
- return;
428
- }
429
-
430
- this.linkedParticipant = this.room.remoteParticipants.get(participantIdentity) || null;
431
- if (!this.linkedParticipant) {
432
- this.logger.error(`Participant with identity ${participantIdentity} not found`);
433
- return;
434
- }
435
-
436
- if (this.linkedParticipant.trackPublications.size > 0) {
437
- this.subscribeToMicrophone();
438
- } else {
439
- this.room.on(RoomEvent.TrackPublished, () => {
440
- this.subscribeToMicrophone();
441
- });
442
- }
443
- }
444
-
445
- private subscribeToMicrophone(): void {
446
- const readAudioStreamTask = async (audioStream: AudioStream) => {
447
- const bstream = new AudioByteStream(
448
- proto.SAMPLE_RATE,
449
- proto.NUM_CHANNELS,
450
- proto.INPUT_PCM_FRAME_SIZE,
451
- );
452
-
453
- audioStream.on(AudioStreamEvent.FrameReceived, (ev: AudioFrameEvent) => {
454
- const audioData = ev.frame.data;
455
- for (const frame of bstream.write(audioData.buffer)) {
456
- this.sendClientCommand({
457
- event: proto.ClientEventType.ADD_USER_AUDIO,
458
- data: Buffer.from(frame.data.buffer).toString('base64'),
459
- });
460
- }
461
- });
462
- };
463
-
464
- if (!this.linkedParticipant) {
465
- this.logger.error('Participant is not set');
466
- return;
467
- }
468
-
469
- for (const publication of this.linkedParticipant.trackPublications.values()) {
470
- if (publication.source !== TrackSource.SOURCE_MICROPHONE) {
471
- continue;
472
- }
473
-
474
- if (!publication.subscribed) {
475
- publication.setSubscribed(true);
476
- }
477
-
478
- const track = publication.track;
479
-
480
- if (track && track !== this.subscribedTrack) {
481
- this.subscribedTrack = track!;
482
- if (this.readMicroTask) {
483
- this.readMicroTask.cancel();
484
- }
485
-
486
- let cancel: () => void;
487
- this.readMicroTask = {
488
- promise: new Promise<void>((resolve, reject) => {
489
- cancel = () => {
490
- // Cleanup logic here
491
- reject(new Error('Task cancelled'));
492
- };
493
- readAudioStreamTask(new AudioStream(track, proto.SAMPLE_RATE, proto.NUM_CHANNELS))
494
- .then(resolve)
495
- .catch(reject);
496
- }),
497
- cancel: () => cancel(),
498
- };
499
- }
500
- }
501
- }
502
-
503
- private getLocalTrackSid(): string | null {
504
- if (!this.localTrackSid && this.room && this.room.localParticipant) {
505
- this.localTrackSid = findMicroTrackId(this.room, this.room.localParticipant?.identity);
506
- }
507
- return this.localTrackSid;
508
- }
509
-
510
- private publishTranscription(
511
- participantIdentity: string,
512
- trackSid: string,
513
- text: string,
514
- isFinal: boolean,
515
- id: string,
516
- ): void {
517
- if (!this.room?.localParticipant) {
518
- log().error('Room or local participant not set');
519
- return;
520
- }
521
-
522
- this.room.localParticipant.publishTranscription({
523
- participantIdentity,
524
- trackSid,
525
- segments: [
526
- {
527
- text,
528
- final: isFinal,
529
- id,
530
- startTime: BigInt(0),
531
- endTime: BigInt(0),
532
- language: '',
533
- },
534
- ],
535
- });
536
- }
537
- }
538
-
539
- const tools = (ctx: llm.FunctionContext): proto.Tool[] =>
540
- Object.entries(ctx).map(([name, func]) => ({
541
- type: 'function',
542
- function: {
543
- name,
544
- description: func.description,
545
- parameters: llm.oaiParams(func.parameters),
546
- },
547
- }));