@livekit/agents-plugin-openai 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. package/.turbo/turbo-build.log +1 -1
  2. package/CHANGELOG.md +17 -0
  3. package/dist/index.d.ts +1 -1
  4. package/dist/index.d.ts.map +1 -1
  5. package/dist/index.js +1 -1
  6. package/dist/index.js.map +1 -1
  7. package/dist/realtime/api_proto.d.ts +400 -0
  8. package/dist/realtime/api_proto.d.ts.map +1 -0
  9. package/dist/realtime/api_proto.js +9 -0
  10. package/dist/realtime/api_proto.js.map +1 -0
  11. package/dist/realtime/index.d.ts +3 -0
  12. package/dist/realtime/index.d.ts.map +1 -0
  13. package/dist/realtime/index.js +6 -0
  14. package/dist/realtime/index.js.map +1 -0
  15. package/dist/realtime/realtime_model.d.ts +148 -0
  16. package/dist/realtime/realtime_model.d.ts.map +1 -0
  17. package/dist/realtime/realtime_model.js +555 -0
  18. package/dist/realtime/realtime_model.js.map +1 -0
  19. package/package.json +5 -3
  20. package/src/index.ts +1 -2
  21. package/src/realtime/api_proto.ts +568 -0
  22. package/src/realtime/index.ts +5 -0
  23. package/src/realtime/realtime_model.ts +842 -0
  24. package/dist/omni_assistant/agent_playout.d.ts +0 -27
  25. package/dist/omni_assistant/agent_playout.d.ts.map +0 -1
  26. package/dist/omni_assistant/agent_playout.js +0 -111
  27. package/dist/omni_assistant/agent_playout.js.map +0 -1
  28. package/dist/omni_assistant/index.d.ts +0 -61
  29. package/dist/omni_assistant/index.d.ts.map +0 -1
  30. package/dist/omni_assistant/index.js +0 -453
  31. package/dist/omni_assistant/index.js.map +0 -1
  32. package/dist/omni_assistant/proto.d.ts +0 -218
  33. package/dist/omni_assistant/proto.d.ts.map +0 -1
  34. package/dist/omni_assistant/proto.js +0 -68
  35. package/dist/omni_assistant/proto.js.map +0 -1
  36. package/dist/omni_assistant/transcription_forwarder.d.ts +0 -28
  37. package/dist/omni_assistant/transcription_forwarder.d.ts.map +0 -1
  38. package/dist/omni_assistant/transcription_forwarder.js +0 -117
  39. package/dist/omni_assistant/transcription_forwarder.js.map +0 -1
  40. package/src/omni_assistant/agent_playout.ts +0 -127
  41. package/src/omni_assistant/index.ts +0 -547
  42. package/src/omni_assistant/proto.ts +0 -280
  43. package/src/omni_assistant/transcription_forwarder.ts +0 -128
@@ -0,0 +1,842 @@
1
+ // SPDX-FileCopyrightText: 2024 LiveKit, Inc.
2
+ //
3
+ // SPDX-License-Identifier: Apache-2.0
4
+ import { AsyncIterableQueue, Future, Queue } from '@livekit/agents';
5
+ import { llm, log, multimodal } from '@livekit/agents';
6
+ import { AudioFrame } from '@livekit/rtc-node';
7
+ import { once } from 'events';
8
+ import { WebSocket } from 'ws';
9
+ import * as api_proto from './api_proto.js';
10
+
11
+ interface ModelOptions {
12
+ modalities: ['text', 'audio'] | ['text'];
13
+ instructions?: string;
14
+ voice: api_proto.Voice;
15
+ inputAudioFormat: api_proto.AudioFormat;
16
+ outputAudioFormat: api_proto.AudioFormat;
17
+ inputAudioTranscription?: api_proto.InputAudioTranscription;
18
+ turnDetection: api_proto.TurnDetectionType;
19
+ temperature: number;
20
+ maxResponseOutputTokens?: number;
21
+ model: api_proto.Model;
22
+ apiKey: string;
23
+ baseURL: string;
24
+ }
25
+
26
+ export interface RealtimeResponse {
27
+ id: string;
28
+ status: api_proto.ResponseStatus;
29
+ output: RealtimeOutput[];
30
+ doneFut: Future;
31
+ }
32
+
33
+ export interface RealtimeOutput {
34
+ responseId: string;
35
+ itemId: string;
36
+ outputIndex: number;
37
+ role: api_proto.Role;
38
+ type: 'message' | 'function_call';
39
+ content: RealtimeContent[];
40
+ doneFut: Future;
41
+ }
42
+
43
+ export interface RealtimeContent {
44
+ responseId: string;
45
+ itemId: string;
46
+ outputIndex: number;
47
+ contentIndex: number;
48
+ text: string;
49
+ audio: AudioFrame[];
50
+ textStream: AsyncIterableQueue<string>;
51
+ audioStream: AsyncIterableQueue<AudioFrame>;
52
+ toolCalls: RealtimeToolCall[];
53
+ }
54
+
55
+ export interface RealtimeToolCall {
56
+ name: string;
57
+ arguments: string;
58
+ toolCallID: string;
59
+ }
60
+
61
+ export interface InputSpeechTranscriptionCompleted {
62
+ itemId: string;
63
+ transcript: string;
64
+ }
65
+
66
+ export interface InputSpeechTranscriptionFailed {
67
+ itemId: string;
68
+ message: string;
69
+ }
70
+
71
+ export interface InputSpeechStarted {
72
+ itemId: string;
73
+ }
74
+
75
+ export interface InputSpeechCommitted {
76
+ itemId: string;
77
+ }
78
+
79
+ class InputAudioBuffer {
80
+ #session: RealtimeSession;
81
+
82
+ constructor(session: RealtimeSession) {
83
+ this.#session = session;
84
+ }
85
+
86
+ append(frame: AudioFrame) {
87
+ this.#session.queueMsg({
88
+ type: 'input_audio_buffer.append',
89
+ audio: Buffer.from(frame.data.buffer).toString('base64'),
90
+ });
91
+ }
92
+
93
+ clear() {
94
+ this.#session.queueMsg({
95
+ type: 'input_audio_buffer.clear',
96
+ });
97
+ }
98
+
99
+ commit() {
100
+ this.#session.queueMsg({
101
+ type: 'input_audio_buffer.commit',
102
+ });
103
+ }
104
+ }
105
+
106
+ class ConversationItem {
107
+ #session: RealtimeSession;
108
+
109
+ constructor(session: RealtimeSession) {
110
+ this.#session = session;
111
+ }
112
+
113
+ truncate(itemId: string, contentIndex: number, audioEnd: number) {
114
+ this.#session.queueMsg({
115
+ type: 'conversation.item.truncate',
116
+ item_id: itemId,
117
+ content_index: contentIndex,
118
+ audio_end_ms: audioEnd,
119
+ });
120
+ }
121
+
122
+ delete(itemId: string) {
123
+ this.#session.queueMsg({
124
+ type: 'conversation.item.delete',
125
+ item_id: itemId,
126
+ });
127
+ }
128
+
129
+ create(item: api_proto.ConversationItemCreateContent, previousItemId?: string): void {
130
+ this.#session.queueMsg({
131
+ type: 'conversation.item.create',
132
+ item,
133
+ previous_item_id: previousItemId,
134
+ });
135
+ }
136
+ }
137
+
138
+ class Conversation {
139
+ #session: RealtimeSession;
140
+
141
+ constructor(session: RealtimeSession) {
142
+ this.#session = session;
143
+ }
144
+
145
+ get item(): ConversationItem {
146
+ return new ConversationItem(this.#session);
147
+ }
148
+ }
149
+
150
+ class Response {
151
+ #session: RealtimeSession;
152
+
153
+ constructor(session: RealtimeSession) {
154
+ this.#session = session;
155
+ }
156
+
157
+ create() {
158
+ this.#session.queueMsg({
159
+ type: 'response.create',
160
+ });
161
+ }
162
+
163
+ cancel() {
164
+ this.#session.queueMsg({
165
+ type: 'response.cancel',
166
+ });
167
+ }
168
+ }
169
+
170
+ interface ContentPtr {
171
+ response_id: string;
172
+ output_index: number;
173
+ content_index: number;
174
+ }
175
+
176
+ export class RealtimeModel extends multimodal.RealtimeModel {
177
+ sampleRate = api_proto.SAMPLE_RATE;
178
+ numChannels = api_proto.NUM_CHANNELS;
179
+ inFrameSize = api_proto.IN_FRAME_SIZE;
180
+ outFrameSize = api_proto.OUT_FRAME_SIZE;
181
+
182
+ #defaultOpts: ModelOptions;
183
+ #sessions: RealtimeSession[] = [];
184
+
185
+ constructor({
186
+ modalities = ['text', 'audio'],
187
+ instructions = undefined,
188
+ voice = 'alloy',
189
+ inputAudioFormat = 'pcm16',
190
+ outputAudioFormat = 'pcm16',
191
+ inputAudioTranscription = { model: 'whisper-1' },
192
+ turnDetection = { type: 'server_vad' },
193
+ temperature = 0.8,
194
+ maxResponseOutputTokens = undefined,
195
+ model = 'gpt-4o-realtime-preview-2024-10-01',
196
+ apiKey = process.env.OPENAI_API_KEY || '',
197
+ baseURL = api_proto.API_URL,
198
+ }: {
199
+ modalities?: ['text', 'audio'] | ['text'];
200
+ instructions?: string;
201
+ voice?: api_proto.Voice;
202
+ inputAudioFormat?: api_proto.AudioFormat;
203
+ outputAudioFormat?: api_proto.AudioFormat;
204
+ inputAudioTranscription?: api_proto.InputAudioTranscription;
205
+ turnDetection?: api_proto.TurnDetectionType;
206
+ temperature?: number;
207
+ maxResponseOutputTokens?: number;
208
+ model?: api_proto.Model;
209
+ apiKey?: string;
210
+ baseURL?: string;
211
+ }) {
212
+ super();
213
+
214
+ if (apiKey === '') {
215
+ throw new Error(
216
+ 'OpenAI API key is required, either using the argument or by setting the OPENAI_API_KEY environmental variable',
217
+ );
218
+ }
219
+
220
+ this.#defaultOpts = {
221
+ modalities,
222
+ instructions,
223
+ voice,
224
+ inputAudioFormat,
225
+ outputAudioFormat,
226
+ inputAudioTranscription,
227
+ turnDetection,
228
+ temperature,
229
+ maxResponseOutputTokens,
230
+ model,
231
+ apiKey,
232
+ baseURL,
233
+ };
234
+ }
235
+
236
+ get sessions(): RealtimeSession[] {
237
+ return this.#sessions;
238
+ }
239
+
240
+ session({
241
+ fncCtx,
242
+ modalities = this.#defaultOpts.modalities,
243
+ instructions = this.#defaultOpts.instructions,
244
+ voice = this.#defaultOpts.voice,
245
+ inputAudioFormat = this.#defaultOpts.inputAudioFormat,
246
+ outputAudioFormat = this.#defaultOpts.outputAudioFormat,
247
+ inputAudioTranscription = this.#defaultOpts.inputAudioTranscription,
248
+ turnDetection = this.#defaultOpts.turnDetection,
249
+ temperature = this.#defaultOpts.temperature,
250
+ maxResponseOutputTokens = this.#defaultOpts.maxResponseOutputTokens,
251
+ }: {
252
+ fncCtx?: llm.FunctionContext;
253
+ modalities?: ['text', 'audio'] | ['text'];
254
+ instructions?: string;
255
+ voice?: api_proto.Voice;
256
+ inputAudioFormat?: api_proto.AudioFormat;
257
+ outputAudioFormat?: api_proto.AudioFormat;
258
+ inputAudioTranscription?: api_proto.InputAudioTranscription;
259
+ turnDetection?: api_proto.TurnDetectionType;
260
+ temperature?: number;
261
+ maxResponseOutputTokens?: number;
262
+ }): RealtimeSession {
263
+ const opts: ModelOptions = {
264
+ modalities,
265
+ instructions,
266
+ voice,
267
+ inputAudioFormat,
268
+ outputAudioFormat,
269
+ inputAudioTranscription,
270
+ turnDetection,
271
+ temperature,
272
+ maxResponseOutputTokens,
273
+ model: this.#defaultOpts.model,
274
+ apiKey: this.#defaultOpts.apiKey,
275
+ baseURL: this.#defaultOpts.baseURL,
276
+ };
277
+
278
+ const newSession = new RealtimeSession(opts, fncCtx);
279
+ this.#sessions.push(newSession);
280
+ return newSession;
281
+ }
282
+
283
+ async close(): Promise<void> {
284
+ // TODO: Implement close method
285
+ throw new Error('Not implemented');
286
+ }
287
+ }
288
+
289
+ export class RealtimeSession extends multimodal.RealtimeSession {
290
+ #fncCtx: llm.FunctionContext | undefined = undefined;
291
+ #opts: ModelOptions;
292
+ #pendingResponses: { [id: string]: RealtimeResponse } = {};
293
+ #sessionId = 'not-connected';
294
+ #ws: WebSocket | null = null;
295
+ #logger = log();
296
+ #task: Promise<void>;
297
+ #closing = true;
298
+ #sendQueue = new Queue<api_proto.ClientEvent>();
299
+
300
+ constructor(opts: ModelOptions, fncCtx?: llm.FunctionContext | undefined) {
301
+ super();
302
+
303
+ this.#opts = opts;
304
+ this.#fncCtx = fncCtx;
305
+
306
+ this.#task = this.#start();
307
+
308
+ this.sessionUpdate({
309
+ modalities: this.#opts.modalities,
310
+ instructions: this.#opts.instructions,
311
+ voice: this.#opts.voice,
312
+ inputAudioFormat: this.#opts.inputAudioFormat,
313
+ outputAudioFormat: this.#opts.outputAudioFormat,
314
+ inputAudioTranscription: this.#opts.inputAudioTranscription,
315
+ turnDetection: this.#opts.turnDetection,
316
+ temperature: this.#opts.temperature,
317
+ maxResponseOutputTokens: this.#opts.maxResponseOutputTokens,
318
+ toolChoice: 'auto',
319
+ });
320
+ }
321
+
322
+ get fncCtx(): llm.FunctionContext | undefined {
323
+ return this.#fncCtx;
324
+ }
325
+
326
+ set fncCtx(ctx: llm.FunctionContext | undefined) {
327
+ this.#fncCtx = ctx;
328
+ }
329
+
330
+ get defaultConversation(): Conversation {
331
+ return new Conversation(this);
332
+ }
333
+
334
+ get inputAudioBuffer(): InputAudioBuffer {
335
+ return new InputAudioBuffer(this);
336
+ }
337
+
338
+ get response(): Response {
339
+ return new Response(this);
340
+ }
341
+
342
+ queueMsg(command: api_proto.ClientEvent): void {
343
+ this.#sendQueue.put(command);
344
+ }
345
+
346
+ /// Truncates the data field of the event to the specified maxLength to avoid overwhelming logs
347
+ /// with large amounts of base64 audio data.
348
+ #loggableEvent(
349
+ event: api_proto.ClientEvent | api_proto.ServerEvent,
350
+ maxLength: number = 30,
351
+ ): Record<string, unknown> {
352
+ const untypedEvent: Record<string, unknown> = {};
353
+ for (const [key, value] of Object.entries(event)) {
354
+ if (value !== undefined) {
355
+ untypedEvent[key] = value;
356
+ }
357
+ }
358
+
359
+ if (untypedEvent.audio && typeof untypedEvent.audio === 'string') {
360
+ const truncatedData =
361
+ untypedEvent.audio.slice(0, maxLength) + (untypedEvent.audio.length > maxLength ? '…' : '');
362
+ return { ...untypedEvent, audio: truncatedData };
363
+ }
364
+ if (
365
+ untypedEvent.delta &&
366
+ typeof untypedEvent.delta === 'string' &&
367
+ event.type === 'response.audio.delta'
368
+ ) {
369
+ const truncatedDelta =
370
+ untypedEvent.delta.slice(0, maxLength) + (untypedEvent.delta.length > maxLength ? '…' : '');
371
+ return { ...untypedEvent, delta: truncatedDelta };
372
+ }
373
+ return untypedEvent;
374
+ }
375
+
376
+ sessionUpdate({
377
+ modalities = this.#opts.modalities,
378
+ instructions = this.#opts.instructions,
379
+ voice = this.#opts.voice,
380
+ inputAudioFormat = this.#opts.inputAudioFormat,
381
+ outputAudioFormat = this.#opts.outputAudioFormat,
382
+ inputAudioTranscription = this.#opts.inputAudioTranscription,
383
+ turnDetection = this.#opts.turnDetection,
384
+ temperature = this.#opts.temperature,
385
+ maxResponseOutputTokens = this.#opts.maxResponseOutputTokens,
386
+ toolChoice = 'auto',
387
+ }: {
388
+ modalities: ['text', 'audio'] | ['text'];
389
+ instructions?: string;
390
+ voice?: api_proto.Voice;
391
+ inputAudioFormat?: api_proto.AudioFormat;
392
+ outputAudioFormat?: api_proto.AudioFormat;
393
+ inputAudioTranscription?: api_proto.InputAudioTranscription;
394
+ turnDetection?: api_proto.TurnDetectionType;
395
+ temperature?: number;
396
+ maxResponseOutputTokens?: number;
397
+ toolChoice?: api_proto.ToolChoice;
398
+ }) {
399
+ this.#opts = {
400
+ modalities,
401
+ instructions,
402
+ voice,
403
+ inputAudioFormat,
404
+ outputAudioFormat,
405
+ inputAudioTranscription,
406
+ turnDetection,
407
+ temperature,
408
+ maxResponseOutputTokens,
409
+ model: this.#opts.model,
410
+ apiKey: this.#opts.apiKey,
411
+ baseURL: this.#opts.baseURL,
412
+ };
413
+
414
+ const tools = this.#fncCtx
415
+ ? Object.entries(this.#fncCtx).map(([name, func]) => ({
416
+ type: 'function' as const,
417
+ name,
418
+ description: func.description,
419
+ parameters: llm.oaiParams(func.parameters),
420
+ }))
421
+ : [];
422
+
423
+ this.queueMsg({
424
+ type: 'session.update',
425
+ session: {
426
+ modalities: this.#opts.modalities,
427
+ instructions: this.#opts.instructions,
428
+ voice: this.#opts.voice,
429
+ input_audio_format: this.#opts.inputAudioFormat,
430
+ output_audio_format: this.#opts.outputAudioFormat,
431
+ input_audio_transcription: this.#opts.inputAudioTranscription,
432
+ turn_detection: this.#opts.turnDetection,
433
+ temperature: this.#opts.temperature,
434
+ max_response_output_tokens: this.#opts.maxResponseOutputTokens,
435
+ tools,
436
+ tool_choice: toolChoice,
437
+ },
438
+ });
439
+ }
440
+
441
+ #start(): Promise<void> {
442
+ return new Promise(async (resolve, reject) => {
443
+ this.#ws = new WebSocket(`${this.#opts.baseURL}?model=${this.#opts.model}`, {
444
+ headers: {
445
+ Authorization: `Bearer ${this.#opts.apiKey}`,
446
+ 'OpenAI-Beta': 'realtime=v1',
447
+ },
448
+ });
449
+
450
+ this.#ws.onerror = (error) => {
451
+ reject(error.message);
452
+ };
453
+
454
+ await once(this.#ws, 'open');
455
+ this.#closing = false;
456
+
457
+ this.#ws.onmessage = (message) => {
458
+ const event: api_proto.ServerEvent = JSON.parse(message.data as string);
459
+ this.#logger.debug(`<- ${JSON.stringify(this.#loggableEvent(event))}`);
460
+ switch (event.type) {
461
+ case 'error':
462
+ this.#handleError(event);
463
+ break;
464
+ case 'session.created':
465
+ this.#handleSessionCreated(event);
466
+ break;
467
+ case 'session.updated':
468
+ this.#handleSessionUpdated(event);
469
+ break;
470
+ case 'conversation.created':
471
+ this.#handleConversationCreated(event);
472
+ break;
473
+ case 'input_audio_buffer.committed':
474
+ this.#handleInputAudioBufferCommitted(event);
475
+ break;
476
+ case 'input_audio_buffer.cleared':
477
+ this.#handleInputAudioBufferCleared(event);
478
+ break;
479
+ case 'input_audio_buffer.speech_started':
480
+ this.#handleInputAudioBufferSpeechStarted(event);
481
+ break;
482
+ case 'input_audio_buffer.speech_stopped':
483
+ this.#handleInputAudioBufferSpeechStopped(event);
484
+ break;
485
+ case 'conversation.item.created':
486
+ this.#handleConversationItemCreated(event);
487
+ break;
488
+ case 'conversation.item.input_audio_transcription.completed':
489
+ this.#handleConversationItemInputAudioTranscriptionCompleted(event);
490
+ break;
491
+ case 'conversation.item.input_audio_transcription.failed':
492
+ this.#handleConversationItemInputAudioTranscriptionFailed(event);
493
+ break;
494
+ case 'conversation.item.truncated':
495
+ this.#handleConversationItemTruncated(event);
496
+ break;
497
+ case 'conversation.item.deleted':
498
+ this.#handleConversationItemDeleted(event);
499
+ break;
500
+ case 'response.created':
501
+ this.#handleResponseCreated(event);
502
+ break;
503
+ case 'response.done':
504
+ this.#handleResponseDone(event);
505
+ break;
506
+ case 'response.output_item.added':
507
+ this.#handleResponseOutputItemAdded(event);
508
+ break;
509
+ case 'response.output_item.done':
510
+ this.#handleResponseOutputItemDone(event);
511
+ break;
512
+ case 'response.content_part.added':
513
+ this.#handleResponseContentPartAdded(event);
514
+ break;
515
+ case 'response.content_part.done':
516
+ this.#handleResponseContentPartDone(event);
517
+ break;
518
+ case 'response.text.delta':
519
+ this.#handleResponseTextDelta(event);
520
+ break;
521
+ case 'response.text.done':
522
+ this.#handleResponseTextDone(event);
523
+ break;
524
+ case 'response.audio_transcript.delta':
525
+ this.#handleResponseAudioTranscriptDelta(event);
526
+ break;
527
+ case 'response.audio_transcript.done':
528
+ this.#handleResponseAudioTranscriptDone(event);
529
+ break;
530
+ case 'response.audio.delta':
531
+ this.#handleResponseAudioDelta(event);
532
+ break;
533
+ case 'response.audio.done':
534
+ this.#handleResponseAudioDone(event);
535
+ break;
536
+ case 'response.function_call_arguments.delta':
537
+ this.#handleResponseFunctionCallArgumentsDelta(event);
538
+ break;
539
+ case 'response.function_call_arguments.done':
540
+ this.#handleResponseFunctionCallArgumentsDone(event);
541
+ break;
542
+ case 'rate_limits.updated':
543
+ this.#handleRateLimitsUpdated(event);
544
+ break;
545
+ }
546
+ };
547
+
548
+ const sendTask = async () => {
549
+ while (this.#ws && !this.#closing && this.#ws.readyState === WebSocket.OPEN) {
550
+ try {
551
+ const event = await this.#sendQueue.get();
552
+ if (event.type !== 'input_audio_buffer.append') {
553
+ this.#logger.debug(`-> ${JSON.stringify(this.#loggableEvent(event))}`);
554
+ }
555
+ this.#ws.send(JSON.stringify(event));
556
+ } catch (error) {
557
+ this.#logger.error('Error sending event:', error);
558
+ }
559
+ }
560
+ };
561
+
562
+ sendTask();
563
+
564
+ this.#ws.onclose = () => {
565
+ if (!this.#closing) {
566
+ reject('OpenAI Realtime connection closed unexpectedly');
567
+ }
568
+ this.#ws = null;
569
+ resolve();
570
+ };
571
+ });
572
+ }
573
+
574
+ async close(): Promise<void> {
575
+ // TODO: Implement close method
576
+ throw new Error('Not implemented');
577
+ }
578
+
579
+ #getContent(ptr: ContentPtr): RealtimeContent {
580
+ const response = this.#pendingResponses[ptr.response_id];
581
+ const output = response.output[ptr.output_index];
582
+ const content = output.content[ptr.content_index];
583
+ return content;
584
+ }
585
+
586
+ #handleError(event: api_proto.ErrorEvent): void {
587
+ this.#logger.error(`OpenAI Realtime error ${JSON.stringify(event.error)}`);
588
+ }
589
+
590
+ #handleSessionCreated(event: api_proto.SessionCreatedEvent): void {
591
+ this.#sessionId = event.session.id;
592
+ }
593
+
594
+ // eslint-disable-next-line @typescript-eslint/no-unused-vars
595
+ #handleSessionUpdated(event: api_proto.SessionUpdatedEvent): void {}
596
+
597
+ // eslint-disable-next-line @typescript-eslint/no-unused-vars
598
+ #handleConversationCreated(event: api_proto.ConversationCreatedEvent): void {}
599
+
600
+ #handleInputAudioBufferCommitted(event: api_proto.InputAudioBufferCommittedEvent): void {
601
+ this.emit('input_speech_committed', {
602
+ itemId: event.item_id,
603
+ } as InputSpeechCommitted);
604
+ }
605
+
606
+ // eslint-disable-next-line @typescript-eslint/no-unused-vars
607
+ #handleInputAudioBufferCleared(event: api_proto.InputAudioBufferClearedEvent): void {}
608
+
609
+ #handleInputAudioBufferSpeechStarted(
610
+ // eslint-disable-next-line @typescript-eslint/no-unused-vars
611
+ event: api_proto.InputAudioBufferSpeechStartedEvent,
612
+ ): void {
613
+ this.emit('input_speech_started', {
614
+ itemId: event.item_id,
615
+ } as InputSpeechStarted);
616
+ }
617
+
618
+ #handleInputAudioBufferSpeechStopped(
619
+ // eslint-disable-next-line @typescript-eslint/no-unused-vars
620
+ event: api_proto.InputAudioBufferSpeechStoppedEvent,
621
+ ): void {
622
+ this.emit('input_speech_stopped');
623
+ }
624
+
625
+ // eslint-disable-next-line @typescript-eslint/no-unused-vars
626
+ #handleConversationItemCreated(event: api_proto.ConversationItemCreatedEvent): void {}
627
+
628
+ #handleConversationItemInputAudioTranscriptionCompleted(
629
+ event: api_proto.ConversationItemInputAudioTranscriptionCompletedEvent,
630
+ ): void {
631
+ const transcript = event.transcript;
632
+ this.emit('input_speech_transcription_completed', {
633
+ itemId: event.item_id,
634
+ transcript: transcript,
635
+ } as InputSpeechTranscriptionCompleted);
636
+ }
637
+
638
+ #handleConversationItemInputAudioTranscriptionFailed(
639
+ event: api_proto.ConversationItemInputAudioTranscriptionFailedEvent,
640
+ ): void {
641
+ const error = event.error;
642
+ this.#logger.error(`OpenAI Realtime failed to transcribe input audio: ${error.message}`);
643
+ this.emit('input_speech_transcription_failed', {
644
+ itemId: event.item_id,
645
+ message: error.message,
646
+ } as InputSpeechTranscriptionFailed);
647
+ }
648
+
649
+ // eslint-disable-next-line @typescript-eslint/no-unused-vars
650
+ #handleConversationItemTruncated(event: api_proto.ConversationItemTruncatedEvent): void {}
651
+
652
+ // eslint-disable-next-line @typescript-eslint/no-unused-vars
653
+ #handleConversationItemDeleted(event: api_proto.ConversationItemDeletedEvent): void {}
654
+
655
+ #handleResponseCreated(responseCreated: api_proto.ResponseCreatedEvent): void {
656
+ const response = responseCreated.response;
657
+ const doneFut = new Future();
658
+ const newResponse: RealtimeResponse = {
659
+ id: response.id,
660
+ status: response.status,
661
+ output: [],
662
+ doneFut: doneFut,
663
+ };
664
+ this.#pendingResponses[newResponse.id] = newResponse;
665
+ this.emit('response_created', newResponse);
666
+ }
667
+
668
+ #handleResponseDone(event: api_proto.ResponseDoneEvent): void {
669
+ const responseData = event.response;
670
+ const responseId = responseData.id;
671
+ const response = this.#pendingResponses[responseId];
672
+ response.doneFut.resolve();
673
+ this.emit('response_done', response);
674
+ }
675
+
676
+ #handleResponseOutputItemAdded(event: api_proto.ResponseOutputItemAddedEvent): void {
677
+ const responseId = event.response_id;
678
+ const response = this.#pendingResponses[responseId];
679
+ const itemData = event.item;
680
+
681
+ if (itemData.type !== 'message' && itemData.type !== 'function_call') {
682
+ throw new Error(`Unexpected item type: ${itemData.type}`);
683
+ }
684
+
685
+ let role: api_proto.Role;
686
+ if (itemData.type === 'function_call') {
687
+ role = 'assistant'; // function_call doesn't have a role field, defaulting it to assistant
688
+ } else {
689
+ role = itemData.role;
690
+ }
691
+
692
+ const newOutput: RealtimeOutput = {
693
+ responseId: responseId,
694
+ itemId: itemData.id,
695
+ outputIndex: event.output_index,
696
+ type: itemData.type,
697
+ role: role,
698
+ content: [],
699
+ doneFut: new Future(),
700
+ };
701
+ response.output.push(newOutput);
702
+ this.emit('response_output_added', newOutput);
703
+ }
704
+
705
+ #handleResponseOutputItemDone(event: api_proto.ResponseOutputItemDoneEvent): void {
706
+ const responseId = event.response_id;
707
+ const response = this.#pendingResponses[responseId];
708
+ const outputIndex = event.output_index;
709
+ const output = response.output[outputIndex];
710
+
711
+ if (output.type === 'function_call') {
712
+ if (!this.#fncCtx) {
713
+ this.#logger.error('function call received but no fncCtx is available');
714
+ return;
715
+ }
716
+
717
+ // parse the arguments and call the function inside the fnc_ctx
718
+ const item = event.item;
719
+ if (item.type !== 'function_call') {
720
+ throw new Error('Expected function_call item');
721
+ }
722
+
723
+ this.emit('function_call_started', {
724
+ callId: item.call_id,
725
+ });
726
+
727
+ const parsedArgs = JSON.parse(item.arguments);
728
+
729
+ this.#logger.debug(
730
+ `[Function Call ${item.call_id}] Executing ${item.name} with arguments ${parsedArgs}`,
731
+ );
732
+
733
+ this.#fncCtx[item.name].execute(parsedArgs).then(
734
+ (content) => {
735
+ this.#logger.debug(`[Function Call ${item.call_id}] ${item.name} returned ${content}`);
736
+ this.emit('function_call_completed', {
737
+ callId: item.call_id,
738
+ });
739
+ this.defaultConversation.item.create(
740
+ {
741
+ type: 'function_call_output',
742
+ call_id: item.call_id,
743
+ output: content,
744
+ },
745
+ output.itemId,
746
+ );
747
+ this.response.create();
748
+ },
749
+ (error) => {
750
+ this.#logger.error(`[Function Call ${item.call_id}] ${item.name} failed with ${error}`);
751
+ // TODO: send it back up as failed?
752
+ this.emit('function_call_failed', {
753
+ callId: item.call_id,
754
+ });
755
+ },
756
+ );
757
+ }
758
+
759
+ output.doneFut.resolve();
760
+ this.emit('response_output_done', output);
761
+ }
762
+
763
+ #handleResponseContentPartAdded(event: api_proto.ResponseContentPartAddedEvent): void {
764
+ const responseId = event.response_id;
765
+ const response = this.#pendingResponses[responseId];
766
+ const outputIndex = event.output_index;
767
+ const output = response.output[outputIndex];
768
+
769
+ const textStream = new AsyncIterableQueue<string>();
770
+ const audioStream = new AsyncIterableQueue<AudioFrame>();
771
+
772
+ const newContent: RealtimeContent = {
773
+ responseId: responseId,
774
+ itemId: event.item_id,
775
+ outputIndex: outputIndex,
776
+ contentIndex: event.content_index,
777
+ text: '',
778
+ audio: [],
779
+ textStream: textStream,
780
+ audioStream: audioStream,
781
+ toolCalls: [],
782
+ };
783
+ output.content.push(newContent);
784
+ this.emit('response_content_added', newContent);
785
+ }
786
+
787
+ #handleResponseContentPartDone(event: api_proto.ResponseContentPartDoneEvent): void {
788
+ const content = this.#getContent(event);
789
+ this.emit('response_content_done', content);
790
+ }
791
+
792
+ // eslint-disable-next-line @typescript-eslint/no-unused-vars
793
+ #handleResponseTextDelta(event: api_proto.ResponseTextDeltaEvent): void {}
794
+
795
+ // eslint-disable-next-line @typescript-eslint/no-unused-vars
796
+ #handleResponseTextDone(event: api_proto.ResponseTextDoneEvent): void {}
797
+
798
+ #handleResponseAudioTranscriptDelta(event: api_proto.ResponseAudioTranscriptDeltaEvent): void {
799
+ const content = this.#getContent(event);
800
+ const transcript = event.delta;
801
+ content.text += transcript;
802
+
803
+ content.textStream.put(transcript);
804
+ }
805
+
806
+ #handleResponseAudioTranscriptDone(event: api_proto.ResponseAudioTranscriptDoneEvent): void {
807
+ const content = this.#getContent(event);
808
+ content.textStream.close();
809
+ }
810
+
811
+ #handleResponseAudioDelta(event: api_proto.ResponseAudioDeltaEvent): void {
812
+ const content = this.#getContent(event);
813
+ const data = Buffer.from(event.delta, 'base64');
814
+ const audio = new AudioFrame(
815
+ new Int16Array(data.buffer),
816
+ api_proto.SAMPLE_RATE,
817
+ api_proto.NUM_CHANNELS,
818
+ data.length / 2,
819
+ );
820
+ content.audio.push(audio);
821
+
822
+ content.audioStream.put(audio);
823
+ }
824
+
825
+ #handleResponseAudioDone(event: api_proto.ResponseAudioDoneEvent): void {
826
+ const content = this.#getContent(event);
827
+ content.audioStream.close();
828
+ }
829
+
830
+ #handleResponseFunctionCallArgumentsDelta(
831
+ // eslint-disable-next-line @typescript-eslint/no-unused-vars
832
+ event: api_proto.ResponseFunctionCallArgumentsDeltaEvent,
833
+ ): void {}
834
+
835
+ #handleResponseFunctionCallArgumentsDone(
836
+ // eslint-disable-next-line @typescript-eslint/no-unused-vars
837
+ event: api_proto.ResponseFunctionCallArgumentsDoneEvent,
838
+ ): void {}
839
+
840
+ // eslint-disable-next-line @typescript-eslint/no-unused-vars
841
+ #handleRateLimitsUpdated(event: api_proto.RateLimitsUpdatedEvent): void {}
842
+ }