@mastra/voice-openai-realtime 0.0.4 → 0.1.0-alpha.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/index.ts CHANGED
@@ -1,33 +1,35 @@
1
1
  import type { ToolsInput } from '@mastra/core/agent';
2
2
  import { MastraVoice } from '@mastra/core/voice';
3
- import { RealtimeClient } from 'openai-realtime-api';
4
- import type { Realtime } from 'openai-realtime-api';
5
3
  import { isReadableStream, transformTools } from './utils';
6
-
7
- /**
8
- * Available event types that can be listened to
9
- */
10
- type VoiceEventType =
11
- | 'speak' // Emitted when starting to speak
12
- | 'writing' // Emitted while speaking with audio data
13
- | 'error'; // Emitted when an error occurs
4
+ import { WebSocket } from 'ws';
5
+ import { EventEmitter } from 'events';
6
+ import type { Realtime, RealtimeServerEvents } from 'openai-realtime-api';
7
+ import { PassThrough } from 'stream';
14
8
 
15
9
  /**
16
10
  * Event callback function type
17
11
  */
18
12
  type EventCallback = (...args: any[]) => void;
19
13
 
14
+ type StreamWithId = PassThrough & { id: string };
15
+
20
16
  /**
21
17
  * Map of event types to their callback arrays
22
18
  */
23
19
  type EventMap = {
24
- [K in VoiceEventType]: EventCallback[];
20
+ transcribing: [{ text: string }];
21
+ writing: [{ text: string }];
22
+ speaking: [{ audio: string }];
23
+ speaker: [StreamWithId];
24
+ error: [Error];
25
25
  } & {
26
26
  [key: string]: EventCallback[];
27
27
  };
28
28
 
29
29
  /** Default voice for text-to-speech. 'alloy' provides a neutral, balanced voice suitable for most use cases */
30
- const DEFAULT_VOICE = 'alloy';
30
+ const DEFAULT_VOICE: Realtime.Voice = 'alloy';
31
+
32
+ const DEFAULT_URL = 'wss://api.openai.com/v1/realtime';
31
33
 
32
34
  /**
33
35
  * Default model for real-time voice interactions.
@@ -62,6 +64,13 @@ type TTools = ToolsInput;
62
64
  */
63
65
  const VOICES = ['alloy', 'ash', 'ballad', 'coral', 'echo', 'sage', 'shimmer', 'verse'];
64
66
 
67
+ type RealtimeClientServerEventMap = {
68
+ [K in RealtimeServerEvents.EventType]: [RealtimeServerEvents.EventMap[K]];
69
+ } & {
70
+ ['conversation.item.input_audio_transcription.delta']: [{ delta: string; response_id: string }];
71
+ ['conversation.item.input_audio_transcription.done']: [{ response_id: string }];
72
+ };
73
+
65
74
  /**
66
75
  * OpenAIRealtimeVoice provides real-time voice interaction capabilities using OpenAI's
67
76
  * WebSocket-based API. It supports:
@@ -94,10 +103,13 @@ const VOICES = ['alloy', 'ash', 'ballad', 'coral', 'echo', 'sage', 'shimmer', 'v
94
103
  * ```
95
104
  */
96
105
  export class OpenAIRealtimeVoice extends MastraVoice {
97
- private client: RealtimeClient;
106
+ private ws: WebSocket;
98
107
  private state: 'close' | 'open';
108
+ private client: EventEmitter<RealtimeClientServerEventMap>;
99
109
  private events: EventMap;
100
- tools?: TTools;
110
+ private instructions?: string;
111
+ private tools?: TTools;
112
+ private debug: boolean;
101
113
 
102
114
  /**
103
115
  * Creates a new instance of OpenAIRealtimeVoice.
@@ -107,13 +119,8 @@ export class OpenAIRealtimeVoice extends MastraVoice {
107
119
  * @param options.chatModel.model - The model ID to use (defaults to GPT-4 Mini Realtime)
108
120
  * @param options.chatModel.apiKey - OpenAI API key. Falls back to process.env.OPENAI_API_KEY
109
121
  * @param options.chatModel.tools - Tools configuration for the model
110
- * @param options.chatModel.options - Additional options for the realtime client
111
- * @param options.chatModel.options.sessionConfig - Session configuration overrides
112
- * @param options.chatModel.options.url - Custom WebSocket URL
113
- * @param options.chatModel.options.dangerouslyAllowAPIKeyInBrowser - Whether to allow API key in browser
114
- * @param options.chatModel.options.debug - Enable debug logging
115
- * @param options.chatModel.options.tools - Additional tools configuration
116
122
  * @param options.speaker - Voice ID to use (defaults to 'alloy')
123
+ * @param options.debug - Enable debug mode
117
124
  *
118
125
  * @example
119
126
  * ```typescript
@@ -129,40 +136,37 @@ export class OpenAIRealtimeVoice extends MastraVoice {
129
136
  constructor({
130
137
  chatModel,
131
138
  speaker,
139
+ debug = false,
132
140
  }: {
133
141
  chatModel?: {
134
142
  model?: string;
135
143
  apiKey?: string;
136
144
  tools?: TTools;
137
- options?: {
138
- sessionConfig?: Realtime.SessionConfig;
139
- url?: string;
140
- dangerouslyAllowAPIKeyInBrowser?: boolean;
141
- debug?: boolean;
142
- tools?: TTools;
143
- };
145
+ instructions?: string;
146
+ url?: string;
144
147
  };
145
148
  speaker?: Realtime.Voice;
149
+ debug?: boolean;
146
150
  } = {}) {
147
151
  super();
148
- this.client = new RealtimeClient({
149
- apiKey: chatModel?.apiKey || process.env.OPENAI_API_KEY,
150
- model: chatModel?.model || DEFAULT_MODEL,
151
- ...chatModel?.options,
152
- sessionConfig: {
153
- voice: speaker || DEFAULT_VOICE,
154
- turn_detection: DEFAULT_VAD_CONFIG,
155
- ...chatModel?.options?.sessionConfig,
152
+
153
+ const url = `${chatModel?.url || DEFAULT_URL}?model=${chatModel?.model || DEFAULT_MODEL}`;
154
+ const apiKey = chatModel?.apiKey || process.env.OPENAI_API_KEY;
155
+ this.ws = new WebSocket(url, undefined, {
156
+ headers: {
157
+ Authorization: 'Bearer ' + apiKey,
158
+ 'OpenAI-Beta': 'realtime=v1',
156
159
  },
157
160
  });
158
161
 
162
+ this.client = new EventEmitter();
159
163
  this.state = 'close';
160
164
  this.events = {} as EventMap;
165
+ this.tools = chatModel?.tools;
166
+ this.instructions = chatModel?.instructions;
167
+ this.speaker = speaker || DEFAULT_VOICE;
168
+ this.debug = debug;
161
169
  this.setupEventListeners();
162
-
163
- if (chatModel?.tools) {
164
- this.addTools(chatModel.tools);
165
- }
166
170
  }
167
171
 
168
172
  /**
@@ -190,8 +194,8 @@ export class OpenAIRealtimeVoice extends MastraVoice {
190
194
  * ```
191
195
  */
192
196
  close() {
193
- if (!this.client) return;
194
- this.client.disconnect();
197
+ if (!this.ws) return;
198
+ this.ws.close();
195
199
  this.state = 'close';
196
200
  }
197
201
 
@@ -212,10 +216,10 @@ export class OpenAIRealtimeVoice extends MastraVoice {
212
216
  * ```
213
217
  */
214
218
  addTools(tools?: TTools) {
215
- const transformedTools = transformTools(tools);
216
- for (const tool of transformedTools) {
217
- this.client.addTool(tool.openaiTool, tool.execute);
218
- }
219
+ const openaiTools = transformTools(tools);
220
+ this.updateConfig({
221
+ tools: openaiTools.map(t => t.openaiTool),
222
+ });
219
223
  }
220
224
 
221
225
  /**
@@ -254,7 +258,7 @@ export class OpenAIRealtimeVoice extends MastraVoice {
254
258
  throw new Error('Input text is empty');
255
259
  }
256
260
 
257
- this.client.realtime.send('response.create', {
261
+ this.sendEvent('response.create', {
258
262
  response: {
259
263
  instructions: `Repeat the following text: ${input}`,
260
264
  voice: options?.speaker ? options.speaker : undefined,
@@ -280,8 +284,8 @@ export class OpenAIRealtimeVoice extends MastraVoice {
280
284
  * });
281
285
  * ```
282
286
  */
283
- updateConfig(sessionConfig: Realtime.SessionConfig): void {
284
- this.client.updateSession(sessionConfig);
287
+ updateConfig(sessionConfig: unknown): void {
288
+ this.sendEvent('session.update', { session: sessionConfig });
285
289
  }
286
290
 
287
291
  /**
@@ -319,7 +323,7 @@ export class OpenAIRealtimeVoice extends MastraVoice {
319
323
  const int16Array = new Int16Array(buffer.buffer, buffer.byteOffset ?? 0, (buffer.byteLength ?? 0) / 2);
320
324
  const base64Audio = this.int16ArrayToBase64(int16Array);
321
325
 
322
- this.client.realtime.send('conversation.item.create', {
326
+ this.sendEvent('conversation.item.create', {
323
327
  item: {
324
328
  type: 'message',
325
329
  role: 'user',
@@ -327,7 +331,7 @@ export class OpenAIRealtimeVoice extends MastraVoice {
327
331
  },
328
332
  });
329
333
 
330
- this.client.realtime.send('response.create', {
334
+ this.sendEvent('response.create', {
331
335
  response: {
332
336
  modalities: ['text'],
333
337
  instructions: `ONLY repeat the input and DO NOT say anything else`,
@@ -338,6 +342,18 @@ export class OpenAIRealtimeVoice extends MastraVoice {
338
342
  }
339
343
  }
340
344
 
345
+ waitForOpen() {
346
+ return new Promise(resolve => {
347
+ this.ws.on('open', resolve);
348
+ });
349
+ }
350
+
351
+ waitForSessionCreated() {
352
+ return new Promise(resolve => {
353
+ this.client.on('session.created', resolve);
354
+ });
355
+ }
356
+
341
357
  /**
342
358
  * Establishes a connection to the OpenAI realtime service.
343
359
  * Must be called before using speak, listen, or relay functions.
@@ -351,8 +367,18 @@ export class OpenAIRealtimeVoice extends MastraVoice {
351
367
  * ```
352
368
  */
353
369
  async connect() {
354
- await this.client.connect();
355
- await this.client.waitForSessionCreated();
370
+ await this.waitForOpen();
371
+ await this.waitForSessionCreated();
372
+
373
+ const openaiTools = transformTools(this.tools);
374
+ this.updateConfig({
375
+ instructions: this.instructions,
376
+ tools: openaiTools.map(t => t.openaiTool),
377
+ input_audio_transcription: {
378
+ model: 'whisper-1',
379
+ },
380
+ voice: this.speaker,
381
+ });
356
382
  this.state = 'open';
357
383
  }
358
384
 
@@ -374,7 +400,7 @@ export class OpenAIRealtimeVoice extends MastraVoice {
374
400
  * await voice.relay(micStream);
375
401
  * ```
376
402
  */
377
- async send(audioData: NodeJS.ReadableStream | Int16Array): Promise<void> {
403
+ async send(audioData: NodeJS.ReadableStream | Int16Array, eventId?: string): Promise<void> {
378
404
  if (!this.state || this.state !== 'open') {
379
405
  console.warn('Cannot relay audio when not open. Call open() first.');
380
406
  return;
@@ -385,15 +411,14 @@ export class OpenAIRealtimeVoice extends MastraVoice {
385
411
  stream.on('data', chunk => {
386
412
  try {
387
413
  const buffer = Buffer.isBuffer(chunk) ? chunk : Buffer.from(chunk);
388
- const int16Array = new Int16Array(buffer.buffer, buffer.byteOffset, buffer.byteLength / 2);
389
- this.client.appendInputAudio(int16Array);
414
+ this.sendEvent('input_audio_buffer.append', { audio: buffer.toString('base64'), event_id: eventId });
390
415
  } catch (err) {
391
416
  this.emit('error', err);
392
417
  }
393
418
  });
394
419
  } else if (audioData instanceof Int16Array) {
395
420
  try {
396
- this.client.appendInputAudio(audioData);
421
+ this.sendEvent('input_audio_buffer.append', { audio: audioData, event_id: eventId });
397
422
  } catch (err) {
398
423
  this.emit('error', err);
399
424
  }
@@ -421,7 +446,7 @@ export class OpenAIRealtimeVoice extends MastraVoice {
421
446
  * });
422
447
  */
423
448
  async answer({ options }: { options?: Realtime.ResponseConfig }) {
424
- this.client.realtime.send('response.create', { response: options ?? {} });
449
+ this.sendEvent('response.create', { response: options ?? {} });
425
450
  }
426
451
 
427
452
  /**
@@ -496,35 +521,115 @@ export class OpenAIRealtimeVoice extends MastraVoice {
496
521
  }
497
522
 
498
523
  private setupEventListeners(): void {
499
- this.client.on('error', error => {
500
- this.emit('error', error);
524
+ const speakerStreams = new Map<string, StreamWithId>();
525
+
526
+ this.ws.on('message', message => {
527
+ const data = JSON.parse(message.toString());
528
+ this.client.emit(data.type, data);
529
+
530
+ if (this.debug) {
531
+ const { delta, ...fields } = data;
532
+ console.log(data.type, fields, delta?.length < 100 ? delta : '');
533
+ }
501
534
  });
502
535
 
503
- this.client.on('conversation.created', conversation => {
504
- this.emit('openAIRealtime:conversation.created', conversation);
536
+ this.client.on('session.created', ev => {
537
+ this.emit('session.created', ev);
538
+ });
539
+ this.client.on('session.updated', ev => {
540
+ this.emit('session.updated', ev);
505
541
  });
542
+ this.client.on('response.created', ev => {
543
+ this.emit('response.created', ev);
506
544
 
507
- this.client.on('conversation.interrupted', () => {
508
- this.emit('openAIRealtime:conversation.interrupted');
545
+ const speakerStream = new PassThrough() as StreamWithId;
546
+
547
+ speakerStream.id = ev.response.id;
548
+
549
+ speakerStreams.set(ev.response.id, speakerStream);
550
+ this.emit('speaker', speakerStream);
551
+ });
552
+ this.client.on('conversation.item.input_audio_transcription.delta', ev => {
553
+ this.emit('transcribing', { text: ev.delta, response_id: ev.response_id, role: 'user' });
554
+ });
555
+ this.client.on('conversation.item.input_audio_transcription.done', ev => {
556
+ this.emit('transcribing', { text: '\n', response_id: ev.response_id, role: 'user' });
509
557
  });
558
+ this.client.on('response.audio.delta', ev => {
559
+ const audio = Buffer.from(ev.delta, 'base64');
560
+ this.emit('speaking', { audio, response_id: ev.response_id });
510
561
 
511
- this.client.on('conversation.updated', ({ delta }) => {
512
- if (delta?.audio) {
513
- this.emit('speaking', { audio: delta.audio });
514
- }
562
+ const stream = speakerStreams.get(ev.response_id);
563
+ stream?.write(audio);
515
564
  });
565
+ this.client.on('response.audio.done', ev => {
566
+ this.emit('speaking.done', { response_id: ev.response_id });
516
567
 
517
- this.client.on('conversation.item.appended', item => {
518
- this.emit('openAIRealtime:conversation.item.appended', item);
568
+ const stream = speakerStreams.get(ev.response_id);
569
+ stream?.end();
570
+ });
571
+ this.client.on('response.audio_transcript.delta', ev => {
572
+ this.emit('writing', { text: ev.delta, response_id: ev.response_id });
573
+ });
574
+ this.client.on('response.audio_transcript.done', ev => {
575
+ this.emit('writing', { text: '\n', response_id: ev.response_id });
519
576
  });
577
+ this.client.on('response.text.delta', ev => {
578
+ this.emit('writing', { text: ev.delta, response_id: ev.response_id });
579
+ });
580
+ this.client.on('response.text.done', ev => {
581
+ this.emit('writing', { text: '\n', response_id: ev.response_id });
582
+ });
583
+ this.client.on('response.done', ev => {
584
+ this.handleFunctionCalls(ev);
585
+ this.emit('response.done', ev);
586
+ speakerStreams.delete(ev.response.id);
587
+ });
588
+ }
520
589
 
521
- this.client.on('conversation.item.completed', ({ item, delta }) => {
522
- if (item.formatted.transcript) {
523
- this.emit('writing', { text: item.formatted.transcript, role: item.role });
590
+ private async handleFunctionCalls(ev: any) {
591
+ for (const output of ev.response?.output ?? []) {
592
+ if (output.type === 'function_call') {
593
+ await this.handleFunctionCall(output);
524
594
  }
595
+ }
596
+ }
525
597
 
526
- this.emit('openAIRealtime:conversation.item.completed', { item, delta });
527
- });
598
+ private async handleFunctionCall(output: any) {
599
+ try {
600
+ const context = JSON.parse(output.arguments);
601
+ const tool = this.tools?.[output.name];
602
+ if (!tool) {
603
+ console.warn(`Tool "${output.name}" not found`);
604
+ return;
605
+ }
606
+ const result = await tool?.execute?.(
607
+ { context },
608
+ {
609
+ toolCallId: 'unknown',
610
+ messages: [],
611
+ },
612
+ );
613
+ this.sendEvent('conversation.item.create', {
614
+ item: {
615
+ type: 'function_call_output',
616
+ call_id: output.call_id,
617
+ output: JSON.stringify(result),
618
+ },
619
+ });
620
+ } catch (e) {
621
+ const err = e as Error;
622
+ console.warn(`Error calling tool "${output.name}":`, err.message);
623
+ this.sendEvent('conversation.item.create', {
624
+ item: {
625
+ type: 'function_call_output',
626
+ call_id: output.call_id,
627
+ output: JSON.stringify({ error: err.message }),
628
+ },
629
+ });
630
+ } finally {
631
+ this.sendEvent('response.create', {});
632
+ }
528
633
  }
529
634
 
530
635
  private int16ArrayToBase64(int16Array: Int16Array): string {
@@ -540,4 +645,13 @@ export class OpenAIRealtimeVoice extends MastraVoice {
540
645
  }
541
646
  return btoa(binary);
542
647
  }
648
+
649
+ private sendEvent(type: string, data: any) {
650
+ this.ws.send(
651
+ JSON.stringify({
652
+ type: type,
653
+ ...data,
654
+ }),
655
+ );
656
+ }
543
657
  }
package/src/utils.ts CHANGED
@@ -37,6 +37,7 @@ export const transformTools = (tools?: TTools) => {
37
37
  continue;
38
38
  }
39
39
  const openaiTool = {
40
+ type: 'function',
40
41
  name,
41
42
  description: tool.description || `Tool: ${name}`,
42
43
  parameters,