@mastra/voice-openai-realtime 0.0.5-alpha.0 → 0.1.0-alpha.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/index.ts CHANGED
@@ -1,54 +1,57 @@
1
+ import { EventEmitter } from 'events';
2
+ import { PassThrough } from 'stream';
1
3
  import type { ToolsInput } from '@mastra/core/agent';
2
4
  import { MastraVoice } from '@mastra/core/voice';
3
- import { RealtimeClient } from 'openai-realtime-api';
4
- import type { Realtime } from 'openai-realtime-api';
5
+ import type { Realtime, RealtimeServerEvents } from 'openai-realtime-api';
6
+ import { WebSocket } from 'ws';
5
7
  import { isReadableStream, transformTools } from './utils';
6
8
 
7
- /**
8
- * Available event types that can be listened to
9
- */
10
- type VoiceEventType =
11
- | 'speak' // Emitted when starting to speak
12
- | 'writing' // Emitted while speaking with audio data
13
- | 'error'; // Emitted when an error occurs
14
-
15
9
  /**
16
10
  * Event callback function type
17
11
  */
18
12
  type EventCallback = (...args: any[]) => void;
19
13
 
14
+ type StreamWithId = PassThrough & { id: string };
15
+
20
16
  /**
21
17
  * Map of event types to their callback arrays
22
18
  */
23
19
  type EventMap = {
24
- [K in VoiceEventType]: EventCallback[];
20
+ transcribing: [{ text: string }];
21
+ writing: [{ text: string }];
22
+ speaking: [{ audio: string }];
23
+ speaker: [StreamWithId];
24
+ error: [Error];
25
25
  } & {
26
26
  [key: string]: EventCallback[];
27
27
  };
28
28
 
29
29
  /** Default voice for text-to-speech. 'alloy' provides a neutral, balanced voice suitable for most use cases */
30
- const DEFAULT_VOICE = 'alloy';
30
+ const DEFAULT_VOICE: Realtime.Voice = 'alloy';
31
+
32
+ const DEFAULT_URL = 'wss://api.openai.com/v1/realtime';
31
33
 
32
34
  /**
33
35
  * Default model for real-time voice interactions.
34
36
  * This model is optimized for low-latency responses while maintaining high quality output.
35
37
  */
36
38
  const DEFAULT_MODEL = 'gpt-4o-mini-realtime-preview-2024-12-17';
37
- /**
38
- * Default Voice Activity Detection (VAD) configuration.
39
- * These settings control how the system detects speech segments.
40
- *
41
- * @property {string} type - Uses server-side VAD for better accuracy
42
- * @property {number} threshold - Speech detection sensitivity (0.5 = balanced)
43
- * @property {number} prefix_padding_ms - Includes 1 second of audio before speech
44
- * @property {number} silence_duration_ms - Waits 1 second of silence before ending turn
45
- */
46
- const DEFAULT_VAD_CONFIG = {
47
- type: 'server_vad',
48
- threshold: 0.5,
49
- prefix_padding_ms: 1000,
50
- silence_duration_ms: 1000,
51
- } as Realtime.TurnDetection;
39
+
40
+ // /**
41
+ // * Default Voice Activity Detection (VAD) configuration.
42
+ // * These settings control how the system detects speech segments.
43
+ // *
44
+ // * @property {string} type - Uses server-side VAD for better accuracy
45
+ // * @property {number} threshold - Speech detection sensitivity (0.5 = balanced)
46
+ // * @property {number} prefix_padding_ms - Includes 1 second of audio before speech
47
+ // * @property {number} silence_duration_ms - Waits 1 second of silence before ending turn
48
+ // */
49
+ // const DEFAULT_VAD_CONFIG = {
50
+ // type: 'server_vad',
51
+ // threshold: 0.5,
52
+ // prefix_padding_ms: 1000,
53
+ // silence_duration_ms: 1000,
54
+ // } as Realtime.TurnDetection;
52
55
 
53
56
  type TTools = ToolsInput;
54
57
 
@@ -62,6 +65,13 @@ type TTools = ToolsInput;
62
65
  */
63
66
  const VOICES = ['alloy', 'ash', 'ballad', 'coral', 'echo', 'sage', 'shimmer', 'verse'];
64
67
 
68
+ type RealtimeClientServerEventMap = {
69
+ [K in RealtimeServerEvents.EventType]: [RealtimeServerEvents.EventMap[K]];
70
+ } & {
71
+ ['conversation.item.input_audio_transcription.delta']: [{ delta: string; response_id: string }];
72
+ ['conversation.item.input_audio_transcription.done']: [{ response_id: string }];
73
+ };
74
+
65
75
  /**
66
76
  * OpenAIRealtimeVoice provides real-time voice interaction capabilities using OpenAI's
67
77
  * WebSocket-based API. It supports:
@@ -94,10 +104,13 @@ const VOICES = ['alloy', 'ash', 'ballad', 'coral', 'echo', 'sage', 'shimmer', 'v
94
104
  * ```
95
105
  */
96
106
  export class OpenAIRealtimeVoice extends MastraVoice {
97
- private client: RealtimeClient;
107
+ private ws: WebSocket;
98
108
  private state: 'close' | 'open';
109
+ private client: EventEmitter<RealtimeClientServerEventMap>;
99
110
  private events: EventMap;
100
- tools?: TTools;
111
+ private instructions?: string;
112
+ private tools?: TTools;
113
+ private debug: boolean;
101
114
 
102
115
  /**
103
116
  * Creates a new instance of OpenAIRealtimeVoice.
@@ -107,13 +120,8 @@ export class OpenAIRealtimeVoice extends MastraVoice {
107
120
  * @param options.chatModel.model - The model ID to use (defaults to GPT-4 Mini Realtime)
108
121
  * @param options.chatModel.apiKey - OpenAI API key. Falls back to process.env.OPENAI_API_KEY
109
122
  * @param options.chatModel.tools - Tools configuration for the model
110
- * @param options.chatModel.options - Additional options for the realtime client
111
- * @param options.chatModel.options.sessionConfig - Session configuration overrides
112
- * @param options.chatModel.options.url - Custom WebSocket URL
113
- * @param options.chatModel.options.dangerouslyAllowAPIKeyInBrowser - Whether to allow API key in browser
114
- * @param options.chatModel.options.debug - Enable debug logging
115
- * @param options.chatModel.options.tools - Additional tools configuration
116
123
  * @param options.speaker - Voice ID to use (defaults to 'alloy')
124
+ * @param options.debug - Enable debug mode
117
125
  *
118
126
  * @example
119
127
  * ```typescript
@@ -129,40 +137,37 @@ export class OpenAIRealtimeVoice extends MastraVoice {
129
137
  constructor({
130
138
  chatModel,
131
139
  speaker,
140
+ debug = false,
132
141
  }: {
133
142
  chatModel?: {
134
143
  model?: string;
135
144
  apiKey?: string;
136
145
  tools?: TTools;
137
- options?: {
138
- sessionConfig?: Realtime.SessionConfig;
139
- url?: string;
140
- dangerouslyAllowAPIKeyInBrowser?: boolean;
141
- debug?: boolean;
142
- tools?: TTools;
143
- };
146
+ instructions?: string;
147
+ url?: string;
144
148
  };
145
149
  speaker?: Realtime.Voice;
150
+ debug?: boolean;
146
151
  } = {}) {
147
152
  super();
148
- this.client = new RealtimeClient({
149
- apiKey: chatModel?.apiKey || process.env.OPENAI_API_KEY,
150
- model: chatModel?.model || DEFAULT_MODEL,
151
- ...chatModel?.options,
152
- sessionConfig: {
153
- voice: speaker || DEFAULT_VOICE,
154
- turn_detection: DEFAULT_VAD_CONFIG,
155
- ...chatModel?.options?.sessionConfig,
153
+
154
+ const url = `${chatModel?.url || DEFAULT_URL}?model=${chatModel?.model || DEFAULT_MODEL}`;
155
+ const apiKey = chatModel?.apiKey || process.env.OPENAI_API_KEY;
156
+ this.ws = new WebSocket(url, undefined, {
157
+ headers: {
158
+ Authorization: 'Bearer ' + apiKey,
159
+ 'OpenAI-Beta': 'realtime=v1',
156
160
  },
157
161
  });
158
162
 
163
+ this.client = new EventEmitter();
159
164
  this.state = 'close';
160
165
  this.events = {} as EventMap;
166
+ this.tools = chatModel?.tools;
167
+ this.instructions = chatModel?.instructions;
168
+ this.speaker = speaker || DEFAULT_VOICE;
169
+ this.debug = debug;
161
170
  this.setupEventListeners();
162
-
163
- if (chatModel?.tools) {
164
- this.addTools(chatModel.tools);
165
- }
166
171
  }
167
172
 
168
173
  /**
@@ -190,8 +195,8 @@ export class OpenAIRealtimeVoice extends MastraVoice {
190
195
  * ```
191
196
  */
192
197
  close() {
193
- if (!this.client) return;
194
- this.client.disconnect();
198
+ if (!this.ws) return;
199
+ this.ws.close();
195
200
  this.state = 'close';
196
201
  }
197
202
 
@@ -212,10 +217,10 @@ export class OpenAIRealtimeVoice extends MastraVoice {
212
217
  * ```
213
218
  */
214
219
  addTools(tools?: TTools) {
215
- const transformedTools = transformTools(tools);
216
- for (const tool of transformedTools) {
217
- this.client.addTool(tool.openaiTool, tool.execute);
218
- }
220
+ const openaiTools = transformTools(tools);
221
+ this.updateConfig({
222
+ tools: openaiTools.map(t => t.openaiTool),
223
+ });
219
224
  }
220
225
 
221
226
  /**
@@ -254,7 +259,7 @@ export class OpenAIRealtimeVoice extends MastraVoice {
254
259
  throw new Error('Input text is empty');
255
260
  }
256
261
 
257
- this.client.realtime.send('response.create', {
262
+ this.sendEvent('response.create', {
258
263
  response: {
259
264
  instructions: `Repeat the following text: ${input}`,
260
265
  voice: options?.speaker ? options.speaker : undefined,
@@ -280,8 +285,8 @@ export class OpenAIRealtimeVoice extends MastraVoice {
280
285
  * });
281
286
  * ```
282
287
  */
283
- updateConfig(sessionConfig: Realtime.SessionConfig): void {
284
- this.client.updateSession(sessionConfig);
288
+ updateConfig(sessionConfig: unknown): void {
289
+ this.sendEvent('session.update', { session: sessionConfig });
285
290
  }
286
291
 
287
292
  /**
@@ -319,7 +324,7 @@ export class OpenAIRealtimeVoice extends MastraVoice {
319
324
  const int16Array = new Int16Array(buffer.buffer, buffer.byteOffset ?? 0, (buffer.byteLength ?? 0) / 2);
320
325
  const base64Audio = this.int16ArrayToBase64(int16Array);
321
326
 
322
- this.client.realtime.send('conversation.item.create', {
327
+ this.sendEvent('conversation.item.create', {
323
328
  item: {
324
329
  type: 'message',
325
330
  role: 'user',
@@ -327,7 +332,7 @@ export class OpenAIRealtimeVoice extends MastraVoice {
327
332
  },
328
333
  });
329
334
 
330
- this.client.realtime.send('response.create', {
335
+ this.sendEvent('response.create', {
331
336
  response: {
332
337
  modalities: ['text'],
333
338
  instructions: `ONLY repeat the input and DO NOT say anything else`,
@@ -338,6 +343,18 @@ export class OpenAIRealtimeVoice extends MastraVoice {
338
343
  }
339
344
  }
340
345
 
346
+ waitForOpen() {
347
+ return new Promise(resolve => {
348
+ this.ws.on('open', resolve);
349
+ });
350
+ }
351
+
352
+ waitForSessionCreated() {
353
+ return new Promise(resolve => {
354
+ this.client.on('session.created', resolve);
355
+ });
356
+ }
357
+
341
358
  /**
342
359
  * Establishes a connection to the OpenAI realtime service.
343
360
  * Must be called before using speak, listen, or relay functions.
@@ -351,8 +368,18 @@ export class OpenAIRealtimeVoice extends MastraVoice {
351
368
  * ```
352
369
  */
353
370
  async connect() {
354
- await this.client.connect();
355
- await this.client.waitForSessionCreated();
371
+ await this.waitForOpen();
372
+ await this.waitForSessionCreated();
373
+
374
+ const openaiTools = transformTools(this.tools);
375
+ this.updateConfig({
376
+ instructions: this.instructions,
377
+ tools: openaiTools.map(t => t.openaiTool),
378
+ input_audio_transcription: {
379
+ model: 'whisper-1',
380
+ },
381
+ voice: this.speaker,
382
+ });
356
383
  this.state = 'open';
357
384
  }
358
385
 
@@ -374,7 +401,7 @@ export class OpenAIRealtimeVoice extends MastraVoice {
374
401
  * await voice.relay(micStream);
375
402
  * ```
376
403
  */
377
- async send(audioData: NodeJS.ReadableStream | Int16Array): Promise<void> {
404
+ async send(audioData: NodeJS.ReadableStream | Int16Array, eventId?: string): Promise<void> {
378
405
  if (!this.state || this.state !== 'open') {
379
406
  console.warn('Cannot relay audio when not open. Call open() first.');
380
407
  return;
@@ -385,15 +412,14 @@ export class OpenAIRealtimeVoice extends MastraVoice {
385
412
  stream.on('data', chunk => {
386
413
  try {
387
414
  const buffer = Buffer.isBuffer(chunk) ? chunk : Buffer.from(chunk);
388
- const int16Array = new Int16Array(buffer.buffer, buffer.byteOffset, buffer.byteLength / 2);
389
- this.client.appendInputAudio(int16Array);
415
+ this.sendEvent('input_audio_buffer.append', { audio: buffer.toString('base64'), event_id: eventId });
390
416
  } catch (err) {
391
417
  this.emit('error', err);
392
418
  }
393
419
  });
394
420
  } else if (audioData instanceof Int16Array) {
395
421
  try {
396
- this.client.appendInputAudio(audioData);
422
+ this.sendEvent('input_audio_buffer.append', { audio: audioData, event_id: eventId });
397
423
  } catch (err) {
398
424
  this.emit('error', err);
399
425
  }
@@ -421,7 +447,7 @@ export class OpenAIRealtimeVoice extends MastraVoice {
421
447
  * });
422
448
  */
423
449
  async answer({ options }: { options?: Realtime.ResponseConfig }) {
424
- this.client.realtime.send('response.create', { response: options ?? {} });
450
+ this.sendEvent('response.create', { response: options ?? {} });
425
451
  }
426
452
 
427
453
  /**
@@ -496,35 +522,115 @@ export class OpenAIRealtimeVoice extends MastraVoice {
496
522
  }
497
523
 
498
524
  private setupEventListeners(): void {
499
- this.client.on('error', error => {
500
- this.emit('error', error);
525
+ const speakerStreams = new Map<string, StreamWithId>();
526
+
527
+ this.ws.on('message', message => {
528
+ const data = JSON.parse(message.toString());
529
+ this.client.emit(data.type, data);
530
+
531
+ if (this.debug) {
532
+ const { delta, ...fields } = data;
533
+ console.log(data.type, fields, delta?.length < 100 ? delta : '');
534
+ }
501
535
  });
502
536
 
503
- this.client.on('conversation.created', conversation => {
504
- this.emit('openAIRealtime:conversation.created', conversation);
537
+ this.client.on('session.created', ev => {
538
+ this.emit('session.created', ev);
539
+ });
540
+ this.client.on('session.updated', ev => {
541
+ this.emit('session.updated', ev);
505
542
  });
543
+ this.client.on('response.created', ev => {
544
+ this.emit('response.created', ev);
545
+
546
+ const speakerStream = new PassThrough() as StreamWithId;
547
+
548
+ speakerStream.id = ev.response.id;
506
549
 
507
- this.client.on('conversation.interrupted', () => {
508
- this.emit('openAIRealtime:conversation.interrupted');
550
+ speakerStreams.set(ev.response.id, speakerStream);
551
+ this.emit('speaker', speakerStream);
509
552
  });
553
+ this.client.on('conversation.item.input_audio_transcription.delta', ev => {
554
+ this.emit('transcribing', { text: ev.delta, response_id: ev.response_id, role: 'user' });
555
+ });
556
+ this.client.on('conversation.item.input_audio_transcription.done', ev => {
557
+ this.emit('transcribing', { text: '\n', response_id: ev.response_id, role: 'user' });
558
+ });
559
+ this.client.on('response.audio.delta', ev => {
560
+ const audio = Buffer.from(ev.delta, 'base64');
561
+ this.emit('speaking', { audio, response_id: ev.response_id });
510
562
 
511
- this.client.on('conversation.updated', ({ delta }) => {
512
- if (delta?.audio) {
513
- this.emit('speaking', { audio: delta.audio });
514
- }
563
+ const stream = speakerStreams.get(ev.response_id);
564
+ stream?.write(audio);
515
565
  });
566
+ this.client.on('response.audio.done', ev => {
567
+ this.emit('speaking.done', { response_id: ev.response_id });
516
568
 
517
- this.client.on('conversation.item.appended', item => {
518
- this.emit('openAIRealtime:conversation.item.appended', item);
569
+ const stream = speakerStreams.get(ev.response_id);
570
+ stream?.end();
519
571
  });
572
+ this.client.on('response.audio_transcript.delta', ev => {
573
+ this.emit('writing', { text: ev.delta, response_id: ev.response_id });
574
+ });
575
+ this.client.on('response.audio_transcript.done', ev => {
576
+ this.emit('writing', { text: '\n', response_id: ev.response_id });
577
+ });
578
+ this.client.on('response.text.delta', ev => {
579
+ this.emit('writing', { text: ev.delta, response_id: ev.response_id });
580
+ });
581
+ this.client.on('response.text.done', ev => {
582
+ this.emit('writing', { text: '\n', response_id: ev.response_id });
583
+ });
584
+ this.client.on('response.done', async ev => {
585
+ await this.handleFunctionCalls(ev);
586
+ this.emit('response.done', ev);
587
+ speakerStreams.delete(ev.response.id);
588
+ });
589
+ }
520
590
 
521
- this.client.on('conversation.item.completed', ({ item, delta }) => {
522
- if (item.formatted.transcript) {
523
- this.emit('writing', { text: item.formatted.transcript, role: item.role });
591
+ private async handleFunctionCalls(ev: any) {
592
+ for (const output of ev.response?.output ?? []) {
593
+ if (output.type === 'function_call') {
594
+ await this.handleFunctionCall(output);
524
595
  }
596
+ }
597
+ }
525
598
 
526
- this.emit('openAIRealtime:conversation.item.completed', { item, delta });
527
- });
599
+ private async handleFunctionCall(output: any) {
600
+ try {
601
+ const context = JSON.parse(output.arguments);
602
+ const tool = this.tools?.[output.name];
603
+ if (!tool) {
604
+ console.warn(`Tool "${output.name}" not found`);
605
+ return;
606
+ }
607
+ const result = await tool?.execute?.(
608
+ { context },
609
+ {
610
+ toolCallId: 'unknown',
611
+ messages: [],
612
+ },
613
+ );
614
+ this.sendEvent('conversation.item.create', {
615
+ item: {
616
+ type: 'function_call_output',
617
+ call_id: output.call_id,
618
+ output: JSON.stringify(result),
619
+ },
620
+ });
621
+ } catch (e) {
622
+ const err = e as Error;
623
+ console.warn(`Error calling tool "${output.name}":`, err.message);
624
+ this.sendEvent('conversation.item.create', {
625
+ item: {
626
+ type: 'function_call_output',
627
+ call_id: output.call_id,
628
+ output: JSON.stringify({ error: err.message }),
629
+ },
630
+ });
631
+ } finally {
632
+ this.sendEvent('response.create', {});
633
+ }
528
634
  }
529
635
 
530
636
  private int16ArrayToBase64(int16Array: Int16Array): string {
@@ -540,4 +646,13 @@ export class OpenAIRealtimeVoice extends MastraVoice {
540
646
  }
541
647
  return btoa(binary);
542
648
  }
649
+
650
+ private sendEvent(type: string, data: any) {
651
+ this.ws.send(
652
+ JSON.stringify({
653
+ type: type,
654
+ ...data,
655
+ }),
656
+ );
657
+ }
543
658
  }
package/src/utils.ts CHANGED
@@ -37,6 +37,7 @@ export const transformTools = (tools?: TTools) => {
37
37
  continue;
38
38
  }
39
39
  const openaiTool = {
40
+ type: 'function',
40
41
  name,
41
42
  description: tool.description || `Tool: ${name}`,
42
43
  parameters,