npm - @mastra/voice-openai-realtime - Versions diffs - 0.0.5-alpha.0 → 0.1.0-alpha.2 - Mend

@mastra/voice-openai-realtime 0.0.5-alpha.0 → 0.1.0-alpha.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

package/.turbo/turbo-build.log +7 -7
package/CHANGELOG.md +23 -0
package/dist/_tsup-dts-rollup.d.cts +18 -18
package/dist/_tsup-dts-rollup.d.ts +18 -18
package/dist/index.cjs +159 -61
package/dist/index.js +159 -61
package/package.json +6 -4
package/src/index.ts +204 -89
package/src/utils.ts +1 -0

package/src/index.ts CHANGED Viewed

@@ -1,54 +1,57 @@
+import { EventEmitter } from 'events';
+import { PassThrough } from 'stream';
 import type { ToolsInput } from '@mastra/core/agent';
 import { MastraVoice } from '@mastra/core/voice';
-import { RealtimeClient } from 'openai-realtime-api';
-import type { Realtime } from 'openai-realtime-api';
+import type { Realtime, RealtimeServerEvents } from 'openai-realtime-api';
+import { WebSocket } from 'ws';
 import { isReadableStream, transformTools } from './utils';
-/**
- * Available event types that can be listened to
- */
-type VoiceEventType =
-  | 'speak' // Emitted when starting to speak
-  | 'writing' // Emitted while speaking with audio data
-  | 'error'; // Emitted when an error occurs
 /**
  * Event callback function type
  */
 type EventCallback = (...args: any[]) => void;
+type StreamWithId = PassThrough & { id: string };
 /**
  * Map of event types to their callback arrays
  */
 type EventMap = {
-  [K in VoiceEventType]: EventCallback[];
+  transcribing: [{ text: string }];
+  writing: [{ text: string }];
+  speaking: [{ audio: string }];
+  speaker: [StreamWithId];
+  error: [Error];
 } & {
   [key: string]: EventCallback[];
 };
 /** Default voice for text-to-speech. 'alloy' provides a neutral, balanced voice suitable for most use cases */
-const DEFAULT_VOICE = 'alloy';
+const DEFAULT_VOICE: Realtime.Voice = 'alloy';
+const DEFAULT_URL = 'wss://api.openai.com/v1/realtime';
 /**
  * Default model for real-time voice interactions.
  * This model is optimized for low-latency responses while maintaining high quality output.
  */
 const DEFAULT_MODEL = 'gpt-4o-mini-realtime-preview-2024-12-17';
-/**
- * Default Voice Activity Detection (VAD) configuration.
- * These settings control how the system detects speech segments.
- *
- * @property {string} type - Uses server-side VAD for better accuracy
- * @property {number} threshold - Speech detection sensitivity (0.5 = balanced)
- * @property {number} prefix_padding_ms - Includes 1 second of audio before speech
- * @property {number} silence_duration_ms - Waits 1 second of silence before ending turn
- */
-const DEFAULT_VAD_CONFIG = {
-  type: 'server_vad',
-  threshold: 0.5,
-  prefix_padding_ms: 1000,
-  silence_duration_ms: 1000,
-} as Realtime.TurnDetection;
+// /**
+//  * Default Voice Activity Detection (VAD) configuration.
+//  * These settings control how the system detects speech segments.
+//  *
+//  * @property {string} type - Uses server-side VAD for better accuracy
+//  * @property {number} threshold - Speech detection sensitivity (0.5 = balanced)
+//  * @property {number} prefix_padding_ms - Includes 1 second of audio before speech
+//  * @property {number} silence_duration_ms - Waits 1 second of silence before ending turn
+//  */
+// const DEFAULT_VAD_CONFIG = {
+//   type: 'server_vad',
+//   threshold: 0.5,
+//   prefix_padding_ms: 1000,
+//   silence_duration_ms: 1000,
+// } as Realtime.TurnDetection;
 type TTools = ToolsInput;
@@ -62,6 +65,13 @@ type TTools = ToolsInput;
  */
 const VOICES = ['alloy', 'ash', 'ballad', 'coral', 'echo', 'sage', 'shimmer', 'verse'];
+type RealtimeClientServerEventMap = {
+  [K in RealtimeServerEvents.EventType]: [RealtimeServerEvents.EventMap[K]];
+} & {
+  ['conversation.item.input_audio_transcription.delta']: [{ delta: string; response_id: string }];
+  ['conversation.item.input_audio_transcription.done']: [{ response_id: string }];
+};
 /**
  * OpenAIRealtimeVoice provides real-time voice interaction capabilities using OpenAI's
  * WebSocket-based API. It supports:
@@ -94,10 +104,13 @@ const VOICES = ['alloy', 'ash', 'ballad', 'coral', 'echo', 'sage', 'shimmer', 'v
  * ```
  */
 export class OpenAIRealtimeVoice extends MastraVoice {
-  private client: RealtimeClient;
+  private ws: WebSocket;
   private state: 'close' | 'open';
+  private client: EventEmitter<RealtimeClientServerEventMap>;
   private events: EventMap;
-  tools?: TTools;
+  private instructions?: string;
+  private tools?: TTools;
+  private debug: boolean;
   /**
    * Creates a new instance of OpenAIRealtimeVoice.
@@ -107,13 +120,8 @@ export class OpenAIRealtimeVoice extends MastraVoice {
    * @param options.chatModel.model - The model ID to use (defaults to GPT-4 Mini Realtime)
    * @param options.chatModel.apiKey - OpenAI API key. Falls back to process.env.OPENAI_API_KEY
    * @param options.chatModel.tools - Tools configuration for the model
-   * @param options.chatModel.options - Additional options for the realtime client
-   * @param options.chatModel.options.sessionConfig - Session configuration overrides
-   * @param options.chatModel.options.url - Custom WebSocket URL
-   * @param options.chatModel.options.dangerouslyAllowAPIKeyInBrowser - Whether to allow API key in browser
-   * @param options.chatModel.options.debug - Enable debug logging
-   * @param options.chatModel.options.tools - Additional tools configuration
    * @param options.speaker - Voice ID to use (defaults to 'alloy')
+   * @param options.debug - Enable debug mode
    *
    * @example
    * ```typescript
@@ -129,40 +137,37 @@ export class OpenAIRealtimeVoice extends MastraVoice {
   constructor({
     chatModel,
     speaker,
+    debug = false,
   }: {
     chatModel?: {
       model?: string;
       apiKey?: string;
       tools?: TTools;
-      options?: {
-        sessionConfig?: Realtime.SessionConfig;
-        url?: string;
-        dangerouslyAllowAPIKeyInBrowser?: boolean;
-        debug?: boolean;
-        tools?: TTools;
-      };
+      instructions?: string;
+      url?: string;
     };
     speaker?: Realtime.Voice;
+    debug?: boolean;
   } = {}) {
     super();
-    this.client = new RealtimeClient({
-      apiKey: chatModel?.apiKey || process.env.OPENAI_API_KEY,
-      model: chatModel?.model || DEFAULT_MODEL,
-      ...chatModel?.options,
-      sessionConfig: {
-        voice: speaker || DEFAULT_VOICE,
-        turn_detection: DEFAULT_VAD_CONFIG,
-        ...chatModel?.options?.sessionConfig,
+    const url = `${chatModel?.url || DEFAULT_URL}?model=${chatModel?.model || DEFAULT_MODEL}`;
+    const apiKey = chatModel?.apiKey || process.env.OPENAI_API_KEY;
+    this.ws = new WebSocket(url, undefined, {
+      headers: {
+        Authorization: 'Bearer ' + apiKey,
+        'OpenAI-Beta': 'realtime=v1',
       },
     });
+    this.client = new EventEmitter();
     this.state = 'close';
     this.events = {} as EventMap;
+    this.tools = chatModel?.tools;
+    this.instructions = chatModel?.instructions;
+    this.speaker = speaker || DEFAULT_VOICE;
+    this.debug = debug;
     this.setupEventListeners();
-    if (chatModel?.tools) {
-      this.addTools(chatModel.tools);
-    }
   }
   /**
@@ -190,8 +195,8 @@ export class OpenAIRealtimeVoice extends MastraVoice {
    * ```
    */
   close() {
-    if (!this.client) return;
-    this.client.disconnect();
+    if (!this.ws) return;
+    this.ws.close();
     this.state = 'close';
   }
@@ -212,10 +217,10 @@ export class OpenAIRealtimeVoice extends MastraVoice {
    * ```
    */
   addTools(tools?: TTools) {
-    const transformedTools = transformTools(tools);
-    for (const tool of transformedTools) {
-      this.client.addTool(tool.openaiTool, tool.execute);
-    }
+    const openaiTools = transformTools(tools);
+    this.updateConfig({
+      tools: openaiTools.map(t => t.openaiTool),
+    });
   }
   /**
@@ -254,7 +259,7 @@ export class OpenAIRealtimeVoice extends MastraVoice {
       throw new Error('Input text is empty');
     }
-    this.client.realtime.send('response.create', {
+    this.sendEvent('response.create', {
       response: {
         instructions: `Repeat the following text: ${input}`,
         voice: options?.speaker ? options.speaker : undefined,
@@ -280,8 +285,8 @@ export class OpenAIRealtimeVoice extends MastraVoice {
    * });
    * ```
    */
-  updateConfig(sessionConfig: Realtime.SessionConfig): void {
-    this.client.updateSession(sessionConfig);
+  updateConfig(sessionConfig: unknown): void {
+    this.sendEvent('session.update', { session: sessionConfig });
   }
   /**
@@ -319,7 +324,7 @@ export class OpenAIRealtimeVoice extends MastraVoice {
       const int16Array = new Int16Array(buffer.buffer, buffer.byteOffset ?? 0, (buffer.byteLength ?? 0) / 2);
       const base64Audio = this.int16ArrayToBase64(int16Array);
-      this.client.realtime.send('conversation.item.create', {
+      this.sendEvent('conversation.item.create', {
         item: {
           type: 'message',
           role: 'user',
@@ -327,7 +332,7 @@ export class OpenAIRealtimeVoice extends MastraVoice {
         },
       });
-      this.client.realtime.send('response.create', {
+      this.sendEvent('response.create', {
         response: {
           modalities: ['text'],
           instructions: `ONLY repeat the input and DO NOT say anything else`,
@@ -338,6 +343,18 @@ export class OpenAIRealtimeVoice extends MastraVoice {
     }
   }
+  waitForOpen() {
+    return new Promise(resolve => {
+      this.ws.on('open', resolve);
+    });
+  }
+  waitForSessionCreated() {
+    return new Promise(resolve => {
+      this.client.on('session.created', resolve);
+    });
+  }
   /**
    * Establishes a connection to the OpenAI realtime service.
    * Must be called before using speak, listen, or relay functions.
@@ -351,8 +368,18 @@ export class OpenAIRealtimeVoice extends MastraVoice {
    * ```
    */
   async connect() {
-    await this.client.connect();
-    await this.client.waitForSessionCreated();
+    await this.waitForOpen();
+    await this.waitForSessionCreated();
+    const openaiTools = transformTools(this.tools);
+    this.updateConfig({
+      instructions: this.instructions,
+      tools: openaiTools.map(t => t.openaiTool),
+      input_audio_transcription: {
+        model: 'whisper-1',
+      },
+      voice: this.speaker,
+    });
     this.state = 'open';
   }
@@ -374,7 +401,7 @@ export class OpenAIRealtimeVoice extends MastraVoice {
    * await voice.relay(micStream);
    * ```
    */
-  async send(audioData: NodeJS.ReadableStream | Int16Array): Promise<void> {
+  async send(audioData: NodeJS.ReadableStream | Int16Array, eventId?: string): Promise<void> {
     if (!this.state || this.state !== 'open') {
       console.warn('Cannot relay audio when not open. Call open() first.');
       return;
@@ -385,15 +412,14 @@ export class OpenAIRealtimeVoice extends MastraVoice {
       stream.on('data', chunk => {
         try {
           const buffer = Buffer.isBuffer(chunk) ? chunk : Buffer.from(chunk);
-          const int16Array = new Int16Array(buffer.buffer, buffer.byteOffset, buffer.byteLength / 2);
-          this.client.appendInputAudio(int16Array);
+          this.sendEvent('input_audio_buffer.append', { audio: buffer.toString('base64'), event_id: eventId });
         } catch (err) {
           this.emit('error', err);
         }
       });
     } else if (audioData instanceof Int16Array) {
       try {
-        this.client.appendInputAudio(audioData);
+        this.sendEvent('input_audio_buffer.append', { audio: audioData, event_id: eventId });
       } catch (err) {
         this.emit('error', err);
       }
@@ -421,7 +447,7 @@ export class OpenAIRealtimeVoice extends MastraVoice {
    * });
    */
   async answer({ options }: { options?: Realtime.ResponseConfig }) {
-    this.client.realtime.send('response.create', { response: options ?? {} });
+    this.sendEvent('response.create', { response: options ?? {} });
   }
   /**
@@ -496,35 +522,115 @@ export class OpenAIRealtimeVoice extends MastraVoice {
   }
   private setupEventListeners(): void {
-    this.client.on('error', error => {
-      this.emit('error', error);
+    const speakerStreams = new Map<string, StreamWithId>();
+    this.ws.on('message', message => {
+      const data = JSON.parse(message.toString());
+      this.client.emit(data.type, data);
+      if (this.debug) {
+        const { delta, ...fields } = data;
+        console.log(data.type, fields, delta?.length < 100 ? delta : '');
+      }
     });
-    this.client.on('conversation.created', conversation => {
-      this.emit('openAIRealtime:conversation.created', conversation);
+    this.client.on('session.created', ev => {
+      this.emit('session.created', ev);
+    });
+    this.client.on('session.updated', ev => {
+      this.emit('session.updated', ev);
     });
+    this.client.on('response.created', ev => {
+      this.emit('response.created', ev);
+      const speakerStream = new PassThrough() as StreamWithId;
+      speakerStream.id = ev.response.id;
-    this.client.on('conversation.interrupted', () => {
-      this.emit('openAIRealtime:conversation.interrupted');
+      speakerStreams.set(ev.response.id, speakerStream);
+      this.emit('speaker', speakerStream);
     });
+    this.client.on('conversation.item.input_audio_transcription.delta', ev => {
+      this.emit('transcribing', { text: ev.delta, response_id: ev.response_id, role: 'user' });
+    });
+    this.client.on('conversation.item.input_audio_transcription.done', ev => {
+      this.emit('transcribing', { text: '\n', response_id: ev.response_id, role: 'user' });
+    });
+    this.client.on('response.audio.delta', ev => {
+      const audio = Buffer.from(ev.delta, 'base64');
+      this.emit('speaking', { audio, response_id: ev.response_id });
-    this.client.on('conversation.updated', ({ delta }) => {
-      if (delta?.audio) {
-        this.emit('speaking', { audio: delta.audio });
-      }
+      const stream = speakerStreams.get(ev.response_id);
+      stream?.write(audio);
     });
+    this.client.on('response.audio.done', ev => {
+      this.emit('speaking.done', { response_id: ev.response_id });
-    this.client.on('conversation.item.appended', item => {
-      this.emit('openAIRealtime:conversation.item.appended', item);
+      const stream = speakerStreams.get(ev.response_id);
+      stream?.end();
     });
+    this.client.on('response.audio_transcript.delta', ev => {
+      this.emit('writing', { text: ev.delta, response_id: ev.response_id });
+    });
+    this.client.on('response.audio_transcript.done', ev => {
+      this.emit('writing', { text: '\n', response_id: ev.response_id });
+    });
+    this.client.on('response.text.delta', ev => {
+      this.emit('writing', { text: ev.delta, response_id: ev.response_id });
+    });
+    this.client.on('response.text.done', ev => {
+      this.emit('writing', { text: '\n', response_id: ev.response_id });
+    });
+    this.client.on('response.done', async ev => {
+      await this.handleFunctionCalls(ev);
+      this.emit('response.done', ev);
+      speakerStreams.delete(ev.response.id);
+    });
+  }
-    this.client.on('conversation.item.completed', ({ item, delta }) => {
-      if (item.formatted.transcript) {
-        this.emit('writing', { text: item.formatted.transcript, role: item.role });
+  private async handleFunctionCalls(ev: any) {
+    for (const output of ev.response?.output ?? []) {
+      if (output.type === 'function_call') {
+        await this.handleFunctionCall(output);
       }
+    }
+  }
-      this.emit('openAIRealtime:conversation.item.completed', { item, delta });
-    });
+  private async handleFunctionCall(output: any) {
+    try {
+      const context = JSON.parse(output.arguments);
+      const tool = this.tools?.[output.name];
+      if (!tool) {
+        console.warn(`Tool "${output.name}" not found`);
+        return;
+      }
+      const result = await tool?.execute?.(
+        { context },
+        {
+          toolCallId: 'unknown',
+          messages: [],
+        },
+      );
+      this.sendEvent('conversation.item.create', {
+        item: {
+          type: 'function_call_output',
+          call_id: output.call_id,
+          output: JSON.stringify(result),
+        },
+      });
+    } catch (e) {
+      const err = e as Error;
+      console.warn(`Error calling tool "${output.name}":`, err.message);
+      this.sendEvent('conversation.item.create', {
+        item: {
+          type: 'function_call_output',
+          call_id: output.call_id,
+          output: JSON.stringify({ error: err.message }),
+        },
+      });
+    } finally {
+      this.sendEvent('response.create', {});
+    }
   }
   private int16ArrayToBase64(int16Array: Int16Array): string {
@@ -540,4 +646,13 @@ export class OpenAIRealtimeVoice extends MastraVoice {
     }
     return btoa(binary);
   }
+  private sendEvent(type: string, data: any) {
+    this.ws.send(
+      JSON.stringify({
+        type: type,
+        ...data,
+      }),
+    );
+  }
 }

package/src/utils.ts CHANGED Viewed

@@ -37,6 +37,7 @@ export const transformTools = (tools?: TTools) => {
       continue;
     }
     const openaiTool = {
+      type: 'function',
       name,
       description: tool.description || `Tool: ${name}`,
       parameters,