npm - @mastra/voice-openai-realtime - Versions diffs - 0.0.5-alpha.0 → 0.1.0-alpha.2 - Mend

@mastra/voice-openai-realtime 0.0.5-alpha.0 → 0.1.0-alpha.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

package/.turbo/turbo-build.log +7 -7
package/CHANGELOG.md +23 -0
package/dist/_tsup-dts-rollup.d.cts +18 -18
package/dist/_tsup-dts-rollup.d.ts +18 -18
package/dist/index.cjs +159 -61
package/dist/index.js +159 -61
package/package.json +6 -4
package/src/index.ts +204 -89
package/src/utils.ts +1 -0

package/dist/index.js CHANGED Viewed

@@ -1,6 +1,7 @@
+import { EventEmitter } from 'events';
+import { PassThrough, Readable } from 'stream';
 import { MastraVoice } from '@mastra/core/voice';
-import { RealtimeClient } from 'openai-realtime-api';
-import { Readable } from 'stream';
+import { WebSocket } from 'ws';
 import { zodToJsonSchema } from 'zod-to-json-schema';
 // src/index.ts
@@ -27,6 +28,7 @@ var transformTools = (tools) => {
       continue;
     }
     const openaiTool = {
+      type: "function",
       name,
       description: tool.description || `Tool: ${name}`,
       parameters
@@ -64,19 +66,17 @@ var isReadableStream = (obj) => {
 // src/index.ts
 var DEFAULT_VOICE = "alloy";
+var DEFAULT_URL = "wss://api.openai.com/v1/realtime";
 var DEFAULT_MODEL = "gpt-4o-mini-realtime-preview-2024-12-17";
-var DEFAULT_VAD_CONFIG = {
-  type: "server_vad",
-  threshold: 0.5,
-  prefix_padding_ms: 1e3,
-  silence_duration_ms: 1e3
-};
 var VOICES = ["alloy", "ash", "ballad", "coral", "echo", "sage", "shimmer", "verse"];
 var OpenAIRealtimeVoice = class extends MastraVoice {
-  client;
+  ws;
   state;
+  client;
   events;
+  instructions;
   tools;
+  debug;
   /**
    * Creates a new instance of OpenAIRealtimeVoice.
    *
@@ -85,13 +85,8 @@ var OpenAIRealtimeVoice = class extends MastraVoice {
    * @param options.chatModel.model - The model ID to use (defaults to GPT-4 Mini Realtime)
    * @param options.chatModel.apiKey - OpenAI API key. Falls back to process.env.OPENAI_API_KEY
    * @param options.chatModel.tools - Tools configuration for the model
-   * @param options.chatModel.options - Additional options for the realtime client
-   * @param options.chatModel.options.sessionConfig - Session configuration overrides
-   * @param options.chatModel.options.url - Custom WebSocket URL
-   * @param options.chatModel.options.dangerouslyAllowAPIKeyInBrowser - Whether to allow API key in browser
-   * @param options.chatModel.options.debug - Enable debug logging
-   * @param options.chatModel.options.tools - Additional tools configuration
    * @param options.speaker - Voice ID to use (defaults to 'alloy')
+   * @param options.debug - Enable debug mode
    *
    * @example
    * ```typescript
@@ -106,25 +101,26 @@ var OpenAIRealtimeVoice = class extends MastraVoice {
    */
   constructor({
     chatModel,
-    speaker
+    speaker,
+    debug = false
   } = {}) {
     super();
-    this.client = new RealtimeClient({
-      apiKey: chatModel?.apiKey || process.env.OPENAI_API_KEY,
-      model: chatModel?.model || DEFAULT_MODEL,
-      ...chatModel?.options,
-      sessionConfig: {
-        voice: speaker || DEFAULT_VOICE,
-        turn_detection: DEFAULT_VAD_CONFIG,
-        ...chatModel?.options?.sessionConfig
+    const url = `${chatModel?.url || DEFAULT_URL}?model=${chatModel?.model || DEFAULT_MODEL}`;
+    const apiKey = chatModel?.apiKey || process.env.OPENAI_API_KEY;
+    this.ws = new WebSocket(url, void 0, {
+      headers: {
+        Authorization: "Bearer " + apiKey,
+        "OpenAI-Beta": "realtime=v1"
       }
     });
+    this.client = new EventEmitter();
     this.state = "close";
     this.events = {};
+    this.tools = chatModel?.tools;
+    this.instructions = chatModel?.instructions;
+    this.speaker = speaker || DEFAULT_VOICE;
+    this.debug = debug;
     this.setupEventListeners();
-    if (chatModel?.tools) {
-      this.addTools(chatModel.tools);
-    }
   }
   /**
    * Returns a list of available voice speakers.
@@ -150,8 +146,8 @@ var OpenAIRealtimeVoice = class extends MastraVoice {
    * ```
    */
   close() {
-    if (!this.client) return;
-    this.client.disconnect();
+    if (!this.ws) return;
+    this.ws.close();
     this.state = "close";
   }
   /**
@@ -171,10 +167,10 @@ var OpenAIRealtimeVoice = class extends MastraVoice {
    * ```
    */
   addTools(tools) {
-    const transformedTools = transformTools(tools);
-    for (const tool of transformedTools) {
-      this.client.addTool(tool.openaiTool, tool.execute);
-    }
+    const openaiTools = transformTools(tools);
+    this.updateConfig({
+      tools: openaiTools.map((t) => t.openaiTool)
+    });
   }
   /**
    * Emits a speaking event using the configured voice model.
@@ -210,7 +206,7 @@ var OpenAIRealtimeVoice = class extends MastraVoice {
     if (input.trim().length === 0) {
       throw new Error("Input text is empty");
     }
-    this.client.realtime.send("response.create", {
+    this.sendEvent("response.create", {
       response: {
         instructions: `Repeat the following text: ${input}`,
         voice: options?.speaker ? options.speaker : void 0
@@ -236,7 +232,7 @@ var OpenAIRealtimeVoice = class extends MastraVoice {
    * ```
    */
   updateConfig(sessionConfig) {
-    this.client.updateSession(sessionConfig);
+    this.sendEvent("session.update", { session: sessionConfig });
   }
   /**
    * Processes audio input for speech recognition.
@@ -271,14 +267,14 @@ var OpenAIRealtimeVoice = class extends MastraVoice {
       const buffer = Buffer.concat(chunks);
       const int16Array = new Int16Array(buffer.buffer, buffer.byteOffset ?? 0, (buffer.byteLength ?? 0) / 2);
       const base64Audio = this.int16ArrayToBase64(int16Array);
-      this.client.realtime.send("conversation.item.create", {
+      this.sendEvent("conversation.item.create", {
         item: {
           type: "message",
           role: "user",
           content: [{ type: "input_audio", audio: base64Audio }]
         }
       });
-      this.client.realtime.send("response.create", {
+      this.sendEvent("response.create", {
         response: {
           modalities: ["text"],
           instructions: `ONLY repeat the input and DO NOT say anything else`
@@ -288,6 +284,16 @@ var OpenAIRealtimeVoice = class extends MastraVoice {
       this.emit("error", new Error("Unsupported audio data format"));
     }
   }
+  waitForOpen() {
+    return new Promise((resolve) => {
+      this.ws.on("open", resolve);
+    });
+  }
+  waitForSessionCreated() {
+    return new Promise((resolve) => {
+      this.client.on("session.created", resolve);
+    });
+  }
   /**
    * Establishes a connection to the OpenAI realtime service.
    * Must be called before using speak, listen, or relay functions.
@@ -301,8 +307,17 @@ var OpenAIRealtimeVoice = class extends MastraVoice {
    * ```
    */
   async connect() {
-    await this.client.connect();
-    await this.client.waitForSessionCreated();
+    await this.waitForOpen();
+    await this.waitForSessionCreated();
+    const openaiTools = transformTools(this.tools);
+    this.updateConfig({
+      instructions: this.instructions,
+      tools: openaiTools.map((t) => t.openaiTool),
+      input_audio_transcription: {
+        model: "whisper-1"
+      },
+      voice: this.speaker
+    });
     this.state = "open";
   }
   /**
@@ -323,7 +338,7 @@ var OpenAIRealtimeVoice = class extends MastraVoice {
    * await voice.relay(micStream);
    * ```
    */
-  async send(audioData) {
+  async send(audioData, eventId) {
     if (!this.state || this.state !== "open") {
       console.warn("Cannot relay audio when not open. Call open() first.");
       return;
@@ -333,15 +348,14 @@ var OpenAIRealtimeVoice = class extends MastraVoice {
       stream.on("data", (chunk) => {
         try {
           const buffer = Buffer.isBuffer(chunk) ? chunk : Buffer.from(chunk);
-          const int16Array = new Int16Array(buffer.buffer, buffer.byteOffset, buffer.byteLength / 2);
-          this.client.appendInputAudio(int16Array);
+          this.sendEvent("input_audio_buffer.append", { audio: buffer.toString("base64"), event_id: eventId });
         } catch (err) {
           this.emit("error", err);
         }
       });
     } else if (audioData instanceof Int16Array) {
       try {
-        this.client.appendInputAudio(audioData);
+        this.sendEvent("input_audio_buffer.append", { audio: audioData, event_id: eventId });
       } catch (err) {
         this.emit("error", err);
       }
@@ -368,7 +382,7 @@ var OpenAIRealtimeVoice = class extends MastraVoice {
    * });
    */
   async answer({ options }) {
-    this.client.realtime.send("response.create", { response: options ?? {} });
+    this.sendEvent("response.create", { response: options ?? {} });
   }
   /**
    * Registers an event listener for voice events.
@@ -437,30 +451,106 @@ var OpenAIRealtimeVoice = class extends MastraVoice {
     }
   }
   setupEventListeners() {
-    this.client.on("error", (error) => {
-      this.emit("error", error);
+    const speakerStreams = /* @__PURE__ */ new Map();
+    this.ws.on("message", (message) => {
+      const data = JSON.parse(message.toString());
+      this.client.emit(data.type, data);
+      if (this.debug) {
+        const { delta, ...fields } = data;
+        console.log(data.type, fields, delta?.length < 100 ? delta : "");
+      }
     });
-    this.client.on("conversation.created", (conversation) => {
-      this.emit("openAIRealtime:conversation.created", conversation);
+    this.client.on("session.created", (ev) => {
+      this.emit("session.created", ev);
     });
-    this.client.on("conversation.interrupted", () => {
-      this.emit("openAIRealtime:conversation.interrupted");
+    this.client.on("session.updated", (ev) => {
+      this.emit("session.updated", ev);
     });
-    this.client.on("conversation.updated", ({ delta }) => {
-      if (delta?.audio) {
-        this.emit("speaking", { audio: delta.audio });
-      }
+    this.client.on("response.created", (ev) => {
+      this.emit("response.created", ev);
+      const speakerStream = new PassThrough();
+      speakerStream.id = ev.response.id;
+      speakerStreams.set(ev.response.id, speakerStream);
+      this.emit("speaker", speakerStream);
     });
-    this.client.on("conversation.item.appended", (item) => {
-      this.emit("openAIRealtime:conversation.item.appended", item);
+    this.client.on("conversation.item.input_audio_transcription.delta", (ev) => {
+      this.emit("transcribing", { text: ev.delta, response_id: ev.response_id, role: "user" });
     });
-    this.client.on("conversation.item.completed", ({ item, delta }) => {
-      if (item.formatted.transcript) {
-        this.emit("writing", { text: item.formatted.transcript, role: item.role });
-      }
-      this.emit("openAIRealtime:conversation.item.completed", { item, delta });
+    this.client.on("conversation.item.input_audio_transcription.done", (ev) => {
+      this.emit("transcribing", { text: "\n", response_id: ev.response_id, role: "user" });
+    });
+    this.client.on("response.audio.delta", (ev) => {
+      const audio = Buffer.from(ev.delta, "base64");
+      this.emit("speaking", { audio, response_id: ev.response_id });
+      const stream = speakerStreams.get(ev.response_id);
+      stream?.write(audio);
+    });
+    this.client.on("response.audio.done", (ev) => {
+      this.emit("speaking.done", { response_id: ev.response_id });
+      const stream = speakerStreams.get(ev.response_id);
+      stream?.end();
+    });
+    this.client.on("response.audio_transcript.delta", (ev) => {
+      this.emit("writing", { text: ev.delta, response_id: ev.response_id });
+    });
+    this.client.on("response.audio_transcript.done", (ev) => {
+      this.emit("writing", { text: "\n", response_id: ev.response_id });
+    });
+    this.client.on("response.text.delta", (ev) => {
+      this.emit("writing", { text: ev.delta, response_id: ev.response_id });
+    });
+    this.client.on("response.text.done", (ev) => {
+      this.emit("writing", { text: "\n", response_id: ev.response_id });
+    });
+    this.client.on("response.done", async (ev) => {
+      await this.handleFunctionCalls(ev);
+      this.emit("response.done", ev);
+      speakerStreams.delete(ev.response.id);
     });
   }
+  async handleFunctionCalls(ev) {
+    for (const output of ev.response?.output ?? []) {
+      if (output.type === "function_call") {
+        await this.handleFunctionCall(output);
+      }
+    }
+  }
+  async handleFunctionCall(output) {
+    try {
+      const context = JSON.parse(output.arguments);
+      const tool = this.tools?.[output.name];
+      if (!tool) {
+        console.warn(`Tool "${output.name}" not found`);
+        return;
+      }
+      const result = await tool?.execute?.(
+        { context },
+        {
+          toolCallId: "unknown",
+          messages: []
+        }
+      );
+      this.sendEvent("conversation.item.create", {
+        item: {
+          type: "function_call_output",
+          call_id: output.call_id,
+          output: JSON.stringify(result)
+        }
+      });
+    } catch (e) {
+      const err = e;
+      console.warn(`Error calling tool "${output.name}":`, err.message);
+      this.sendEvent("conversation.item.create", {
+        item: {
+          type: "function_call_output",
+          call_id: output.call_id,
+          output: JSON.stringify({ error: err.message })
+        }
+      });
+    } finally {
+      this.sendEvent("response.create", {});
+    }
+  }
   int16ArrayToBase64(int16Array) {
     const buffer = new ArrayBuffer(int16Array.length * 2);
     const view = new DataView(buffer);
@@ -474,6 +564,14 @@ var OpenAIRealtimeVoice = class extends MastraVoice {
     }
     return btoa(binary);
   }
+  sendEvent(type, data) {
+    this.ws.send(
+      JSON.stringify({
+        type,
+        ...data
+      })
+    );
+  }
 };
 export { OpenAIRealtimeVoice };

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@mastra/voice-openai-realtime",
-  "version": "0.0.5-alpha.0",
+  "version": "0.1.0-alpha.2",
   "description": "Mastra OpenAI Realtime API integration",
   "type": "module",
   "main": "dist/index.js",
@@ -20,16 +20,18 @@
   },
   "dependencies": {
     "openai-realtime-api": "^1.0.7",
+    "ws": "^8.18.1",
     "zod-to-json-schema": "^3.24.1",
-    "@mastra/core": "^0.6.5-alpha.0"
+    "@mastra/core": "^0.7.0-alpha.2"
   },
   "devDependencies": {
     "@microsoft/api-extractor": "^7.49.2",
     "@types/node": "^22.13.1",
-    "eslint": "^9.20.1",
+    "@types/ws": "^8.18.0",
+    "eslint": "^9.23.0",
     "tsup": "^8.3.6",
     "typescript": "^5.7.3",
-    "vitest": "^2.1.8",
+    "vitest": "^2.1.9",
     "@internal/lint": "0.0.1"
   },
   "scripts": {