npm - @mastra/voice-openai-realtime - Versions diffs - 0.0.1-alpha.1 - Mend

@mastra/voice-openai-realtime 0.0.1-alpha.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

package/.turbo/turbo-build.log +23 -0
package/LICENSE +44 -0
package/README.md +153 -0
package/dist/_tsup-dts-rollup.d.cts +325 -0
package/dist/_tsup-dts-rollup.d.ts +325 -0
package/dist/index.cjs +481 -0
package/dist/index.d.cts +1 -0
package/dist/index.d.ts +1 -0
package/dist/index.js +479 -0
package/eslint.config.js +6 -0
package/package.json +41 -0
package/src/index.test.ts +117 -0
package/src/index.ts +543 -0
package/src/utils.ts +87 -0
package/tsconfig.json +5 -0
package/vitest.config.ts +8 -0

package/dist/index.cjs ADDED Viewed

@@ -0,0 +1,481 @@
+'use strict';
+var voice = require('@mastra/core/voice');
+var openaiRealtimeApi = require('openai-realtime-api');
+var stream = require('stream');
+var zodToJsonSchema = require('zod-to-json-schema');
+// src/index.ts
+var transformTools = (tools) => {
+  const openaiTools = [];
+  for (const [name, tool] of Object.entries(tools || {})) {
+    let parameters;
+    if ("inputSchema" in tool && tool.inputSchema) {
+      if (typeof tool.inputSchema === "object" && tool.inputSchema._def && tool.inputSchema._def.typeName === "ZodObject") {
+        parameters = zodToJsonSchema.zodToJsonSchema(tool.inputSchema);
+        delete parameters.$schema;
+      } else {
+        parameters = tool.inputSchema;
+      }
+    } else if ("parameters" in tool) {
+      if (typeof tool.parameters === "object" && tool.parameters._def && tool.parameters._def.typeName === "ZodObject") {
+        parameters = zodToJsonSchema.zodToJsonSchema(tool.parameters);
+        delete parameters.$schema;
+      } else {
+        parameters = tool.parameters;
+      }
+    } else {
+      console.warn(`Tool ${name} has neither inputSchema nor parameters, skipping`);
+      continue;
+    }
+    const openaiTool = {
+      name,
+      description: tool.description || `Tool: ${name}`,
+      parameters
+    };
+    if (tool.execute) {
+      const executeAdapter = async (args) => {
+        try {
+          if (!tool.execute) {
+            throw new Error(`Tool ${name} has no execute function`);
+          }
+          if ("inputSchema" in tool) {
+            return await tool.execute({ context: args });
+          } else {
+            const options = {
+              toolCallId: "unknown",
+              messages: []
+            };
+            return await tool.execute(args, options);
+          }
+        } catch (error) {
+          console.error(`Error executing tool ${name}:`, error);
+          throw error;
+        }
+      };
+      openaiTools.push({ openaiTool, execute: executeAdapter });
+    } else {
+      console.warn(`Tool ${name} has no execute function, skipping`);
+    }
+  }
+  return openaiTools;
+};
+var isReadableStream = (obj) => {
+  return obj && obj instanceof stream.Readable && typeof obj.read === "function" && typeof obj.pipe === "function" && obj.readable === true;
+};
+// src/index.ts
+var DEFAULT_VOICE = "alloy";
+var DEFAULT_MODEL = "gpt-4o-mini-realtime-preview-2024-12-17";
+var DEFAULT_VAD_CONFIG = {
+  type: "server_vad",
+  threshold: 0.5,
+  prefix_padding_ms: 1e3,
+  silence_duration_ms: 1e3
+};
+var VOICES = ["alloy", "ash", "ballad", "coral", "echo", "sage", "shimmer", "verse"];
+var OpenAIRealtimeVoice = class extends voice.MastraVoice {
+  client;
+  state;
+  events;
+  tools;
+  /**
+   * Creates a new instance of OpenAIRealtimeVoice.
+   *
+   * @param options - Configuration options for the voice instance
+   * @param options.chatModel - Configuration for the chat model
+   * @param options.chatModel.model - The model ID to use (defaults to GPT-4 Mini Realtime)
+   * @param options.chatModel.apiKey - OpenAI API key. Falls back to process.env.OPENAI_API_KEY
+   * @param options.chatModel.tools - Tools configuration for the model
+   * @param options.chatModel.options - Additional options for the realtime client
+   * @param options.chatModel.options.sessionConfig - Session configuration overrides
+   * @param options.chatModel.options.url - Custom WebSocket URL
+   * @param options.chatModel.options.dangerouslyAllowAPIKeyInBrowser - Whether to allow API key in browser
+   * @param options.chatModel.options.debug - Enable debug logging
+   * @param options.chatModel.options.tools - Additional tools configuration
+   * @param options.speaker - Voice ID to use (defaults to 'alloy')
+   *
+   * @example
+   * ```typescript
+   * const voice = new OpenAIRealtimeVoice({
+   *   chatModel: {
+   *     apiKey: 'your-api-key',
+   *     model: 'gpt-4o-mini-realtime',
+   *   },
+   *   speaker: 'alloy'
+   * });
+   * ```
+   */
+  constructor({
+    chatModel,
+    speaker
+  } = {}) {
+    super();
+    this.client = new openaiRealtimeApi.RealtimeClient({
+      apiKey: chatModel?.apiKey || process.env.OPENAI_API_KEY,
+      model: chatModel?.model || DEFAULT_MODEL,
+      ...chatModel?.options,
+      sessionConfig: {
+        voice: speaker || DEFAULT_VOICE,
+        turn_detection: DEFAULT_VAD_CONFIG,
+        ...chatModel?.options?.sessionConfig
+      }
+    });
+    this.state = "close";
+    this.events = {};
+    this.setupEventListeners();
+    if (chatModel?.tools) {
+      this.addTools(chatModel.tools);
+    }
+  }
+  /**
+   * Returns a list of available voice speakers.
+   *
+   * @returns Promise resolving to an array of voice objects, each containing at least a voiceId
+   *
+   * @example
+   * ```typescript
+   * const speakers = await voice.getSpeakers();
+   * // speakers = [{ voiceId: 'alloy' }, { voiceId: 'echo' }, ...]
+   * ```
+   */
+  getSpeakers() {
+    return Promise.resolve(VOICES.map((v) => ({ voiceId: v })));
+  }
+  /**
+   * Disconnects from the OpenAI realtime session and cleans up resources.
+   * Should be called when you're done with the voice instance.
+   *
+   * @example
+   * ```typescript
+   * voice.close(); // Disconnects and cleans up
+   * ```
+   */
+  close() {
+    if (!this.client) return;
+    this.client.disconnect();
+    this.state = "close";
+  }
+  /**
+   * Equips the voice instance with a set of tools.
+   * Tools allow the model to perform additional actions during conversations.
+   *
+   * @param tools - Optional tools configuration to addTools
+   * @returns Transformed tools configuration ready for use with the model
+   *
+   * @example
+   * ```typescript
+   * const tools = {
+   *   search: async (query: string) => { ... },
+   *   calculate: (expression: string) => { ... }
+   * };
+   * voice.addTools(tools);
+   * ```
+   */
+  addTools(tools) {
+    const transformedTools = transformTools(tools);
+    for (const tool of transformedTools) {
+      this.client.addTool(tool.openaiTool, tool.execute);
+    }
+  }
+  /**
+   * Emits a speaking event using the configured voice model.
+   * Can accept either a string or a readable stream as input.
+   *
+   * @param input - The text to convert to speech, or a readable stream containing the text
+   * @param options - Optional configuration for this specific speech request
+   * @param options.speaker - Override the voice to use for this specific request
+   *
+   * @throws {Error} If the input text is empty
+   *
+   * @example
+   * ```typescript
+   * // Simple text to speech
+   * await voice.speak('Hello world');
+   *
+   * // With custom voice
+   * await voice.speak('Hello world', { speaker: 'echo' });
+   *
+   * // Using a stream
+   * const stream = fs.createReadStream('text.txt');
+   * await voice.speak(stream);
+   * ```
+   */
+  async speak(input, options) {
+    if (typeof input !== "string") {
+      const chunks = [];
+      for await (const chunk of input) {
+        chunks.push(Buffer.isBuffer(chunk) ? chunk : Buffer.from(String(chunk)));
+      }
+      input = Buffer.concat(chunks).toString("utf-8");
+    }
+    if (input.trim().length === 0) {
+      throw new Error("Input text is empty");
+    }
+    this.client.realtime.send("response.create", {
+      response: {
+        instructions: `Repeat the following text: ${input}`,
+        voice: options?.speaker ? options.speaker : void 0
+      }
+    });
+  }
+  /**
+   * Updates the session configuration for the voice instance.
+   * This can be used to modify voice settings, turn detection, and other parameters.
+   *
+   * @param sessionConfig - New session configuration to apply
+   *
+   * @example
+   * ```typescript
+   * voice.updateConfig({
+   *   voice: 'echo',
+   *   turn_detection: {
+   *     type: 'server_vad',
+   *     threshold: 0.5,
+   *     silence_duration_ms: 1000
+   *   }
+   * });
+   * ```
+   */
+  updateConfig(sessionConfig) {
+    this.client.updateSession(sessionConfig);
+  }
+  /**
+   * Processes audio input for speech recognition.
+   * Takes a readable stream of audio data and emits a writing event.
+   * The output of the writing event is int16 audio data.
+   *
+   * @param audioData - Readable stream containing the audio data to process
+   * @param options - Optional configuration for audio processing
+   *
+   * @throws {Error} If the audio data format is not supported
+   *
+   * @example
+   * ```typescript
+   * // Process audio from a file
+   * const audioStream = fs.createReadStream('audio.raw');
+   * await voice.listen(audioStream);
+   *
+   * // Process audio with options
+   * await voice.listen(microphoneStream, {
+   *   format: 'int16',
+   *   sampleRate: 24000
+   * });
+   * ```
+   */
+  async listen(audioData) {
+    if (isReadableStream(audioData)) {
+      const chunks = [];
+      for await (const chunk of audioData) {
+        const buffer2 = Buffer.isBuffer(chunk) ? chunk : Buffer.from(chunk);
+        chunks.push(buffer2);
+      }
+      const buffer = Buffer.concat(chunks);
+      const int16Array = new Int16Array(buffer.buffer, buffer.byteOffset ?? 0, (buffer.byteLength ?? 0) / 2);
+      const base64Audio = this.int16ArrayToBase64(int16Array);
+      this.client.realtime.send("conversation.item.create", {
+        item: {
+          type: "message",
+          role: "user",
+          content: [{ type: "input_audio", audio: base64Audio }]
+        }
+      });
+      this.client.realtime.send("response.create", {
+        response: {
+          modalities: ["text"],
+          instructions: `ONLY repeat the input and DO NOT say anything else`
+        }
+      });
+    } else {
+      this.emit("error", new Error("Unsupported audio data format"));
+    }
+  }
+  /**
+   * Establishes a connection to the OpenAI realtime service.
+   * Must be called before using speak, listen, or relay functions.
+   *
+   * @throws {Error} If connection fails or session creation times out
+   *
+   * @example
+   * ```typescript
+   * await voice.open();
+   * // Now ready for voice interactions
+   * ```
+   */
+  async connect() {
+    await this.client.connect();
+    await this.client.waitForSessionCreated();
+    this.state = "open";
+  }
+  /**
+   * Streams audio data in real-time to the OpenAI service.
+   * Useful for continuous audio streaming scenarios like live microphone input.
+   * Must be in 'open' state before calling this method.
+   *
+   * @param audioData - Readable stream of audio data to relay
+   * @throws {Error} If audio format is not supported
+   *
+   * @example
+   * ```typescript
+   * // First connect
+   * await voice.open();
+   *
+   * // Then relay audio
+   * const micStream = getMicrophoneStream();
+   * await voice.relay(micStream);
+   * ```
+   */
+  async send(audioData) {
+    if (!this.state || this.state !== "open") {
+      console.warn("Cannot relay audio when not open. Call open() first.");
+      return;
+    }
+    if (isReadableStream(audioData)) {
+      const stream = audioData;
+      stream.on("data", (chunk) => {
+        try {
+          const buffer = Buffer.isBuffer(chunk) ? chunk : Buffer.from(chunk);
+          const int16Array = new Int16Array(buffer.buffer, buffer.byteOffset, buffer.byteLength / 2);
+          this.client.appendInputAudio(int16Array);
+        } catch (err) {
+          this.emit("error", err);
+        }
+      });
+    } else if (audioData instanceof Int16Array) {
+      try {
+        this.client.appendInputAudio(audioData);
+      } catch (err) {
+        this.emit("error", err);
+      }
+    } else {
+      this.emit("error", new Error("Unsupported audio data format"));
+    }
+  }
+  /**
+   * Sends a response to the OpenAI Realtime API.
+   *
+   * Trigger a response to the real-time session.
+   *
+   * @param {Object} params - The parameters object
+   * @param {Realtime.ResponseConfig} params.options - Configuration options for the response
+   * @returns {Promise<void>} A promise that resolves when the response has been sent
+   *
+   * @example
+   * // Send a simple text response
+   * await realtimeVoice.answer({
+   *   options: {
+   *     content: "Hello, how can I help you today?",
+   *     voice: "alloy"
+   *   }
+   * });
+   */
+  async answer({ options }) {
+    this.client.realtime.send("response.create", { response: options ?? {} });
+  }
+  /**
+   * Registers an event listener for voice events.
+   * Available events: 'speaking', 'writing, 'error'
+   * Can listen to OpenAI Realtime events by prefixing with 'openAIRealtime:'
+   * Such as 'openAIRealtime:conversation.item.completed', 'openAIRealtime:conversation.updated', etc.
+   *
+   * @param event - Name of the event to listen for
+   * @param callback - Function to call when the event occurs
+   *
+   * @example
+   * ```typescript
+   * // Listen for speech events
+   * voice.on('speaking', (audioData: Int16Array) => {
+   *   // Handle audio data
+   * });
+   *
+   * // Handle errors
+   * voice.on('error', (error: Error) => {
+   *   console.error('Voice error:', error);
+   * });
+   * ```
+   */
+  on(event, callback) {
+    if (!this.events[event]) {
+      this.events[event] = [];
+    }
+    this.events[event].push(callback);
+  }
+  /**
+   * Removes a previously registered event listener.
+   *
+   * @param event - Name of the event to stop listening to
+   * @param callback - The specific callback function to remove
+   *
+   * @example
+   * ```typescript
+   * // Create event handler
+   * const handleSpeech = (audioData: Int16Array) => {
+   *   // Handle audio data
+   * };
+   *
+   * // Add listener
+   * voice.on('speaking', handleSpeech);
+   *
+   * // Later, remove the listener
+   * voice.off('speaking', handleSpeech);
+   * ```
+   */
+  off(event, callback) {
+    if (!this.events[event]) return;
+    const index = this.events[event].indexOf(callback);
+    if (index !== -1) {
+      this.events[event].splice(index, 1);
+    }
+  }
+  /**
+   * Emit an event with arguments
+   * @param event Event name
+   * @param args Arguments to pass to the callbacks
+   */
+  emit(event, ...args) {
+    if (!this.events[event]) return;
+    for (const callback of this.events[event]) {
+      callback(...args);
+    }
+  }
+  setupEventListeners() {
+    this.client.on("error", (error) => {
+      this.emit("error", error);
+    });
+    this.client.on("conversation.created", (conversation) => {
+      this.emit("openAIRealtime:conversation.created", conversation);
+    });
+    this.client.on("conversation.interrupted", () => {
+      this.emit("openAIRealtime:conversation.interrupted");
+    });
+    this.client.on("conversation.updated", ({ delta }) => {
+      if (delta?.audio) {
+        this.emit("speaking", { audio: delta.audio });
+      }
+    });
+    this.client.on("conversation.item.appended", (item) => {
+      this.emit("openAIRealtime:conversation.item.appended", item);
+    });
+    this.client.on("conversation.item.completed", ({ item, delta }) => {
+      if (item.formatted.transcript) {
+        this.emit("writing", { text: item.formatted.transcript, role: item.role });
+      }
+      this.emit("openAIRealtime:conversation.item.completed", { item, delta });
+    });
+  }
+  int16ArrayToBase64(int16Array) {
+    const buffer = new ArrayBuffer(int16Array.length * 2);
+    const view = new DataView(buffer);
+    for (let i = 0; i < int16Array.length; i++) {
+      view.setInt16(i * 2, int16Array[i], true);
+    }
+    const uint8Array = new Uint8Array(buffer);
+    let binary = "";
+    for (let i = 0; i < uint8Array.length; i++) {
+      binary += String.fromCharCode(uint8Array[i]);
+    }
+    return btoa(binary);
+  }
+};
+exports.OpenAIRealtimeVoice = OpenAIRealtimeVoice;

package/dist/index.d.cts ADDED Viewed

	@@ -0,0 +1 @@
1	+ export { OpenAIRealtimeVoice } from './_tsup-dts-rollup.cjs';

package/dist/index.d.ts ADDED Viewed

	@@ -0,0 +1 @@
1	+ export { OpenAIRealtimeVoice } from './_tsup-dts-rollup.js';