npm - @mastra/voice-openai-realtime - Versions diffs - 0.0.1-alpha.1 - Mend

@mastra/voice-openai-realtime 0.0.1-alpha.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

package/.turbo/turbo-build.log +23 -0
package/LICENSE +44 -0
package/README.md +153 -0
package/dist/_tsup-dts-rollup.d.cts +325 -0
package/dist/_tsup-dts-rollup.d.ts +325 -0
package/dist/index.cjs +481 -0
package/dist/index.d.cts +1 -0
package/dist/index.d.ts +1 -0
package/dist/index.js +479 -0
package/eslint.config.js +6 -0
package/package.json +41 -0
package/src/index.test.ts +117 -0
package/src/index.ts +543 -0
package/src/utils.ts +87 -0
package/tsconfig.json +5 -0
package/vitest.config.ts +8 -0

package/.turbo/turbo-build.log ADDED Viewed

@@ -0,0 +1,23 @@
+> @mastra/voice-openai-realtime@0.0.1-alpha.1 build /home/runner/work/mastra/mastra/voice/openai-realtime-api
+> tsup src/index.ts --format esm,cjs --experimental-dts --clean --treeshake
+[34mCLI[39m Building entry: src/index.ts
+[34mCLI[39m Using tsconfig: tsconfig.json
+[34mCLI[39m tsup v8.4.0
+[34mTSC[39m Build start
+[32mTSC[39m ⚡️ Build success in 8472ms
+[34mDTS[39m Build start
+[34mCLI[39m Target: es2022
+Analysis will use the bundled TypeScript version 5.7.3
+[36mWriting package typings: /home/runner/work/mastra/mastra/voice/openai-realtime-api/dist/_tsup-dts-rollup.d.ts[39m
+Analysis will use the bundled TypeScript version 5.7.3
+[36mWriting package typings: /home/runner/work/mastra/mastra/voice/openai-realtime-api/dist/_tsup-dts-rollup.d.cts[39m
+[32mDTS[39m ⚡️ Build success in 9610ms
+[34mCLI[39m Cleaning output folder
+[34mESM[39m Build start
+[34mCJS[39m Build start
+[32mCJS[39m [1mdist/index.cjs [22m[32m14.99 KB[39m
+[32mCJS[39m ⚡️ Build success in 698ms
+[32mESM[39m [1mdist/index.js [22m[32m14.90 KB[39m
+[32mESM[39m ⚡️ Build success in 699ms

package/LICENSE ADDED Viewed

@@ -0,0 +1,44 @@
+Elastic License 2.0 (ELv2)
+**Acceptance**
+By using the software, you agree to all of the terms and conditions below.
+**Copyright License**
+The licensor grants you a non-exclusive, royalty-free, worldwide, non-sublicensable, non-transferable license to use, copy, distribute, make available, and prepare derivative works of the software, in each case subject to the limitations and conditions below
+**Limitations**
+You may not provide the software to third parties as a hosted or managed service, where the service provides users with access to any substantial set of the features or functionality of the software.
+You may not move, change, disable, or circumvent the license key functionality in the software, and you may not remove or obscure any functionality in the software that is protected by the license key.
+You may not alter, remove, or obscure any licensing, copyright, or other notices of the licensor in the software. Any use of the licensor’s trademarks is subject to applicable law.
+**Patents**
+The licensor grants you a license, under any patent claims the licensor can license, or becomes able to license, to make, have made, use, sell, offer for sale, import and have imported the software, in each case subject to the limitations and conditions in this license. This license does not cover any patent claims that you cause to be infringed by modifications or additions to the software. If you or your company make any written claim that the software infringes or contributes to infringement of any patent, your patent license for the software granted under these terms ends immediately. If your company makes such a claim, your patent license ends immediately for work on behalf of your company.
+**Notices**
+You must ensure that anyone who gets a copy of any part of the software from you also gets a copy of these terms.
+If you modify the software, you must include in any modified copies of the software prominent notices stating that you have modified the software.
+**No Other Rights**
+These terms do not imply any licenses other than those expressly granted in these terms.
+**Termination**
+If you use the software in violation of these terms, such use is not licensed, and your licenses will automatically terminate. If the licensor provides you with a notice of your violation, and you cease all violation of this license no later than 30 days after you receive that notice, your licenses will be reinstated retroactively. However, if you violate these terms after such reinstatement, any additional violation of these terms will cause your licenses to terminate automatically and permanently.
+**No Liability**
+As far as the law allows, the software comes as is, without any warranty or condition, and the licensor will not be liable to you for any damages arising out of these terms or the use or nature of the software, under any kind of legal claim.
+**Definitions**
+The _licensor_ is the entity offering these terms, and the _software_ is the software the licensor makes available under these terms, including any portion of it.
+_you_ refers to the individual or entity agreeing to these terms.
+_your company_ is any legal entity, sole proprietorship, or other kind of organization that you work for, plus all organizations that have control over, are under the control of, or are under common control with that organization. _control_ means ownership of substantially all the assets of an entity, or the power to direct its management and policies by vote, contract, or otherwise. Control can be direct or indirect.
+_your licenses_ are all the licenses granted to you for the software under these terms.
+_use_ means anything you do with the software requiring one of your licenses.
+_trademark_ means trademarks, service marks, and similar rights.

package/README.md ADDED Viewed

@@ -0,0 +1,153 @@
+# @mastra/voice-openai-realtime
+OpenAI Realtime Voice integration for Mastra, providing real-time voice interaction capabilities using OpenAI's WebSocket-based API. This integration enables seamless voice conversations with real-time speech to speech capabilities.
+## Installation
+```bash
+npm install @mastra/voice-openai-realtime
+```
+## Configuration
+The module requires an OpenAI API key, which can be provided through environment variables or directly in the configuration:
+```bash
+OPENAI_API_KEY=your_api_key
+```
+## Usage
+```typescript
+import { OpenAIRealtimeVoice } from '@mastra/voice-openai-realtime';
+// Create a voice instance with default configuration
+const voice = new OpenAIRealtimeVoice();
+// Create a voice instance with configuration
+const voice = new OpenAIRealtimeVoice({
+  chatModel: {
+    apiKey: 'your-api-key', // Optional, can use OPENAI_API_KEY env var
+    model: 'gpt-4o-mini-realtime', // Optional, uses latest model by default
+    options: {
+      sessionConfig: {
+        voice: 'alloy', // Default voice
+        turn_detection: {
+          type: 'server_vad',
+          threshold: 0.5,
+          silence_duration_ms: 1000,
+        },
+      },
+    },
+  },
+});
+// Connect to the realtime service
+await voice.open();
+// Audio data from voice provider
+voice.on('speaking', (audioData: Int16Array) => {
+  // Handle audio data
+});
+// Text data from voice provider
+voice.on('writing', (text: string) => {
+  // Handle transcribed text
+});
+// Error from voice provider
+voice.on('error', (error: Error) => {
+  console.error('Voice error:', error);
+});
+// Generate speech
+await voice.speak('Hello from Mastra!', {
+  speaker: 'echo', // Optional: override default speaker
+});
+// Listen to audio input
+await voice.listen(audioData);
+// Process audio input
+const microphoneStream = getMicrophoneStream();
+await voice.send(microphoneStream);
+// Clean up
+voice.close();
+```
+## Features
+- Real-time voice interactions via WebSocket
+- Seamless speech to speech
+- Voice activity detection (VAD)
+- Multiple voice options
+- Event-based audio streaming
+- Tool integration support
+## Voice Options
+Available voices include:
+- alloy (Neutral)
+- ash (Balanced)
+- echo (Warm)
+- shimmer (Clear)
+- coral (Expressive)
+- sage (Professional)
+- ballad (Melodic)
+- verse (Dynamic)
+## Events
+The voice instance emits several events:
+- `speaking`: Emitted while generating speech, provides Int16Array audio data
+- `writing`: Emitted when speech is transcribed to text
+- `error`: Emitted when an error occurs
+You can also listen to OpenAI Realtime [sdk utility events](https://github.com/openai/openai-realtime-api-beta/tree/main?tab=readme-ov-file#reference-client-utility-events) by prefixing with 'openAIRealtime:', such as:
+- `openAIRealtime:conversation.item.completed`
+- `openAIRealtime:conversation.updated`
+## Voice Activity Detection
+The realtime voice integration includes server-side VAD (Voice Activity Detection) with configurable parameters:
+```typescript
+voice.updateConfig({
+  voice: 'echo',
+  turn_detection: {
+    type: 'server_vad',
+    threshold: 0.5, // Speech detection sensitivity
+    silence_duration_ms: 1000, // Wait time before ending turn
+    prefix_padding_ms: 1000, // Audio padding before speech
+  },
+});
+```
+## Tool Integration
+You can add tools to the voice instance with tools that extend its capabilities:
+```typescript
+export const menuTool = createTool({
+  id: 'menuTool',
+  description: 'Get menu items',
+  inputSchema: z
+    .object({
+      query: z.string(),
+    })
+    .required(),
+  execute: async ({ context }) => {
+    // Implement menu search functionality
+  },
+});
+voice.addTools(menuTool);
+```
+## API Reference
+For detailed API documentation, refer to the JSDoc comments in the source code or generate documentation using TypeDoc.

package/dist/_tsup-dts-rollup.d.cts ADDED Viewed

@@ -0,0 +1,325 @@
+import { MastraVoice } from '@mastra/core/voice';
+import type { Realtime } from 'openai-realtime-api';
+import type { ToolsInput } from '@mastra/core/agent';
+/**
+ * Event callback function type
+ */
+declare type EventCallback = (...args: any[]) => void;
+export declare const isReadableStream: (obj: unknown) => unknown;
+export declare type OpenAIExecuteFunction = (args: any) => Promise<any>;
+/**
+ * OpenAIRealtimeVoice provides real-time voice interaction capabilities using OpenAI's
+ * WebSocket-based API. It supports:
+ * - Real-time text-to-speech
+ * - Speech-to-text (transcription)
+ * - Voice activity detection
+ * - Multiple voice options
+ * - Event-based audio streaming
+ *
+ * The class manages WebSocket connections, audio streaming, and event handling
+ * for seamless voice interactions.
+ *
+ * @extends MastraVoice
+ *
+ * @example
+ * ```typescript
+ * const voice = new OpenAIRealtimeVoice({
+ *   chatModel: {
+ *     apiKey: process.env.OPENAI_API_KEY,
+ *     model: 'gpt-4o-mini-realtime'
+ *   }
+ * });
+ *
+ * await voice.open();
+ * voice.on('speaking', (audioData) => {
+ *   // Handle audio data
+ * });
+ *
+ * await voice.speak('Hello, how can I help you today?');
+ * ```
+ */
+export declare class OpenAIRealtimeVoice extends MastraVoice {
+    private client;
+    private state;
+    private events;
+    tools?: TTools;
+    /**
+     * Creates a new instance of OpenAIRealtimeVoice.
+     *
+     * @param options - Configuration options for the voice instance
+     * @param options.chatModel - Configuration for the chat model
+     * @param options.chatModel.model - The model ID to use (defaults to GPT-4 Mini Realtime)
+     * @param options.chatModel.apiKey - OpenAI API key. Falls back to process.env.OPENAI_API_KEY
+     * @param options.chatModel.tools - Tools configuration for the model
+     * @param options.chatModel.options - Additional options for the realtime client
+     * @param options.chatModel.options.sessionConfig - Session configuration overrides
+     * @param options.chatModel.options.url - Custom WebSocket URL
+     * @param options.chatModel.options.dangerouslyAllowAPIKeyInBrowser - Whether to allow API key in browser
+     * @param options.chatModel.options.debug - Enable debug logging
+     * @param options.chatModel.options.tools - Additional tools configuration
+     * @param options.speaker - Voice ID to use (defaults to 'alloy')
+     *
+     * @example
+     * ```typescript
+     * const voice = new OpenAIRealtimeVoice({
+     *   chatModel: {
+     *     apiKey: 'your-api-key',
+     *     model: 'gpt-4o-mini-realtime',
+     *   },
+     *   speaker: 'alloy'
+     * });
+     * ```
+     */
+    constructor({ chatModel, speaker, }?: {
+        chatModel?: {
+            model?: string;
+            apiKey?: string;
+            tools?: TTools;
+            options?: {
+                sessionConfig?: Realtime.SessionConfig;
+                url?: string;
+                dangerouslyAllowAPIKeyInBrowser?: boolean;
+                debug?: boolean;
+                tools?: TTools;
+            };
+        };
+        speaker?: Realtime.Voice;
+    });
+    /**
+     * Returns a list of available voice speakers.
+     *
+     * @returns Promise resolving to an array of voice objects, each containing at least a voiceId
+     *
+     * @example
+     * ```typescript
+     * const speakers = await voice.getSpeakers();
+     * // speakers = [{ voiceId: 'alloy' }, { voiceId: 'echo' }, ...]
+     * ```
+     */
+    getSpeakers(): Promise<Array<{
+        voiceId: string;
+        [key: string]: any;
+    }>>;
+    /**
+     * Disconnects from the OpenAI realtime session and cleans up resources.
+     * Should be called when you're done with the voice instance.
+     *
+     * @example
+     * ```typescript
+     * voice.close(); // Disconnects and cleans up
+     * ```
+     */
+    close(): void;
+    /**
+     * Equips the voice instance with a set of tools.
+     * Tools allow the model to perform additional actions during conversations.
+     *
+     * @param tools - Optional tools configuration to addTools
+     * @returns Transformed tools configuration ready for use with the model
+     *
+     * @example
+     * ```typescript
+     * const tools = {
+     *   search: async (query: string) => { ... },
+     *   calculate: (expression: string) => { ... }
+     * };
+     * voice.addTools(tools);
+     * ```
+     */
+    addTools(tools?: TTools): void;
+    /**
+     * Emits a speaking event using the configured voice model.
+     * Can accept either a string or a readable stream as input.
+     *
+     * @param input - The text to convert to speech, or a readable stream containing the text
+     * @param options - Optional configuration for this specific speech request
+     * @param options.speaker - Override the voice to use for this specific request
+     *
+     * @throws {Error} If the input text is empty
+     *
+     * @example
+     * ```typescript
+     * // Simple text to speech
+     * await voice.speak('Hello world');
+     *
+     * // With custom voice
+     * await voice.speak('Hello world', { speaker: 'echo' });
+     *
+     * // Using a stream
+     * const stream = fs.createReadStream('text.txt');
+     * await voice.speak(stream);
+     * ```
+     */
+    speak(input: string | NodeJS.ReadableStream, options?: {
+        speaker?: Realtime.Voice;
+    }): Promise<void>;
+    /**
+     * Updates the session configuration for the voice instance.
+     * This can be used to modify voice settings, turn detection, and other parameters.
+     *
+     * @param sessionConfig - New session configuration to apply
+     *
+     * @example
+     * ```typescript
+     * voice.updateConfig({
+     *   voice: 'echo',
+     *   turn_detection: {
+     *     type: 'server_vad',
+     *     threshold: 0.5,
+     *     silence_duration_ms: 1000
+     *   }
+     * });
+     * ```
+     */
+    updateConfig(sessionConfig: Realtime.SessionConfig): void;
+    /**
+     * Processes audio input for speech recognition.
+     * Takes a readable stream of audio data and emits a writing event.
+     * The output of the writing event is int16 audio data.
+     *
+     * @param audioData - Readable stream containing the audio data to process
+     * @param options - Optional configuration for audio processing
+     *
+     * @throws {Error} If the audio data format is not supported
+     *
+     * @example
+     * ```typescript
+     * // Process audio from a file
+     * const audioStream = fs.createReadStream('audio.raw');
+     * await voice.listen(audioStream);
+     *
+     * // Process audio with options
+     * await voice.listen(microphoneStream, {
+     *   format: 'int16',
+     *   sampleRate: 24000
+     * });
+     * ```
+     */
+    listen(audioData: NodeJS.ReadableStream): Promise<void>;
+    /**
+     * Establishes a connection to the OpenAI realtime service.
+     * Must be called before using speak, listen, or relay functions.
+     *
+     * @throws {Error} If connection fails or session creation times out
+     *
+     * @example
+     * ```typescript
+     * await voice.open();
+     * // Now ready for voice interactions
+     * ```
+     */
+    connect(): Promise<void>;
+    /**
+     * Streams audio data in real-time to the OpenAI service.
+     * Useful for continuous audio streaming scenarios like live microphone input.
+     * Must be in 'open' state before calling this method.
+     *
+     * @param audioData - Readable stream of audio data to relay
+     * @throws {Error} If audio format is not supported
+     *
+     * @example
+     * ```typescript
+     * // First connect
+     * await voice.open();
+     *
+     * // Then relay audio
+     * const micStream = getMicrophoneStream();
+     * await voice.relay(micStream);
+     * ```
+     */
+    send(audioData: NodeJS.ReadableStream | Int16Array): Promise<void>;
+    /**
+     * Sends a response to the OpenAI Realtime API.
+     *
+     * Trigger a response to the real-time session.
+     *
+     * @param {Object} params - The parameters object
+     * @param {Realtime.ResponseConfig} params.options - Configuration options for the response
+     * @returns {Promise<void>} A promise that resolves when the response has been sent
+     *
+     * @example
+     * // Send a simple text response
+     * await realtimeVoice.answer({
+     *   options: {
+     *     content: "Hello, how can I help you today?",
+     *     voice: "alloy"
+     *   }
+     * });
+     */
+    answer({ options }: {
+        options?: Realtime.ResponseConfig;
+    }): Promise<void>;
+    /**
+     * Registers an event listener for voice events.
+     * Available events: 'speaking', 'writing, 'error'
+     * Can listen to OpenAI Realtime events by prefixing with 'openAIRealtime:'
+     * Such as 'openAIRealtime:conversation.item.completed', 'openAIRealtime:conversation.updated', etc.
+     *
+     * @param event - Name of the event to listen for
+     * @param callback - Function to call when the event occurs
+     *
+     * @example
+     * ```typescript
+     * // Listen for speech events
+     * voice.on('speaking', (audioData: Int16Array) => {
+     *   // Handle audio data
+     * });
+     *
+     * // Handle errors
+     * voice.on('error', (error: Error) => {
+     *   console.error('Voice error:', error);
+     * });
+     * ```
+     */
+    on(event: string, callback: EventCallback): void;
+    /**
+     * Removes a previously registered event listener.
+     *
+     * @param event - Name of the event to stop listening to
+     * @param callback - The specific callback function to remove
+     *
+     * @example
+     * ```typescript
+     * // Create event handler
+     * const handleSpeech = (audioData: Int16Array) => {
+     *   // Handle audio data
+     * };
+     *
+     * // Add listener
+     * voice.on('speaking', handleSpeech);
+     *
+     * // Later, remove the listener
+     * voice.off('speaking', handleSpeech);
+     * ```
+     */
+    off(event: string, callback: EventCallback): void;
+    /**
+     * Emit an event with arguments
+     * @param event Event name
+     * @param args Arguments to pass to the callbacks
+     */
+    private emit;
+    private setupEventListeners;
+    private int16ArrayToBase64;
+}
+export declare const transformTools: (tools?: TTools_2) => {
+    openaiTool: {
+        name: string;
+        description: string;
+        parameters: {
+            [key: string]: any;
+        };
+    };
+    execute: (args: any) => Promise<any>;
+}[];
+declare type TTools = ToolsInput;
+declare type TTools_2 = ToolsInput;
+export { }