@mastra/voice-openai-realtime 0.0.1-alpha.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,23 @@
1
+
2
+ > @mastra/voice-openai-realtime@0.0.1-alpha.1 build /home/runner/work/mastra/mastra/voice/openai-realtime-api
3
+ > tsup src/index.ts --format esm,cjs --experimental-dts --clean --treeshake
4
+
5
+ CLI Building entry: src/index.ts
6
+ CLI Using tsconfig: tsconfig.json
7
+ CLI tsup v8.4.0
8
+ TSC Build start
9
+ TSC ⚡️ Build success in 8472ms
10
+ DTS Build start
11
+ CLI Target: es2022
12
+ Analysis will use the bundled TypeScript version 5.7.3
13
+ Writing package typings: /home/runner/work/mastra/mastra/voice/openai-realtime-api/dist/_tsup-dts-rollup.d.ts
14
+ Analysis will use the bundled TypeScript version 5.7.3
15
+ Writing package typings: /home/runner/work/mastra/mastra/voice/openai-realtime-api/dist/_tsup-dts-rollup.d.cts
16
+ DTS ⚡️ Build success in 9610ms
17
+ CLI Cleaning output folder
18
+ ESM Build start
19
+ CJS Build start
20
+ CJS dist/index.cjs 14.99 KB
21
+ CJS ⚡️ Build success in 698ms
22
+ ESM dist/index.js 14.90 KB
23
+ ESM ⚡️ Build success in 699ms
package/LICENSE ADDED
@@ -0,0 +1,44 @@
1
+ Elastic License 2.0 (ELv2)
2
+
3
+ **Acceptance**
4
+ By using the software, you agree to all of the terms and conditions below.
5
+
6
+ **Copyright License**
7
+ The licensor grants you a non-exclusive, royalty-free, worldwide, non-sublicensable, non-transferable license to use, copy, distribute, make available, and prepare derivative works of the software, in each case subject to the limitations and conditions below
8
+
9
+ **Limitations**
10
+ You may not provide the software to third parties as a hosted or managed service, where the service provides users with access to any substantial set of the features or functionality of the software.
11
+
12
+ You may not move, change, disable, or circumvent the license key functionality in the software, and you may not remove or obscure any functionality in the software that is protected by the license key.
13
+
14
+ You may not alter, remove, or obscure any licensing, copyright, or other notices of the licensor in the software. Any use of the licensor’s trademarks is subject to applicable law.
15
+
16
+ **Patents**
17
+ The licensor grants you a license, under any patent claims the licensor can license, or becomes able to license, to make, have made, use, sell, offer for sale, import and have imported the software, in each case subject to the limitations and conditions in this license. This license does not cover any patent claims that you cause to be infringed by modifications or additions to the software. If you or your company make any written claim that the software infringes or contributes to infringement of any patent, your patent license for the software granted under these terms ends immediately. If your company makes such a claim, your patent license ends immediately for work on behalf of your company.
18
+
19
+ **Notices**
20
+ You must ensure that anyone who gets a copy of any part of the software from you also gets a copy of these terms.
21
+
22
+ If you modify the software, you must include in any modified copies of the software prominent notices stating that you have modified the software.
23
+
24
+ **No Other Rights**
25
+ These terms do not imply any licenses other than those expressly granted in these terms.
26
+
27
+ **Termination**
28
+ If you use the software in violation of these terms, such use is not licensed, and your licenses will automatically terminate. If the licensor provides you with a notice of your violation, and you cease all violation of this license no later than 30 days after you receive that notice, your licenses will be reinstated retroactively. However, if you violate these terms after such reinstatement, any additional violation of these terms will cause your licenses to terminate automatically and permanently.
29
+
30
+ **No Liability**
31
+ As far as the law allows, the software comes as is, without any warranty or condition, and the licensor will not be liable to you for any damages arising out of these terms or the use or nature of the software, under any kind of legal claim.
32
+
33
+ **Definitions**
34
+ The _licensor_ is the entity offering these terms, and the _software_ is the software the licensor makes available under these terms, including any portion of it.
35
+
36
+ _you_ refers to the individual or entity agreeing to these terms.
37
+
38
+ _your company_ is any legal entity, sole proprietorship, or other kind of organization that you work for, plus all organizations that have control over, are under the control of, or are under common control with that organization. _control_ means ownership of substantially all the assets of an entity, or the power to direct its management and policies by vote, contract, or otherwise. Control can be direct or indirect.
39
+
40
+ _your licenses_ are all the licenses granted to you for the software under these terms.
41
+
42
+ _use_ means anything you do with the software requiring one of your licenses.
43
+
44
+ _trademark_ means trademarks, service marks, and similar rights.
package/README.md ADDED
@@ -0,0 +1,153 @@
1
+ # @mastra/voice-openai-realtime
2
+
3
+ OpenAI Realtime Voice integration for Mastra, providing real-time voice interaction capabilities using OpenAI's WebSocket-based API. This integration enables seamless voice conversations with real-time speech to speech capabilities.
4
+
5
+ ## Installation
6
+
7
+ ```bash
8
+ npm install @mastra/voice-openai-realtime
9
+ ```
10
+
11
+ ## Configuration
12
+
13
+ The module requires an OpenAI API key, which can be provided through environment variables or directly in the configuration:
14
+
15
+ ```bash
16
+ OPENAI_API_KEY=your_api_key
17
+ ```
18
+
19
+ ## Usage
20
+
21
+ ```typescript
22
+ import { OpenAIRealtimeVoice } from '@mastra/voice-openai-realtime';
23
+
24
+ // Create a voice instance with default configuration
25
+ const voice = new OpenAIRealtimeVoice();
26
+
27
+ // Create a voice instance with configuration
28
+ const voice = new OpenAIRealtimeVoice({
29
+ chatModel: {
30
+ apiKey: 'your-api-key', // Optional, can use OPENAI_API_KEY env var
31
+ model: 'gpt-4o-mini-realtime', // Optional, uses latest model by default
32
+ options: {
33
+ sessionConfig: {
34
+ voice: 'alloy', // Default voice
35
+ turn_detection: {
36
+ type: 'server_vad',
37
+ threshold: 0.5,
38
+ silence_duration_ms: 1000,
39
+ },
40
+ },
41
+ },
42
+ },
43
+ });
44
+
45
+ // Connect to the realtime service
46
+ await voice.open();
47
+
48
+ // Audio data from voice provider
49
+ voice.on('speaking', (audioData: Int16Array) => {
50
+ // Handle audio data
51
+ });
52
+
53
+ // Text data from voice provider
54
+ voice.on('writing', (text: string) => {
55
+ // Handle transcribed text
56
+ });
57
+
58
+ // Error from voice provider
59
+ voice.on('error', (error: Error) => {
60
+ console.error('Voice error:', error);
61
+ });
62
+
63
+ // Generate speech
64
+ await voice.speak('Hello from Mastra!', {
65
+ speaker: 'echo', // Optional: override default speaker
66
+ });
67
+
68
+ // Listen to audio input
69
+ await voice.listen(audioData);
70
+
71
+ // Process audio input
72
+ const microphoneStream = getMicrophoneStream();
73
+ await voice.send(microphoneStream);
74
+
75
+ // Clean up
76
+ voice.close();
77
+ ```
78
+
79
+ ## Features
80
+
81
+ - Real-time voice interactions via WebSocket
82
+ - Seamless speech to speech
83
+ - Voice activity detection (VAD)
84
+ - Multiple voice options
85
+ - Event-based audio streaming
86
+ - Tool integration support
87
+
88
+ ## Voice Options
89
+
90
+ Available voices include:
91
+
92
+ - alloy (Neutral)
93
+ - ash (Balanced)
94
+ - echo (Warm)
95
+ - shimmer (Clear)
96
+ - coral (Expressive)
97
+ - sage (Professional)
98
+ - ballad (Melodic)
99
+ - verse (Dynamic)
100
+
101
+ ## Events
102
+
103
+ The voice instance emits several events:
104
+
105
+ - `speaking`: Emitted while generating speech, provides Int16Array audio data
106
+ - `writing`: Emitted when speech is transcribed to text
107
+ - `error`: Emitted when an error occurs
108
+
109
+ You can also listen to OpenAI Realtime [sdk utility events](https://github.com/openai/openai-realtime-api-beta/tree/main?tab=readme-ov-file#reference-client-utility-events) by prefixing with 'openAIRealtime:', such as:
110
+
111
+ - `openAIRealtime:conversation.item.completed`
112
+ - `openAIRealtime:conversation.updated`
113
+
114
+ ## Voice Activity Detection
115
+
116
+ The realtime voice integration includes server-side VAD (Voice Activity Detection) with configurable parameters:
117
+
118
+ ```typescript
119
+ voice.updateConfig({
120
+ voice: 'echo',
121
+ turn_detection: {
122
+ type: 'server_vad',
123
+ threshold: 0.5, // Speech detection sensitivity
124
+ silence_duration_ms: 1000, // Wait time before ending turn
125
+ prefix_padding_ms: 1000, // Audio padding before speech
126
+ },
127
+ });
128
+ ```
129
+
130
+ ## Tool Integration
131
+
132
+ You can add tools to the voice instance with tools that extend its capabilities:
133
+
134
+ ```typescript
135
+ export const menuTool = createTool({
136
+ id: 'menuTool',
137
+ description: 'Get menu items',
138
+ inputSchema: z
139
+ .object({
140
+ query: z.string(),
141
+ })
142
+ .required(),
143
+ execute: async ({ context }) => {
144
+ // Implement menu search functionality
145
+ },
146
+ });
147
+
148
+ voice.addTools(menuTool);
149
+ ```
150
+
151
+ ## API Reference
152
+
153
+ For detailed API documentation, refer to the JSDoc comments in the source code or generate documentation using TypeDoc.
@@ -0,0 +1,325 @@
1
+ import { MastraVoice } from '@mastra/core/voice';
2
+ import type { Realtime } from 'openai-realtime-api';
3
+ import type { ToolsInput } from '@mastra/core/agent';
4
+
5
+ /**
6
+ * Event callback function type
7
+ */
8
+ declare type EventCallback = (...args: any[]) => void;
9
+
10
+ export declare const isReadableStream: (obj: unknown) => unknown;
11
+
12
+ export declare type OpenAIExecuteFunction = (args: any) => Promise<any>;
13
+
14
+ /**
15
+ * OpenAIRealtimeVoice provides real-time voice interaction capabilities using OpenAI's
16
+ * WebSocket-based API. It supports:
17
+ * - Real-time text-to-speech
18
+ * - Speech-to-text (transcription)
19
+ * - Voice activity detection
20
+ * - Multiple voice options
21
+ * - Event-based audio streaming
22
+ *
23
+ * The class manages WebSocket connections, audio streaming, and event handling
24
+ * for seamless voice interactions.
25
+ *
26
+ * @extends MastraVoice
27
+ *
28
+ * @example
29
+ * ```typescript
30
+ * const voice = new OpenAIRealtimeVoice({
31
+ * chatModel: {
32
+ * apiKey: process.env.OPENAI_API_KEY,
33
+ * model: 'gpt-4o-mini-realtime'
34
+ * }
35
+ * });
36
+ *
37
+ * await voice.open();
38
+ * voice.on('speaking', (audioData) => {
39
+ * // Handle audio data
40
+ * });
41
+ *
42
+ * await voice.speak('Hello, how can I help you today?');
43
+ * ```
44
+ */
45
+ export declare class OpenAIRealtimeVoice extends MastraVoice {
46
+ private client;
47
+ private state;
48
+ private events;
49
+ tools?: TTools;
50
+ /**
51
+ * Creates a new instance of OpenAIRealtimeVoice.
52
+ *
53
+ * @param options - Configuration options for the voice instance
54
+ * @param options.chatModel - Configuration for the chat model
55
+ * @param options.chatModel.model - The model ID to use (defaults to GPT-4 Mini Realtime)
56
+ * @param options.chatModel.apiKey - OpenAI API key. Falls back to process.env.OPENAI_API_KEY
57
+ * @param options.chatModel.tools - Tools configuration for the model
58
+ * @param options.chatModel.options - Additional options for the realtime client
59
+ * @param options.chatModel.options.sessionConfig - Session configuration overrides
60
+ * @param options.chatModel.options.url - Custom WebSocket URL
61
+ * @param options.chatModel.options.dangerouslyAllowAPIKeyInBrowser - Whether to allow API key in browser
62
+ * @param options.chatModel.options.debug - Enable debug logging
63
+ * @param options.chatModel.options.tools - Additional tools configuration
64
+ * @param options.speaker - Voice ID to use (defaults to 'alloy')
65
+ *
66
+ * @example
67
+ * ```typescript
68
+ * const voice = new OpenAIRealtimeVoice({
69
+ * chatModel: {
70
+ * apiKey: 'your-api-key',
71
+ * model: 'gpt-4o-mini-realtime',
72
+ * },
73
+ * speaker: 'alloy'
74
+ * });
75
+ * ```
76
+ */
77
+ constructor({ chatModel, speaker, }?: {
78
+ chatModel?: {
79
+ model?: string;
80
+ apiKey?: string;
81
+ tools?: TTools;
82
+ options?: {
83
+ sessionConfig?: Realtime.SessionConfig;
84
+ url?: string;
85
+ dangerouslyAllowAPIKeyInBrowser?: boolean;
86
+ debug?: boolean;
87
+ tools?: TTools;
88
+ };
89
+ };
90
+ speaker?: Realtime.Voice;
91
+ });
92
+ /**
93
+ * Returns a list of available voice speakers.
94
+ *
95
+ * @returns Promise resolving to an array of voice objects, each containing at least a voiceId
96
+ *
97
+ * @example
98
+ * ```typescript
99
+ * const speakers = await voice.getSpeakers();
100
+ * // speakers = [{ voiceId: 'alloy' }, { voiceId: 'echo' }, ...]
101
+ * ```
102
+ */
103
+ getSpeakers(): Promise<Array<{
104
+ voiceId: string;
105
+ [key: string]: any;
106
+ }>>;
107
+ /**
108
+ * Disconnects from the OpenAI realtime session and cleans up resources.
109
+ * Should be called when you're done with the voice instance.
110
+ *
111
+ * @example
112
+ * ```typescript
113
+ * voice.close(); // Disconnects and cleans up
114
+ * ```
115
+ */
116
+ close(): void;
117
+ /**
118
+ * Equips the voice instance with a set of tools.
119
+ * Tools allow the model to perform additional actions during conversations.
120
+ *
121
+ * @param tools - Optional tools configuration to addTools
122
+ * @returns Transformed tools configuration ready for use with the model
123
+ *
124
+ * @example
125
+ * ```typescript
126
+ * const tools = {
127
+ * search: async (query: string) => { ... },
128
+ * calculate: (expression: string) => { ... }
129
+ * };
130
+ * voice.addTools(tools);
131
+ * ```
132
+ */
133
+ addTools(tools?: TTools): void;
134
+ /**
135
+ * Emits a speaking event using the configured voice model.
136
+ * Can accept either a string or a readable stream as input.
137
+ *
138
+ * @param input - The text to convert to speech, or a readable stream containing the text
139
+ * @param options - Optional configuration for this specific speech request
140
+ * @param options.speaker - Override the voice to use for this specific request
141
+ *
142
+ * @throws {Error} If the input text is empty
143
+ *
144
+ * @example
145
+ * ```typescript
146
+ * // Simple text to speech
147
+ * await voice.speak('Hello world');
148
+ *
149
+ * // With custom voice
150
+ * await voice.speak('Hello world', { speaker: 'echo' });
151
+ *
152
+ * // Using a stream
153
+ * const stream = fs.createReadStream('text.txt');
154
+ * await voice.speak(stream);
155
+ * ```
156
+ */
157
+ speak(input: string | NodeJS.ReadableStream, options?: {
158
+ speaker?: Realtime.Voice;
159
+ }): Promise<void>;
160
+ /**
161
+ * Updates the session configuration for the voice instance.
162
+ * This can be used to modify voice settings, turn detection, and other parameters.
163
+ *
164
+ * @param sessionConfig - New session configuration to apply
165
+ *
166
+ * @example
167
+ * ```typescript
168
+ * voice.updateConfig({
169
+ * voice: 'echo',
170
+ * turn_detection: {
171
+ * type: 'server_vad',
172
+ * threshold: 0.5,
173
+ * silence_duration_ms: 1000
174
+ * }
175
+ * });
176
+ * ```
177
+ */
178
+ updateConfig(sessionConfig: Realtime.SessionConfig): void;
179
+ /**
180
+ * Processes audio input for speech recognition.
181
+ * Takes a readable stream of audio data and emits a writing event.
182
+ * The output of the writing event is int16 audio data.
183
+ *
184
+ * @param audioData - Readable stream containing the audio data to process
185
+ * @param options - Optional configuration for audio processing
186
+ *
187
+ * @throws {Error} If the audio data format is not supported
188
+ *
189
+ * @example
190
+ * ```typescript
191
+ * // Process audio from a file
192
+ * const audioStream = fs.createReadStream('audio.raw');
193
+ * await voice.listen(audioStream);
194
+ *
195
+ * // Process audio with options
196
+ * await voice.listen(microphoneStream, {
197
+ * format: 'int16',
198
+ * sampleRate: 24000
199
+ * });
200
+ * ```
201
+ */
202
+ listen(audioData: NodeJS.ReadableStream): Promise<void>;
203
+ /**
204
+ * Establishes a connection to the OpenAI realtime service.
205
+ * Must be called before using speak, listen, or relay functions.
206
+ *
207
+ * @throws {Error} If connection fails or session creation times out
208
+ *
209
+ * @example
210
+ * ```typescript
211
+ * await voice.open();
212
+ * // Now ready for voice interactions
213
+ * ```
214
+ */
215
+ connect(): Promise<void>;
216
+ /**
217
+ * Streams audio data in real-time to the OpenAI service.
218
+ * Useful for continuous audio streaming scenarios like live microphone input.
219
+ * Must be in 'open' state before calling this method.
220
+ *
221
+ * @param audioData - Readable stream of audio data to relay
222
+ * @throws {Error} If audio format is not supported
223
+ *
224
+ * @example
225
+ * ```typescript
226
+ * // First connect
227
+ * await voice.open();
228
+ *
229
+ * // Then relay audio
230
+ * const micStream = getMicrophoneStream();
231
+ * await voice.relay(micStream);
232
+ * ```
233
+ */
234
+ send(audioData: NodeJS.ReadableStream | Int16Array): Promise<void>;
235
+ /**
236
+ * Sends a response to the OpenAI Realtime API.
237
+ *
238
+ * Trigger a response to the real-time session.
239
+ *
240
+ * @param {Object} params - The parameters object
241
+ * @param {Realtime.ResponseConfig} params.options - Configuration options for the response
242
+ * @returns {Promise<void>} A promise that resolves when the response has been sent
243
+ *
244
+ * @example
245
+ * // Send a simple text response
246
+ * await realtimeVoice.answer({
247
+ * options: {
248
+ * content: "Hello, how can I help you today?",
249
+ * voice: "alloy"
250
+ * }
251
+ * });
252
+ */
253
+ answer({ options }: {
254
+ options?: Realtime.ResponseConfig;
255
+ }): Promise<void>;
256
+ /**
257
+ * Registers an event listener for voice events.
258
+ * Available events: 'speaking', 'writing, 'error'
259
+ * Can listen to OpenAI Realtime events by prefixing with 'openAIRealtime:'
260
+ * Such as 'openAIRealtime:conversation.item.completed', 'openAIRealtime:conversation.updated', etc.
261
+ *
262
+ * @param event - Name of the event to listen for
263
+ * @param callback - Function to call when the event occurs
264
+ *
265
+ * @example
266
+ * ```typescript
267
+ * // Listen for speech events
268
+ * voice.on('speaking', (audioData: Int16Array) => {
269
+ * // Handle audio data
270
+ * });
271
+ *
272
+ * // Handle errors
273
+ * voice.on('error', (error: Error) => {
274
+ * console.error('Voice error:', error);
275
+ * });
276
+ * ```
277
+ */
278
+ on(event: string, callback: EventCallback): void;
279
+ /**
280
+ * Removes a previously registered event listener.
281
+ *
282
+ * @param event - Name of the event to stop listening to
283
+ * @param callback - The specific callback function to remove
284
+ *
285
+ * @example
286
+ * ```typescript
287
+ * // Create event handler
288
+ * const handleSpeech = (audioData: Int16Array) => {
289
+ * // Handle audio data
290
+ * };
291
+ *
292
+ * // Add listener
293
+ * voice.on('speaking', handleSpeech);
294
+ *
295
+ * // Later, remove the listener
296
+ * voice.off('speaking', handleSpeech);
297
+ * ```
298
+ */
299
+ off(event: string, callback: EventCallback): void;
300
+ /**
301
+ * Emit an event with arguments
302
+ * @param event Event name
303
+ * @param args Arguments to pass to the callbacks
304
+ */
305
+ private emit;
306
+ private setupEventListeners;
307
+ private int16ArrayToBase64;
308
+ }
309
+
310
+ export declare const transformTools: (tools?: TTools_2) => {
311
+ openaiTool: {
312
+ name: string;
313
+ description: string;
314
+ parameters: {
315
+ [key: string]: any;
316
+ };
317
+ };
318
+ execute: (args: any) => Promise<any>;
319
+ }[];
320
+
321
+ declare type TTools = ToolsInput;
322
+
323
+ declare type TTools_2 = ToolsInput;
324
+
325
+ export { }