@mastra/voice-openai-realtime 0.0.1-alpha.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,325 @@
1
+ import { MastraVoice } from '@mastra/core/voice';
2
+ import type { Realtime } from 'openai-realtime-api';
3
+ import type { ToolsInput } from '@mastra/core/agent';
4
+
5
+ /**
6
+ * Event callback function type
7
+ */
8
+ declare type EventCallback = (...args: any[]) => void;
9
+
10
+ export declare const isReadableStream: (obj: unknown) => unknown;
11
+
12
+ export declare type OpenAIExecuteFunction = (args: any) => Promise<any>;
13
+
14
+ /**
15
+ * OpenAIRealtimeVoice provides real-time voice interaction capabilities using OpenAI's
16
+ * WebSocket-based API. It supports:
17
+ * - Real-time text-to-speech
18
+ * - Speech-to-text (transcription)
19
+ * - Voice activity detection
20
+ * - Multiple voice options
21
+ * - Event-based audio streaming
22
+ *
23
+ * The class manages WebSocket connections, audio streaming, and event handling
24
+ * for seamless voice interactions.
25
+ *
26
+ * @extends MastraVoice
27
+ *
28
+ * @example
29
+ * ```typescript
30
+ * const voice = new OpenAIRealtimeVoice({
31
+ * chatModel: {
32
+ * apiKey: process.env.OPENAI_API_KEY,
33
+ * model: 'gpt-4o-mini-realtime'
34
+ * }
35
+ * });
36
+ *
37
+ * await voice.open();
38
+ * voice.on('speaking', (audioData) => {
39
+ * // Handle audio data
40
+ * });
41
+ *
42
+ * await voice.speak('Hello, how can I help you today?');
43
+ * ```
44
+ */
45
+ export declare class OpenAIRealtimeVoice extends MastraVoice {
46
+ private client;
47
+ private state;
48
+ private events;
49
+ tools?: TTools;
50
+ /**
51
+ * Creates a new instance of OpenAIRealtimeVoice.
52
+ *
53
+ * @param options - Configuration options for the voice instance
54
+ * @param options.chatModel - Configuration for the chat model
55
+ * @param options.chatModel.model - The model ID to use (defaults to GPT-4 Mini Realtime)
56
+ * @param options.chatModel.apiKey - OpenAI API key. Falls back to process.env.OPENAI_API_KEY
57
+ * @param options.chatModel.tools - Tools configuration for the model
58
+ * @param options.chatModel.options - Additional options for the realtime client
59
+ * @param options.chatModel.options.sessionConfig - Session configuration overrides
60
+ * @param options.chatModel.options.url - Custom WebSocket URL
61
+ * @param options.chatModel.options.dangerouslyAllowAPIKeyInBrowser - Whether to allow API key in browser
62
+ * @param options.chatModel.options.debug - Enable debug logging
63
+ * @param options.chatModel.options.tools - Additional tools configuration
64
+ * @param options.speaker - Voice ID to use (defaults to 'alloy')
65
+ *
66
+ * @example
67
+ * ```typescript
68
+ * const voice = new OpenAIRealtimeVoice({
69
+ * chatModel: {
70
+ * apiKey: 'your-api-key',
71
+ * model: 'gpt-4o-mini-realtime',
72
+ * },
73
+ * speaker: 'alloy'
74
+ * });
75
+ * ```
76
+ */
77
+ constructor({ chatModel, speaker, }?: {
78
+ chatModel?: {
79
+ model?: string;
80
+ apiKey?: string;
81
+ tools?: TTools;
82
+ options?: {
83
+ sessionConfig?: Realtime.SessionConfig;
84
+ url?: string;
85
+ dangerouslyAllowAPIKeyInBrowser?: boolean;
86
+ debug?: boolean;
87
+ tools?: TTools;
88
+ };
89
+ };
90
+ speaker?: Realtime.Voice;
91
+ });
92
+ /**
93
+ * Returns a list of available voice speakers.
94
+ *
95
+ * @returns Promise resolving to an array of voice objects, each containing at least a voiceId
96
+ *
97
+ * @example
98
+ * ```typescript
99
+ * const speakers = await voice.getSpeakers();
100
+ * // speakers = [{ voiceId: 'alloy' }, { voiceId: 'echo' }, ...]
101
+ * ```
102
+ */
103
+ getSpeakers(): Promise<Array<{
104
+ voiceId: string;
105
+ [key: string]: any;
106
+ }>>;
107
+ /**
108
+ * Disconnects from the OpenAI realtime session and cleans up resources.
109
+ * Should be called when you're done with the voice instance.
110
+ *
111
+ * @example
112
+ * ```typescript
113
+ * voice.close(); // Disconnects and cleans up
114
+ * ```
115
+ */
116
+ close(): void;
117
+ /**
118
+ * Equips the voice instance with a set of tools.
119
+ * Tools allow the model to perform additional actions during conversations.
120
+ *
121
+ * @param tools - Optional tools configuration to addTools
122
+ * @returns Transformed tools configuration ready for use with the model
123
+ *
124
+ * @example
125
+ * ```typescript
126
+ * const tools = {
127
+ * search: async (query: string) => { ... },
128
+ * calculate: (expression: string) => { ... }
129
+ * };
130
+ * voice.addTools(tools);
131
+ * ```
132
+ */
133
+ addTools(tools?: TTools): void;
134
+ /**
135
+ * Emits a speaking event using the configured voice model.
136
+ * Can accept either a string or a readable stream as input.
137
+ *
138
+ * @param input - The text to convert to speech, or a readable stream containing the text
139
+ * @param options - Optional configuration for this specific speech request
140
+ * @param options.speaker - Override the voice to use for this specific request
141
+ *
142
+ * @throws {Error} If the input text is empty
143
+ *
144
+ * @example
145
+ * ```typescript
146
+ * // Simple text to speech
147
+ * await voice.speak('Hello world');
148
+ *
149
+ * // With custom voice
150
+ * await voice.speak('Hello world', { speaker: 'echo' });
151
+ *
152
+ * // Using a stream
153
+ * const stream = fs.createReadStream('text.txt');
154
+ * await voice.speak(stream);
155
+ * ```
156
+ */
157
+ speak(input: string | NodeJS.ReadableStream, options?: {
158
+ speaker?: Realtime.Voice;
159
+ }): Promise<void>;
160
+ /**
161
+ * Updates the session configuration for the voice instance.
162
+ * This can be used to modify voice settings, turn detection, and other parameters.
163
+ *
164
+ * @param sessionConfig - New session configuration to apply
165
+ *
166
+ * @example
167
+ * ```typescript
168
+ * voice.updateConfig({
169
+ * voice: 'echo',
170
+ * turn_detection: {
171
+ * type: 'server_vad',
172
+ * threshold: 0.5,
173
+ * silence_duration_ms: 1000
174
+ * }
175
+ * });
176
+ * ```
177
+ */
178
+ updateConfig(sessionConfig: Realtime.SessionConfig): void;
179
+ /**
180
+ * Processes audio input for speech recognition.
181
+ * Takes a readable stream of audio data and emits a writing event.
182
+ * The output of the writing event is int16 audio data.
183
+ *
184
+ * @param audioData - Readable stream containing the audio data to process
185
+ * @param options - Optional configuration for audio processing
186
+ *
187
+ * @throws {Error} If the audio data format is not supported
188
+ *
189
+ * @example
190
+ * ```typescript
191
+ * // Process audio from a file
192
+ * const audioStream = fs.createReadStream('audio.raw');
193
+ * await voice.listen(audioStream);
194
+ *
195
+ * // Process audio with options
196
+ * await voice.listen(microphoneStream, {
197
+ * format: 'int16',
198
+ * sampleRate: 24000
199
+ * });
200
+ * ```
201
+ */
202
+ listen(audioData: NodeJS.ReadableStream): Promise<void>;
203
+ /**
204
+ * Establishes a connection to the OpenAI realtime service.
205
+ * Must be called before using speak, listen, or relay functions.
206
+ *
207
+ * @throws {Error} If connection fails or session creation times out
208
+ *
209
+ * @example
210
+ * ```typescript
211
+ * await voice.open();
212
+ * // Now ready for voice interactions
213
+ * ```
214
+ */
215
+ connect(): Promise<void>;
216
+ /**
217
+ * Streams audio data in real-time to the OpenAI service.
218
+ * Useful for continuous audio streaming scenarios like live microphone input.
219
+ * Must be in 'open' state before calling this method.
220
+ *
221
+ * @param audioData - Readable stream of audio data to relay
222
+ * @throws {Error} If audio format is not supported
223
+ *
224
+ * @example
225
+ * ```typescript
226
+ * // First connect
227
+ * await voice.open();
228
+ *
229
+ * // Then relay audio
230
+ * const micStream = getMicrophoneStream();
231
+ * await voice.relay(micStream);
232
+ * ```
233
+ */
234
+ send(audioData: NodeJS.ReadableStream | Int16Array): Promise<void>;
235
+ /**
236
+ * Sends a response to the OpenAI Realtime API.
237
+ *
238
+ * Trigger a response to the real-time session.
239
+ *
240
+ * @param {Object} params - The parameters object
241
+ * @param {Realtime.ResponseConfig} params.options - Configuration options for the response
242
+ * @returns {Promise<void>} A promise that resolves when the response has been sent
243
+ *
244
+ * @example
245
+ * // Send a simple text response
246
+ * await realtimeVoice.answer({
247
+ * options: {
248
+ * content: "Hello, how can I help you today?",
249
+ * voice: "alloy"
250
+ * }
251
+ * });
252
+ */
253
+ answer({ options }: {
254
+ options?: Realtime.ResponseConfig;
255
+ }): Promise<void>;
256
+ /**
257
+ * Registers an event listener for voice events.
258
+ * Available events: 'speaking', 'writing, 'error'
259
+ * Can listen to OpenAI Realtime events by prefixing with 'openAIRealtime:'
260
+ * Such as 'openAIRealtime:conversation.item.completed', 'openAIRealtime:conversation.updated', etc.
261
+ *
262
+ * @param event - Name of the event to listen for
263
+ * @param callback - Function to call when the event occurs
264
+ *
265
+ * @example
266
+ * ```typescript
267
+ * // Listen for speech events
268
+ * voice.on('speaking', (audioData: Int16Array) => {
269
+ * // Handle audio data
270
+ * });
271
+ *
272
+ * // Handle errors
273
+ * voice.on('error', (error: Error) => {
274
+ * console.error('Voice error:', error);
275
+ * });
276
+ * ```
277
+ */
278
+ on(event: string, callback: EventCallback): void;
279
+ /**
280
+ * Removes a previously registered event listener.
281
+ *
282
+ * @param event - Name of the event to stop listening to
283
+ * @param callback - The specific callback function to remove
284
+ *
285
+ * @example
286
+ * ```typescript
287
+ * // Create event handler
288
+ * const handleSpeech = (audioData: Int16Array) => {
289
+ * // Handle audio data
290
+ * };
291
+ *
292
+ * // Add listener
293
+ * voice.on('speaking', handleSpeech);
294
+ *
295
+ * // Later, remove the listener
296
+ * voice.off('speaking', handleSpeech);
297
+ * ```
298
+ */
299
+ off(event: string, callback: EventCallback): void;
300
+ /**
301
+ * Emit an event with arguments
302
+ * @param event Event name
303
+ * @param args Arguments to pass to the callbacks
304
+ */
305
+ private emit;
306
+ private setupEventListeners;
307
+ private int16ArrayToBase64;
308
+ }
309
+
310
+ export declare const transformTools: (tools?: TTools_2) => {
311
+ openaiTool: {
312
+ name: string;
313
+ description: string;
314
+ parameters: {
315
+ [key: string]: any;
316
+ };
317
+ };
318
+ execute: (args: any) => Promise<any>;
319
+ }[];
320
+
321
+ declare type TTools = ToolsInput;
322
+
323
+ declare type TTools_2 = ToolsInput;
324
+
325
+ export { }