@mastra/voice-openai-realtime 0.0.1-alpha.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.turbo/turbo-build.log +23 -0
- package/LICENSE +44 -0
- package/README.md +153 -0
- package/dist/_tsup-dts-rollup.d.cts +325 -0
- package/dist/_tsup-dts-rollup.d.ts +325 -0
- package/dist/index.cjs +481 -0
- package/dist/index.d.cts +1 -0
- package/dist/index.d.ts +1 -0
- package/dist/index.js +479 -0
- package/eslint.config.js +6 -0
- package/package.json +41 -0
- package/src/index.test.ts +117 -0
- package/src/index.ts +543 -0
- package/src/utils.ts +87 -0
- package/tsconfig.json +5 -0
- package/vitest.config.ts +8 -0
|
@@ -0,0 +1,325 @@
|
|
|
1
|
+
import { MastraVoice } from '@mastra/core/voice';
|
|
2
|
+
import type { Realtime } from 'openai-realtime-api';
|
|
3
|
+
import type { ToolsInput } from '@mastra/core/agent';
|
|
4
|
+
|
|
5
|
+
/**
|
|
6
|
+
* Event callback function type
|
|
7
|
+
*/
|
|
8
|
+
declare type EventCallback = (...args: any[]) => void;
|
|
9
|
+
|
|
10
|
+
export declare const isReadableStream: (obj: unknown) => unknown;
|
|
11
|
+
|
|
12
|
+
export declare type OpenAIExecuteFunction = (args: any) => Promise<any>;
|
|
13
|
+
|
|
14
|
+
/**
|
|
15
|
+
* OpenAIRealtimeVoice provides real-time voice interaction capabilities using OpenAI's
|
|
16
|
+
* WebSocket-based API. It supports:
|
|
17
|
+
* - Real-time text-to-speech
|
|
18
|
+
* - Speech-to-text (transcription)
|
|
19
|
+
* - Voice activity detection
|
|
20
|
+
* - Multiple voice options
|
|
21
|
+
* - Event-based audio streaming
|
|
22
|
+
*
|
|
23
|
+
* The class manages WebSocket connections, audio streaming, and event handling
|
|
24
|
+
* for seamless voice interactions.
|
|
25
|
+
*
|
|
26
|
+
* @extends MastraVoice
|
|
27
|
+
*
|
|
28
|
+
* @example
|
|
29
|
+
* ```typescript
|
|
30
|
+
* const voice = new OpenAIRealtimeVoice({
|
|
31
|
+
* chatModel: {
|
|
32
|
+
* apiKey: process.env.OPENAI_API_KEY,
|
|
33
|
+
* model: 'gpt-4o-mini-realtime'
|
|
34
|
+
* }
|
|
35
|
+
* });
|
|
36
|
+
*
|
|
37
|
+
* await voice.open();
|
|
38
|
+
* voice.on('speaking', (audioData) => {
|
|
39
|
+
* // Handle audio data
|
|
40
|
+
* });
|
|
41
|
+
*
|
|
42
|
+
* await voice.speak('Hello, how can I help you today?');
|
|
43
|
+
* ```
|
|
44
|
+
*/
|
|
45
|
+
export declare class OpenAIRealtimeVoice extends MastraVoice {
|
|
46
|
+
private client;
|
|
47
|
+
private state;
|
|
48
|
+
private events;
|
|
49
|
+
tools?: TTools;
|
|
50
|
+
/**
|
|
51
|
+
* Creates a new instance of OpenAIRealtimeVoice.
|
|
52
|
+
*
|
|
53
|
+
* @param options - Configuration options for the voice instance
|
|
54
|
+
* @param options.chatModel - Configuration for the chat model
|
|
55
|
+
* @param options.chatModel.model - The model ID to use (defaults to GPT-4 Mini Realtime)
|
|
56
|
+
* @param options.chatModel.apiKey - OpenAI API key. Falls back to process.env.OPENAI_API_KEY
|
|
57
|
+
* @param options.chatModel.tools - Tools configuration for the model
|
|
58
|
+
* @param options.chatModel.options - Additional options for the realtime client
|
|
59
|
+
* @param options.chatModel.options.sessionConfig - Session configuration overrides
|
|
60
|
+
* @param options.chatModel.options.url - Custom WebSocket URL
|
|
61
|
+
* @param options.chatModel.options.dangerouslyAllowAPIKeyInBrowser - Whether to allow API key in browser
|
|
62
|
+
* @param options.chatModel.options.debug - Enable debug logging
|
|
63
|
+
* @param options.chatModel.options.tools - Additional tools configuration
|
|
64
|
+
* @param options.speaker - Voice ID to use (defaults to 'alloy')
|
|
65
|
+
*
|
|
66
|
+
* @example
|
|
67
|
+
* ```typescript
|
|
68
|
+
* const voice = new OpenAIRealtimeVoice({
|
|
69
|
+
* chatModel: {
|
|
70
|
+
* apiKey: 'your-api-key',
|
|
71
|
+
* model: 'gpt-4o-mini-realtime',
|
|
72
|
+
* },
|
|
73
|
+
* speaker: 'alloy'
|
|
74
|
+
* });
|
|
75
|
+
* ```
|
|
76
|
+
*/
|
|
77
|
+
constructor({ chatModel, speaker, }?: {
|
|
78
|
+
chatModel?: {
|
|
79
|
+
model?: string;
|
|
80
|
+
apiKey?: string;
|
|
81
|
+
tools?: TTools;
|
|
82
|
+
options?: {
|
|
83
|
+
sessionConfig?: Realtime.SessionConfig;
|
|
84
|
+
url?: string;
|
|
85
|
+
dangerouslyAllowAPIKeyInBrowser?: boolean;
|
|
86
|
+
debug?: boolean;
|
|
87
|
+
tools?: TTools;
|
|
88
|
+
};
|
|
89
|
+
};
|
|
90
|
+
speaker?: Realtime.Voice;
|
|
91
|
+
});
|
|
92
|
+
/**
|
|
93
|
+
* Returns a list of available voice speakers.
|
|
94
|
+
*
|
|
95
|
+
* @returns Promise resolving to an array of voice objects, each containing at least a voiceId
|
|
96
|
+
*
|
|
97
|
+
* @example
|
|
98
|
+
* ```typescript
|
|
99
|
+
* const speakers = await voice.getSpeakers();
|
|
100
|
+
* // speakers = [{ voiceId: 'alloy' }, { voiceId: 'echo' }, ...]
|
|
101
|
+
* ```
|
|
102
|
+
*/
|
|
103
|
+
getSpeakers(): Promise<Array<{
|
|
104
|
+
voiceId: string;
|
|
105
|
+
[key: string]: any;
|
|
106
|
+
}>>;
|
|
107
|
+
/**
|
|
108
|
+
* Disconnects from the OpenAI realtime session and cleans up resources.
|
|
109
|
+
* Should be called when you're done with the voice instance.
|
|
110
|
+
*
|
|
111
|
+
* @example
|
|
112
|
+
* ```typescript
|
|
113
|
+
* voice.close(); // Disconnects and cleans up
|
|
114
|
+
* ```
|
|
115
|
+
*/
|
|
116
|
+
close(): void;
|
|
117
|
+
/**
|
|
118
|
+
* Equips the voice instance with a set of tools.
|
|
119
|
+
* Tools allow the model to perform additional actions during conversations.
|
|
120
|
+
*
|
|
121
|
+
* @param tools - Optional tools configuration to addTools
|
|
122
|
+
* @returns Transformed tools configuration ready for use with the model
|
|
123
|
+
*
|
|
124
|
+
* @example
|
|
125
|
+
* ```typescript
|
|
126
|
+
* const tools = {
|
|
127
|
+
* search: async (query: string) => { ... },
|
|
128
|
+
* calculate: (expression: string) => { ... }
|
|
129
|
+
* };
|
|
130
|
+
* voice.addTools(tools);
|
|
131
|
+
* ```
|
|
132
|
+
*/
|
|
133
|
+
addTools(tools?: TTools): void;
|
|
134
|
+
/**
|
|
135
|
+
* Emits a speaking event using the configured voice model.
|
|
136
|
+
* Can accept either a string or a readable stream as input.
|
|
137
|
+
*
|
|
138
|
+
* @param input - The text to convert to speech, or a readable stream containing the text
|
|
139
|
+
* @param options - Optional configuration for this specific speech request
|
|
140
|
+
* @param options.speaker - Override the voice to use for this specific request
|
|
141
|
+
*
|
|
142
|
+
* @throws {Error} If the input text is empty
|
|
143
|
+
*
|
|
144
|
+
* @example
|
|
145
|
+
* ```typescript
|
|
146
|
+
* // Simple text to speech
|
|
147
|
+
* await voice.speak('Hello world');
|
|
148
|
+
*
|
|
149
|
+
* // With custom voice
|
|
150
|
+
* await voice.speak('Hello world', { speaker: 'echo' });
|
|
151
|
+
*
|
|
152
|
+
* // Using a stream
|
|
153
|
+
* const stream = fs.createReadStream('text.txt');
|
|
154
|
+
* await voice.speak(stream);
|
|
155
|
+
* ```
|
|
156
|
+
*/
|
|
157
|
+
speak(input: string | NodeJS.ReadableStream, options?: {
|
|
158
|
+
speaker?: Realtime.Voice;
|
|
159
|
+
}): Promise<void>;
|
|
160
|
+
/**
|
|
161
|
+
* Updates the session configuration for the voice instance.
|
|
162
|
+
* This can be used to modify voice settings, turn detection, and other parameters.
|
|
163
|
+
*
|
|
164
|
+
* @param sessionConfig - New session configuration to apply
|
|
165
|
+
*
|
|
166
|
+
* @example
|
|
167
|
+
* ```typescript
|
|
168
|
+
* voice.updateConfig({
|
|
169
|
+
* voice: 'echo',
|
|
170
|
+
* turn_detection: {
|
|
171
|
+
* type: 'server_vad',
|
|
172
|
+
* threshold: 0.5,
|
|
173
|
+
* silence_duration_ms: 1000
|
|
174
|
+
* }
|
|
175
|
+
* });
|
|
176
|
+
* ```
|
|
177
|
+
*/
|
|
178
|
+
updateConfig(sessionConfig: Realtime.SessionConfig): void;
|
|
179
|
+
/**
|
|
180
|
+
* Processes audio input for speech recognition.
|
|
181
|
+
* Takes a readable stream of audio data and emits a writing event.
|
|
182
|
+
* The output of the writing event is int16 audio data.
|
|
183
|
+
*
|
|
184
|
+
* @param audioData - Readable stream containing the audio data to process
|
|
185
|
+
* @param options - Optional configuration for audio processing
|
|
186
|
+
*
|
|
187
|
+
* @throws {Error} If the audio data format is not supported
|
|
188
|
+
*
|
|
189
|
+
* @example
|
|
190
|
+
* ```typescript
|
|
191
|
+
* // Process audio from a file
|
|
192
|
+
* const audioStream = fs.createReadStream('audio.raw');
|
|
193
|
+
* await voice.listen(audioStream);
|
|
194
|
+
*
|
|
195
|
+
* // Process audio with options
|
|
196
|
+
* await voice.listen(microphoneStream, {
|
|
197
|
+
* format: 'int16',
|
|
198
|
+
* sampleRate: 24000
|
|
199
|
+
* });
|
|
200
|
+
* ```
|
|
201
|
+
*/
|
|
202
|
+
listen(audioData: NodeJS.ReadableStream): Promise<void>;
|
|
203
|
+
/**
|
|
204
|
+
* Establishes a connection to the OpenAI realtime service.
|
|
205
|
+
* Must be called before using speak, listen, or relay functions.
|
|
206
|
+
*
|
|
207
|
+
* @throws {Error} If connection fails or session creation times out
|
|
208
|
+
*
|
|
209
|
+
* @example
|
|
210
|
+
* ```typescript
|
|
211
|
+
* await voice.open();
|
|
212
|
+
* // Now ready for voice interactions
|
|
213
|
+
* ```
|
|
214
|
+
*/
|
|
215
|
+
connect(): Promise<void>;
|
|
216
|
+
/**
|
|
217
|
+
* Streams audio data in real-time to the OpenAI service.
|
|
218
|
+
* Useful for continuous audio streaming scenarios like live microphone input.
|
|
219
|
+
* Must be in 'open' state before calling this method.
|
|
220
|
+
*
|
|
221
|
+
* @param audioData - Readable stream of audio data to relay
|
|
222
|
+
* @throws {Error} If audio format is not supported
|
|
223
|
+
*
|
|
224
|
+
* @example
|
|
225
|
+
* ```typescript
|
|
226
|
+
* // First connect
|
|
227
|
+
* await voice.open();
|
|
228
|
+
*
|
|
229
|
+
* // Then relay audio
|
|
230
|
+
* const micStream = getMicrophoneStream();
|
|
231
|
+
* await voice.relay(micStream);
|
|
232
|
+
* ```
|
|
233
|
+
*/
|
|
234
|
+
send(audioData: NodeJS.ReadableStream | Int16Array): Promise<void>;
|
|
235
|
+
/**
|
|
236
|
+
* Sends a response to the OpenAI Realtime API.
|
|
237
|
+
*
|
|
238
|
+
* Trigger a response to the real-time session.
|
|
239
|
+
*
|
|
240
|
+
* @param {Object} params - The parameters object
|
|
241
|
+
* @param {Realtime.ResponseConfig} params.options - Configuration options for the response
|
|
242
|
+
* @returns {Promise<void>} A promise that resolves when the response has been sent
|
|
243
|
+
*
|
|
244
|
+
* @example
|
|
245
|
+
* // Send a simple text response
|
|
246
|
+
* await realtimeVoice.answer({
|
|
247
|
+
* options: {
|
|
248
|
+
* content: "Hello, how can I help you today?",
|
|
249
|
+
* voice: "alloy"
|
|
250
|
+
* }
|
|
251
|
+
* });
|
|
252
|
+
*/
|
|
253
|
+
answer({ options }: {
|
|
254
|
+
options?: Realtime.ResponseConfig;
|
|
255
|
+
}): Promise<void>;
|
|
256
|
+
/**
|
|
257
|
+
* Registers an event listener for voice events.
|
|
258
|
+
* Available events: 'speaking', 'writing, 'error'
|
|
259
|
+
* Can listen to OpenAI Realtime events by prefixing with 'openAIRealtime:'
|
|
260
|
+
* Such as 'openAIRealtime:conversation.item.completed', 'openAIRealtime:conversation.updated', etc.
|
|
261
|
+
*
|
|
262
|
+
* @param event - Name of the event to listen for
|
|
263
|
+
* @param callback - Function to call when the event occurs
|
|
264
|
+
*
|
|
265
|
+
* @example
|
|
266
|
+
* ```typescript
|
|
267
|
+
* // Listen for speech events
|
|
268
|
+
* voice.on('speaking', (audioData: Int16Array) => {
|
|
269
|
+
* // Handle audio data
|
|
270
|
+
* });
|
|
271
|
+
*
|
|
272
|
+
* // Handle errors
|
|
273
|
+
* voice.on('error', (error: Error) => {
|
|
274
|
+
* console.error('Voice error:', error);
|
|
275
|
+
* });
|
|
276
|
+
* ```
|
|
277
|
+
*/
|
|
278
|
+
on(event: string, callback: EventCallback): void;
|
|
279
|
+
/**
|
|
280
|
+
* Removes a previously registered event listener.
|
|
281
|
+
*
|
|
282
|
+
* @param event - Name of the event to stop listening to
|
|
283
|
+
* @param callback - The specific callback function to remove
|
|
284
|
+
*
|
|
285
|
+
* @example
|
|
286
|
+
* ```typescript
|
|
287
|
+
* // Create event handler
|
|
288
|
+
* const handleSpeech = (audioData: Int16Array) => {
|
|
289
|
+
* // Handle audio data
|
|
290
|
+
* };
|
|
291
|
+
*
|
|
292
|
+
* // Add listener
|
|
293
|
+
* voice.on('speaking', handleSpeech);
|
|
294
|
+
*
|
|
295
|
+
* // Later, remove the listener
|
|
296
|
+
* voice.off('speaking', handleSpeech);
|
|
297
|
+
* ```
|
|
298
|
+
*/
|
|
299
|
+
off(event: string, callback: EventCallback): void;
|
|
300
|
+
/**
|
|
301
|
+
* Emit an event with arguments
|
|
302
|
+
* @param event Event name
|
|
303
|
+
* @param args Arguments to pass to the callbacks
|
|
304
|
+
*/
|
|
305
|
+
private emit;
|
|
306
|
+
private setupEventListeners;
|
|
307
|
+
private int16ArrayToBase64;
|
|
308
|
+
}
|
|
309
|
+
|
|
310
|
+
export declare const transformTools: (tools?: TTools_2) => {
|
|
311
|
+
openaiTool: {
|
|
312
|
+
name: string;
|
|
313
|
+
description: string;
|
|
314
|
+
parameters: {
|
|
315
|
+
[key: string]: any;
|
|
316
|
+
};
|
|
317
|
+
};
|
|
318
|
+
execute: (args: any) => Promise<any>;
|
|
319
|
+
}[];
|
|
320
|
+
|
|
321
|
+
declare type TTools = ToolsInput;
|
|
322
|
+
|
|
323
|
+
declare type TTools_2 = ToolsInput;
|
|
324
|
+
|
|
325
|
+
export { }
|