@mastra/voice-openai-realtime 0.11.12 → 0.12.0-beta.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +67 -13
- package/README.md +1 -1
- package/dist/docs/README.md +32 -0
- package/dist/docs/SKILL.md +33 -0
- package/dist/docs/SOURCE_MAP.json +6 -0
- package/dist/docs/agents/01-adding-voice.md +352 -0
- package/dist/docs/voice/01-overview.md +1019 -0
- package/dist/docs/voice/02-speech-to-speech.md +106 -0
- package/dist/docs/voice/03-reference.md +1096 -0
- package/dist/index.cjs +4 -4
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.ts +4 -4
- package/dist/index.js +4 -4
- package/dist/index.js.map +1 -1
- package/package.json +13 -15
|
@@ -0,0 +1,1096 @@
|
|
|
1
|
+
# Voice API Reference
|
|
2
|
+
|
|
3
|
+
> API reference for voice - 12 entries
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
---
|
|
7
|
+
|
|
8
|
+
## Reference: OpenAI Realtime Voice
|
|
9
|
+
|
|
10
|
+
> Documentation for the OpenAIRealtimeVoice class, providing real-time text-to-speech and speech-to-text capabilities via WebSockets.
|
|
11
|
+
|
|
12
|
+
The OpenAIRealtimeVoice class provides real-time voice interaction capabilities using OpenAI's WebSocket-based API. It supports real time speech to speech, voice activity detection, and event-based audio streaming.
|
|
13
|
+
|
|
14
|
+
## Usage Example
|
|
15
|
+
|
|
16
|
+
```typescript
|
|
17
|
+
import { OpenAIRealtimeVoice } from "@mastra/voice-openai-realtime";
|
|
18
|
+
import { playAudio, getMicrophoneStream } from "@mastra/node-audio";
|
|
19
|
+
|
|
20
|
+
// Initialize with default configuration using environment variables
|
|
21
|
+
const voice = new OpenAIRealtimeVoice();
|
|
22
|
+
|
|
23
|
+
// Or initialize with specific configuration
|
|
24
|
+
const voiceWithConfig = new OpenAIRealtimeVoice({
|
|
25
|
+
apiKey: "your-openai-api-key",
|
|
26
|
+
model: "gpt-5.1-realtime-preview-2024-12-17",
|
|
27
|
+
speaker: "alloy", // Default voice
|
|
28
|
+
});
|
|
29
|
+
|
|
30
|
+
voiceWithConfig.updateSession({
|
|
31
|
+
turn_detection: {
|
|
32
|
+
type: "server_vad",
|
|
33
|
+
threshold: 0.6,
|
|
34
|
+
silence_duration_ms: 1200,
|
|
35
|
+
},
|
|
36
|
+
});
|
|
37
|
+
|
|
38
|
+
// Establish connection
|
|
39
|
+
await voice.connect();
|
|
40
|
+
|
|
41
|
+
// Set up event listeners
|
|
42
|
+
voice.on("speaker", ({ audio }) => {
|
|
43
|
+
// Handle audio data (Int16Array) pcm format by default
|
|
44
|
+
playAudio(audio);
|
|
45
|
+
});
|
|
46
|
+
|
|
47
|
+
voice.on("writing", ({ text, role }) => {
|
|
48
|
+
// Handle transcribed text
|
|
49
|
+
console.log(`${role}: ${text}`);
|
|
50
|
+
});
|
|
51
|
+
|
|
52
|
+
// Convert text to speech
|
|
53
|
+
await voice.speak("Hello, how can I help you today?", {
|
|
54
|
+
speaker: "echo", // Override default voice
|
|
55
|
+
});
|
|
56
|
+
|
|
57
|
+
// Process audio input
|
|
58
|
+
const microphoneStream = getMicrophoneStream();
|
|
59
|
+
await voice.send(microphoneStream);
|
|
60
|
+
|
|
61
|
+
// When done, disconnect
|
|
62
|
+
voice.connect();
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
## Configuration
|
|
66
|
+
|
|
67
|
+
### Constructor Options
|
|
68
|
+
|
|
69
|
+
### Voice Activity Detection (VAD) Configuration
|
|
70
|
+
|
|
71
|
+
## Methods
|
|
72
|
+
|
|
73
|
+
### connect()
|
|
74
|
+
|
|
75
|
+
Establishes a connection to the OpenAI realtime service. Must be called before using speak, listen, or send functions.
|
|
76
|
+
|
|
77
|
+
### speak()
|
|
78
|
+
|
|
79
|
+
Emits a speaking event using the configured voice model. Can accept either a string or a readable stream as input.
|
|
80
|
+
|
|
81
|
+
Returns: `Promise<void>`
|
|
82
|
+
|
|
83
|
+
### listen()
|
|
84
|
+
|
|
85
|
+
Processes audio input for speech recognition. Takes a readable stream of audio data and emits a 'listening' event with the transcribed text.
|
|
86
|
+
|
|
87
|
+
Returns: `Promise<void>`
|
|
88
|
+
|
|
89
|
+
### send()
|
|
90
|
+
|
|
91
|
+
Streams audio data in real-time to the OpenAI service for continuous audio streaming scenarios like live microphone input.
|
|
92
|
+
|
|
93
|
+
Returns: `Promise<void>`
|
|
94
|
+
|
|
95
|
+
### updateConfig()
|
|
96
|
+
|
|
97
|
+
Updates the session configuration for the voice instance. This can be used to modify voice settings, turn detection, and other parameters.
|
|
98
|
+
|
|
99
|
+
Returns: `void`
|
|
100
|
+
|
|
101
|
+
### addTools()
|
|
102
|
+
|
|
103
|
+
Adds a set of tools to the voice instance. Tools allow the model to perform additional actions during conversations. When OpenAIRealtimeVoice is added to an Agent, any tools configured for the Agent will automatically be available to the voice interface.
|
|
104
|
+
|
|
105
|
+
Returns: `void`
|
|
106
|
+
|
|
107
|
+
### close()
|
|
108
|
+
|
|
109
|
+
Disconnects from the OpenAI realtime session and cleans up resources. Should be called when you're done with the voice instance.
|
|
110
|
+
|
|
111
|
+
Returns: `void`
|
|
112
|
+
|
|
113
|
+
### getSpeakers()
|
|
114
|
+
|
|
115
|
+
Returns a list of available voice speakers.
|
|
116
|
+
|
|
117
|
+
Returns: `Promise<Array<{ voiceId: string; [key: string]: any }>>`
|
|
118
|
+
|
|
119
|
+
### on()
|
|
120
|
+
|
|
121
|
+
Registers an event listener for voice events.
|
|
122
|
+
|
|
123
|
+
Returns: `void`
|
|
124
|
+
|
|
125
|
+
### off()
|
|
126
|
+
|
|
127
|
+
Removes a previously registered event listener.
|
|
128
|
+
|
|
129
|
+
Returns: `void`
|
|
130
|
+
|
|
131
|
+
## Events
|
|
132
|
+
|
|
133
|
+
The OpenAIRealtimeVoice class emits the following events:
|
|
134
|
+
|
|
135
|
+
### OpenAI Realtime Events
|
|
136
|
+
|
|
137
|
+
You can also listen to [OpenAI Realtime utility events](https://github.com/openai/openai-realtime-api-beta#reference-client-utility-events) by prefixing with 'openAIRealtime:':
|
|
138
|
+
|
|
139
|
+
## Available Voices
|
|
140
|
+
|
|
141
|
+
The following voice options are available:
|
|
142
|
+
|
|
143
|
+
- `alloy`: Neutral and balanced
|
|
144
|
+
- `ash`: Clear and precise
|
|
145
|
+
- `ballad`: Melodic and smooth
|
|
146
|
+
- `coral`: Warm and friendly
|
|
147
|
+
- `echo`: Resonant and deep
|
|
148
|
+
- `sage`: Calm and thoughtful
|
|
149
|
+
- `shimmer`: Bright and energetic
|
|
150
|
+
- `verse`: Versatile and expressive
|
|
151
|
+
|
|
152
|
+
## Notes
|
|
153
|
+
|
|
154
|
+
- API keys can be provided via constructor options or the `OPENAI_API_KEY` environment variable
|
|
155
|
+
- The OpenAI Realtime Voice API uses WebSockets for real-time communication
|
|
156
|
+
- Server-side Voice Activity Detection (VAD) provides better accuracy for speech detection
|
|
157
|
+
- All audio data is processed as Int16Array format
|
|
158
|
+
- The voice instance must be connected with `connect()` before using other methods
|
|
159
|
+
- Always call `close()` when done to properly clean up resources
|
|
160
|
+
- Memory management is handled by OpenAI Realtime API
|
|
161
|
+
|
|
162
|
+
---
|
|
163
|
+
|
|
164
|
+
## Reference: voice.addInstructions()
|
|
165
|
+
|
|
166
|
+
> Documentation for the addInstructions() method available in voice providers, which adds instructions to guide the voice model
|
|
167
|
+
|
|
168
|
+
The `addInstructions()` method equips a voice provider with instructions that guide the model's behavior during real-time interactions. This is particularly useful for real-time voice providers that maintain context across a conversation.
|
|
169
|
+
|
|
170
|
+
## Usage Example
|
|
171
|
+
|
|
172
|
+
```typescript
|
|
173
|
+
import { OpenAIRealtimeVoice } from "@mastra/voice-openai-realtime";
|
|
174
|
+
import { Agent } from "@mastra/core/agent";
|
|
175
|
+
|
|
176
|
+
// Initialize a real-time voice provider
|
|
177
|
+
const voice = new OpenAIRealtimeVoice({
|
|
178
|
+
realtimeConfig: {
|
|
179
|
+
model: "gpt-5.1-realtime",
|
|
180
|
+
apiKey: process.env.OPENAI_API_KEY,
|
|
181
|
+
},
|
|
182
|
+
});
|
|
183
|
+
|
|
184
|
+
// Create an agent with the voice provider
|
|
185
|
+
const agent = new Agent({
|
|
186
|
+
name: "Customer Support Agent",
|
|
187
|
+
instructions:
|
|
188
|
+
"You are a helpful customer support agent for a software company.",
|
|
189
|
+
model: "openai/gpt-5.1",
|
|
190
|
+
voice,
|
|
191
|
+
});
|
|
192
|
+
|
|
193
|
+
// Add additional instructions to the voice provider
|
|
194
|
+
voice.addInstructions(`
|
|
195
|
+
When speaking to customers:
|
|
196
|
+
- Always introduce yourself as the customer support agent
|
|
197
|
+
- Speak clearly and concisely
|
|
198
|
+
- Ask clarifying questions when needed
|
|
199
|
+
- Summarize the conversation at the end
|
|
200
|
+
`);
|
|
201
|
+
|
|
202
|
+
// Connect to the real-time service
|
|
203
|
+
await voice.connect();
|
|
204
|
+
```
|
|
205
|
+
|
|
206
|
+
## Parameters
|
|
207
|
+
|
|
208
|
+
<br />
|
|
209
|
+
|
|
210
|
+
## Return Value
|
|
211
|
+
|
|
212
|
+
This method does not return a value.
|
|
213
|
+
|
|
214
|
+
## Notes
|
|
215
|
+
|
|
216
|
+
- Instructions are most effective when they are clear, specific, and relevant to the voice interaction
|
|
217
|
+
- This method is primarily used with real-time voice providers that maintain conversation context
|
|
218
|
+
- If called on a voice provider that doesn't support instructions, it will log a warning and do nothing
|
|
219
|
+
- Instructions added with this method are typically combined with any instructions provided by an associated Agent
|
|
220
|
+
- For best results, add instructions before starting a conversation (before calling `connect()`)
|
|
221
|
+
- Multiple calls to `addInstructions()` may either replace or append to existing instructions, depending on the provider implementation
|
|
222
|
+
|
|
223
|
+
---
|
|
224
|
+
|
|
225
|
+
## Reference: voice.addTools()
|
|
226
|
+
|
|
227
|
+
> Documentation for the addTools() method available in voice providers, which equips voice models with function calling capabilities.
|
|
228
|
+
|
|
229
|
+
The `addTools()` method equips a voice provider with tools (functions) that can be called by the model during real-time interactions. This enables voice assistants to perform actions like searching for information, making calculations, or interacting with external systems.
|
|
230
|
+
|
|
231
|
+
## Usage Example
|
|
232
|
+
|
|
233
|
+
```typescript
|
|
234
|
+
import { OpenAIRealtimeVoice } from "@mastra/voice-openai-realtime";
|
|
235
|
+
import { createTool } from "@mastra/core/tools";
|
|
236
|
+
import { z } from "zod";
|
|
237
|
+
|
|
238
|
+
// Define tools
|
|
239
|
+
const weatherTool = createTool({
|
|
240
|
+
id: "getWeather",
|
|
241
|
+
description: "Get the current weather for a location",
|
|
242
|
+
inputSchema: z.object({
|
|
243
|
+
location: z.string().describe("The city and state, e.g. San Francisco, CA"),
|
|
244
|
+
}),
|
|
245
|
+
outputSchema: z.object({
|
|
246
|
+
message: z.string(),
|
|
247
|
+
}),
|
|
248
|
+
execute: async (inputData) => {
|
|
249
|
+
// Fetch weather data from an API
|
|
250
|
+
const response = await fetch(
|
|
251
|
+
`https://api.weather.com?location=${encodeURIComponent(inputData.location)}`,
|
|
252
|
+
);
|
|
253
|
+
const data = await response.json();
|
|
254
|
+
return {
|
|
255
|
+
message: `The current temperature in ${inputData.location} is ${data.temperature}°F with ${data.conditions}.`,
|
|
256
|
+
};
|
|
257
|
+
},
|
|
258
|
+
});
|
|
259
|
+
|
|
260
|
+
// Initialize a real-time voice provider
|
|
261
|
+
const voice = new OpenAIRealtimeVoice({
|
|
262
|
+
realtimeConfig: {
|
|
263
|
+
model: "gpt-5.1-realtime",
|
|
264
|
+
apiKey: process.env.OPENAI_API_KEY,
|
|
265
|
+
},
|
|
266
|
+
});
|
|
267
|
+
|
|
268
|
+
// Add tools to the voice provider
|
|
269
|
+
voice.addTools({
|
|
270
|
+
getWeather: weatherTool,
|
|
271
|
+
});
|
|
272
|
+
|
|
273
|
+
// Connect to the real-time service
|
|
274
|
+
await voice.connect();
|
|
275
|
+
```
|
|
276
|
+
|
|
277
|
+
## Parameters
|
|
278
|
+
|
|
279
|
+
<br />
|
|
280
|
+
|
|
281
|
+
## Return Value
|
|
282
|
+
|
|
283
|
+
This method does not return a value.
|
|
284
|
+
|
|
285
|
+
## Notes
|
|
286
|
+
|
|
287
|
+
- Tools must follow the Mastra tool format with name, description, input schema, and execute function
|
|
288
|
+
- This method is primarily used with real-time voice providers that support function calling
|
|
289
|
+
- If called on a voice provider that doesn't support tools, it will log a warning and do nothing
|
|
290
|
+
- Tools added with this method are typically combined with any tools provided by an associated Agent
|
|
291
|
+
- For best results, add tools before starting a conversation (before calling `connect()`)
|
|
292
|
+
- The voice provider will automatically handle the invocation of tool handlers when the model decides to use them
|
|
293
|
+
- Multiple calls to `addTools()` may either replace or merge with existing tools, depending on the provider implementation
|
|
294
|
+
|
|
295
|
+
---
|
|
296
|
+
|
|
297
|
+
## Reference: voice.answer()
|
|
298
|
+
|
|
299
|
+
> Documentation for the answer() method available in real-time voice providers, which triggers the voice provider to generate a response.
|
|
300
|
+
|
|
301
|
+
The `answer()` method is used in real-time voice providers to trigger the AI to generate a response. This method is particularly useful in speech-to-speech conversations where you need to explicitly signal the AI to respond after receiving user input.
|
|
302
|
+
|
|
303
|
+
## Usage Example
|
|
304
|
+
|
|
305
|
+
```typescript
|
|
306
|
+
import { OpenAIRealtimeVoice } from "@mastra/voice-openai-realtime";
|
|
307
|
+
import { getMicrophoneStream } from "@mastra/node-audio";
|
|
308
|
+
import Speaker from "@mastra/node-speaker";
|
|
309
|
+
|
|
310
|
+
const speaker = new Speaker({
|
|
311
|
+
sampleRate: 24100, // Audio sample rate in Hz - standard for high-quality audio on MacBook Pro
|
|
312
|
+
channels: 1, // Mono audio output (as opposed to stereo which would be 2)
|
|
313
|
+
bitDepth: 16, // Bit depth for audio quality - CD quality standard (16-bit resolution)
|
|
314
|
+
});
|
|
315
|
+
|
|
316
|
+
// Initialize a real-time voice provider
|
|
317
|
+
const voice = new OpenAIRealtimeVoice({
|
|
318
|
+
realtimeConfig: {
|
|
319
|
+
model: "gpt-5.1",
|
|
320
|
+
apiKey: process.env.OPENAI_API_KEY,
|
|
321
|
+
},
|
|
322
|
+
speaker: "alloy", // Default voice
|
|
323
|
+
});
|
|
324
|
+
// Connect to the real-time service
|
|
325
|
+
await voice.connect();
|
|
326
|
+
// Register event listener for responses
|
|
327
|
+
voice.on("speaker", (stream) => {
|
|
328
|
+
// Handle audio response
|
|
329
|
+
stream.pipe(speaker);
|
|
330
|
+
});
|
|
331
|
+
// Send user audio input
|
|
332
|
+
const microphoneStream = getMicrophoneStream();
|
|
333
|
+
await voice.send(microphoneStream);
|
|
334
|
+
// Trigger the AI to respond
|
|
335
|
+
await voice.answer();
|
|
336
|
+
```
|
|
337
|
+
|
|
338
|
+
## Parameters
|
|
339
|
+
|
|
340
|
+
<br />
|
|
341
|
+
|
|
342
|
+
## Return Value
|
|
343
|
+
|
|
344
|
+
Returns a `Promise<void>` that resolves when the response has been triggered.
|
|
345
|
+
|
|
346
|
+
## Notes
|
|
347
|
+
|
|
348
|
+
- This method is only implemented by real-time voice providers that support speech-to-speech capabilities
|
|
349
|
+
- If called on a voice provider that doesn't support this functionality, it will log a warning and resolve immediately
|
|
350
|
+
- The response audio will typically be emitted through the 'speaking' event rather than returned directly
|
|
351
|
+
- For providers that support it, you can use this method to send a specific response instead of having the AI generate one
|
|
352
|
+
- This method is commonly used in conjunction with `send()` to create a conversational flow
|
|
353
|
+
|
|
354
|
+
---
|
|
355
|
+
|
|
356
|
+
## Reference: voice.close()
|
|
357
|
+
|
|
358
|
+
> Documentation for the close() method available in voice providers, which disconnects from real-time voice services.
|
|
359
|
+
|
|
360
|
+
The `close()` method disconnects from a real-time voice service and cleans up resources. This is important for properly ending voice sessions and preventing resource leaks.
|
|
361
|
+
|
|
362
|
+
## Usage Example
|
|
363
|
+
|
|
364
|
+
```typescript
|
|
365
|
+
import { OpenAIRealtimeVoice } from "@mastra/voice-openai-realtime";
|
|
366
|
+
import { getMicrophoneStream } from "@mastra/node-audio";
|
|
367
|
+
|
|
368
|
+
// Initialize a real-time voice provider
|
|
369
|
+
const voice = new OpenAIRealtimeVoice({
|
|
370
|
+
realtimeConfig: {
|
|
371
|
+
model: "gpt-5.1-realtime",
|
|
372
|
+
apiKey: process.env.OPENAI_API_KEY,
|
|
373
|
+
},
|
|
374
|
+
});
|
|
375
|
+
|
|
376
|
+
// Connect to the real-time service
|
|
377
|
+
await voice.connect();
|
|
378
|
+
|
|
379
|
+
// Start a conversation
|
|
380
|
+
voice.speak("Hello, I'm your AI assistant!");
|
|
381
|
+
|
|
382
|
+
// Stream audio from a microphone
|
|
383
|
+
const microphoneStream = getMicrophoneStream();
|
|
384
|
+
voice.send(microphoneStream);
|
|
385
|
+
|
|
386
|
+
// When the conversation is complete
|
|
387
|
+
setTimeout(() => {
|
|
388
|
+
// Close the connection and clean up resources
|
|
389
|
+
voice.close();
|
|
390
|
+
console.log("Voice session ended");
|
|
391
|
+
}, 60000); // End after 1 minute
|
|
392
|
+
```
|
|
393
|
+
|
|
394
|
+
## Parameters
|
|
395
|
+
|
|
396
|
+
This method does not accept any parameters.
|
|
397
|
+
|
|
398
|
+
## Return Value
|
|
399
|
+
|
|
400
|
+
This method does not return a value.
|
|
401
|
+
|
|
402
|
+
## Notes
|
|
403
|
+
|
|
404
|
+
- Always call `close()` when you're done with a real-time voice session to free up resources
|
|
405
|
+
- After calling `close()`, you'll need to call `connect()` again if you want to start a new session
|
|
406
|
+
- This method is primarily used with real-time voice providers that maintain persistent connections
|
|
407
|
+
- If called on a voice provider that doesn't support real-time connections, it will log a warning and do nothing
|
|
408
|
+
- Failing to close connections can lead to resource leaks and potential billing issues with voice service providers
|
|
409
|
+
|
|
410
|
+
---
|
|
411
|
+
|
|
412
|
+
## Reference: voice.connect()
|
|
413
|
+
|
|
414
|
+
> Documentation for the connect() method available in real-time voice providers, which establishes a connection for speech-to-speech communication.
|
|
415
|
+
|
|
416
|
+
The `connect()` method establishes a WebSocket or WebRTC connection for real-time speech-to-speech communication. This method must be called before using other real-time features like `send()` or `answer()`.
|
|
417
|
+
|
|
418
|
+
## Usage Example
|
|
419
|
+
|
|
420
|
+
```typescript
|
|
421
|
+
import { OpenAIRealtimeVoice } from "@mastra/voice-openai-realtime";
|
|
422
|
+
import Speaker from "@mastra/node-speaker";
|
|
423
|
+
|
|
424
|
+
const speaker = new Speaker({
|
|
425
|
+
sampleRate: 24100, // Audio sample rate in Hz - standard for high-quality audio on MacBook Pro
|
|
426
|
+
channels: 1, // Mono audio output (as opposed to stereo which would be 2)
|
|
427
|
+
bitDepth: 16, // Bit depth for audio quality - CD quality standard (16-bit resolution)
|
|
428
|
+
});
|
|
429
|
+
|
|
430
|
+
// Initialize a real-time voice provider
|
|
431
|
+
const voice = new OpenAIRealtimeVoice({
|
|
432
|
+
realtimeConfig: {
|
|
433
|
+
model: "gpt-5.1-realtime",
|
|
434
|
+
apiKey: process.env.OPENAI_API_KEY,
|
|
435
|
+
options: {
|
|
436
|
+
sessionConfig: {
|
|
437
|
+
turn_detection: {
|
|
438
|
+
type: "server_vad",
|
|
439
|
+
threshold: 0.6,
|
|
440
|
+
silence_duration_ms: 1200,
|
|
441
|
+
},
|
|
442
|
+
},
|
|
443
|
+
},
|
|
444
|
+
},
|
|
445
|
+
speaker: "alloy", // Default voice
|
|
446
|
+
});
|
|
447
|
+
// Connect to the real-time service
|
|
448
|
+
await voice.connect();
|
|
449
|
+
// Now you can use real-time features
|
|
450
|
+
voice.on("speaker", (stream) => {
|
|
451
|
+
stream.pipe(speaker);
|
|
452
|
+
});
|
|
453
|
+
// With connection options
|
|
454
|
+
await voice.connect({
|
|
455
|
+
timeout: 10000, // 10 seconds timeout
|
|
456
|
+
reconnect: true,
|
|
457
|
+
});
|
|
458
|
+
```
|
|
459
|
+
|
|
460
|
+
## Parameters
|
|
461
|
+
|
|
462
|
+
## Return Value
|
|
463
|
+
|
|
464
|
+
Returns a `Promise<void>` that resolves when the connection is successfully established.
|
|
465
|
+
|
|
466
|
+
## Provider-Specific Options
|
|
467
|
+
|
|
468
|
+
Each real-time voice provider may support different options for the `connect()` method:
|
|
469
|
+
|
|
470
|
+
### OpenAI Realtime
|
|
471
|
+
|
|
472
|
+
## Using with CompositeVoice
|
|
473
|
+
|
|
474
|
+
When using `CompositeVoice`, the `connect()` method delegates to the configured real-time provider:
|
|
475
|
+
|
|
476
|
+
```typescript
|
|
477
|
+
import { CompositeVoice } from "@mastra/core/voice";
|
|
478
|
+
import { OpenAIRealtimeVoice } from "@mastra/voice-openai-realtime";
|
|
479
|
+
const realtimeVoice = new OpenAIRealtimeVoice();
|
|
480
|
+
const voice = new CompositeVoice({
|
|
481
|
+
realtimeProvider: realtimeVoice,
|
|
482
|
+
});
|
|
483
|
+
// This will use the OpenAIRealtimeVoice provider
|
|
484
|
+
await voice.connect();
|
|
485
|
+
```
|
|
486
|
+
|
|
487
|
+
## Notes
|
|
488
|
+
|
|
489
|
+
- This method is only implemented by real-time voice providers that support speech-to-speech capabilities
|
|
490
|
+
- If called on a voice provider that doesn't support this functionality, it will log a warning and resolve immediately
|
|
491
|
+
- The connection must be established before using other real-time methods like `send()` or `answer()`
|
|
492
|
+
- When you're done with the voice instance, call `close()` to properly clean up resources
|
|
493
|
+
- Some providers may automatically reconnect on connection loss, depending on their implementation
|
|
494
|
+
- Connection errors will typically be thrown as exceptions that should be caught and handled
|
|
495
|
+
|
|
496
|
+
## Related Methods
|
|
497
|
+
|
|
498
|
+
- [voice.send()](./voice.send) - Sends audio data to the voice provider
|
|
499
|
+
- [voice.answer()](./voice.answer) - Triggers the voice provider to respond
|
|
500
|
+
- [voice.close()](./voice.close) - Disconnects from the real-time service
|
|
501
|
+
- [voice.on()](./voice.on) - Registers an event listener for voice events
|
|
502
|
+
|
|
503
|
+
---
|
|
504
|
+
|
|
505
|
+
## Reference: voice.listen()
|
|
506
|
+
|
|
507
|
+
> Documentation for the listen() method available in all Mastra voice providers, which converts speech to text.
|
|
508
|
+
|
|
509
|
+
The `listen()` method is a core function available in all Mastra voice providers that converts speech to text. It takes an audio stream as input and returns the transcribed text.
|
|
510
|
+
|
|
511
|
+
## Parameters
|
|
512
|
+
|
|
513
|
+
## Return Value
|
|
514
|
+
|
|
515
|
+
Returns one of the following:
|
|
516
|
+
|
|
517
|
+
- `Promise<string>`: A promise that resolves to the transcribed text
|
|
518
|
+
- `Promise<NodeJS.ReadableStream>`: A promise that resolves to a stream of transcribed text (for streaming transcription)
|
|
519
|
+
- `Promise<void>`: For real-time providers that emit 'writing' events instead of returning text directly
|
|
520
|
+
|
|
521
|
+
## Provider-Specific Options
|
|
522
|
+
|
|
523
|
+
Each voice provider may support additional options specific to their implementation. Here are some examples:
|
|
524
|
+
|
|
525
|
+
### OpenAI
|
|
526
|
+
|
|
527
|
+
### Google
|
|
528
|
+
|
|
529
|
+
### Deepgram
|
|
530
|
+
|
|
531
|
+
## Usage Example
|
|
532
|
+
|
|
533
|
+
```typescript
|
|
534
|
+
import { OpenAIVoice } from "@mastra/voice-openai";
|
|
535
|
+
import { getMicrophoneStream } from "@mastra/node-audio";
|
|
536
|
+
import { createReadStream } from "fs";
|
|
537
|
+
import path from "path";
|
|
538
|
+
|
|
539
|
+
// Initialize a voice provider
|
|
540
|
+
const voice = new OpenAIVoice({
|
|
541
|
+
listeningModel: {
|
|
542
|
+
name: "whisper-1",
|
|
543
|
+
apiKey: process.env.OPENAI_API_KEY,
|
|
544
|
+
},
|
|
545
|
+
});
|
|
546
|
+
|
|
547
|
+
// Basic usage with a file stream
|
|
548
|
+
const audioFilePath = path.join(process.cwd(), "audio.mp3");
|
|
549
|
+
const audioStream = createReadStream(audioFilePath);
|
|
550
|
+
const transcript = await voice.listen(audioStream, {
|
|
551
|
+
filetype: "mp3",
|
|
552
|
+
});
|
|
553
|
+
console.log("Transcribed text:", transcript);
|
|
554
|
+
|
|
555
|
+
// Using a microphone stream
|
|
556
|
+
const microphoneStream = getMicrophoneStream(); // Assume this function gets audio input
|
|
557
|
+
const transcription = await voice.listen(microphoneStream);
|
|
558
|
+
|
|
559
|
+
// With provider-specific options
|
|
560
|
+
const transcriptWithOptions = await voice.listen(audioStream, {
|
|
561
|
+
language: "en",
|
|
562
|
+
prompt: "This is a conversation about artificial intelligence.",
|
|
563
|
+
});
|
|
564
|
+
```
|
|
565
|
+
|
|
566
|
+
## Using with CompositeVoice
|
|
567
|
+
|
|
568
|
+
When using `CompositeVoice`, the `listen()` method delegates to the configured listening provider:
|
|
569
|
+
|
|
570
|
+
```typescript
|
|
571
|
+
import { CompositeVoice } from "@mastra/core/voice";
|
|
572
|
+
import { OpenAIVoice } from "@mastra/voice-openai";
|
|
573
|
+
import { PlayAIVoice } from "@mastra/voice-playai";
|
|
574
|
+
|
|
575
|
+
const voice = new CompositeVoice({
|
|
576
|
+
input: new OpenAIVoice(),
|
|
577
|
+
output: new PlayAIVoice(),
|
|
578
|
+
});
|
|
579
|
+
|
|
580
|
+
// This will use the OpenAIVoice provider
|
|
581
|
+
const transcript = await voice.listen(audioStream);
|
|
582
|
+
```
|
|
583
|
+
|
|
584
|
+
### Using AI SDK Model Providers
|
|
585
|
+
|
|
586
|
+
You can also use AI SDK transcription models directly with `CompositeVoice`:
|
|
587
|
+
|
|
588
|
+
```typescript
|
|
589
|
+
import { CompositeVoice } from "@mastra/core/voice";
|
|
590
|
+
import { openai } from "@ai-sdk/openai";
|
|
591
|
+
import { groq } from "@ai-sdk/groq";
|
|
592
|
+
|
|
593
|
+
// Use AI SDK transcription models
|
|
594
|
+
const voice = new CompositeVoice({
|
|
595
|
+
input: openai.transcription('whisper-1'), // AI SDK model
|
|
596
|
+
output: new PlayAIVoice(), // Mastra provider
|
|
597
|
+
});
|
|
598
|
+
|
|
599
|
+
// Works the same way
|
|
600
|
+
const transcript = await voice.listen(audioStream);
|
|
601
|
+
|
|
602
|
+
// Provider-specific options can be passed through
|
|
603
|
+
const transcriptWithOptions = await voice.listen(audioStream, {
|
|
604
|
+
providerOptions: {
|
|
605
|
+
openai: {
|
|
606
|
+
language: 'en',
|
|
607
|
+
prompt: 'This is about AI',
|
|
608
|
+
}
|
|
609
|
+
}
|
|
610
|
+
});
|
|
611
|
+
```
|
|
612
|
+
|
|
613
|
+
See the [CompositeVoice reference](https://mastra.ai/reference/v1/voice/composite-voice) for more details on AI SDK integration.
|
|
614
|
+
|
|
615
|
+
## Realtime Voice Providers
|
|
616
|
+
|
|
617
|
+
When using realtime voice providers like `OpenAIRealtimeVoice`, the `listen()` method behaves differently:
|
|
618
|
+
|
|
619
|
+
- Instead of returning transcribed text, it emits 'writing' events with the transcribed text
|
|
620
|
+
- You need to register an event listener to receive the transcription
|
|
621
|
+
|
|
622
|
+
```typescript
|
|
623
|
+
import { OpenAIRealtimeVoice } from "@mastra/voice-openai-realtime";
|
|
624
|
+
import { getMicrophoneStream } from "@mastra/node-audio";
|
|
625
|
+
|
|
626
|
+
const voice = new OpenAIRealtimeVoice();
|
|
627
|
+
await voice.connect();
|
|
628
|
+
|
|
629
|
+
// Register event listener for transcription
|
|
630
|
+
voice.on("writing", ({ text, role }) => {
|
|
631
|
+
console.log(`${role}: ${text}`);
|
|
632
|
+
});
|
|
633
|
+
|
|
634
|
+
// This will emit 'writing' events instead of returning text
|
|
635
|
+
const microphoneStream = getMicrophoneStream();
|
|
636
|
+
await voice.listen(microphoneStream);
|
|
637
|
+
```
|
|
638
|
+
|
|
639
|
+
## Notes
|
|
640
|
+
|
|
641
|
+
- Not all voice providers support speech-to-text functionality (e.g., PlayAI, Speechify)
|
|
642
|
+
- The behavior of `listen()` may vary slightly between providers, but all implementations follow the same basic interface
|
|
643
|
+
- When using a realtime voice provider, the method might not return text directly but instead emit a 'writing' event
|
|
644
|
+
- The audio format supported depends on the provider. Common formats include MP3, WAV, and M4A
|
|
645
|
+
- Some providers support streaming transcription, where text is returned as it's transcribed
|
|
646
|
+
- For best performance, consider closing or ending the audio stream when you're done with it
|
|
647
|
+
|
|
648
|
+
## Related Methods
|
|
649
|
+
|
|
650
|
+
- [voice.speak()](./voice.speak) - Converts text to speech
|
|
651
|
+
- [voice.send()](./voice.send) - Sends audio data to the voice provider in real-time
|
|
652
|
+
- [voice.on()](./voice.on) - Registers an event listener for voice events
|
|
653
|
+
|
|
654
|
+
---
|
|
655
|
+
|
|
656
|
+
## Reference: voice.off()
|
|
657
|
+
|
|
658
|
+
> Documentation for the off() method available in voice providers, which removes event listeners for voice events.
|
|
659
|
+
|
|
660
|
+
The `off()` method removes event listeners previously registered with the `on()` method. This is particularly useful for cleaning up resources and preventing memory leaks in long-running applications with real-time voice capabilities.
|
|
661
|
+
|
|
662
|
+
## Usage Example
|
|
663
|
+
|
|
664
|
+
```typescript
|
|
665
|
+
import { OpenAIRealtimeVoice } from "@mastra/voice-openai-realtime";
|
|
666
|
+
import chalk from "chalk";
|
|
667
|
+
|
|
668
|
+
// Initialize a real-time voice provider
|
|
669
|
+
const voice = new OpenAIRealtimeVoice({
|
|
670
|
+
realtimeConfig: {
|
|
671
|
+
model: "gpt-5.1-realtime",
|
|
672
|
+
apiKey: process.env.OPENAI_API_KEY,
|
|
673
|
+
},
|
|
674
|
+
});
|
|
675
|
+
|
|
676
|
+
// Connect to the real-time service
|
|
677
|
+
await voice.connect();
|
|
678
|
+
|
|
679
|
+
// Define the callback function
|
|
680
|
+
const writingCallback = ({ text, role }) => {
|
|
681
|
+
if (role === "user") {
|
|
682
|
+
process.stdout.write(chalk.green(text));
|
|
683
|
+
} else {
|
|
684
|
+
process.stdout.write(chalk.blue(text));
|
|
685
|
+
}
|
|
686
|
+
};
|
|
687
|
+
|
|
688
|
+
// Register event listener
|
|
689
|
+
voice.on("writing", writingCallback);
|
|
690
|
+
|
|
691
|
+
// Later, when you want to remove the listener
|
|
692
|
+
voice.off("writing", writingCallback);
|
|
693
|
+
```
|
|
694
|
+
|
|
695
|
+
## Parameters
|
|
696
|
+
|
|
697
|
+
<br />
|
|
698
|
+
|
|
699
|
+
## Return Value
|
|
700
|
+
|
|
701
|
+
This method does not return a value.
|
|
702
|
+
|
|
703
|
+
## Notes
|
|
704
|
+
|
|
705
|
+
- The callback passed to `off()` must be the same function reference that was passed to `on()`
|
|
706
|
+
- If the callback is not found, the method will have no effect
|
|
707
|
+
- This method is primarily used with real-time voice providers that support event-based communication
|
|
708
|
+
- If called on a voice provider that doesn't support events, it will log a warning and do nothing
|
|
709
|
+
- Removing event listeners is important for preventing memory leaks in long-running applications
|
|
710
|
+
|
|
711
|
+
---
|
|
712
|
+
|
|
713
|
+
## Reference: voice.on()
|
|
714
|
+
|
|
715
|
+
> Documentation for the on() method available in voice providers, which registers event listeners for voice events.
|
|
716
|
+
|
|
717
|
+
The `on()` method registers event listeners for various voice events. This is particularly important for real-time voice providers, where events are used to communicate transcribed text, audio responses, and other state changes.
|
|
718
|
+
|
|
719
|
+
## Usage Example
|
|
720
|
+
|
|
721
|
+
```typescript
|
|
722
|
+
import { OpenAIRealtimeVoice } from "@mastra/voice-openai-realtime";
|
|
723
|
+
import Speaker from "@mastra/node-speaker";
|
|
724
|
+
import chalk from "chalk";
|
|
725
|
+
|
|
726
|
+
// Initialize a real-time voice provider
|
|
727
|
+
const voice = new OpenAIRealtimeVoice({
|
|
728
|
+
realtimeConfig: {
|
|
729
|
+
model: "gpt-5.1-realtime",
|
|
730
|
+
apiKey: process.env.OPENAI_API_KEY,
|
|
731
|
+
},
|
|
732
|
+
});
|
|
733
|
+
|
|
734
|
+
// Connect to the real-time service
|
|
735
|
+
await voice.connect();
|
|
736
|
+
|
|
737
|
+
// Register event listener for transcribed text
|
|
738
|
+
voice.on("writing", (event) => {
|
|
739
|
+
if (event.role === "user") {
|
|
740
|
+
process.stdout.write(chalk.green(event.text));
|
|
741
|
+
} else {
|
|
742
|
+
process.stdout.write(chalk.blue(event.text));
|
|
743
|
+
}
|
|
744
|
+
});
|
|
745
|
+
|
|
746
|
+
// Listen for audio data and play it
|
|
747
|
+
const speaker = new Speaker({
|
|
748
|
+
sampleRate: 24100,
|
|
749
|
+
channels: 1,
|
|
750
|
+
bitDepth: 16,
|
|
751
|
+
});
|
|
752
|
+
|
|
753
|
+
voice.on("speaker", (stream) => {
|
|
754
|
+
stream.pipe(speaker);
|
|
755
|
+
});
|
|
756
|
+
|
|
757
|
+
// Register event listener for errors
|
|
758
|
+
voice.on("error", ({ message, code, details }) => {
|
|
759
|
+
console.error(`Error ${code}: ${message}`, details);
|
|
760
|
+
});
|
|
761
|
+
```
|
|
762
|
+
|
|
763
|
+
## Parameters
|
|
764
|
+
|
|
765
|
+
<br />
|
|
766
|
+
|
|
767
|
+
## Return Value
|
|
768
|
+
|
|
769
|
+
This method does not return a value.
|
|
770
|
+
|
|
771
|
+
## Events
|
|
772
|
+
|
|
773
|
+
For a comprehensive list of events and their payload structures, see the [Voice Events](./voice.events) documentation.
|
|
774
|
+
|
|
775
|
+
Common events include:
|
|
776
|
+
|
|
777
|
+
- `speaking`: Emitted when audio data is available
|
|
778
|
+
- `speaker`: Emitted with a stream that can be piped to audio output
|
|
779
|
+
- `writing`: Emitted when text is transcribed or generated
|
|
780
|
+
- `error`: Emitted when an error occurs
|
|
781
|
+
- `tool-call-start`: Emitted when a tool is about to be executed
|
|
782
|
+
- `tool-call-result`: Emitted when a tool execution is complete
|
|
783
|
+
|
|
784
|
+
Different voice providers may support different sets of events with varying payload structures.
|
|
785
|
+
|
|
786
|
+
## Using with CompositeVoice
|
|
787
|
+
|
|
788
|
+
When using `CompositeVoice`, the `on()` method delegates to the configured real-time provider:
|
|
789
|
+
|
|
790
|
+
```typescript
|
|
791
|
+
import { CompositeVoice } from "@mastra/core/voice";
|
|
792
|
+
import { OpenAIRealtimeVoice } from "@mastra/voice-openai-realtime";
|
|
793
|
+
import Speaker from "@mastra/node-speaker";
|
|
794
|
+
|
|
795
|
+
const speaker = new Speaker({
|
|
796
|
+
sampleRate: 24100, // Audio sample rate in Hz - standard for high-quality audio on MacBook Pro
|
|
797
|
+
channels: 1, // Mono audio output (as opposed to stereo which would be 2)
|
|
798
|
+
bitDepth: 16, // Bit depth for audio quality - CD quality standard (16-bit resolution)
|
|
799
|
+
});
|
|
800
|
+
|
|
801
|
+
const realtimeVoice = new OpenAIRealtimeVoice();
|
|
802
|
+
const voice = new CompositeVoice({
|
|
803
|
+
realtimeProvider: realtimeVoice,
|
|
804
|
+
});
|
|
805
|
+
|
|
806
|
+
// Connect to the real-time service
|
|
807
|
+
await voice.connect();
|
|
808
|
+
|
|
809
|
+
// This will register the event listener with the OpenAIRealtimeVoice provider
|
|
810
|
+
voice.on("speaker", (stream) => {
|
|
811
|
+
stream.pipe(speaker);
|
|
812
|
+
});
|
|
813
|
+
```
|
|
814
|
+
|
|
815
|
+
## Notes
|
|
816
|
+
|
|
817
|
+
- This method is primarily used with real-time voice providers that support event-based communication
|
|
818
|
+
- If called on a voice provider that doesn't support events, it will log a warning and do nothing
|
|
819
|
+
- Event listeners should be registered before calling methods that might emit events
|
|
820
|
+
- To remove an event listener, use the [voice.off()](./voice.off) method with the same event name and callback function
|
|
821
|
+
- Multiple listeners can be registered for the same event
|
|
822
|
+
- The callback function will receive different data depending on the event type (see [Voice Events](./voice.events))
|
|
823
|
+
- For best performance, consider removing event listeners when they are no longer needed
|
|
824
|
+
|
|
825
|
+
---
|
|
826
|
+
|
|
827
|
+
## Reference: voice.send()
|
|
828
|
+
|
|
829
|
+
> Documentation for the send() method available in real-time voice providers, which streams audio data for continuous processing.
|
|
830
|
+
|
|
831
|
+
The `send()` method streams audio data in real-time to voice providers for continuous processing. This method is essential for real-time speech-to-speech conversations, allowing you to send microphone input directly to the AI service.
|
|
832
|
+
|
|
833
|
+
## Usage Example
|
|
834
|
+
|
|
835
|
+
```typescript
|
|
836
|
+
import { OpenAIRealtimeVoice } from "@mastra/voice-openai-realtime";
|
|
837
|
+
import Speaker from "@mastra/node-speaker";
|
|
838
|
+
import { getMicrophoneStream } from "@mastra/node-audio";
|
|
839
|
+
|
|
840
|
+
const speaker = new Speaker({
|
|
841
|
+
sampleRate: 24100, // Audio sample rate in Hz - standard for high-quality audio on MacBook Pro
|
|
842
|
+
channels: 1, // Mono audio output (as opposed to stereo which would be 2)
|
|
843
|
+
bitDepth: 16, // Bit depth for audio quality - CD quality standard (16-bit resolution)
|
|
844
|
+
});
|
|
845
|
+
|
|
846
|
+
// Initialize a real-time voice provider
|
|
847
|
+
const voice = new OpenAIRealtimeVoice({
|
|
848
|
+
realtimeConfig: {
|
|
849
|
+
model: "gpt-5.1-realtime",
|
|
850
|
+
apiKey: process.env.OPENAI_API_KEY,
|
|
851
|
+
},
|
|
852
|
+
});
|
|
853
|
+
|
|
854
|
+
// Connect to the real-time service
|
|
855
|
+
await voice.connect();
|
|
856
|
+
|
|
857
|
+
// Set up event listeners for responses
|
|
858
|
+
voice.on("writing", ({ text, role }) => {
|
|
859
|
+
console.log(`${role}: ${text}`);
|
|
860
|
+
});
|
|
861
|
+
|
|
862
|
+
voice.on("speaker", (stream) => {
|
|
863
|
+
stream.pipe(speaker);
|
|
864
|
+
});
|
|
865
|
+
|
|
866
|
+
// Get microphone stream (implementation depends on your environment)
|
|
867
|
+
const microphoneStream = getMicrophoneStream();
|
|
868
|
+
|
|
869
|
+
// Send audio data to the voice provider
|
|
870
|
+
await voice.send(microphoneStream);
|
|
871
|
+
|
|
872
|
+
// You can also send audio data as Int16Array
|
|
873
|
+
const audioBuffer = getAudioBuffer(); // Assume this returns Int16Array
|
|
874
|
+
await voice.send(audioBuffer);
|
|
875
|
+
```
|
|
876
|
+
|
|
877
|
+
## Parameters
|
|
878
|
+
|
|
879
|
+
<br />
|
|
880
|
+
|
|
881
|
+
## Return Value
|
|
882
|
+
|
|
883
|
+
Returns a `Promise<void>` that resolves when the audio data has been accepted by the voice provider.
|
|
884
|
+
|
|
885
|
+
## Notes
|
|
886
|
+
|
|
887
|
+
- This method is only implemented by real-time voice providers that support speech-to-speech capabilities
|
|
888
|
+
- If called on a voice provider that doesn't support this functionality, it will log a warning and resolve immediately
|
|
889
|
+
- You must call `connect()` before using `send()` to establish the WebSocket connection
|
|
890
|
+
- The audio format requirements depend on the specific voice provider
|
|
891
|
+
- For continuous conversation, you typically call `send()` to transmit user audio, then `answer()` to trigger the AI response
|
|
892
|
+
- The provider will typically emit 'writing' events with transcribed text as it processes the audio
|
|
893
|
+
- When the AI responds, the provider will emit 'speaking' events with the audio response
|
|
894
|
+
|
|
895
|
+
---
|
|
896
|
+
|
|
897
|
+
## Reference: voice.speak()
|
|
898
|
+
|
|
899
|
+
> Documentation for the speak() method available in all Mastra voice providers, which converts text to speech.
|
|
900
|
+
|
|
901
|
+
The `speak()` method is a core function available in all Mastra voice providers that converts text to speech. It takes text input and returns an audio stream that can be played or saved.
|
|
902
|
+
|
|
903
|
+
## Parameters
|
|
904
|
+
|
|
905
|
+
## Return Value
|
|
906
|
+
|
|
907
|
+
Returns a `Promise<NodeJS.ReadableStream | void>` where:
|
|
908
|
+
|
|
909
|
+
- `NodeJS.ReadableStream`: A stream of audio data that can be played or saved
|
|
910
|
+
- `void`: When using a realtime voice provider that emits audio through events instead of returning it directly
|
|
911
|
+
|
|
912
|
+
## Provider-Specific Options
|
|
913
|
+
|
|
914
|
+
Each voice provider may support additional options specific to their implementation. Here are some examples:
|
|
915
|
+
|
|
916
|
+
### OpenAI
|
|
917
|
+
|
|
918
|
+
### ElevenLabs
|
|
919
|
+
|
|
920
|
+
### Google
|
|
921
|
+
|
|
922
|
+
### Murf
|
|
923
|
+
|
|
924
|
+
## Usage Example
|
|
925
|
+
|
|
926
|
+
```typescript
|
|
927
|
+
import { OpenAIVoice } from "@mastra/voice-openai";
|
|
928
|
+
// Initialize a voice provider
|
|
929
|
+
const voice = new OpenAIVoice({
|
|
930
|
+
speaker: "alloy", // Default voice
|
|
931
|
+
});
|
|
932
|
+
// Basic usage with default settings
|
|
933
|
+
const audioStream = await voice.speak("Hello, world!");
|
|
934
|
+
// Using a different voice for this specific request
|
|
935
|
+
const audioStreamWithDifferentVoice = await voice.speak("Hello again!", {
|
|
936
|
+
speaker: "nova",
|
|
937
|
+
});
|
|
938
|
+
// Using provider-specific options
|
|
939
|
+
const audioStreamWithOptions = await voice.speak("Hello with options!", {
|
|
940
|
+
speaker: "echo",
|
|
941
|
+
speed: 1.2, // OpenAI-specific option
|
|
942
|
+
});
|
|
943
|
+
// Using a text stream as input
|
|
944
|
+
import { Readable } from "stream";
|
|
945
|
+
const textStream = Readable.from(["Hello", " from", " a", " stream!"]);
|
|
946
|
+
const audioStreamFromTextStream = await voice.speak(textStream);
|
|
947
|
+
```
|
|
948
|
+
|
|
949
|
+
## Using with CompositeVoice
|
|
950
|
+
|
|
951
|
+
When using `CompositeVoice`, the `speak()` method delegates to the configured speaking provider:
|
|
952
|
+
|
|
953
|
+
```typescript
|
|
954
|
+
import { CompositeVoice } from "@mastra/core/voice";
|
|
955
|
+
import { OpenAIVoice } from "@mastra/voice-openai";
|
|
956
|
+
import { PlayAIVoice } from "@mastra/voice-playai";
|
|
957
|
+
|
|
958
|
+
const voice = new CompositeVoice({
|
|
959
|
+
output: new PlayAIVoice(),
|
|
960
|
+
input: new OpenAIVoice(),
|
|
961
|
+
});
|
|
962
|
+
|
|
963
|
+
// This will use the PlayAIVoice provider
|
|
964
|
+
const audioStream = await voice.speak("Hello, world!");
|
|
965
|
+
```
|
|
966
|
+
|
|
967
|
+
### Using AI SDK Model Providers
|
|
968
|
+
|
|
969
|
+
You can also use AI SDK speech models directly with `CompositeVoice`:
|
|
970
|
+
|
|
971
|
+
```typescript
|
|
972
|
+
import { CompositeVoice } from "@mastra/core/voice";
|
|
973
|
+
import { openai } from "@ai-sdk/openai";
|
|
974
|
+
import { elevenlabs } from "@ai-sdk/elevenlabs";
|
|
975
|
+
|
|
976
|
+
// Use AI SDK speech models
|
|
977
|
+
const voice = new CompositeVoice({
|
|
978
|
+
output: elevenlabs.speech('eleven_turbo_v2'), // AI SDK model
|
|
979
|
+
input: openai.transcription('whisper-1'), // AI SDK model
|
|
980
|
+
});
|
|
981
|
+
|
|
982
|
+
// Works the same way
|
|
983
|
+
const audioStream = await voice.speak("Hello from AI SDK!");
|
|
984
|
+
|
|
985
|
+
// Provider-specific options can be passed through
|
|
986
|
+
const audioWithOptions = await voice.speak("Hello with options!", {
|
|
987
|
+
speaker: 'Rachel', // ElevenLabs voice
|
|
988
|
+
providerOptions: {
|
|
989
|
+
elevenlabs: {
|
|
990
|
+
stability: 0.5,
|
|
991
|
+
similarity_boost: 0.75,
|
|
992
|
+
}
|
|
993
|
+
}
|
|
994
|
+
});
|
|
995
|
+
```
|
|
996
|
+
|
|
997
|
+
See the [CompositeVoice reference](https://mastra.ai/reference/v1/voice/composite-voice) for more details on AI SDK integration.
|
|
998
|
+
|
|
999
|
+
## Realtime Voice Providers
|
|
1000
|
+
|
|
1001
|
+
When using realtime voice providers like `OpenAIRealtimeVoice`, the `speak()` method behaves differently:
|
|
1002
|
+
|
|
1003
|
+
- Instead of returning an audio stream, it emits a 'speaking' event with the audio data
|
|
1004
|
+
- You need to register an event listener to receive the audio chunks
|
|
1005
|
+
|
|
1006
|
+
```typescript
|
|
1007
|
+
import { OpenAIRealtimeVoice } from "@mastra/voice-openai-realtime";
|
|
1008
|
+
import Speaker from "@mastra/node-speaker";
|
|
1009
|
+
|
|
1010
|
+
const speaker = new Speaker({
|
|
1011
|
+
sampleRate: 24100, // Audio sample rate in Hz - standard for high-quality audio on MacBook Pro
|
|
1012
|
+
channels: 1, // Mono audio output (as opposed to stereo which would be 2)
|
|
1013
|
+
bitDepth: 16, // Bit depth for audio quality - CD quality standard (16-bit resolution)
|
|
1014
|
+
});
|
|
1015
|
+
|
|
1016
|
+
const voice = new OpenAIRealtimeVoice();
|
|
1017
|
+
await voice.connect();
|
|
1018
|
+
// Register event listener for audio chunks
|
|
1019
|
+
voice.on("speaker", (stream) => {
|
|
1020
|
+
// Handle audio chunk (e.g., play it or save it)
|
|
1021
|
+
stream.pipe(speaker);
|
|
1022
|
+
});
|
|
1023
|
+
// This will emit 'speaking' events instead of returning a stream
|
|
1024
|
+
await voice.speak("Hello, this is realtime speech!");
|
|
1025
|
+
```
|
|
1026
|
+
|
|
1027
|
+
## Notes
|
|
1028
|
+
|
|
1029
|
+
- The behavior of `speak()` may vary slightly between providers, but all implementations follow the same basic interface.
|
|
1030
|
+
- When using a realtime voice provider, the method might not return an audio stream directly but instead emit a 'speaking' event.
|
|
1031
|
+
- If a text stream is provided as input, the provider will typically convert it to a string before processing.
|
|
1032
|
+
- The audio format of the returned stream depends on the provider. Common formats include MP3, WAV, and OGG.
|
|
1033
|
+
- For best performance, consider closing or ending the audio stream when you're done with it.
|
|
1034
|
+
|
|
1035
|
+
---
|
|
1036
|
+
|
|
1037
|
+
## Reference: voice.updateConfig()
|
|
1038
|
+
|
|
1039
|
+
> Documentation for the updateConfig() method available in voice providers, which updates the configuration of a voice provider at runtime.
|
|
1040
|
+
|
|
1041
|
+
The `updateConfig()` method allows you to update the configuration of a voice provider at runtime. This is useful for changing voice settings, API keys, or other provider-specific options without creating a new instance.
|
|
1042
|
+
|
|
1043
|
+
## Usage Example
|
|
1044
|
+
|
|
1045
|
+
```typescript
|
|
1046
|
+
import { OpenAIRealtimeVoice } from "@mastra/voice-openai-realtime";
|
|
1047
|
+
|
|
1048
|
+
// Initialize a real-time voice provider
|
|
1049
|
+
const voice = new OpenAIRealtimeVoice({
|
|
1050
|
+
realtimeConfig: {
|
|
1051
|
+
model: "gpt-5.1-realtime",
|
|
1052
|
+
apiKey: process.env.OPENAI_API_KEY,
|
|
1053
|
+
},
|
|
1054
|
+
speaker: "alloy",
|
|
1055
|
+
});
|
|
1056
|
+
|
|
1057
|
+
// Connect to the real-time service
|
|
1058
|
+
await voice.connect();
|
|
1059
|
+
|
|
1060
|
+
// Later, update the configuration
|
|
1061
|
+
voice.updateConfig({
|
|
1062
|
+
voice: "nova", // Change the default voice
|
|
1063
|
+
turn_detection: {
|
|
1064
|
+
type: "server_vad",
|
|
1065
|
+
threshold: 0.5,
|
|
1066
|
+
silence_duration_ms: 1000,
|
|
1067
|
+
},
|
|
1068
|
+
});
|
|
1069
|
+
|
|
1070
|
+
// The next speak() call will use the new configuration
|
|
1071
|
+
await voice.speak("Hello with my new voice!");
|
|
1072
|
+
```
|
|
1073
|
+
|
|
1074
|
+
## Parameters
|
|
1075
|
+
|
|
1076
|
+
<br />
|
|
1077
|
+
|
|
1078
|
+
## Return Value
|
|
1079
|
+
|
|
1080
|
+
This method does not return a value.
|
|
1081
|
+
|
|
1082
|
+
## Configuration Options
|
|
1083
|
+
|
|
1084
|
+
Different voice providers support different configuration options:
|
|
1085
|
+
|
|
1086
|
+
### OpenAI Realtime
|
|
1087
|
+
|
|
1088
|
+
<br />
|
|
1089
|
+
|
|
1090
|
+
## Notes
|
|
1091
|
+
|
|
1092
|
+
- The default implementation logs a warning if the provider doesn't support this method
|
|
1093
|
+
- Configuration updates are typically applied to subsequent operations, not ongoing ones
|
|
1094
|
+
- Not all properties that can be set in the constructor can be updated at runtime
|
|
1095
|
+
- The specific behavior depends on the voice provider implementation
|
|
1096
|
+
- For real-time voice providers, some configuration changes may require reconnecting to the service
|