@mastra/voice-google 0.12.0 → 0.12.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (125) hide show
  1. package/CHANGELOG.md +16 -0
  2. package/LICENSE.md +15 -0
  3. package/dist/_types/@internal_voice/dist/_types/@internal_ai-sdk-v5/dist/index.d.ts +8888 -0
  4. package/dist/_types/@internal_voice/dist/_types/@internal_core/dist/base/index.d.ts +31 -0
  5. package/dist/_types/@internal_voice/dist/_types/@internal_core/dist/logger/index.d.ts +217 -0
  6. package/dist/_types/@internal_voice/dist/_types/@internal_core/dist/request-context/index.d.ts +147 -0
  7. package/dist/_types/@internal_voice/dist/_types/@internal_core/dist/types/index.d.ts +3 -0
  8. package/dist/_types/@internal_voice/dist/_types/zod/v3/ZodError.d.ts +164 -0
  9. package/dist/_types/@internal_voice/dist/_types/zod/v3/errors.d.ts +5 -0
  10. package/dist/_types/@internal_voice/dist/_types/zod/v3/external.d.ts +6 -0
  11. package/dist/_types/@internal_voice/dist/_types/zod/v3/helpers/enumUtil.d.ts +8 -0
  12. package/dist/_types/@internal_voice/dist/_types/zod/v3/helpers/errorUtil.d.ts +9 -0
  13. package/dist/_types/@internal_voice/dist/_types/zod/v3/helpers/parseUtil.d.ts +78 -0
  14. package/dist/_types/@internal_voice/dist/_types/zod/v3/helpers/partialUtil.d.ts +8 -0
  15. package/dist/_types/@internal_voice/dist/_types/zod/v3/helpers/typeAliases.d.ts +2 -0
  16. package/dist/_types/@internal_voice/dist/_types/zod/v3/helpers/util.d.ts +85 -0
  17. package/dist/_types/@internal_voice/dist/_types/zod/v3/index.d.cts +4 -0
  18. package/dist/_types/@internal_voice/dist/_types/zod/v3/index.d.ts +4 -0
  19. package/dist/_types/@internal_voice/dist/_types/zod/v3/locales/en.d.ts +3 -0
  20. package/dist/_types/@internal_voice/dist/_types/zod/v3/standard-schema.d.ts +102 -0
  21. package/dist/_types/@internal_voice/dist/_types/zod/v3/types.d.ts +1034 -0
  22. package/dist/_types/@internal_voice/dist/_types/zod/v4/classic/checks.d.ts +1 -0
  23. package/dist/_types/@internal_voice/dist/_types/zod/v4/classic/coerce.d.ts +17 -0
  24. package/dist/_types/@internal_voice/dist/_types/zod/v4/classic/compat.d.ts +50 -0
  25. package/dist/_types/@internal_voice/dist/_types/zod/v4/classic/errors.d.ts +30 -0
  26. package/dist/_types/@internal_voice/dist/_types/zod/v4/classic/external.d.ts +16 -0
  27. package/dist/_types/@internal_voice/dist/_types/zod/v4/classic/from-json-schema.d.ts +12 -0
  28. package/dist/_types/@internal_voice/dist/_types/zod/v4/classic/index.d.ts +4 -0
  29. package/dist/_types/@internal_voice/dist/_types/zod/v4/classic/iso.d.ts +22 -0
  30. package/dist/_types/@internal_voice/dist/_types/zod/v4/classic/parse.d.ts +31 -0
  31. package/dist/_types/@internal_voice/dist/_types/zod/v4/classic/schemas.d.ts +767 -0
  32. package/dist/_types/@internal_voice/dist/_types/zod/v4/core/api.d.ts +325 -0
  33. package/dist/_types/@internal_voice/dist/_types/zod/v4/core/checks.d.ts +278 -0
  34. package/dist/_types/@internal_voice/dist/_types/zod/v4/core/core.d.ts +70 -0
  35. package/dist/_types/@internal_voice/dist/_types/zod/v4/core/doc.d.ts +14 -0
  36. package/dist/_types/@internal_voice/dist/_types/zod/v4/core/errors.d.ts +221 -0
  37. package/dist/_types/@internal_voice/dist/_types/zod/v4/core/index.d.ts +16 -0
  38. package/dist/_types/@internal_voice/dist/_types/zod/v4/core/json-schema-generator.d.ts +65 -0
  39. package/dist/_types/@internal_voice/dist/_types/zod/v4/core/json-schema-processors.d.ts +49 -0
  40. package/dist/_types/@internal_voice/dist/_types/zod/v4/core/json-schema.d.ts +88 -0
  41. package/dist/_types/@internal_voice/dist/_types/zod/v4/core/parse.d.ts +49 -0
  42. package/dist/_types/@internal_voice/dist/_types/zod/v4/core/regexes.d.ts +85 -0
  43. package/dist/_types/@internal_voice/dist/_types/zod/v4/core/registries.d.ts +35 -0
  44. package/dist/_types/@internal_voice/dist/_types/zod/v4/core/schemas.d.ts +1184 -0
  45. package/dist/_types/@internal_voice/dist/_types/zod/v4/core/standard-schema.d.ts +126 -0
  46. package/dist/_types/@internal_voice/dist/_types/zod/v4/core/to-json-schema.d.ts +114 -0
  47. package/dist/_types/@internal_voice/dist/_types/zod/v4/core/util.d.ts +200 -0
  48. package/dist/_types/@internal_voice/dist/_types/zod/v4/core/versions.d.ts +5 -0
  49. package/dist/_types/@internal_voice/dist/_types/zod/v4/index.d.cts +3 -0
  50. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/ar.d.ts +4 -0
  51. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/az.d.ts +4 -0
  52. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/be.d.ts +4 -0
  53. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/bg.d.ts +4 -0
  54. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/ca.d.ts +4 -0
  55. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/cs.d.ts +4 -0
  56. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/da.d.ts +4 -0
  57. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/de.d.ts +4 -0
  58. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/el.d.ts +4 -0
  59. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/en.d.ts +4 -0
  60. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/eo.d.ts +4 -0
  61. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/es.d.ts +4 -0
  62. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/fa.d.ts +4 -0
  63. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/fi.d.ts +4 -0
  64. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/fr-CA.d.ts +4 -0
  65. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/fr.d.ts +4 -0
  66. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/he.d.ts +4 -0
  67. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/hr.d.ts +4 -0
  68. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/hu.d.ts +4 -0
  69. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/hy.d.ts +4 -0
  70. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/id.d.ts +4 -0
  71. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/index.d.ts +52 -0
  72. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/is.d.ts +4 -0
  73. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/it.d.ts +4 -0
  74. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/ja.d.ts +4 -0
  75. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/ka.d.ts +4 -0
  76. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/kh.d.ts +5 -0
  77. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/km.d.ts +4 -0
  78. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/ko.d.ts +4 -0
  79. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/lt.d.ts +4 -0
  80. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/mk.d.ts +4 -0
  81. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/ms.d.ts +4 -0
  82. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/nl.d.ts +4 -0
  83. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/no.d.ts +4 -0
  84. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/ota.d.ts +4 -0
  85. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/pl.d.ts +4 -0
  86. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/ps.d.ts +4 -0
  87. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/pt.d.ts +4 -0
  88. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/ro.d.ts +4 -0
  89. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/ru.d.ts +4 -0
  90. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/sl.d.ts +4 -0
  91. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/sv.d.ts +4 -0
  92. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/ta.d.ts +4 -0
  93. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/th.d.ts +4 -0
  94. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/tr.d.ts +4 -0
  95. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/ua.d.ts +5 -0
  96. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/uk.d.ts +4 -0
  97. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/ur.d.ts +4 -0
  98. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/uz.d.ts +4 -0
  99. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/vi.d.ts +4 -0
  100. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/yo.d.ts +4 -0
  101. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/zh-CN.d.ts +4 -0
  102. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/zh-TW.d.ts +4 -0
  103. package/dist/_types/@internal_voice/dist/index.d.ts +16 -0
  104. package/dist/_types/@internal_voice/dist/voice/aisdk/index.d.ts +3 -0
  105. package/dist/_types/@internal_voice/dist/voice/aisdk/speech.d.ts +23 -0
  106. package/dist/_types/@internal_voice/dist/voice/aisdk/transcription.d.ts +22 -0
  107. package/dist/_types/@internal_voice/dist/voice/composite-voice.d.ts +72 -0
  108. package/dist/_types/@internal_voice/dist/voice/default-voice.d.ts +13 -0
  109. package/dist/_types/@internal_voice/dist/voice/index.d.ts +5 -0
  110. package/dist/_types/@internal_voice/dist/voice/voice.d.ts +172 -0
  111. package/dist/docs/SKILL.md +15 -21
  112. package/dist/docs/{SOURCE_MAP.json → assets/SOURCE_MAP.json} +1 -1
  113. package/dist/docs/references/docs-agents-adding-voice.md +381 -0
  114. package/dist/docs/references/docs-voice-overview.md +1250 -0
  115. package/dist/docs/{voice/02-reference.md → references/reference-voice-google.md} +98 -50
  116. package/dist/index.cjs +262 -2
  117. package/dist/index.cjs.map +1 -1
  118. package/dist/index.d.ts +1 -1
  119. package/dist/index.d.ts.map +1 -1
  120. package/dist/index.js +261 -1
  121. package/dist/index.js.map +1 -1
  122. package/package.json +12 -14
  123. package/dist/docs/README.md +0 -32
  124. package/dist/docs/agents/01-adding-voice.md +0 -352
  125. package/dist/docs/voice/01-overview.md +0 -1019
@@ -0,0 +1,381 @@
1
+ # Voice
2
+
3
+ Mastra agents can be enhanced with voice capabilities, allowing them to speak responses and listen to user input. You can configure an agent to use either a single voice provider or combine multiple providers for different operations.
4
+
5
+ ## Basic usage
6
+
7
+ The simplest way to add voice to an agent is to use a single provider for both speaking and listening:
8
+
9
+ ```typescript
10
+ import { createReadStream } from 'fs'
11
+ import path from 'path'
12
+ import { Agent } from '@mastra/core/agent'
13
+ import { OpenAIVoice } from '@mastra/voice-openai'
14
+
15
+ // Initialize the voice provider with default settings
16
+ const voice = new OpenAIVoice()
17
+
18
+ // Create an agent with voice capabilities
19
+ export const agent = new Agent({
20
+ id: 'voice-agent',
21
+ name: 'Voice Agent',
22
+ instructions: `You are a helpful assistant with both STT and TTS capabilities.`,
23
+ model: 'openai/gpt-5.5',
24
+ voice,
25
+ })
26
+
27
+ // The agent can now use voice for interaction
28
+ const audioStream = await agent.voice.speak("Hello, I'm your AI assistant!", {
29
+ filetype: 'm4a',
30
+ })
31
+
32
+ playAudio(audioStream!)
33
+
34
+ try {
35
+ const transcription = await agent.voice.listen(audioStream)
36
+ console.log(transcription)
37
+ } catch (error) {
38
+ console.error('Error transcribing audio:', error)
39
+ }
40
+ ```
41
+
42
+ ## Working with audio streams
43
+
44
+ The `speak()` and `listen()` methods work with Node.js streams. Here's how to save and load audio files:
45
+
46
+ ### Saving Speech Output
47
+
48
+ The `speak` method returns a stream that you can pipe to a file or speaker.
49
+
50
+ ```typescript
51
+ import { createWriteStream } from 'fs'
52
+ import path from 'path'
53
+
54
+ // Generate speech and save to file
55
+ const audio = await agent.voice.speak('Hello, World!')
56
+ const filePath = path.join(process.cwd(), 'agent.mp3')
57
+ const writer = createWriteStream(filePath)
58
+
59
+ audio.pipe(writer)
60
+
61
+ await new Promise<void>((resolve, reject) => {
62
+ writer.on('finish', () => resolve())
63
+ writer.on('error', reject)
64
+ })
65
+ ```
66
+
67
+ ### Transcribing Audio Input
68
+
69
+ The `listen` method expects a stream of audio data from a microphone or file.
70
+
71
+ ```typescript
72
+ import { createReadStream } from 'fs'
73
+ import path from 'path'
74
+
75
+ // Read audio file and transcribe
76
+ const audioFilePath = path.join(process.cwd(), '/agent.m4a')
77
+ const audioStream = createReadStream(audioFilePath)
78
+
79
+ try {
80
+ console.log('Transcribing audio file...')
81
+ const transcription = await agent.voice.listen(audioStream, {
82
+ filetype: 'm4a',
83
+ })
84
+ console.log('Transcription:', transcription)
85
+ } catch (error) {
86
+ console.error('Error transcribing audio:', error)
87
+ }
88
+ ```
89
+
90
+ ## Speech-to-speech voice interactions
91
+
92
+ For more dynamic and interactive voice experiences, you can use real-time voice providers that support speech-to-speech capabilities:
93
+
94
+ ```typescript
95
+ import { Agent } from '@mastra/core/agent'
96
+ import { getMicrophoneStream } from '@mastra/node-audio'
97
+ import { OpenAIRealtimeVoice } from '@mastra/voice-openai-realtime'
98
+ import { search, calculate } from '../tools'
99
+
100
+ // Initialize the realtime voice provider
101
+ const voice = new OpenAIRealtimeVoice({
102
+ apiKey: process.env.OPENAI_API_KEY,
103
+ model: 'gpt-5.1-realtime',
104
+ speaker: 'alloy',
105
+ })
106
+
107
+ // Create an agent with speech-to-speech voice capabilities
108
+ export const agent = new Agent({
109
+ id: 'speech-to-speech-agent',
110
+ name: 'Speech-to-Speech Agent',
111
+ instructions: `You are a helpful assistant with speech-to-speech capabilities.`,
112
+ model: 'openai/gpt-5.5',
113
+ tools: {
114
+ // Tools configured on Agent are passed to voice provider
115
+ search,
116
+ calculate,
117
+ },
118
+ voice,
119
+ })
120
+
121
+ // Establish a WebSocket connection
122
+ await agent.voice.connect()
123
+
124
+ // Start a conversation
125
+ agent.voice.speak("Hello, I'm your AI assistant!")
126
+
127
+ // Stream audio from a microphone
128
+ const microphoneStream = getMicrophoneStream()
129
+ agent.voice.send(microphoneStream)
130
+
131
+ // When done with the conversation
132
+ agent.voice.close()
133
+ ```
134
+
135
+ ### Per-session voice for concurrent sessions
136
+
137
+ A static `voice` instance is shared across every request. For one-shot text-to-speech this is fine, but realtime and speech-to-speech providers store one WebSocket, one set of tools, and one request context per instance. If you deploy a single agent that handles several live sessions at once, a shared instance lets one session overwrite another session's tools, instructions, and request context.
138
+
139
+ To give each session its own voice, provide `voice` as a resolver. Mastra runs the resolver on every `getVoice()` call and returns a fresh, session-owned instance:
140
+
141
+ ```typescript
142
+ import { Agent } from '@mastra/core/agent'
143
+ import { OpenAIRealtimeVoice } from '@mastra/voice-openai-realtime'
144
+
145
+ export const agent = new Agent({
146
+ id: 'support-line',
147
+ name: 'Support Line',
148
+ instructions: ({ requestContext }) => `Help user ${requestContext.get('user')}.`,
149
+ model: 'openai/gpt-5.5',
150
+ voice: ({ requestContext }) => new OpenAIRealtimeVoice({ apiKey: requestContext.get('apiKey') }),
151
+ })
152
+
153
+ // Each concurrent session resolves its own voice instance
154
+ const voice = await agent.getVoice({ requestContext })
155
+ await voice.connect()
156
+ ```
157
+
158
+ When you use a resolver:
159
+
160
+ - Each call to `getVoice()` returns a new instance, so concurrent sessions never share state.
161
+ - Mastra does not add tools or instructions to a resolver instance. Configure those inside the resolver or on the provider.
162
+ - You own the lifecycle of the returned instance, so call `disconnect()` or `close()` when the session ends.
163
+
164
+ The `agent.voice` getter has no request context, so it throws when `voice` is a resolver. Use `agent.getVoice({ requestContext })` instead.
165
+
166
+ ### Event System
167
+
168
+ The realtime voice provider emits several events you can listen for:
169
+
170
+ ```typescript
171
+ // Listen for speech audio data sent from voice provider
172
+ agent.voice.on('speaking', ({ audio }) => {
173
+ // audio contains ReadableStream or Int16Array audio data
174
+ })
175
+
176
+ // Listen for transcribed text sent from both voice provider and user
177
+ agent.voice.on('writing', ({ text, role }) => {
178
+ console.log(`${role} said: ${text}`)
179
+ })
180
+
181
+ // Listen for errors
182
+ agent.voice.on('error', error => {
183
+ console.error('Voice error:', error)
184
+ })
185
+ ```
186
+
187
+ ## Examples
188
+
189
+ ### End-to-end voice interaction
190
+
191
+ This example demonstrates a voice interaction between two agents. The hybrid voice agent, which uses multiple providers, speaks a question, which is saved as an audio file. The unified voice agent listens to that file, processes the question, generates a response, and speaks it back. Both audio outputs are saved to the `audio` directory.
192
+
193
+ The following files are created:
194
+
195
+ - **hybrid-question.mp3** – Hybrid agent's spoken question.
196
+ - **unified-response.mp3** – Unified agent's spoken response.
197
+
198
+ ```typescript
199
+ import 'dotenv/config'
200
+
201
+ import path from 'path'
202
+ import { createReadStream } from 'fs'
203
+ import { Agent } from '@mastra/core/agent'
204
+ import { CompositeVoice } from '@mastra/core/voice'
205
+ import { OpenAIVoice } from '@mastra/voice-openai'
206
+ import { Mastra } from '@mastra/core'
207
+
208
+ // Saves an audio stream to a file in the audio directory, creating the directory if it doesn't exist.
209
+ export const saveAudioToFile = async (
210
+ audio: NodeJS.ReadableStream,
211
+ filename: string,
212
+ ): Promise<void> => {
213
+ const audioDir = path.join(process.cwd(), 'audio')
214
+ const filePath = path.join(audioDir, filename)
215
+
216
+ await fs.promises.mkdir(audioDir, { recursive: true })
217
+
218
+ const writer = createWriteStream(filePath)
219
+ audio.pipe(writer)
220
+ return new Promise((resolve, reject) => {
221
+ writer.on('finish', resolve)
222
+ writer.on('error', reject)
223
+ })
224
+ }
225
+
226
+ // Saves an audio stream to a file in the audio directory, creating the directory if it doesn't exist.
227
+ export const convertToText = async (input: string | NodeJS.ReadableStream): Promise<string> => {
228
+ if (typeof input === 'string') {
229
+ return input
230
+ }
231
+
232
+ const chunks: Buffer[] = []
233
+ return new Promise((resolve, reject) => {
234
+ inputData.on('data', chunk => chunks.push(Buffer.from(chunk)))
235
+ inputData.on('error', reject)
236
+ inputData.on('end', () => resolve(Buffer.concat(chunks).toString('utf-8')))
237
+ })
238
+ }
239
+
240
+ export const hybridVoiceAgent = new Agent({
241
+ id: 'hybrid-voice-agent',
242
+ name: 'Hybrid Voice Agent',
243
+ model: 'openai/gpt-5.5',
244
+ instructions: 'You can speak and listen using different providers.',
245
+ voice: new CompositeVoice({
246
+ input: new OpenAIVoice(),
247
+ output: new OpenAIVoice(),
248
+ }),
249
+ })
250
+
251
+ export const unifiedVoiceAgent = new Agent({
252
+ id: 'unified-voice-agent',
253
+ name: 'Unified Voice Agent',
254
+ instructions: 'You are an agent with both STT and TTS capabilities.',
255
+ model: 'openai/gpt-5.5',
256
+ voice: new OpenAIVoice(),
257
+ })
258
+
259
+ export const mastra = new Mastra({
260
+ agents: { hybridVoiceAgent, unifiedVoiceAgent },
261
+ })
262
+
263
+ const hybridVoiceAgent = mastra.getAgent('hybridVoiceAgent')
264
+ const unifiedVoiceAgent = mastra.getAgent('unifiedVoiceAgent')
265
+
266
+ const question = 'What is the meaning of life in one sentence?'
267
+
268
+ const hybridSpoken = await hybridVoiceAgent.voice.speak(question)
269
+
270
+ await saveAudioToFile(hybridSpoken!, 'hybrid-question.mp3')
271
+
272
+ const audioStream = createReadStream(path.join(process.cwd(), 'audio', 'hybrid-question.mp3'))
273
+ const unifiedHeard = await unifiedVoiceAgent.voice.listen(audioStream)
274
+
275
+ const inputText = await convertToText(unifiedHeard!)
276
+
277
+ const unifiedResponse = await unifiedVoiceAgent.generate(inputText)
278
+ const unifiedSpoken = await unifiedVoiceAgent.voice.speak(unifiedResponse.text)
279
+
280
+ await saveAudioToFile(unifiedSpoken!, 'unified-response.mp3')
281
+ ```
282
+
283
+ ### Using Multiple Providers
284
+
285
+ For more flexibility, you can use different providers for speaking and listening using the CompositeVoice class:
286
+
287
+ ```typescript
288
+ import { Agent } from '@mastra/core/agent'
289
+ import { CompositeVoice } from '@mastra/core/voice'
290
+ import { OpenAIVoice } from '@mastra/voice-openai'
291
+ import { PlayAIVoice } from '@mastra/voice-playai'
292
+
293
+ export const agent = new Agent({
294
+ id: 'voice-agent',
295
+ name: 'Voice Agent',
296
+ instructions: `You are a helpful assistant with both STT and TTS capabilities.`,
297
+ model: 'openai/gpt-5.5',
298
+
299
+ // Create a composite voice using OpenAI for listening and PlayAI for speaking
300
+ voice: new CompositeVoice({
301
+ input: new OpenAIVoice(),
302
+ output: new PlayAIVoice(),
303
+ }),
304
+ })
305
+ ```
306
+
307
+ ### Using AI SDK
308
+
309
+ Mastra supports using AI SDK's transcription and speech models directly in `CompositeVoice`, giving you access to a wide range of providers through the AI SDK ecosystem:
310
+
311
+ ```typescript
312
+ import { Agent } from '@mastra/core/agent'
313
+ import { CompositeVoice } from '@mastra/core/voice'
314
+ import { openai } from '@ai-sdk/openai'
315
+ import { elevenlabs } from '@ai-sdk/elevenlabs'
316
+ import { groq } from '@ai-sdk/groq'
317
+
318
+ export const agent = new Agent({
319
+ id: 'aisdk-voice-agent',
320
+ name: 'AI SDK Voice Agent',
321
+ instructions: `You are a helpful assistant with voice capabilities.`,
322
+ model: 'openai/gpt-5.5',
323
+
324
+ // Pass AI SDK models directly to CompositeVoice
325
+ voice: new CompositeVoice({
326
+ input: openai.transcription('whisper-1'), // AI SDK transcription model
327
+ output: elevenlabs.speech('eleven_turbo_v2'), // AI SDK speech model
328
+ }),
329
+ })
330
+
331
+ // Use voice capabilities as usual
332
+ const audioStream = await agent.voice.speak('Hello!')
333
+ const transcribedText = await agent.voice.listen(audioStream)
334
+ ```
335
+
336
+ #### Mix and Match Providers
337
+
338
+ You can mix AI SDK models with Mastra voice providers:
339
+
340
+ ```typescript
341
+ import { CompositeVoice } from '@mastra/core/voice'
342
+ import { PlayAIVoice } from '@mastra/voice-playai'
343
+ import { openai } from '@ai-sdk/openai'
344
+
345
+ // Use AI SDK for transcription and Mastra provider for speech
346
+ const voice = new CompositeVoice({
347
+ input: openai.transcription('whisper-1'), // AI SDK
348
+ output: new PlayAIVoice(), // Mastra provider
349
+ })
350
+ ```
351
+
352
+ For the complete list of supported AI SDK providers and their capabilities:
353
+
354
+ - [Transcription](https://ai-sdk.dev/docs/providers/openai/transcription)
355
+ - [Speech](https://ai-sdk.dev/docs/providers/elevenlabs/speech)
356
+
357
+ ## Supported voice providers
358
+
359
+ Mastra supports multiple voice providers for text-to-speech (TTS) and speech-to-text (STT) capabilities:
360
+
361
+ | Provider | Package | Features | Reference |
362
+ | --------------- | ------------------------------- | ----------------------------------------- | ------------------------------------------------------------------ |
363
+ | OpenAI | `@mastra/voice-openai` | TTS, STT | [Documentation](https://mastra.ai/reference/voice/openai) |
364
+ | OpenAI Realtime | `@mastra/voice-openai-realtime` | Realtime speech-to-speech | [Documentation](https://mastra.ai/reference/voice/openai-realtime) |
365
+ | AWS Nova Sonic | `@mastra/voice-aws-nova-sonic` | Realtime speech-to-speech via AWS Bedrock | [Documentation](https://mastra.ai/reference/voice/aws-nova-sonic) |
366
+ | ElevenLabs | `@mastra/voice-elevenlabs` | High-quality TTS | [Documentation](https://mastra.ai/reference/voice/elevenlabs) |
367
+ | PlayAI | `@mastra/voice-playai` | TTS | [Documentation](https://mastra.ai/reference/voice/playai) |
368
+ | Google | `@mastra/voice-google` | TTS, STT | [Documentation](https://mastra.ai/reference/voice/google) |
369
+ | Deepgram | `@mastra/voice-deepgram` | STT | [Documentation](https://mastra.ai/reference/voice/deepgram) |
370
+ | Murf | `@mastra/voice-murf` | TTS | [Documentation](https://mastra.ai/reference/voice/murf) |
371
+ | Speechify | `@mastra/voice-speechify` | TTS | [Documentation](https://mastra.ai/reference/voice/speechify) |
372
+ | Sarvam | `@mastra/voice-sarvam` | TTS, STT | [Documentation](https://mastra.ai/reference/voice/sarvam) |
373
+ | Azure | `@mastra/voice-azure` | TTS, STT | [Documentation](https://mastra.ai/reference/voice/mastra-voice) |
374
+ | Cloudflare | `@mastra/voice-cloudflare` | TTS | [Documentation](https://mastra.ai/reference/voice/mastra-voice) |
375
+
376
+ ## Next steps
377
+
378
+ - [Voice API Reference](https://mastra.ai/reference/voice/mastra-voice): Detailed API documentation for voice capabilities
379
+ - [Text to Speech Examples](https://github.com/mastra-ai/voice-examples/tree/main/text-to-speech): Interactive story generator and other TTS implementations
380
+ - [Speech to Text Examples](https://github.com/mastra-ai/voice-examples/tree/main/speech-to-text): Voice memo app and other STT implementations
381
+ - [Speech to Speech Examples](https://github.com/mastra-ai/voice-examples/tree/main/speech-to-speech): Real-time voice conversation with call analysis