@mastra/voice-google 0.12.0 → 0.12.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (125) hide show
  1. package/CHANGELOG.md +16 -0
  2. package/LICENSE.md +15 -0
  3. package/dist/_types/@internal_voice/dist/_types/@internal_ai-sdk-v5/dist/index.d.ts +8888 -0
  4. package/dist/_types/@internal_voice/dist/_types/@internal_core/dist/base/index.d.ts +31 -0
  5. package/dist/_types/@internal_voice/dist/_types/@internal_core/dist/logger/index.d.ts +217 -0
  6. package/dist/_types/@internal_voice/dist/_types/@internal_core/dist/request-context/index.d.ts +147 -0
  7. package/dist/_types/@internal_voice/dist/_types/@internal_core/dist/types/index.d.ts +3 -0
  8. package/dist/_types/@internal_voice/dist/_types/zod/v3/ZodError.d.ts +164 -0
  9. package/dist/_types/@internal_voice/dist/_types/zod/v3/errors.d.ts +5 -0
  10. package/dist/_types/@internal_voice/dist/_types/zod/v3/external.d.ts +6 -0
  11. package/dist/_types/@internal_voice/dist/_types/zod/v3/helpers/enumUtil.d.ts +8 -0
  12. package/dist/_types/@internal_voice/dist/_types/zod/v3/helpers/errorUtil.d.ts +9 -0
  13. package/dist/_types/@internal_voice/dist/_types/zod/v3/helpers/parseUtil.d.ts +78 -0
  14. package/dist/_types/@internal_voice/dist/_types/zod/v3/helpers/partialUtil.d.ts +8 -0
  15. package/dist/_types/@internal_voice/dist/_types/zod/v3/helpers/typeAliases.d.ts +2 -0
  16. package/dist/_types/@internal_voice/dist/_types/zod/v3/helpers/util.d.ts +85 -0
  17. package/dist/_types/@internal_voice/dist/_types/zod/v3/index.d.cts +4 -0
  18. package/dist/_types/@internal_voice/dist/_types/zod/v3/index.d.ts +4 -0
  19. package/dist/_types/@internal_voice/dist/_types/zod/v3/locales/en.d.ts +3 -0
  20. package/dist/_types/@internal_voice/dist/_types/zod/v3/standard-schema.d.ts +102 -0
  21. package/dist/_types/@internal_voice/dist/_types/zod/v3/types.d.ts +1034 -0
  22. package/dist/_types/@internal_voice/dist/_types/zod/v4/classic/checks.d.ts +1 -0
  23. package/dist/_types/@internal_voice/dist/_types/zod/v4/classic/coerce.d.ts +17 -0
  24. package/dist/_types/@internal_voice/dist/_types/zod/v4/classic/compat.d.ts +50 -0
  25. package/dist/_types/@internal_voice/dist/_types/zod/v4/classic/errors.d.ts +30 -0
  26. package/dist/_types/@internal_voice/dist/_types/zod/v4/classic/external.d.ts +16 -0
  27. package/dist/_types/@internal_voice/dist/_types/zod/v4/classic/from-json-schema.d.ts +12 -0
  28. package/dist/_types/@internal_voice/dist/_types/zod/v4/classic/index.d.ts +4 -0
  29. package/dist/_types/@internal_voice/dist/_types/zod/v4/classic/iso.d.ts +22 -0
  30. package/dist/_types/@internal_voice/dist/_types/zod/v4/classic/parse.d.ts +31 -0
  31. package/dist/_types/@internal_voice/dist/_types/zod/v4/classic/schemas.d.ts +767 -0
  32. package/dist/_types/@internal_voice/dist/_types/zod/v4/core/api.d.ts +325 -0
  33. package/dist/_types/@internal_voice/dist/_types/zod/v4/core/checks.d.ts +278 -0
  34. package/dist/_types/@internal_voice/dist/_types/zod/v4/core/core.d.ts +70 -0
  35. package/dist/_types/@internal_voice/dist/_types/zod/v4/core/doc.d.ts +14 -0
  36. package/dist/_types/@internal_voice/dist/_types/zod/v4/core/errors.d.ts +221 -0
  37. package/dist/_types/@internal_voice/dist/_types/zod/v4/core/index.d.ts +16 -0
  38. package/dist/_types/@internal_voice/dist/_types/zod/v4/core/json-schema-generator.d.ts +65 -0
  39. package/dist/_types/@internal_voice/dist/_types/zod/v4/core/json-schema-processors.d.ts +49 -0
  40. package/dist/_types/@internal_voice/dist/_types/zod/v4/core/json-schema.d.ts +88 -0
  41. package/dist/_types/@internal_voice/dist/_types/zod/v4/core/parse.d.ts +49 -0
  42. package/dist/_types/@internal_voice/dist/_types/zod/v4/core/regexes.d.ts +85 -0
  43. package/dist/_types/@internal_voice/dist/_types/zod/v4/core/registries.d.ts +35 -0
  44. package/dist/_types/@internal_voice/dist/_types/zod/v4/core/schemas.d.ts +1184 -0
  45. package/dist/_types/@internal_voice/dist/_types/zod/v4/core/standard-schema.d.ts +126 -0
  46. package/dist/_types/@internal_voice/dist/_types/zod/v4/core/to-json-schema.d.ts +114 -0
  47. package/dist/_types/@internal_voice/dist/_types/zod/v4/core/util.d.ts +200 -0
  48. package/dist/_types/@internal_voice/dist/_types/zod/v4/core/versions.d.ts +5 -0
  49. package/dist/_types/@internal_voice/dist/_types/zod/v4/index.d.cts +3 -0
  50. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/ar.d.ts +4 -0
  51. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/az.d.ts +4 -0
  52. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/be.d.ts +4 -0
  53. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/bg.d.ts +4 -0
  54. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/ca.d.ts +4 -0
  55. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/cs.d.ts +4 -0
  56. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/da.d.ts +4 -0
  57. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/de.d.ts +4 -0
  58. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/el.d.ts +4 -0
  59. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/en.d.ts +4 -0
  60. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/eo.d.ts +4 -0
  61. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/es.d.ts +4 -0
  62. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/fa.d.ts +4 -0
  63. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/fi.d.ts +4 -0
  64. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/fr-CA.d.ts +4 -0
  65. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/fr.d.ts +4 -0
  66. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/he.d.ts +4 -0
  67. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/hr.d.ts +4 -0
  68. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/hu.d.ts +4 -0
  69. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/hy.d.ts +4 -0
  70. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/id.d.ts +4 -0
  71. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/index.d.ts +52 -0
  72. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/is.d.ts +4 -0
  73. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/it.d.ts +4 -0
  74. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/ja.d.ts +4 -0
  75. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/ka.d.ts +4 -0
  76. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/kh.d.ts +5 -0
  77. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/km.d.ts +4 -0
  78. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/ko.d.ts +4 -0
  79. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/lt.d.ts +4 -0
  80. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/mk.d.ts +4 -0
  81. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/ms.d.ts +4 -0
  82. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/nl.d.ts +4 -0
  83. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/no.d.ts +4 -0
  84. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/ota.d.ts +4 -0
  85. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/pl.d.ts +4 -0
  86. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/ps.d.ts +4 -0
  87. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/pt.d.ts +4 -0
  88. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/ro.d.ts +4 -0
  89. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/ru.d.ts +4 -0
  90. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/sl.d.ts +4 -0
  91. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/sv.d.ts +4 -0
  92. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/ta.d.ts +4 -0
  93. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/th.d.ts +4 -0
  94. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/tr.d.ts +4 -0
  95. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/ua.d.ts +5 -0
  96. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/uk.d.ts +4 -0
  97. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/ur.d.ts +4 -0
  98. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/uz.d.ts +4 -0
  99. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/vi.d.ts +4 -0
  100. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/yo.d.ts +4 -0
  101. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/zh-CN.d.ts +4 -0
  102. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/zh-TW.d.ts +4 -0
  103. package/dist/_types/@internal_voice/dist/index.d.ts +16 -0
  104. package/dist/_types/@internal_voice/dist/voice/aisdk/index.d.ts +3 -0
  105. package/dist/_types/@internal_voice/dist/voice/aisdk/speech.d.ts +23 -0
  106. package/dist/_types/@internal_voice/dist/voice/aisdk/transcription.d.ts +22 -0
  107. package/dist/_types/@internal_voice/dist/voice/composite-voice.d.ts +72 -0
  108. package/dist/_types/@internal_voice/dist/voice/default-voice.d.ts +13 -0
  109. package/dist/_types/@internal_voice/dist/voice/index.d.ts +5 -0
  110. package/dist/_types/@internal_voice/dist/voice/voice.d.ts +172 -0
  111. package/dist/docs/SKILL.md +15 -21
  112. package/dist/docs/{SOURCE_MAP.json → assets/SOURCE_MAP.json} +1 -1
  113. package/dist/docs/references/docs-agents-adding-voice.md +381 -0
  114. package/dist/docs/references/docs-voice-overview.md +1250 -0
  115. package/dist/docs/{voice/02-reference.md → references/reference-voice-google.md} +98 -50
  116. package/dist/index.cjs +262 -2
  117. package/dist/index.cjs.map +1 -1
  118. package/dist/index.d.ts +1 -1
  119. package/dist/index.d.ts.map +1 -1
  120. package/dist/index.js +261 -1
  121. package/dist/index.js.map +1 -1
  122. package/package.json +12 -14
  123. package/dist/docs/README.md +0 -32
  124. package/dist/docs/agents/01-adding-voice.md +0 -352
  125. package/dist/docs/voice/01-overview.md +0 -1019
@@ -0,0 +1,1250 @@
1
+ # Voice in Mastra
2
+
3
+ Mastra's Voice system provides a unified interface for voice interactions, enabling text-to-speech (TTS), speech-to-text (STT), and real-time speech-to-speech (STS) capabilities in your applications.
4
+
5
+ ## Adding voice to agents
6
+
7
+ To learn how to integrate voice capabilities into your agents, check out the [Adding Voice to Agents](https://mastra.ai/docs/agents/adding-voice) documentation. This section covers how to use both single and multiple voice providers, as well as real-time interactions.
8
+
9
+ ```typescript
10
+ import { Agent } from '@mastra/core/agent'
11
+ import { OpenAIVoice } from '@mastra/voice-openai'
12
+
13
+ // Initialize OpenAI voice for TTS
14
+
15
+ const voiceAgent = new Agent({
16
+ id: 'voice-agent',
17
+ name: 'Voice Agent',
18
+ instructions: 'You are a voice assistant that can help users with their tasks.',
19
+ model: 'openai/gpt-5.5',
20
+ voice: new OpenAIVoice(),
21
+ })
22
+ ```
23
+
24
+ You can then use the following voice capabilities:
25
+
26
+ ### Text to Speech (TTS)
27
+
28
+ Turn your agent's responses into natural-sounding speech using Mastra's TTS capabilities. Choose from multiple providers like OpenAI, ElevenLabs, and more.
29
+
30
+ For detailed configuration options and advanced features, check out our [Text-to-Speech guide](https://mastra.ai/docs/voice/text-to-speech).
31
+
32
+ **OpenAI**:
33
+
34
+ ```typescript
35
+ import { Agent } from '@mastra/core/agent'
36
+ import { OpenAIVoice } from '@mastra/voice-openai'
37
+ import { playAudio } from '@mastra/node-audio'
38
+
39
+ const voiceAgent = new Agent({
40
+ id: 'voice-agent',
41
+ name: 'Voice Agent',
42
+ instructions: 'You are a voice assistant that can help users with their tasks.',
43
+ model: 'openai/gpt-5.5',
44
+ voice: new OpenAIVoice(),
45
+ })
46
+
47
+ const { text } = await voiceAgent.generate('What color is the sky?')
48
+
49
+ // Convert text to speech to an Audio Stream
50
+ const audioStream = await voiceAgent.voice.speak(text, {
51
+ speaker: 'default', // Optional: specify a speaker
52
+ responseFormat: 'wav', // Optional: specify a response format
53
+ })
54
+
55
+ playAudio(audioStream)
56
+ ```
57
+
58
+ Visit the [OpenAI Voice Reference](https://mastra.ai/reference/voice/openai) for more information on the OpenAI voice provider.
59
+
60
+ **Azure**:
61
+
62
+ ```typescript
63
+ import { Agent } from '@mastra/core/agent'
64
+ import { AzureVoice } from '@mastra/voice-azure'
65
+ import { playAudio } from '@mastra/node-audio'
66
+
67
+ const voiceAgent = new Agent({
68
+ id: 'voice-agent',
69
+ name: 'Voice Agent',
70
+ instructions: 'You are a voice assistant that can help users with their tasks.',
71
+ model: 'openai/gpt-5.5',
72
+ voice: new AzureVoice(),
73
+ })
74
+
75
+ const { text } = await voiceAgent.generate('What color is the sky?')
76
+
77
+ // Convert text to speech to an Audio Stream
78
+ const audioStream = await voiceAgent.voice.speak(text, {
79
+ speaker: 'en-US-JennyNeural', // Optional: specify a speaker
80
+ })
81
+
82
+ playAudio(audioStream)
83
+ ```
84
+
85
+ Visit the [Azure Voice Reference](https://mastra.ai/reference/voice/azure) for more information on the Azure voice provider.
86
+
87
+ **ElevenLabs**:
88
+
89
+ ```typescript
90
+ import { Agent } from '@mastra/core/agent'
91
+ import { ElevenLabsVoice } from '@mastra/voice-elevenlabs'
92
+ import { playAudio } from '@mastra/node-audio'
93
+
94
+ const voiceAgent = new Agent({
95
+ id: 'voice-agent',
96
+ name: 'Voice Agent',
97
+ instructions: 'You are a voice assistant that can help users with their tasks.',
98
+ model: 'openai/gpt-5.5',
99
+ voice: new ElevenLabsVoice(),
100
+ })
101
+
102
+ const { text } = await voiceAgent.generate('What color is the sky?')
103
+
104
+ // Convert text to speech to an Audio Stream
105
+ const audioStream = await voiceAgent.voice.speak(text, {
106
+ speaker: 'default', // Optional: specify a speaker
107
+ })
108
+
109
+ playAudio(audioStream)
110
+ ```
111
+
112
+ Visit the [ElevenLabs Voice Reference](https://mastra.ai/reference/voice/elevenlabs) for more information on the ElevenLabs voice provider.
113
+
114
+ **PlayAI**:
115
+
116
+ ```typescript
117
+ import { Agent } from '@mastra/core/agent'
118
+ import { PlayAIVoice } from '@mastra/voice-playai'
119
+ import { playAudio } from '@mastra/node-audio'
120
+
121
+ const voiceAgent = new Agent({
122
+ id: 'voice-agent',
123
+ name: 'Voice Agent',
124
+ instructions: 'You are a voice assistant that can help users with their tasks.',
125
+ model: 'openai/gpt-5.5',
126
+ voice: new PlayAIVoice(),
127
+ })
128
+
129
+ const { text } = await voiceAgent.generate('What color is the sky?')
130
+
131
+ // Convert text to speech to an Audio Stream
132
+ const audioStream = await voiceAgent.voice.speak(text, {
133
+ speaker: 'default', // Optional: specify a speaker
134
+ })
135
+
136
+ playAudio(audioStream)
137
+ ```
138
+
139
+ Visit the [PlayAI Voice Reference](https://mastra.ai/reference/voice/playai) for more information on the PlayAI voice provider.
140
+
141
+ **Google**:
142
+
143
+ ```typescript
144
+ import { Agent } from '@mastra/core/agent'
145
+ import { GoogleVoice } from '@mastra/voice-google'
146
+ import { playAudio } from '@mastra/node-audio'
147
+
148
+ const voiceAgent = new Agent({
149
+ id: 'voice-agent',
150
+ name: 'Voice Agent',
151
+ instructions: 'You are a voice assistant that can help users with their tasks.',
152
+ model: 'openai/gpt-5.5',
153
+ voice: new GoogleVoice(),
154
+ })
155
+
156
+ const { text } = await voiceAgent.generate('What color is the sky?')
157
+
158
+ // Convert text to speech to an Audio Stream
159
+ const audioStream = await voiceAgent.voice.speak(text, {
160
+ speaker: 'en-US-Studio-O', // Optional: specify a speaker
161
+ })
162
+
163
+ playAudio(audioStream)
164
+ ```
165
+
166
+ Visit the [Google Voice Reference](https://mastra.ai/reference/voice/google) for more information on the Google voice provider.
167
+
168
+ **Cloudflare**:
169
+
170
+ ```typescript
171
+ import { Agent } from '@mastra/core/agent'
172
+ import { CloudflareVoice } from '@mastra/voice-cloudflare'
173
+ import { playAudio } from '@mastra/node-audio'
174
+
175
+ const voiceAgent = new Agent({
176
+ id: 'voice-agent',
177
+ name: 'Voice Agent',
178
+ instructions: 'You are a voice assistant that can help users with their tasks.',
179
+ model: 'openai/gpt-5.5',
180
+ voice: new CloudflareVoice(),
181
+ })
182
+
183
+ const { text } = await voiceAgent.generate('What color is the sky?')
184
+
185
+ // Convert text to speech to an Audio Stream
186
+ const audioStream = await voiceAgent.voice.speak(text, {
187
+ speaker: 'default', // Optional: specify a speaker
188
+ })
189
+
190
+ playAudio(audioStream)
191
+ ```
192
+
193
+ Visit the [Cloudflare Voice Reference](https://mastra.ai/reference/voice/cloudflare) for more information on the Cloudflare voice provider.
194
+
195
+ **Deepgram**:
196
+
197
+ ```typescript
198
+ import { Agent } from '@mastra/core/agent'
199
+ import { DeepgramVoice } from '@mastra/voice-deepgram'
200
+ import { playAudio } from '@mastra/node-audio'
201
+
202
+ const voiceAgent = new Agent({
203
+ id: 'voice-agent',
204
+ name: 'Voice Agent',
205
+ instructions: 'You are a voice assistant that can help users with their tasks.',
206
+ model: 'openai/gpt-5.5',
207
+ voice: new DeepgramVoice(),
208
+ })
209
+
210
+ const { text } = await voiceAgent.generate('What color is the sky?')
211
+
212
+ // Convert text to speech to an Audio Stream
213
+ const audioStream = await voiceAgent.voice.speak(text, {
214
+ speaker: 'aura-english-us', // Optional: specify a speaker
215
+ })
216
+
217
+ playAudio(audioStream)
218
+ ```
219
+
220
+ Visit the [Deepgram Voice Reference](https://mastra.ai/reference/voice/deepgram) for more information on the Deepgram voice provider.
221
+
222
+ **Inworld**:
223
+
224
+ ```typescript
225
+ import { Agent } from '@mastra/core/agent'
226
+ import { InworldVoice } from '@mastra/voice-inworld'
227
+ import { playAudio } from '@mastra/node-audio'
228
+
229
+ const voiceAgent = new Agent({
230
+ id: 'voice-agent',
231
+ name: 'Voice Agent',
232
+ instructions: 'You are a voice assistant that can help users with their tasks.',
233
+ model: 'openai/gpt-5.5',
234
+ voice: new InworldVoice(),
235
+ })
236
+
237
+ const { text } = await voiceAgent.generate('What color is the sky?')
238
+
239
+ // Convert text to speech to an Audio Stream
240
+ const audioStream = await voiceAgent.voice.speak(text, {
241
+ speaker: 'Dennis', // Optional: specify a speaker
242
+ })
243
+
244
+ playAudio(audioStream)
245
+ ```
246
+
247
+ Visit the [Inworld Voice Reference](https://mastra.ai/reference/voice/inworld) for more information on the Inworld voice provider.
248
+
249
+ **Speechify**:
250
+
251
+ ```typescript
252
+ import { Agent } from '@mastra/core/agent'
253
+ import { SpeechifyVoice } from '@mastra/voice-speechify'
254
+ import { playAudio } from '@mastra/node-audio'
255
+
256
+ const voiceAgent = new Agent({
257
+ id: 'voice-agent',
258
+ name: 'Voice Agent',
259
+ instructions: 'You are a voice assistant that can help users with their tasks.',
260
+ model: 'openai/gpt-5.5',
261
+ voice: new SpeechifyVoice(),
262
+ })
263
+
264
+ const { text } = await voiceAgent.generate('What color is the sky?')
265
+
266
+ // Convert text to speech to an Audio Stream
267
+ const audioStream = await voiceAgent.voice.speak(text, {
268
+ speaker: 'matthew', // Optional: specify a speaker
269
+ })
270
+
271
+ playAudio(audioStream)
272
+ ```
273
+
274
+ Visit the [Speechify Voice Reference](https://mastra.ai/reference/voice/speechify) for more information on the Speechify voice provider.
275
+
276
+ **Sarvam**:
277
+
278
+ ```typescript
279
+ import { Agent } from '@mastra/core/agent'
280
+ import { SarvamVoice } from '@mastra/voice-sarvam'
281
+ import { playAudio } from '@mastra/node-audio'
282
+
283
+ const voiceAgent = new Agent({
284
+ id: 'voice-agent',
285
+ name: 'Voice Agent',
286
+ instructions: 'You are a voice assistant that can help users with their tasks.',
287
+ model: 'openai/gpt-5.5',
288
+ voice: new SarvamVoice(),
289
+ })
290
+
291
+ const { text } = await voiceAgent.generate('What color is the sky?')
292
+
293
+ // Convert text to speech to an Audio Stream
294
+ const audioStream = await voiceAgent.voice.speak(text, {
295
+ speaker: 'shubh', // Optional: specify a bulbul:v3 speaker
296
+ })
297
+
298
+ playAudio(audioStream)
299
+ ```
300
+
301
+ Visit the [Sarvam Voice Reference](https://mastra.ai/reference/voice/sarvam) for more information on the Sarvam voice provider.
302
+
303
+ **Murf**:
304
+
305
+ ```typescript
306
+ import { Agent } from '@mastra/core/agent'
307
+ import { MurfVoice } from '@mastra/voice-murf'
308
+ import { playAudio } from '@mastra/node-audio'
309
+
310
+ const voiceAgent = new Agent({
311
+ id: 'voice-agent',
312
+ name: 'Voice Agent',
313
+ instructions: 'You are a voice assistant that can help users with their tasks.',
314
+ model: 'openai/gpt-5.5',
315
+ voice: new MurfVoice(),
316
+ })
317
+
318
+ const { text } = await voiceAgent.generate('What color is the sky?')
319
+
320
+ // Convert text to speech to an Audio Stream
321
+ const audioStream = await voiceAgent.voice.speak(text, {
322
+ speaker: 'default', // Optional: specify a speaker
323
+ })
324
+
325
+ playAudio(audioStream)
326
+ ```
327
+
328
+ Visit the [Murf Voice Reference](https://mastra.ai/reference/voice/murf) for more information on the Murf voice provider.
329
+
330
+ ### Speech to Text (STT)
331
+
332
+ Transcribe spoken content using various providers like OpenAI, ElevenLabs, and more. For detailed configuration options and more, check out [Speech to Text](https://mastra.ai/docs/voice/speech-to-text).
333
+
334
+ You can download a sample audio file from [here](https://github.com/mastra-ai/realtime-voice-demo/raw/refs/heads/main/how_can_i_help_you.mp3).
335
+
336
+ [](https://github.com/mastra-ai/realtime-voice-demo/raw/refs/heads/main/how_can_i_help_you.mp3)
337
+
338
+ **OpenAI**:
339
+
340
+ ```typescript
341
+ import { Agent } from '@mastra/core/agent'
342
+ import { OpenAIVoice } from '@mastra/voice-openai'
343
+ import { createReadStream } from 'fs'
344
+
345
+ const voiceAgent = new Agent({
346
+ id: 'voice-agent',
347
+ name: 'Voice Agent',
348
+ instructions: 'You are a voice assistant that can help users with their tasks.',
349
+ model: 'openai/gpt-5.5',
350
+ voice: new OpenAIVoice(),
351
+ })
352
+
353
+ // Use an audio file from a URL
354
+ const audioStream = await createReadStream('./how_can_i_help_you.mp3')
355
+
356
+ // Convert audio to text
357
+ const transcript = await voiceAgent.voice.listen(audioStream)
358
+ console.log(`User said: ${transcript}`)
359
+
360
+ // Generate a response based on the transcript
361
+ const { text } = await voiceAgent.generate(transcript)
362
+ ```
363
+
364
+ Visit the [OpenAI Voice Reference](https://mastra.ai/reference/voice/openai) for more information on the OpenAI voice provider.
365
+
366
+ **Azure**:
367
+
368
+ ```typescript
369
+ import { createReadStream } from 'fs'
370
+ import { Agent } from '@mastra/core/agent'
371
+ import { AzureVoice } from '@mastra/voice-azure'
372
+ import { createReadStream } from 'fs'
373
+
374
+ const voiceAgent = new Agent({
375
+ id: 'voice-agent',
376
+ name: 'Voice Agent',
377
+ instructions: 'You are a voice assistant that can help users with their tasks.',
378
+ model: 'openai/gpt-5.5',
379
+ voice: new AzureVoice(),
380
+ })
381
+
382
+ // Use an audio file from a URL
383
+ const audioStream = await createReadStream('./how_can_i_help_you.mp3')
384
+
385
+ // Convert audio to text
386
+ const transcript = await voiceAgent.voice.listen(audioStream)
387
+ console.log(`User said: ${transcript}`)
388
+
389
+ // Generate a response based on the transcript
390
+ const { text } = await voiceAgent.generate(transcript)
391
+ ```
392
+
393
+ Visit the [Azure Voice Reference](https://mastra.ai/reference/voice/azure) for more information on the Azure voice provider.
394
+
395
+ **ElevenLabs**:
396
+
397
+ ```typescript
398
+ import { Agent } from '@mastra/core/agent'
399
+ import { ElevenLabsVoice } from '@mastra/voice-elevenlabs'
400
+ import { createReadStream } from 'fs'
401
+
402
+ const voiceAgent = new Agent({
403
+ id: 'voice-agent',
404
+ name: 'Voice Agent',
405
+ instructions: 'You are a voice assistant that can help users with their tasks.',
406
+ model: 'openai/gpt-5.5',
407
+ voice: new ElevenLabsVoice(),
408
+ })
409
+
410
+ // Use an audio file from a URL
411
+ const audioStream = await createReadStream('./how_can_i_help_you.mp3')
412
+
413
+ // Convert audio to text
414
+ const transcript = await voiceAgent.voice.listen(audioStream)
415
+ console.log(`User said: ${transcript}`)
416
+
417
+ // Generate a response based on the transcript
418
+ const { text } = await voiceAgent.generate(transcript)
419
+ ```
420
+
421
+ Visit the [ElevenLabs Voice Reference](https://mastra.ai/reference/voice/elevenlabs) for more information on the ElevenLabs voice provider.
422
+
423
+ **Google**:
424
+
425
+ ```typescript
426
+ import { Agent } from '@mastra/core/agent'
427
+ import { GoogleVoice } from '@mastra/voice-google'
428
+ import { createReadStream } from 'fs'
429
+
430
+ const voiceAgent = new Agent({
431
+ id: 'voice-agent',
432
+ name: 'Voice Agent',
433
+ instructions: 'You are a voice assistant that can help users with their tasks.',
434
+ model: 'openai/gpt-5.5',
435
+ voice: new GoogleVoice(),
436
+ })
437
+
438
+ // Use an audio file from a URL
439
+ const audioStream = await createReadStream('./how_can_i_help_you.mp3')
440
+
441
+ // Convert audio to text
442
+ const transcript = await voiceAgent.voice.listen(audioStream)
443
+ console.log(`User said: ${transcript}`)
444
+
445
+ // Generate a response based on the transcript
446
+ const { text } = await voiceAgent.generate(transcript)
447
+ ```
448
+
449
+ Visit the [Google Voice Reference](https://mastra.ai/reference/voice/google) for more information on the Google voice provider.
450
+
451
+ **Cloudflare**:
452
+
453
+ ```typescript
454
+ import { Agent } from '@mastra/core/agent'
455
+ import { CloudflareVoice } from '@mastra/voice-cloudflare'
456
+ import { createReadStream } from 'fs'
457
+
458
+ const voiceAgent = new Agent({
459
+ id: 'voice-agent',
460
+ name: 'Voice Agent',
461
+ instructions: 'You are a voice assistant that can help users with their tasks.',
462
+ model: 'openai/gpt-5.5',
463
+ voice: new CloudflareVoice(),
464
+ })
465
+
466
+ // Use an audio file from a URL
467
+ const audioStream = await createReadStream('./how_can_i_help_you.mp3')
468
+
469
+ // Convert audio to text
470
+ const transcript = await voiceAgent.voice.listen(audioStream)
471
+ console.log(`User said: ${transcript}`)
472
+
473
+ // Generate a response based on the transcript
474
+ const { text } = await voiceAgent.generate(transcript)
475
+ ```
476
+
477
+ Visit the [Cloudflare Voice Reference](https://mastra.ai/reference/voice/cloudflare) for more information on the Cloudflare voice provider.
478
+
479
+ **Deepgram**:
480
+
481
+ ```typescript
482
+ import { Agent } from '@mastra/core/agent'
483
+ import { DeepgramVoice } from '@mastra/voice-deepgram'
484
+ import { createReadStream } from 'fs'
485
+
486
+ const voiceAgent = new Agent({
487
+ id: 'voice-agent',
488
+ name: 'Voice Agent',
489
+ instructions: 'You are a voice assistant that can help users with their tasks.',
490
+ model: 'openai/gpt-5.5',
491
+ voice: new DeepgramVoice(),
492
+ })
493
+
494
+ // Use an audio file from a URL
495
+ const audioStream = await createReadStream('./how_can_i_help_you.mp3')
496
+
497
+ // Convert audio to text
498
+ const transcript = await voiceAgent.voice.listen(audioStream)
499
+ console.log(`User said: ${transcript}`)
500
+
501
+ // Generate a response based on the transcript
502
+ const { text } = await voiceAgent.generate(transcript)
503
+ ```
504
+
505
+ Visit the [Deepgram Voice Reference](https://mastra.ai/reference/voice/deepgram) for more information on the Deepgram voice provider.
506
+
507
+ **Inworld**:
508
+
509
+ ```typescript
510
+ import { Agent } from '@mastra/core/agent'
511
+ import { InworldVoice } from '@mastra/voice-inworld'
512
+ import { createReadStream } from 'fs'
513
+
514
+ const voiceAgent = new Agent({
515
+ id: 'voice-agent',
516
+ name: 'Voice Agent',
517
+ instructions: 'You are a voice assistant that can help users with their tasks.',
518
+ model: 'openai/gpt-5.5',
519
+ voice: new InworldVoice(),
520
+ })
521
+
522
+ // Use an audio file from a URL
523
+ const audioStream = await createReadStream('./how_can_i_help_you.mp3')
524
+
525
+ // Convert audio to text
526
+ const transcript = await voiceAgent.voice.listen(audioStream)
527
+ console.log(`User said: ${transcript}`)
528
+
529
+ // Generate a response based on the transcript
530
+ const { text } = await voiceAgent.generate(transcript)
531
+ ```
532
+
533
+ Visit the [Inworld Voice Reference](https://mastra.ai/reference/voice/inworld) for more information on the Inworld voice provider.
534
+
535
+ **Sarvam**:
536
+
537
+ ```typescript
538
+ import { Agent } from '@mastra/core/agent'
539
+ import { SarvamVoice } from '@mastra/voice-sarvam'
540
+ import { createReadStream } from 'fs'
541
+
542
+ const voiceAgent = new Agent({
543
+ id: 'voice-agent',
544
+ name: 'Voice Agent',
545
+ instructions: 'You are a voice assistant that can help users with their tasks.',
546
+ model: 'openai/gpt-5.5',
547
+ voice: new SarvamVoice(),
548
+ })
549
+
550
+ // Use an audio file from a URL
551
+ const audioStream = await createReadStream('./how_can_i_help_you.mp3')
552
+
553
+ // Convert audio to text
554
+ const transcript = await voiceAgent.voice.listen(audioStream)
555
+ console.log(`User said: ${transcript}`)
556
+
557
+ // Generate a response based on the transcript
558
+ const { text } = await voiceAgent.generate(transcript)
559
+ ```
560
+
561
+ Visit the [Sarvam Voice Reference](https://mastra.ai/reference/voice/sarvam) for more information on the Sarvam voice provider.
562
+
563
+ ### Speech to Speech (STS)
564
+
565
+ Create conversational experiences with speech-to-speech capabilities. The unified API enables real-time voice interactions between users and AI agents. For detailed configuration options and advanced features, check out [Speech to Speech](https://mastra.ai/docs/voice/speech-to-speech).
566
+
567
+ **OpenAI**:
568
+
569
+ ```typescript
570
+ import { Agent } from '@mastra/core/agent'
571
+ import { playAudio, getMicrophoneStream } from '@mastra/node-audio'
572
+ import { OpenAIRealtimeVoice } from '@mastra/voice-openai-realtime'
573
+
574
+ const voiceAgent = new Agent({
575
+ id: 'voice-agent',
576
+ name: 'Voice Agent',
577
+ instructions: 'You are a voice assistant that can help users with their tasks.',
578
+ model: 'openai/gpt-5.5',
579
+ voice: new OpenAIRealtimeVoice(),
580
+ })
581
+
582
+ // Listen for agent audio responses
583
+ voiceAgent.voice.on('speaker', ({ audio }) => {
584
+ playAudio(audio)
585
+ })
586
+
587
+ // Initiate the conversation
588
+ await voiceAgent.voice.speak('How can I help you today?')
589
+
590
+ // Send continuous audio from the microphone
591
+ const micStream = getMicrophoneStream()
592
+ await voiceAgent.voice.send(micStream)
593
+ ```
594
+
595
+ Visit the [OpenAI Voice Reference](https://mastra.ai/reference/voice/openai-realtime) for more information on the OpenAI voice provider.
596
+
597
+ **Google**:
598
+
599
+ ```typescript
600
+ import { Agent } from '@mastra/core/agent'
601
+ import { playAudio, getMicrophoneStream } from '@mastra/node-audio'
602
+ import { GeminiLiveVoice } from '@mastra/voice-google-gemini-live'
603
+
604
+ const voiceAgent = new Agent({
605
+ id: 'voice-agent',
606
+ name: 'Voice Agent',
607
+ instructions: 'You are a voice assistant that can help users with their tasks.',
608
+ model: 'openai/gpt-5.5',
609
+ voice: new GeminiLiveVoice({
610
+ // Live API mode
611
+ apiKey: process.env.GOOGLE_API_KEY,
612
+ model: 'gemini-2.0-flash-exp',
613
+ speaker: 'Puck',
614
+ debug: true,
615
+ // Vertex AI alternative:
616
+ // vertexAI: true,
617
+ // project: 'your-gcp-project',
618
+ // location: 'us-central1',
619
+ // serviceAccountKeyFile: '/path/to/service-account.json',
620
+ }),
621
+ })
622
+
623
+ // Connect before using speak/send
624
+ await voiceAgent.voice.connect()
625
+
626
+ // Listen for agent audio responses
627
+ voiceAgent.voice.on('speaker', ({ audio }) => {
628
+ playAudio(audio)
629
+ })
630
+
631
+ // Listen for text responses and transcriptions
632
+ voiceAgent.voice.on('writing', ({ text, role }) => {
633
+ console.log(`${role}: ${text}`)
634
+ })
635
+
636
+ // Initiate the conversation
637
+ await voiceAgent.voice.speak('How can I help you today?')
638
+
639
+ // Send continuous audio from the microphone
640
+ const micStream = getMicrophoneStream()
641
+ await voiceAgent.voice.send(micStream)
642
+ ```
643
+
644
+ Visit the [Google Gemini Live Reference](https://mastra.ai/reference/voice/google-gemini-live) for more information on the Google Gemini Live voice provider.
645
+
646
+ **AWS Nova Sonic**:
647
+
648
+ ```typescript
649
+ import { Agent } from '@mastra/core/agent'
650
+ import { playAudio, getMicrophoneStream } from '@mastra/node-audio'
651
+ import { NovaSonicVoice } from '@mastra/voice-aws-nova-sonic'
652
+
653
+ const voiceAgent = new Agent({
654
+ id: 'voice-agent',
655
+ name: 'Voice Agent',
656
+ instructions: 'You are a voice assistant that can help users with their tasks.',
657
+ model: 'openai/gpt-5.5',
658
+ voice: new NovaSonicVoice({
659
+ region: 'us-east-1',
660
+ speaker: 'matthew',
661
+ // Static credentials are optional. The default AWS credential
662
+ // provider chain is used when none are passed.
663
+ }),
664
+ })
665
+
666
+ // Connect before using speak/send
667
+ await voiceAgent.voice.connect()
668
+
669
+ // Listen for assistant audio (Int16Array PCM)
670
+ voiceAgent.voice.on('speaking', ({ audioData }) => {
671
+ if (audioData) playAudio(audioData)
672
+ })
673
+
674
+ // Listen for transcribed text
675
+ voiceAgent.voice.on('writing', ({ text, role }) => {
676
+ console.log(`${role}: ${text}`)
677
+ })
678
+
679
+ // Initiate the conversation
680
+ await voiceAgent.voice.speak('How can I help you today?')
681
+
682
+ // Send continuous audio from the microphone
683
+ const micStream = getMicrophoneStream()
684
+ await voiceAgent.voice.send(micStream)
685
+ ```
686
+
687
+ Visit the [AWS Nova Sonic Reference](https://mastra.ai/reference/voice/aws-nova-sonic) for more information on the AWS Nova Sonic voice provider.
688
+
689
+ **Inworld Realtime**:
690
+
691
+ ```typescript
692
+ import { Agent } from '@mastra/core/agent'
693
+ import { playAudio, getMicrophoneStream } from '@mastra/node-audio'
694
+ import { InworldRealtimeVoice } from '@mastra/voice-inworld'
695
+
696
+ const voiceAgent = new Agent({
697
+ id: 'voice-agent',
698
+ name: 'Voice Agent',
699
+ instructions: 'You are a voice assistant that can help users with their tasks.',
700
+ model: 'openai/gpt-5.5',
701
+ voice: new InworldRealtimeVoice({
702
+ apiKey: process.env.INWORLD_API_KEY,
703
+ model: 'inworld/models/gemma-4-26b-a4b-it',
704
+ speaker: 'Sarah',
705
+ }),
706
+ })
707
+
708
+ // Connect before using speak/send
709
+ await voiceAgent.voice.connect()
710
+
711
+ // Listen for agent audio (PCM stream)
712
+ voiceAgent.voice.on('speaker', stream => {
713
+ playAudio(stream)
714
+ })
715
+
716
+ // Listen for text responses and transcriptions
717
+ voiceAgent.voice.on('writing', ({ text, role }) => {
718
+ console.log(`${role}: ${text}`)
719
+ })
720
+
721
+ // Initiate the conversation
722
+ await voiceAgent.voice.speak('How can I help you today?')
723
+
724
+ // Send continuous audio from the microphone
725
+ const micStream = getMicrophoneStream()
726
+ await voiceAgent.voice.send(micStream)
727
+ ```
728
+
729
+ Visit the [Inworld Realtime Reference](https://mastra.ai/reference/voice/inworld-realtime) for more information on the Inworld Realtime voice provider.
730
+
731
+ **xAI**:
732
+
733
+ ```typescript
734
+ import { Agent } from '@mastra/core/agent'
735
+ import { playAudio, getMicrophoneStream } from '@mastra/node-audio'
736
+ import { XAIRealtimeVoice } from '@mastra/voice-xai-realtime'
737
+
738
+ const voiceAgent = new Agent({
739
+ id: 'voice-agent',
740
+ name: 'Voice Agent',
741
+ instructions: 'You are a voice assistant that can help users with their tasks.',
742
+ model: 'xai/grok-4.3',
743
+ voice: new XAIRealtimeVoice({
744
+ apiKey: process.env.XAI_API_KEY,
745
+ model: 'grok-voice-think-fast-1.0',
746
+ speaker: 'eve',
747
+ turnDetection: { type: 'server_vad' },
748
+ }),
749
+ })
750
+
751
+ // Connect before using speak/send
752
+ await voiceAgent.voice.connect()
753
+
754
+ // Listen for agent audio responses
755
+ voiceAgent.voice.on('speaker', audioStream => {
756
+ playAudio(audioStream)
757
+ })
758
+
759
+ // Listen for text responses and transcriptions
760
+ voiceAgent.voice.on('writing', ({ text, role }) => {
761
+ console.log(`${role}: ${text}`)
762
+ })
763
+
764
+ // Initiate the conversation
765
+ await voiceAgent.voice.speak('How can I help you today?')
766
+
767
+ // Send continuous audio from the microphone
768
+ const micStream = getMicrophoneStream()
769
+ await voiceAgent.voice.send(micStream)
770
+ ```
771
+
772
+ Visit the [xAI Realtime Voice Reference](https://mastra.ai/reference/voice/xai-realtime) for more information on the xAI voice provider.
773
+
774
+ ## Voice configuration
775
+
776
+ Each voice provider can be configured with different models and options. Below are the detailed configuration options for all supported providers:
777
+
778
+ **OpenAI**:
779
+
780
+ ```typescript
781
+ // OpenAI Voice Configuration
782
+ const voice = new OpenAIVoice({
783
+ speechModel: {
784
+ name: 'gpt-3.5-turbo', // Example model name
785
+ apiKey: process.env.OPENAI_API_KEY,
786
+ language: 'en-US', // Language code
787
+ voiceType: 'neural', // Type of voice model
788
+ },
789
+ listeningModel: {
790
+ name: 'whisper-1', // Example model name
791
+ apiKey: process.env.OPENAI_API_KEY,
792
+ language: 'en-US', // Language code
793
+ format: 'wav', // Audio format
794
+ },
795
+ speaker: 'alloy', // Example speaker name
796
+ })
797
+ ```
798
+
799
+ Visit the [OpenAI Voice Reference](https://mastra.ai/reference/voice/openai) for more information on the OpenAI voice provider.
800
+
801
+ **Azure**:
802
+
803
+ ```typescript
804
+ // Azure Voice Configuration
805
+ const voice = new AzureVoice({
806
+ speechModel: {
807
+ name: 'en-US-JennyNeural', // Example model name
808
+ apiKey: process.env.AZURE_SPEECH_KEY,
809
+ region: process.env.AZURE_SPEECH_REGION,
810
+ language: 'en-US', // Language code
811
+ style: 'cheerful', // Voice style
812
+ pitch: '+0Hz', // Pitch adjustment
813
+ rate: '1.0', // Speech rate
814
+ },
815
+ listeningModel: {
816
+ name: 'en-US', // Example model name
817
+ apiKey: process.env.AZURE_SPEECH_KEY,
818
+ region: process.env.AZURE_SPEECH_REGION,
819
+ format: 'simple', // Output format
820
+ },
821
+ })
822
+ ```
823
+
824
+ Visit the [Azure Voice Reference](https://mastra.ai/reference/voice/azure) for more information on the Azure voice provider.
825
+
826
+ **ElevenLabs**:
827
+
828
+ ```typescript
829
+ // ElevenLabs Voice Configuration
830
+ const voice = new ElevenLabsVoice({
831
+ speechModel: {
832
+ voiceId: 'your-voice-id', // Example voice ID
833
+ model: 'eleven_multilingual_v2', // Example model name
834
+ apiKey: process.env.ELEVENLABS_API_KEY,
835
+ language: 'en', // Language code
836
+ emotion: 'neutral', // Emotion setting
837
+ },
838
+ // ElevenLabs may not have a separate listening model
839
+ })
840
+ ```
841
+
842
+ Visit the [ElevenLabs Voice Reference](https://mastra.ai/reference/voice/elevenlabs) for more information on the ElevenLabs voice provider.
843
+
844
+ **PlayAI**:
845
+
846
+ ```typescript
847
+ // PlayAI Voice Configuration
848
+ const voice = new PlayAIVoice({
849
+ speechModel: {
850
+ name: 'playai-voice', // Example model name
851
+ speaker: 'emma', // Example speaker name
852
+ apiKey: process.env.PLAYAI_API_KEY,
853
+ language: 'en-US', // Language code
854
+ speed: 1.0, // Speech speed
855
+ },
856
+ // PlayAI may not have a separate listening model
857
+ })
858
+ ```
859
+
860
+ Visit the [PlayAI Voice Reference](https://mastra.ai/reference/voice/playai) for more information on the PlayAI voice provider.
861
+
862
+ **Google**:
863
+
864
+ ```typescript
865
+ // Google Voice Configuration
866
+ const voice = new GoogleVoice({
867
+ speechModel: {
868
+ name: 'en-US-Studio-O', // Example model name
869
+ apiKey: process.env.GOOGLE_API_KEY,
870
+ languageCode: 'en-US', // Language code
871
+ gender: 'FEMALE', // Voice gender
872
+ speakingRate: 1.0, // Speaking rate
873
+ },
874
+ listeningModel: {
875
+ name: 'en-US', // Example model name
876
+ sampleRateHertz: 16000, // Sample rate
877
+ },
878
+ })
879
+ ```
880
+
881
+ Visit the [Google Voice Reference](https://mastra.ai/reference/voice/google) for more information on the Google voice provider.
882
+
883
+ **Cloudflare**:
884
+
885
+ ```typescript
886
+ // Cloudflare Voice Configuration
887
+ const voice = new CloudflareVoice({
888
+ speechModel: {
889
+ name: 'cloudflare-voice', // Example model name
890
+ accountId: process.env.CLOUDFLARE_ACCOUNT_ID,
891
+ apiToken: process.env.CLOUDFLARE_API_TOKEN,
892
+ language: 'en-US', // Language code
893
+ format: 'mp3', // Audio format
894
+ },
895
+ // Cloudflare may not have a separate listening model
896
+ })
897
+ ```
898
+
899
+ Visit the [Cloudflare Voice Reference](https://mastra.ai/reference/voice/cloudflare) for more information on the Cloudflare voice provider.
900
+
901
+ **Deepgram**:
902
+
903
+ ```typescript
904
+ // Deepgram Voice Configuration
905
+ const voice = new DeepgramVoice({
906
+ speechModel: {
907
+ name: 'nova-2', // Example model name
908
+ speaker: 'aura-english-us', // Example speaker name
909
+ apiKey: process.env.DEEPGRAM_API_KEY,
910
+ language: 'en-US', // Language code
911
+ tone: 'formal', // Tone setting
912
+ },
913
+ listeningModel: {
914
+ name: 'nova-2', // Example model name
915
+ format: 'flac', // Audio format
916
+ },
917
+ })
918
+ ```
919
+
920
+ Visit the [Deepgram Voice Reference](https://mastra.ai/reference/voice/deepgram) for more information on the Deepgram voice provider.
921
+
922
+ **Inworld**:
923
+
924
+ ```typescript
925
+ // Inworld Voice Configuration
926
+ const voice = new InworldVoice({
927
+ speechModel: {
928
+ name: 'inworld-tts-2',
929
+ apiKey: process.env.INWORLD_API_KEY,
930
+ },
931
+ listeningModel: {
932
+ name: 'groq/whisper-large-v3',
933
+ apiKey: process.env.INWORLD_API_KEY,
934
+ },
935
+ speaker: 'Dennis',
936
+ audioEncoding: 'MP3',
937
+ sampleRateHertz: 48000,
938
+ language: 'en-US',
939
+ })
940
+
941
+ // Per-call options: `deliveryMode` is honored only by `inworld-tts-2`.
942
+ const audioStream = await voice.speak('Hello!', {
943
+ deliveryMode: 'BALANCED', // 'STABLE' | 'BALANCED' | 'CREATIVE'
944
+ language: 'en-US', // BCP-47 per-call override
945
+ })
946
+ ```
947
+
948
+ Visit the [Inworld Voice Reference](https://mastra.ai/reference/voice/inworld) for more information on the Inworld voice provider.
949
+
950
+ **Speechify**:
951
+
952
+ ```typescript
953
+ // Speechify Voice Configuration
954
+ const voice = new SpeechifyVoice({
955
+ speechModel: {
956
+ name: 'speechify-voice', // Example model name
957
+ speaker: 'matthew', // Example speaker name
958
+ apiKey: process.env.SPEECHIFY_API_KEY,
959
+ language: 'en-US', // Language code
960
+ speed: 1.0, // Speech speed
961
+ },
962
+ // Speechify may not have a separate listening model
963
+ })
964
+ ```
965
+
966
+ Visit the [Speechify Voice Reference](https://mastra.ai/reference/voice/speechify) for more information on the Speechify voice provider.
967
+
968
+ **Sarvam**:
969
+
970
+ ```typescript
971
+ // Sarvam Voice Configuration
972
+ const voice = new SarvamVoice({
973
+ speechModel: {
974
+ model: 'bulbul:v3', // TTS model (bulbul:v2 or bulbul:v3)
975
+ apiKey: process.env.SARVAM_API_KEY,
976
+ language: 'en-IN', // BCP-47 language code
977
+ },
978
+ listeningModel: {
979
+ model: 'saarika:v2.5', // STT model (saarika:v2.5 or saaras:v3)
980
+ apiKey: process.env.SARVAM_API_KEY,
981
+ },
982
+ speaker: 'shubh', // Default bulbul:v3 speaker
983
+ })
984
+ ```
985
+
986
+ Visit the [Sarvam Voice Reference](https://mastra.ai/reference/voice/sarvam) for more information on the Sarvam voice provider.
987
+
988
+ **Murf**:
989
+
990
+ ```typescript
991
+ // Murf Voice Configuration
992
+ const voice = new MurfVoice({
993
+ speechModel: {
994
+ name: 'murf-voice', // Example model name
995
+ apiKey: process.env.MURF_API_KEY,
996
+ language: 'en-US', // Language code
997
+ emotion: 'happy', // Emotion setting
998
+ },
999
+ // Murf may not have a separate listening model
1000
+ })
1001
+ ```
1002
+
1003
+ Visit the [Murf Voice Reference](https://mastra.ai/reference/voice/murf) for more information on the Murf voice provider.
1004
+
1005
+ **OpenAI Realtime**:
1006
+
1007
+ ```typescript
1008
+ // OpenAI Realtime Voice Configuration
1009
+ const voice = new OpenAIRealtimeVoice({
1010
+ speechModel: {
1011
+ name: 'gpt-3.5-turbo', // Example model name
1012
+ apiKey: process.env.OPENAI_API_KEY,
1013
+ language: 'en-US', // Language code
1014
+ },
1015
+ listeningModel: {
1016
+ name: 'whisper-1', // Example model name
1017
+ apiKey: process.env.OPENAI_API_KEY,
1018
+ format: 'ogg', // Audio format
1019
+ },
1020
+ speaker: 'alloy', // Example speaker name
1021
+ })
1022
+ ```
1023
+
1024
+ For more information on the OpenAI Realtime voice provider, refer to the [OpenAI Realtime Voice Reference](https://mastra.ai/reference/voice/openai-realtime).
1025
+
1026
+ **xAI Realtime**:
1027
+
1028
+ ```typescript
1029
+ // xAI Realtime Voice Configuration
1030
+ const voice = new XAIRealtimeVoice({
1031
+ apiKey: process.env.XAI_API_KEY,
1032
+ model: 'grok-voice-think-fast-1.0',
1033
+ speaker: 'eve',
1034
+ instructions: 'You are a concise voice assistant.',
1035
+ turnDetection: {
1036
+ type: 'server_vad',
1037
+ threshold: 0.85,
1038
+ silence_duration_ms: 1000,
1039
+ prefix_padding_ms: 333,
1040
+ },
1041
+ audio: {
1042
+ input: { format: { type: 'audio/pcm', rate: 24000 } },
1043
+ output: { format: { type: 'audio/pcm', rate: 24000 } },
1044
+ },
1045
+ serverTools: [
1046
+ { type: 'web_search' },
1047
+ {
1048
+ type: 'mcp',
1049
+ server_url: 'https://mcp.example.com/mcp',
1050
+ server_label: 'business-tools',
1051
+ },
1052
+ ],
1053
+ })
1054
+ ```
1055
+
1056
+ Visit the [xAI Realtime Voice Reference](https://mastra.ai/reference/voice/xai-realtime) for more information on the xAI realtime voice provider.
1057
+
1058
+ **Google Gemini Live**:
1059
+
1060
+ ```typescript
1061
+ // Google Gemini Live Voice Configuration
1062
+ const voice = new GeminiLiveVoice({
1063
+ speechModel: {
1064
+ name: 'gemini-2.0-flash-exp', // Example model name
1065
+ apiKey: process.env.GOOGLE_API_KEY,
1066
+ },
1067
+ speaker: 'Puck', // Example speaker name
1068
+ // Google Gemini Live is a realtime bidirectional API without separate speech and listening models
1069
+ })
1070
+ ```
1071
+
1072
+ Visit the [Google Gemini Live Reference](https://mastra.ai/reference/voice/google-gemini-live) for more information on the Google Gemini Live voice provider.
1073
+
1074
+ **AWS Nova Sonic**:
1075
+
1076
+ ```typescript
1077
+ // AWS Nova Sonic Voice Configuration
1078
+ const voice = new NovaSonicVoice({
1079
+ region: 'us-east-1',
1080
+ speaker: 'matthew',
1081
+ sessionConfig: {
1082
+ inferenceConfiguration: {
1083
+ temperature: 0.7,
1084
+ maxTokens: 1024,
1085
+ },
1086
+ turnDetectionConfiguration: {
1087
+ endpointingSensitivity: 'MEDIUM',
1088
+ },
1089
+ },
1090
+ // AWS Nova Sonic is a realtime bidirectional API without separate speech and listening models
1091
+ })
1092
+ ```
1093
+
1094
+ Visit the [AWS Nova Sonic Reference](https://mastra.ai/reference/voice/aws-nova-sonic) for more information on the AWS Nova Sonic voice provider.
1095
+
1096
+ **Inworld Realtime**:
1097
+
1098
+ ```typescript
1099
+ // Inworld Realtime Voice Configuration
1100
+ const voice = new InworldRealtimeVoice({
1101
+ apiKey: process.env.INWORLD_API_KEY,
1102
+ model: 'inworld/models/gemma-4-26b-a4b-it',
1103
+ speaker: 'Sarah',
1104
+ // Typed Inworld realtime knobs (semantic VAD, playback speed, MCP tool routing, ...)
1105
+ session: {
1106
+ audio: {
1107
+ output: { speed: 1.1 },
1108
+ input: { turn_detection: { type: 'semantic_vad', eagerness: 'high' } },
1109
+ },
1110
+ },
1111
+ })
1112
+ ```
1113
+
1114
+ Visit the [Inworld Realtime Reference](https://mastra.ai/reference/voice/inworld-realtime) for more information on the Inworld Realtime voice provider.
1115
+
1116
+ **AI SDK**:
1117
+
1118
+ ```typescript
1119
+ // AI SDK Voice Configuration
1120
+ import { CompositeVoice } from '@mastra/core/voice'
1121
+ import { openai } from '@ai-sdk/openai'
1122
+ import { elevenlabs } from '@ai-sdk/elevenlabs'
1123
+
1124
+ // Use AI SDK models directly - no need to install separate packages
1125
+ const voice = new CompositeVoice({
1126
+ input: openai.transcription('whisper-1'), // AI SDK transcription
1127
+ output: elevenlabs.speech('eleven_turbo_v2'), // AI SDK speech
1128
+ })
1129
+
1130
+ // Works seamlessly with your agent
1131
+ const voiceAgent = new Agent({
1132
+ id: 'aisdk-voice-agent',
1133
+ name: 'AI SDK Voice Agent',
1134
+ instructions: 'You are a helpful assistant with voice capabilities.',
1135
+ model: 'openai/gpt-5.5',
1136
+ voice,
1137
+ })
1138
+ ```
1139
+
1140
+ ### Using Multiple Voice Providers
1141
+
1142
+ This example demonstrates how to create and use two different voice providers in Mastra: OpenAI for speech-to-text (STT) and PlayAI for text-to-speech (TTS).
1143
+
1144
+ Start by creating instances of the voice providers with any necessary configuration.
1145
+
1146
+ ```typescript
1147
+ import { OpenAIVoice } from '@mastra/voice-openai'
1148
+ import { PlayAIVoice } from '@mastra/voice-playai'
1149
+ import { CompositeVoice } from '@mastra/core/voice'
1150
+ import { playAudio, getMicrophoneStream } from '@mastra/node-audio'
1151
+
1152
+ // Initialize OpenAI voice for STT
1153
+ const input = new OpenAIVoice({
1154
+ listeningModel: {
1155
+ name: 'whisper-1',
1156
+ apiKey: process.env.OPENAI_API_KEY,
1157
+ },
1158
+ })
1159
+
1160
+ // Initialize PlayAI voice for TTS
1161
+ const output = new PlayAIVoice({
1162
+ speechModel: {
1163
+ name: 'playai-voice',
1164
+ apiKey: process.env.PLAYAI_API_KEY,
1165
+ },
1166
+ })
1167
+
1168
+ // Combine the providers using CompositeVoice
1169
+ const voice = new CompositeVoice({
1170
+ input,
1171
+ output,
1172
+ })
1173
+
1174
+ // Implement voice interactions using the combined voice provider
1175
+ const audioStream = getMicrophoneStream() // Assume this function gets audio input
1176
+ const transcript = await voice.listen(audioStream)
1177
+
1178
+ // Log the transcribed text
1179
+ console.log('Transcribed text:', transcript)
1180
+
1181
+ // Convert text to speech
1182
+ const responseAudio = await voice.speak(`You said: ${transcript}`, {
1183
+ speaker: 'default', // Optional: specify a speaker,
1184
+ responseFormat: 'wav', // Optional: specify a response format
1185
+ })
1186
+
1187
+ // Play the audio response
1188
+ playAudio(responseAudio)
1189
+ ```
1190
+
1191
+ ### Using AI SDK Model Providers
1192
+
1193
+ You can also use AI SDK models directly with `CompositeVoice`:
1194
+
1195
+ ```typescript
1196
+ import { CompositeVoice } from '@mastra/core/voice'
1197
+ import { openai } from '@ai-sdk/openai'
1198
+ import { elevenlabs } from '@ai-sdk/elevenlabs'
1199
+ import { playAudio, getMicrophoneStream } from '@mastra/node-audio'
1200
+
1201
+ // Use AI SDK models directly - no provider setup needed
1202
+ const voice = new CompositeVoice({
1203
+ input: openai.transcription('whisper-1'), // AI SDK transcription
1204
+ output: elevenlabs.speech('eleven_turbo_v2'), // AI SDK speech
1205
+ })
1206
+
1207
+ // Works the same way as Mastra providers
1208
+ const audioStream = getMicrophoneStream()
1209
+ const transcript = await voice.listen(audioStream)
1210
+
1211
+ console.log('Transcribed text:', transcript)
1212
+
1213
+ // Convert text to speech
1214
+ const responseAudio = await voice.speak(`You said: ${transcript}`, {
1215
+ speaker: 'Rachel', // ElevenLabs voice
1216
+ })
1217
+
1218
+ playAudio(responseAudio)
1219
+ ```
1220
+
1221
+ You can also mix AI SDK models with Mastra providers:
1222
+
1223
+ ```typescript
1224
+ import { CompositeVoice } from '@mastra/core/voice'
1225
+ import { PlayAIVoice } from '@mastra/voice-playai'
1226
+ import { groq } from '@ai-sdk/groq'
1227
+
1228
+ const voice = new CompositeVoice({
1229
+ input: groq.transcription('whisper-large-v3'), // AI SDK for STT
1230
+ output: new PlayAIVoice(), // Mastra provider for TTS
1231
+ })
1232
+ ```
1233
+
1234
+ For more information on the CompositeVoice, refer to the [CompositeVoice Reference](https://mastra.ai/reference/voice/composite-voice).
1235
+
1236
+ ## More resources
1237
+
1238
+ - [CompositeVoice](https://mastra.ai/reference/voice/composite-voice)
1239
+ - [MastraVoice](https://mastra.ai/reference/voice/mastra-voice)
1240
+ - [OpenAI Voice](https://mastra.ai/reference/voice/openai)
1241
+ - [OpenAI Realtime Voice](https://mastra.ai/reference/voice/openai-realtime)
1242
+ - [xAI Realtime Voice](https://mastra.ai/reference/voice/xai-realtime)
1243
+ - [Azure Voice](https://mastra.ai/reference/voice/azure)
1244
+ - [Google Voice](https://mastra.ai/reference/voice/google)
1245
+ - [Google Gemini Live Voice](https://mastra.ai/reference/voice/google-gemini-live)
1246
+ - [AWS Nova Sonic Voice](https://mastra.ai/reference/voice/aws-nova-sonic)
1247
+ - [Deepgram Voice](https://mastra.ai/reference/voice/deepgram)
1248
+ - [Inworld Voice](https://mastra.ai/reference/voice/inworld)
1249
+ - [PlayAI Voice](https://mastra.ai/reference/voice/playai)
1250
+ - [Voice Examples](https://github.com/mastra-ai/voice-examples)