@mastra/voice-openai 0.12.1 → 0.12.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (128) hide show
  1. package/CHANGELOG.md +16 -0
  2. package/LICENSE.md +15 -0
  3. package/dist/_types/@internal_voice/dist/_types/@internal_ai-sdk-v5/dist/index.d.ts +8888 -0
  4. package/dist/_types/@internal_voice/dist/_types/@internal_core/dist/base/index.d.ts +31 -0
  5. package/dist/_types/@internal_voice/dist/_types/@internal_core/dist/logger/index.d.ts +217 -0
  6. package/dist/_types/@internal_voice/dist/_types/@internal_core/dist/request-context/index.d.ts +147 -0
  7. package/dist/_types/@internal_voice/dist/_types/@internal_core/dist/types/index.d.ts +3 -0
  8. package/dist/_types/@internal_voice/dist/_types/zod/v3/ZodError.d.ts +164 -0
  9. package/dist/_types/@internal_voice/dist/_types/zod/v3/errors.d.ts +5 -0
  10. package/dist/_types/@internal_voice/dist/_types/zod/v3/external.d.ts +6 -0
  11. package/dist/_types/@internal_voice/dist/_types/zod/v3/helpers/enumUtil.d.ts +8 -0
  12. package/dist/_types/@internal_voice/dist/_types/zod/v3/helpers/errorUtil.d.ts +9 -0
  13. package/dist/_types/@internal_voice/dist/_types/zod/v3/helpers/parseUtil.d.ts +78 -0
  14. package/dist/_types/@internal_voice/dist/_types/zod/v3/helpers/partialUtil.d.ts +8 -0
  15. package/dist/_types/@internal_voice/dist/_types/zod/v3/helpers/typeAliases.d.ts +2 -0
  16. package/dist/_types/@internal_voice/dist/_types/zod/v3/helpers/util.d.ts +85 -0
  17. package/dist/_types/@internal_voice/dist/_types/zod/v3/index.d.cts +4 -0
  18. package/dist/_types/@internal_voice/dist/_types/zod/v3/index.d.ts +4 -0
  19. package/dist/_types/@internal_voice/dist/_types/zod/v3/locales/en.d.ts +3 -0
  20. package/dist/_types/@internal_voice/dist/_types/zod/v3/standard-schema.d.ts +102 -0
  21. package/dist/_types/@internal_voice/dist/_types/zod/v3/types.d.ts +1034 -0
  22. package/dist/_types/@internal_voice/dist/_types/zod/v4/classic/checks.d.ts +1 -0
  23. package/dist/_types/@internal_voice/dist/_types/zod/v4/classic/coerce.d.ts +17 -0
  24. package/dist/_types/@internal_voice/dist/_types/zod/v4/classic/compat.d.ts +50 -0
  25. package/dist/_types/@internal_voice/dist/_types/zod/v4/classic/errors.d.ts +30 -0
  26. package/dist/_types/@internal_voice/dist/_types/zod/v4/classic/external.d.ts +16 -0
  27. package/dist/_types/@internal_voice/dist/_types/zod/v4/classic/from-json-schema.d.ts +12 -0
  28. package/dist/_types/@internal_voice/dist/_types/zod/v4/classic/index.d.ts +4 -0
  29. package/dist/_types/@internal_voice/dist/_types/zod/v4/classic/iso.d.ts +22 -0
  30. package/dist/_types/@internal_voice/dist/_types/zod/v4/classic/parse.d.ts +31 -0
  31. package/dist/_types/@internal_voice/dist/_types/zod/v4/classic/schemas.d.ts +767 -0
  32. package/dist/_types/@internal_voice/dist/_types/zod/v4/core/api.d.ts +325 -0
  33. package/dist/_types/@internal_voice/dist/_types/zod/v4/core/checks.d.ts +278 -0
  34. package/dist/_types/@internal_voice/dist/_types/zod/v4/core/core.d.ts +70 -0
  35. package/dist/_types/@internal_voice/dist/_types/zod/v4/core/doc.d.ts +14 -0
  36. package/dist/_types/@internal_voice/dist/_types/zod/v4/core/errors.d.ts +221 -0
  37. package/dist/_types/@internal_voice/dist/_types/zod/v4/core/index.d.ts +16 -0
  38. package/dist/_types/@internal_voice/dist/_types/zod/v4/core/json-schema-generator.d.ts +65 -0
  39. package/dist/_types/@internal_voice/dist/_types/zod/v4/core/json-schema-processors.d.ts +49 -0
  40. package/dist/_types/@internal_voice/dist/_types/zod/v4/core/json-schema.d.ts +88 -0
  41. package/dist/_types/@internal_voice/dist/_types/zod/v4/core/parse.d.ts +49 -0
  42. package/dist/_types/@internal_voice/dist/_types/zod/v4/core/regexes.d.ts +85 -0
  43. package/dist/_types/@internal_voice/dist/_types/zod/v4/core/registries.d.ts +35 -0
  44. package/dist/_types/@internal_voice/dist/_types/zod/v4/core/schemas.d.ts +1184 -0
  45. package/dist/_types/@internal_voice/dist/_types/zod/v4/core/standard-schema.d.ts +126 -0
  46. package/dist/_types/@internal_voice/dist/_types/zod/v4/core/to-json-schema.d.ts +114 -0
  47. package/dist/_types/@internal_voice/dist/_types/zod/v4/core/util.d.ts +200 -0
  48. package/dist/_types/@internal_voice/dist/_types/zod/v4/core/versions.d.ts +5 -0
  49. package/dist/_types/@internal_voice/dist/_types/zod/v4/index.d.cts +3 -0
  50. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/ar.d.ts +4 -0
  51. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/az.d.ts +4 -0
  52. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/be.d.ts +4 -0
  53. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/bg.d.ts +4 -0
  54. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/ca.d.ts +4 -0
  55. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/cs.d.ts +4 -0
  56. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/da.d.ts +4 -0
  57. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/de.d.ts +4 -0
  58. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/el.d.ts +4 -0
  59. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/en.d.ts +4 -0
  60. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/eo.d.ts +4 -0
  61. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/es.d.ts +4 -0
  62. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/fa.d.ts +4 -0
  63. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/fi.d.ts +4 -0
  64. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/fr-CA.d.ts +4 -0
  65. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/fr.d.ts +4 -0
  66. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/he.d.ts +4 -0
  67. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/hr.d.ts +4 -0
  68. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/hu.d.ts +4 -0
  69. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/hy.d.ts +4 -0
  70. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/id.d.ts +4 -0
  71. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/index.d.ts +52 -0
  72. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/is.d.ts +4 -0
  73. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/it.d.ts +4 -0
  74. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/ja.d.ts +4 -0
  75. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/ka.d.ts +4 -0
  76. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/kh.d.ts +5 -0
  77. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/km.d.ts +4 -0
  78. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/ko.d.ts +4 -0
  79. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/lt.d.ts +4 -0
  80. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/mk.d.ts +4 -0
  81. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/ms.d.ts +4 -0
  82. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/nl.d.ts +4 -0
  83. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/no.d.ts +4 -0
  84. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/ota.d.ts +4 -0
  85. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/pl.d.ts +4 -0
  86. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/ps.d.ts +4 -0
  87. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/pt.d.ts +4 -0
  88. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/ro.d.ts +4 -0
  89. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/ru.d.ts +4 -0
  90. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/sl.d.ts +4 -0
  91. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/sv.d.ts +4 -0
  92. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/ta.d.ts +4 -0
  93. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/th.d.ts +4 -0
  94. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/tr.d.ts +4 -0
  95. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/ua.d.ts +5 -0
  96. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/uk.d.ts +4 -0
  97. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/ur.d.ts +4 -0
  98. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/uz.d.ts +4 -0
  99. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/vi.d.ts +4 -0
  100. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/yo.d.ts +4 -0
  101. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/zh-CN.d.ts +4 -0
  102. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/zh-TW.d.ts +4 -0
  103. package/dist/_types/@internal_voice/dist/index.d.ts +16 -0
  104. package/dist/_types/@internal_voice/dist/voice/aisdk/index.d.ts +3 -0
  105. package/dist/_types/@internal_voice/dist/voice/aisdk/speech.d.ts +23 -0
  106. package/dist/_types/@internal_voice/dist/voice/aisdk/transcription.d.ts +22 -0
  107. package/dist/_types/@internal_voice/dist/voice/composite-voice.d.ts +72 -0
  108. package/dist/_types/@internal_voice/dist/voice/default-voice.d.ts +13 -0
  109. package/dist/_types/@internal_voice/dist/voice/index.d.ts +5 -0
  110. package/dist/_types/@internal_voice/dist/voice/voice.d.ts +172 -0
  111. package/dist/docs/SKILL.md +2 -2
  112. package/dist/docs/assets/SOURCE_MAP.json +1 -1
  113. package/dist/docs/references/docs-agents-adding-voice.md +186 -158
  114. package/dist/docs/references/docs-voice-overview.md +650 -379
  115. package/dist/docs/references/docs-voice-speech-to-text.md +27 -28
  116. package/dist/docs/references/docs-voice-text-to-speech.md +28 -29
  117. package/dist/docs/references/reference-voice-composite-voice.md +37 -37
  118. package/dist/docs/references/reference-voice-openai.md +36 -30
  119. package/dist/docs/references/reference-voice-voice.getSpeakers.md +45 -45
  120. package/dist/docs/references/reference-voice-voice.listen.md +64 -58
  121. package/dist/docs/references/reference-voice-voice.speak.md +65 -55
  122. package/dist/index.cjs +260 -2
  123. package/dist/index.cjs.map +1 -1
  124. package/dist/index.d.ts +1 -1
  125. package/dist/index.d.ts.map +1 -1
  126. package/dist/index.js +259 -1
  127. package/dist/index.js.map +1 -1
  128. package/package.json +10 -11
@@ -7,39 +7,39 @@ Mastra agents can be enhanced with voice capabilities, allowing them to speak re
7
7
  The simplest way to add voice to an agent is to use a single provider for both speaking and listening:
8
8
 
9
9
  ```typescript
10
- import { createReadStream } from "fs";
11
- import path from "path";
12
- import { Agent } from "@mastra/core/agent";
13
- import { OpenAIVoice } from "@mastra/voice-openai";
10
+ import { createReadStream } from 'fs'
11
+ import path from 'path'
12
+ import { Agent } from '@mastra/core/agent'
13
+ import { OpenAIVoice } from '@mastra/voice-openai'
14
14
 
15
15
  // Initialize the voice provider with default settings
16
- const voice = new OpenAIVoice();
16
+ const voice = new OpenAIVoice()
17
17
 
18
18
  // Create an agent with voice capabilities
19
19
  export const agent = new Agent({
20
- id: "voice-agent",
21
- name: "Voice Agent",
20
+ id: 'voice-agent',
21
+ name: 'Voice Agent',
22
22
  instructions: `You are a helpful assistant with both STT and TTS capabilities.`,
23
- model: "openai/gpt-5.1",
23
+ model: 'openai/gpt-5.5',
24
24
  voice,
25
- });
25
+ })
26
26
 
27
27
  // The agent can now use voice for interaction
28
28
  const audioStream = await agent.voice.speak("Hello, I'm your AI assistant!", {
29
- filetype: "m4a",
30
- });
29
+ filetype: 'm4a',
30
+ })
31
31
 
32
- playAudio(audioStream!);
32
+ playAudio(audioStream!)
33
33
 
34
34
  try {
35
- const transcription = await agent.voice.listen(audioStream);
36
- console.log(transcription);
35
+ const transcription = await agent.voice.listen(audioStream)
36
+ console.log(transcription)
37
37
  } catch (error) {
38
- console.error("Error transcribing audio:", error);
38
+ console.error('Error transcribing audio:', error)
39
39
  }
40
40
  ```
41
41
 
42
- ## Working with Audio Streams
42
+ ## Working with audio streams
43
43
 
44
44
  The `speak()` and `listen()` methods work with Node.js streams. Here's how to save and load audio files:
45
45
 
@@ -48,20 +48,20 @@ The `speak()` and `listen()` methods work with Node.js streams. Here's how to sa
48
48
  The `speak` method returns a stream that you can pipe to a file or speaker.
49
49
 
50
50
  ```typescript
51
- import { createWriteStream } from "fs";
52
- import path from "path";
51
+ import { createWriteStream } from 'fs'
52
+ import path from 'path'
53
53
 
54
54
  // Generate speech and save to file
55
- const audio = await agent.voice.speak("Hello, World!");
56
- const filePath = path.join(process.cwd(), "agent.mp3");
57
- const writer = createWriteStream(filePath);
55
+ const audio = await agent.voice.speak('Hello, World!')
56
+ const filePath = path.join(process.cwd(), 'agent.mp3')
57
+ const writer = createWriteStream(filePath)
58
58
 
59
- audio.pipe(writer);
59
+ audio.pipe(writer)
60
60
 
61
61
  await new Promise<void>((resolve, reject) => {
62
- writer.on("finish", () => resolve());
63
- writer.on("error", reject);
64
- });
62
+ writer.on('finish', () => resolve())
63
+ writer.on('error', reject)
64
+ })
65
65
  ```
66
66
 
67
67
  ### Transcribing Audio Input
@@ -69,88 +69,119 @@ await new Promise<void>((resolve, reject) => {
69
69
  The `listen` method expects a stream of audio data from a microphone or file.
70
70
 
71
71
  ```typescript
72
- import { createReadStream } from "fs";
73
- import path from "path";
72
+ import { createReadStream } from 'fs'
73
+ import path from 'path'
74
74
 
75
75
  // Read audio file and transcribe
76
- const audioFilePath = path.join(process.cwd(), "/agent.m4a");
77
- const audioStream = createReadStream(audioFilePath);
76
+ const audioFilePath = path.join(process.cwd(), '/agent.m4a')
77
+ const audioStream = createReadStream(audioFilePath)
78
78
 
79
79
  try {
80
- console.log("Transcribing audio file...");
80
+ console.log('Transcribing audio file...')
81
81
  const transcription = await agent.voice.listen(audioStream, {
82
- filetype: "m4a",
83
- });
84
- console.log("Transcription:", transcription);
82
+ filetype: 'm4a',
83
+ })
84
+ console.log('Transcription:', transcription)
85
85
  } catch (error) {
86
- console.error("Error transcribing audio:", error);
86
+ console.error('Error transcribing audio:', error)
87
87
  }
88
88
  ```
89
89
 
90
- ## Speech-to-Speech Voice Interactions
90
+ ## Speech-to-speech voice interactions
91
91
 
92
92
  For more dynamic and interactive voice experiences, you can use real-time voice providers that support speech-to-speech capabilities:
93
93
 
94
94
  ```typescript
95
- import { Agent } from "@mastra/core/agent";
96
- import { getMicrophoneStream } from "@mastra/node-audio";
97
- import { OpenAIRealtimeVoice } from "@mastra/voice-openai-realtime";
98
- import { search, calculate } from "../tools";
95
+ import { Agent } from '@mastra/core/agent'
96
+ import { getMicrophoneStream } from '@mastra/node-audio'
97
+ import { OpenAIRealtimeVoice } from '@mastra/voice-openai-realtime'
98
+ import { search, calculate } from '../tools'
99
99
 
100
100
  // Initialize the realtime voice provider
101
101
  const voice = new OpenAIRealtimeVoice({
102
102
  apiKey: process.env.OPENAI_API_KEY,
103
- model: "gpt-5.1-realtime",
104
- speaker: "alloy",
105
- });
103
+ model: 'gpt-5.1-realtime',
104
+ speaker: 'alloy',
105
+ })
106
106
 
107
107
  // Create an agent with speech-to-speech voice capabilities
108
108
  export const agent = new Agent({
109
- id: "speech-to-speech-agent",
110
- name: "Speech-to-Speech Agent",
109
+ id: 'speech-to-speech-agent',
110
+ name: 'Speech-to-Speech Agent',
111
111
  instructions: `You are a helpful assistant with speech-to-speech capabilities.`,
112
- model: "openai/gpt-5.1",
112
+ model: 'openai/gpt-5.5',
113
113
  tools: {
114
114
  // Tools configured on Agent are passed to voice provider
115
115
  search,
116
116
  calculate,
117
117
  },
118
118
  voice,
119
- });
119
+ })
120
120
 
121
121
  // Establish a WebSocket connection
122
- await agent.voice.connect();
122
+ await agent.voice.connect()
123
123
 
124
124
  // Start a conversation
125
- agent.voice.speak("Hello, I'm your AI assistant!");
125
+ agent.voice.speak("Hello, I'm your AI assistant!")
126
126
 
127
127
  // Stream audio from a microphone
128
- const microphoneStream = getMicrophoneStream();
129
- agent.voice.send(microphoneStream);
128
+ const microphoneStream = getMicrophoneStream()
129
+ agent.voice.send(microphoneStream)
130
130
 
131
131
  // When done with the conversation
132
- agent.voice.close();
132
+ agent.voice.close()
133
133
  ```
134
134
 
135
+ ### Per-session voice for concurrent sessions
136
+
137
+ A static `voice` instance is shared across every request. For one-shot text-to-speech this is fine, but realtime and speech-to-speech providers store one WebSocket, one set of tools, and one request context per instance. If you deploy a single agent that handles several live sessions at once, a shared instance lets one session overwrite another session's tools, instructions, and request context.
138
+
139
+ To give each session its own voice, provide `voice` as a resolver. Mastra runs the resolver on every `getVoice()` call and returns a fresh, session-owned instance:
140
+
141
+ ```typescript
142
+ import { Agent } from '@mastra/core/agent'
143
+ import { OpenAIRealtimeVoice } from '@mastra/voice-openai-realtime'
144
+
145
+ export const agent = new Agent({
146
+ id: 'support-line',
147
+ name: 'Support Line',
148
+ instructions: ({ requestContext }) => `Help user ${requestContext.get('user')}.`,
149
+ model: 'openai/gpt-5.5',
150
+ voice: ({ requestContext }) => new OpenAIRealtimeVoice({ apiKey: requestContext.get('apiKey') }),
151
+ })
152
+
153
+ // Each concurrent session resolves its own voice instance
154
+ const voice = await agent.getVoice({ requestContext })
155
+ await voice.connect()
156
+ ```
157
+
158
+ When you use a resolver:
159
+
160
+ - Each call to `getVoice()` returns a new instance, so concurrent sessions never share state.
161
+ - Mastra does not add tools or instructions to a resolver instance. Configure those inside the resolver or on the provider.
162
+ - You own the lifecycle of the returned instance, so call `disconnect()` or `close()` when the session ends.
163
+
164
+ The `agent.voice` getter has no request context, so it throws when `voice` is a resolver. Use `agent.getVoice({ requestContext })` instead.
165
+
135
166
  ### Event System
136
167
 
137
168
  The realtime voice provider emits several events you can listen for:
138
169
 
139
170
  ```typescript
140
171
  // Listen for speech audio data sent from voice provider
141
- agent.voice.on("speaking", ({ audio }) => {
172
+ agent.voice.on('speaking', ({ audio }) => {
142
173
  // audio contains ReadableStream or Int16Array audio data
143
- });
174
+ })
144
175
 
145
176
  // Listen for transcribed text sent from both voice provider and user
146
- agent.voice.on("writing", ({ text, role }) => {
147
- console.log(`${role} said: ${text}`);
148
- });
177
+ agent.voice.on('writing', ({ text, role }) => {
178
+ console.log(`${role} said: ${text}`)
179
+ })
149
180
 
150
181
  // Listen for errors
151
- agent.voice.on("error", (error) => {
152
- console.error("Voice error:", error);
153
- });
182
+ agent.voice.on('error', error => {
183
+ console.error('Voice error:', error)
184
+ })
154
185
  ```
155
186
 
156
187
  ## Examples
@@ -165,92 +196,88 @@ The following files are created:
165
196
  - **unified-response.mp3** – Unified agent's spoken response.
166
197
 
167
198
  ```typescript
168
- import "dotenv/config";
199
+ import 'dotenv/config'
169
200
 
170
- import path from "path";
171
- import { createReadStream } from "fs";
172
- import { Agent } from "@mastra/core/agent";
173
- import { CompositeVoice } from "@mastra/core/voice";
174
- import { OpenAIVoice } from "@mastra/voice-openai";
175
- import { Mastra } from "@mastra/core";
201
+ import path from 'path'
202
+ import { createReadStream } from 'fs'
203
+ import { Agent } from '@mastra/core/agent'
204
+ import { CompositeVoice } from '@mastra/core/voice'
205
+ import { OpenAIVoice } from '@mastra/voice-openai'
206
+ import { Mastra } from '@mastra/core'
176
207
 
177
208
  // Saves an audio stream to a file in the audio directory, creating the directory if it doesn't exist.
178
209
  export const saveAudioToFile = async (
179
210
  audio: NodeJS.ReadableStream,
180
211
  filename: string,
181
212
  ): Promise<void> => {
182
- const audioDir = path.join(process.cwd(), "audio");
183
- const filePath = path.join(audioDir, filename);
213
+ const audioDir = path.join(process.cwd(), 'audio')
214
+ const filePath = path.join(audioDir, filename)
184
215
 
185
- await fs.promises.mkdir(audioDir, { recursive: true });
216
+ await fs.promises.mkdir(audioDir, { recursive: true })
186
217
 
187
- const writer = createWriteStream(filePath);
188
- audio.pipe(writer);
218
+ const writer = createWriteStream(filePath)
219
+ audio.pipe(writer)
189
220
  return new Promise((resolve, reject) => {
190
- writer.on("finish", resolve);
191
- writer.on("error", reject);
192
- });
193
- };
221
+ writer.on('finish', resolve)
222
+ writer.on('error', reject)
223
+ })
224
+ }
194
225
 
195
226
  // Saves an audio stream to a file in the audio directory, creating the directory if it doesn't exist.
196
- export const convertToText = async (
197
- input: string | NodeJS.ReadableStream,
198
- ): Promise<string> => {
199
- if (typeof input === "string") {
200
- return input;
227
+ export const convertToText = async (input: string | NodeJS.ReadableStream): Promise<string> => {
228
+ if (typeof input === 'string') {
229
+ return input
201
230
  }
202
231
 
203
- const chunks: Buffer[] = [];
232
+ const chunks: Buffer[] = []
204
233
  return new Promise((resolve, reject) => {
205
- inputData.on("data", (chunk) => chunks.push(Buffer.from(chunk)));
206
- inputData.on("error", reject);
207
- inputData.on("end", () => resolve(Buffer.concat(chunks).toString("utf-8")));
208
- });
209
- };
234
+ inputData.on('data', chunk => chunks.push(Buffer.from(chunk)))
235
+ inputData.on('error', reject)
236
+ inputData.on('end', () => resolve(Buffer.concat(chunks).toString('utf-8')))
237
+ })
238
+ }
210
239
 
211
240
  export const hybridVoiceAgent = new Agent({
212
- id: "hybrid-voice-agent",
213
- name: "Hybrid Voice Agent",
214
- model: "openai/gpt-5.1",
215
- instructions: "You can speak and listen using different providers.",
241
+ id: 'hybrid-voice-agent',
242
+ name: 'Hybrid Voice Agent',
243
+ model: 'openai/gpt-5.5',
244
+ instructions: 'You can speak and listen using different providers.',
216
245
  voice: new CompositeVoice({
217
246
  input: new OpenAIVoice(),
218
247
  output: new OpenAIVoice(),
219
248
  }),
220
- });
249
+ })
221
250
 
222
251
  export const unifiedVoiceAgent = new Agent({
223
- id: "unified-voice-agent",
224
- name: "Unified Voice Agent",
225
- instructions: "You are an agent with both STT and TTS capabilities.",
226
- model: "openai/gpt-5.1",
252
+ id: 'unified-voice-agent',
253
+ name: 'Unified Voice Agent',
254
+ instructions: 'You are an agent with both STT and TTS capabilities.',
255
+ model: 'openai/gpt-5.5',
227
256
  voice: new OpenAIVoice(),
228
- });
257
+ })
229
258
 
230
259
  export const mastra = new Mastra({
231
260
  agents: { hybridVoiceAgent, unifiedVoiceAgent },
232
- });
261
+ })
233
262
 
234
- const hybridVoiceAgent = mastra.getAgent("hybridVoiceAgent");
235
- const unifiedVoiceAgent = mastra.getAgent("unifiedVoiceAgent");
263
+ const hybridVoiceAgent = mastra.getAgent('hybridVoiceAgent')
264
+ const unifiedVoiceAgent = mastra.getAgent('unifiedVoiceAgent')
236
265
 
237
- const question = "What is the meaning of life in one sentence?";
266
+ const question = 'What is the meaning of life in one sentence?'
238
267
 
239
- const hybridSpoken = await hybridVoiceAgent.voice.speak(question);
268
+ const hybridSpoken = await hybridVoiceAgent.voice.speak(question)
240
269
 
241
- await saveAudioToFile(hybridSpoken!, "hybrid-question.mp3");
270
+ await saveAudioToFile(hybridSpoken!, 'hybrid-question.mp3')
242
271
 
243
- const audioStream = createReadStream(
244
- path.join(process.cwd(), "audio", "hybrid-question.mp3"),
245
- );
246
- const unifiedHeard = await unifiedVoiceAgent.voice.listen(audioStream);
272
+ const audioStream = createReadStream(path.join(process.cwd(), 'audio', 'hybrid-question.mp3'))
273
+ const unifiedHeard = await unifiedVoiceAgent.voice.listen(audioStream)
247
274
 
248
- const inputText = await convertToText(unifiedHeard!);
275
+ const inputText = await convertToText(unifiedHeard!)
249
276
 
250
- const unifiedResponse = await unifiedVoiceAgent.generate(inputText);
251
- const unifiedSpoken = await unifiedVoiceAgent.voice.speak(unifiedResponse.text);
277
+ const unifiedResponse = await unifiedVoiceAgent.generate(inputText)
278
+ const unifiedSpoken = await unifiedVoiceAgent.voice.speak(unifiedResponse.text)
252
279
 
253
- await saveAudioToFile(unifiedSpoken!, "unified-response.mp3");
280
+ await saveAudioToFile(unifiedSpoken!, 'unified-response.mp3')
254
281
  ```
255
282
 
256
283
  ### Using Multiple Providers
@@ -258,23 +285,23 @@ await saveAudioToFile(unifiedSpoken!, "unified-response.mp3");
258
285
  For more flexibility, you can use different providers for speaking and listening using the CompositeVoice class:
259
286
 
260
287
  ```typescript
261
- import { Agent } from "@mastra/core/agent";
262
- import { CompositeVoice } from "@mastra/core/voice";
263
- import { OpenAIVoice } from "@mastra/voice-openai";
264
- import { PlayAIVoice } from "@mastra/voice-playai";
288
+ import { Agent } from '@mastra/core/agent'
289
+ import { CompositeVoice } from '@mastra/core/voice'
290
+ import { OpenAIVoice } from '@mastra/voice-openai'
291
+ import { PlayAIVoice } from '@mastra/voice-playai'
265
292
 
266
293
  export const agent = new Agent({
267
- id: "voice-agent",
268
- name: "Voice Agent",
294
+ id: 'voice-agent',
295
+ name: 'Voice Agent',
269
296
  instructions: `You are a helpful assistant with both STT and TTS capabilities.`,
270
- model: "openai/gpt-5.1",
297
+ model: 'openai/gpt-5.5',
271
298
 
272
299
  // Create a composite voice using OpenAI for listening and PlayAI for speaking
273
300
  voice: new CompositeVoice({
274
301
  input: new OpenAIVoice(),
275
302
  output: new PlayAIVoice(),
276
303
  }),
277
- });
304
+ })
278
305
  ```
279
306
 
280
307
  ### Using AI SDK
@@ -282,28 +309,28 @@ export const agent = new Agent({
282
309
  Mastra supports using AI SDK's transcription and speech models directly in `CompositeVoice`, giving you access to a wide range of providers through the AI SDK ecosystem:
283
310
 
284
311
  ```typescript
285
- import { Agent } from "@mastra/core/agent";
286
- import { CompositeVoice } from "@mastra/core/voice";
287
- import { openai } from "@ai-sdk/openai";
288
- import { elevenlabs } from "@ai-sdk/elevenlabs";
289
- import { groq } from "@ai-sdk/groq";
312
+ import { Agent } from '@mastra/core/agent'
313
+ import { CompositeVoice } from '@mastra/core/voice'
314
+ import { openai } from '@ai-sdk/openai'
315
+ import { elevenlabs } from '@ai-sdk/elevenlabs'
316
+ import { groq } from '@ai-sdk/groq'
290
317
 
291
318
  export const agent = new Agent({
292
- id: "aisdk-voice-agent",
293
- name: "AI SDK Voice Agent",
319
+ id: 'aisdk-voice-agent',
320
+ name: 'AI SDK Voice Agent',
294
321
  instructions: `You are a helpful assistant with voice capabilities.`,
295
- model: "openai/gpt-5.1",
322
+ model: 'openai/gpt-5.5',
296
323
 
297
324
  // Pass AI SDK models directly to CompositeVoice
298
325
  voice: new CompositeVoice({
299
- input: openai.transcription('whisper-1'), // AI SDK transcription model
300
- output: elevenlabs.speech('eleven_turbo_v2'), // AI SDK speech model
326
+ input: openai.transcription('whisper-1'), // AI SDK transcription model
327
+ output: elevenlabs.speech('eleven_turbo_v2'), // AI SDK speech model
301
328
  }),
302
- });
329
+ })
303
330
 
304
331
  // Use voice capabilities as usual
305
- const audioStream = await agent.voice.speak("Hello!");
306
- const transcribedText = await agent.voice.listen(audioStream);
332
+ const audioStream = await agent.voice.speak('Hello!')
333
+ const transcribedText = await agent.voice.listen(audioStream)
307
334
  ```
308
335
 
309
336
  #### Mix and Match Providers
@@ -311,15 +338,15 @@ const transcribedText = await agent.voice.listen(audioStream);
311
338
  You can mix AI SDK models with Mastra voice providers:
312
339
 
313
340
  ```typescript
314
- import { CompositeVoice } from "@mastra/core/voice";
315
- import { PlayAIVoice } from "@mastra/voice-playai";
316
- import { openai } from "@ai-sdk/openai";
341
+ import { CompositeVoice } from '@mastra/core/voice'
342
+ import { PlayAIVoice } from '@mastra/voice-playai'
343
+ import { openai } from '@ai-sdk/openai'
317
344
 
318
345
  // Use AI SDK for transcription and Mastra provider for speech
319
346
  const voice = new CompositeVoice({
320
- input: openai.transcription('whisper-1'), // AI SDK
321
- output: new PlayAIVoice(), // Mastra provider
322
- });
347
+ input: openai.transcription('whisper-1'), // AI SDK
348
+ output: new PlayAIVoice(), // Mastra provider
349
+ })
323
350
  ```
324
351
 
325
352
  For the complete list of supported AI SDK providers and their capabilities:
@@ -327,27 +354,28 @@ For the complete list of supported AI SDK providers and their capabilities:
327
354
  - [Transcription](https://ai-sdk.dev/docs/providers/openai/transcription)
328
355
  - [Speech](https://ai-sdk.dev/docs/providers/elevenlabs/speech)
329
356
 
330
- ## Supported Voice Providers
357
+ ## Supported voice providers
331
358
 
332
359
  Mastra supports multiple voice providers for text-to-speech (TTS) and speech-to-text (STT) capabilities:
333
360
 
334
- | Provider | Package | Features | Reference |
335
- | --------------- | ------------------------------- | ------------------------- | ------------------------------------------------------------------ |
336
- | OpenAI | `@mastra/voice-openai` | TTS, STT | [Documentation](https://mastra.ai/reference/voice/openai) |
337
- | OpenAI Realtime | `@mastra/voice-openai-realtime` | Realtime speech-to-speech | [Documentation](https://mastra.ai/reference/voice/openai-realtime) |
338
- | ElevenLabs | `@mastra/voice-elevenlabs` | High-quality TTS | [Documentation](https://mastra.ai/reference/voice/elevenlabs) |
339
- | PlayAI | `@mastra/voice-playai` | TTS | [Documentation](https://mastra.ai/reference/voice/playai) |
340
- | Google | `@mastra/voice-google` | TTS, STT | [Documentation](https://mastra.ai/reference/voice/google) |
341
- | Deepgram | `@mastra/voice-deepgram` | STT | [Documentation](https://mastra.ai/reference/voice/deepgram) |
342
- | Murf | `@mastra/voice-murf` | TTS | [Documentation](https://mastra.ai/reference/voice/murf) |
343
- | Speechify | `@mastra/voice-speechify` | TTS | [Documentation](https://mastra.ai/reference/voice/speechify) |
344
- | Sarvam | `@mastra/voice-sarvam` | TTS, STT | [Documentation](https://mastra.ai/reference/voice/sarvam) |
345
- | Azure | `@mastra/voice-azure` | TTS, STT | [Documentation](https://mastra.ai/reference/voice/mastra-voice) |
346
- | Cloudflare | `@mastra/voice-cloudflare` | TTS | [Documentation](https://mastra.ai/reference/voice/mastra-voice) |
347
-
348
- ## Next Steps
349
-
350
- - [Voice API Reference](https://mastra.ai/reference/voice/mastra-voice) - Detailed API documentation for voice capabilities
351
- - [Text to Speech Examples](https://github.com/mastra-ai/voice-examples/tree/main/text-to-speech) - Interactive story generator and other TTS implementations
352
- - [Speech to Text Examples](https://github.com/mastra-ai/voice-examples/tree/main/speech-to-text) - Voice memo app and other STT implementations
353
- - [Speech to Speech Examples](https://github.com/mastra-ai/voice-examples/tree/main/speech-to-speech) - Real-time voice conversation with call analysis
361
+ | Provider | Package | Features | Reference |
362
+ | --------------- | ------------------------------- | ----------------------------------------- | ------------------------------------------------------------------ |
363
+ | OpenAI | `@mastra/voice-openai` | TTS, STT | [Documentation](https://mastra.ai/reference/voice/openai) |
364
+ | OpenAI Realtime | `@mastra/voice-openai-realtime` | Realtime speech-to-speech | [Documentation](https://mastra.ai/reference/voice/openai-realtime) |
365
+ | AWS Nova Sonic | `@mastra/voice-aws-nova-sonic` | Realtime speech-to-speech via AWS Bedrock | [Documentation](https://mastra.ai/reference/voice/aws-nova-sonic) |
366
+ | ElevenLabs | `@mastra/voice-elevenlabs` | High-quality TTS | [Documentation](https://mastra.ai/reference/voice/elevenlabs) |
367
+ | PlayAI | `@mastra/voice-playai` | TTS | [Documentation](https://mastra.ai/reference/voice/playai) |
368
+ | Google | `@mastra/voice-google` | TTS, STT | [Documentation](https://mastra.ai/reference/voice/google) |
369
+ | Deepgram | `@mastra/voice-deepgram` | STT | [Documentation](https://mastra.ai/reference/voice/deepgram) |
370
+ | Murf | `@mastra/voice-murf` | TTS | [Documentation](https://mastra.ai/reference/voice/murf) |
371
+ | Speechify | `@mastra/voice-speechify` | TTS | [Documentation](https://mastra.ai/reference/voice/speechify) |
372
+ | Sarvam | `@mastra/voice-sarvam` | TTS, STT | [Documentation](https://mastra.ai/reference/voice/sarvam) |
373
+ | Azure | `@mastra/voice-azure` | TTS, STT | [Documentation](https://mastra.ai/reference/voice/mastra-voice) |
374
+ | Cloudflare | `@mastra/voice-cloudflare` | TTS | [Documentation](https://mastra.ai/reference/voice/mastra-voice) |
375
+
376
+ ## Next steps
377
+
378
+ - [Voice API Reference](https://mastra.ai/reference/voice/mastra-voice): Detailed API documentation for voice capabilities
379
+ - [Text to Speech Examples](https://github.com/mastra-ai/voice-examples/tree/main/text-to-speech): Interactive story generator and other TTS implementations
380
+ - [Speech to Text Examples](https://github.com/mastra-ai/voice-examples/tree/main/speech-to-text): Voice memo app and other STT implementations
381
+ - [Speech to Speech Examples](https://github.com/mastra-ai/voice-examples/tree/main/speech-to-speech): Real-time voice conversation with call analysis