@mastra/voice-inworld 0.0.0 → 0.2.0-alpha.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +12 -0
- package/LICENSE.md +30 -0
- package/dist/docs/SKILL.md +26 -0
- package/dist/docs/assets/SOURCE_MAP.json +6 -0
- package/dist/docs/references/docs-voice-overview.md +1112 -0
- package/dist/docs/references/reference-voice-inworld.md +133 -0
- package/package.json +15 -16
package/CHANGELOG.md
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
# @mastra/voice-inworld
|
|
2
|
+
|
|
3
|
+
## 0.2.0-alpha.0
|
|
4
|
+
|
|
5
|
+
### Minor Changes
|
|
6
|
+
|
|
7
|
+
- Added Inworld AI voice integration with streaming TTS and batch STT. Supports inworld-tts-2 (default), inworld-tts-1.5-max, and inworld-tts-1.5-mini models for text-to-speech, with groq/whisper-large-v3 for speech-to-text. Includes 22+ built-in voices, configurable audio encoding, per-call `deliveryMode` and `language` overrides (deliveryMode honored only by inworld-tts-2), and progressive NDJSON audio streaming with backpressure handling. ([#14945](https://github.com/mastra-ai/mastra/pull/14945))
|
|
8
|
+
|
|
9
|
+
### Patch Changes
|
|
10
|
+
|
|
11
|
+
- Updated dependencies [[`37c0dc5`](https://github.com/mastra-ai/mastra/commit/37c0dc5697d343db98628bf867bf71ce6deec6d7), [`ef6b584`](https://github.com/mastra-ai/mastra/commit/ef6b5847ac33c0a7e80af3a86e8801e2933dd3ee), [`4dd900d`](https://github.com/mastra-ai/mastra/commit/4dd900d75dfe9be89f8c15188b368a8622aa1e18), [`4ff5bdf`](https://github.com/mastra-ai/mastra/commit/4ff5bdfe170cba6dfb5260c6af0f4ba668430772), [`bbcd93c`](https://github.com/mastra-ai/mastra/commit/bbcd93cf7d8aa1007d6d84bfd033b8015c912087), [`308bd07`](https://github.com/mastra-ai/mastra/commit/308bd074f35cef0c75d82fc1eb19382fe04ecf6f)]:
|
|
12
|
+
- @mastra/core@1.33.0-alpha.11
|
package/LICENSE.md
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
Portions of this software are licensed as follows:
|
|
2
|
+
|
|
3
|
+
- All content that resides under any directory named "ee/" within this
|
|
4
|
+
repository, including but not limited to:
|
|
5
|
+
- `packages/core/src/auth/ee/`
|
|
6
|
+
- `packages/server/src/server/auth/ee/`
|
|
7
|
+
is licensed under the license defined in `ee/LICENSE`.
|
|
8
|
+
|
|
9
|
+
- All third-party components incorporated into the Mastra Software are
|
|
10
|
+
licensed under the original license provided by the owner of the
|
|
11
|
+
applicable component.
|
|
12
|
+
|
|
13
|
+
- Content outside of the above-mentioned directories or restrictions is
|
|
14
|
+
available under the "Apache License 2.0" as defined below.
|
|
15
|
+
|
|
16
|
+
# Apache License 2.0
|
|
17
|
+
|
|
18
|
+
Copyright (c) 2025 Kepler Software, Inc.
|
|
19
|
+
|
|
20
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
|
21
|
+
you may not use this file except in compliance with the License.
|
|
22
|
+
You may obtain a copy of the License at
|
|
23
|
+
|
|
24
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
|
25
|
+
|
|
26
|
+
Unless required by applicable law or agreed to in writing, software
|
|
27
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
|
28
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
29
|
+
See the License for the specific language governing permissions and
|
|
30
|
+
limitations under the License.
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: mastra-voice-inworld
|
|
3
|
+
description: Documentation for @mastra/voice-inworld. Use when working with @mastra/voice-inworld APIs, configuration, or implementation.
|
|
4
|
+
metadata:
|
|
5
|
+
package: "@mastra/voice-inworld"
|
|
6
|
+
version: "0.2.0-alpha.0"
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
## When to use
|
|
10
|
+
|
|
11
|
+
Use this skill whenever you are working with @mastra/voice-inworld to obtain the domain-specific knowledge.
|
|
12
|
+
|
|
13
|
+
## How to use
|
|
14
|
+
|
|
15
|
+
Read the individual reference documents for detailed explanations and code examples.
|
|
16
|
+
|
|
17
|
+
### Docs
|
|
18
|
+
|
|
19
|
+
- [Voice in Mastra](references/docs-voice-overview.md) - Overview of voice capabilities in Mastra, including text-to-speech, speech-to-text, and real-time speech-to-speech interactions.
|
|
20
|
+
|
|
21
|
+
### Reference
|
|
22
|
+
|
|
23
|
+
- [Reference: Inworld](references/reference-voice-inworld.md) - Documentation for the InworldVoice class, providing streaming text-to-speech and batch speech-to-text capabilities using Inworld AI.
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
Read [assets/SOURCE_MAP.json](assets/SOURCE_MAP.json) for source code references.
|
|
@@ -0,0 +1,1112 @@
|
|
|
1
|
+
# Voice in Mastra
|
|
2
|
+
|
|
3
|
+
Mastra's Voice system provides a unified interface for voice interactions, enabling text-to-speech (TTS), speech-to-text (STT), and real-time speech-to-speech (STS) capabilities in your applications.
|
|
4
|
+
|
|
5
|
+
## Adding voice to agents
|
|
6
|
+
|
|
7
|
+
To learn how to integrate voice capabilities into your agents, check out the [Adding Voice to Agents](https://mastra.ai/docs/agents/adding-voice) documentation. This section covers how to use both single and multiple voice providers, as well as real-time interactions.
|
|
8
|
+
|
|
9
|
+
```typescript
|
|
10
|
+
import { Agent } from '@mastra/core/agent'
|
|
11
|
+
import { OpenAIVoice } from '@mastra/voice-openai'
|
|
12
|
+
|
|
13
|
+
// Initialize OpenAI voice for TTS
|
|
14
|
+
|
|
15
|
+
const voiceAgent = new Agent({
|
|
16
|
+
id: 'voice-agent',
|
|
17
|
+
name: 'Voice Agent',
|
|
18
|
+
instructions: 'You are a voice assistant that can help users with their tasks.',
|
|
19
|
+
model: 'openai/gpt-5.4',
|
|
20
|
+
voice: new OpenAIVoice(),
|
|
21
|
+
})
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
You can then use the following voice capabilities:
|
|
25
|
+
|
|
26
|
+
### Text to Speech (TTS)
|
|
27
|
+
|
|
28
|
+
Turn your agent's responses into natural-sounding speech using Mastra's TTS capabilities. Choose from multiple providers like OpenAI, ElevenLabs, and more.
|
|
29
|
+
|
|
30
|
+
For detailed configuration options and advanced features, check out our [Text-to-Speech guide](https://mastra.ai/docs/voice/text-to-speech).
|
|
31
|
+
|
|
32
|
+
**OpenAI**:
|
|
33
|
+
|
|
34
|
+
```typescript
|
|
35
|
+
import { Agent } from '@mastra/core/agent'
|
|
36
|
+
import { OpenAIVoice } from '@mastra/voice-openai'
|
|
37
|
+
import { playAudio } from '@mastra/node-audio'
|
|
38
|
+
|
|
39
|
+
const voiceAgent = new Agent({
|
|
40
|
+
id: 'voice-agent',
|
|
41
|
+
name: 'Voice Agent',
|
|
42
|
+
instructions: 'You are a voice assistant that can help users with their tasks.',
|
|
43
|
+
model: 'openai/gpt-5.4',
|
|
44
|
+
voice: new OpenAIVoice(),
|
|
45
|
+
})
|
|
46
|
+
|
|
47
|
+
const { text } = await voiceAgent.generate('What color is the sky?')
|
|
48
|
+
|
|
49
|
+
// Convert text to speech to an Audio Stream
|
|
50
|
+
const audioStream = await voiceAgent.voice.speak(text, {
|
|
51
|
+
speaker: 'default', // Optional: specify a speaker
|
|
52
|
+
responseFormat: 'wav', // Optional: specify a response format
|
|
53
|
+
})
|
|
54
|
+
|
|
55
|
+
playAudio(audioStream)
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
Visit the [OpenAI Voice Reference](https://mastra.ai/reference/voice/openai) for more information on the OpenAI voice provider.
|
|
59
|
+
|
|
60
|
+
**Azure**:
|
|
61
|
+
|
|
62
|
+
```typescript
|
|
63
|
+
import { Agent } from '@mastra/core/agent'
|
|
64
|
+
import { AzureVoice } from '@mastra/voice-azure'
|
|
65
|
+
import { playAudio } from '@mastra/node-audio'
|
|
66
|
+
|
|
67
|
+
const voiceAgent = new Agent({
|
|
68
|
+
id: 'voice-agent',
|
|
69
|
+
name: 'Voice Agent',
|
|
70
|
+
instructions: 'You are a voice assistant that can help users with their tasks.',
|
|
71
|
+
model: 'openai/gpt-5.4',
|
|
72
|
+
voice: new AzureVoice(),
|
|
73
|
+
})
|
|
74
|
+
|
|
75
|
+
const { text } = await voiceAgent.generate('What color is the sky?')
|
|
76
|
+
|
|
77
|
+
// Convert text to speech to an Audio Stream
|
|
78
|
+
const audioStream = await voiceAgent.voice.speak(text, {
|
|
79
|
+
speaker: 'en-US-JennyNeural', // Optional: specify a speaker
|
|
80
|
+
})
|
|
81
|
+
|
|
82
|
+
playAudio(audioStream)
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
Visit the [Azure Voice Reference](https://mastra.ai/reference/voice/azure) for more information on the Azure voice provider.
|
|
86
|
+
|
|
87
|
+
**ElevenLabs**:
|
|
88
|
+
|
|
89
|
+
```typescript
|
|
90
|
+
import { Agent } from '@mastra/core/agent'
|
|
91
|
+
import { ElevenLabsVoice } from '@mastra/voice-elevenlabs'
|
|
92
|
+
import { playAudio } from '@mastra/node-audio'
|
|
93
|
+
|
|
94
|
+
const voiceAgent = new Agent({
|
|
95
|
+
id: 'voice-agent',
|
|
96
|
+
name: 'Voice Agent',
|
|
97
|
+
instructions: 'You are a voice assistant that can help users with their tasks.',
|
|
98
|
+
model: 'openai/gpt-5.4',
|
|
99
|
+
voice: new ElevenLabsVoice(),
|
|
100
|
+
})
|
|
101
|
+
|
|
102
|
+
const { text } = await voiceAgent.generate('What color is the sky?')
|
|
103
|
+
|
|
104
|
+
// Convert text to speech to an Audio Stream
|
|
105
|
+
const audioStream = await voiceAgent.voice.speak(text, {
|
|
106
|
+
speaker: 'default', // Optional: specify a speaker
|
|
107
|
+
})
|
|
108
|
+
|
|
109
|
+
playAudio(audioStream)
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
Visit the [ElevenLabs Voice Reference](https://mastra.ai/reference/voice/elevenlabs) for more information on the ElevenLabs voice provider.
|
|
113
|
+
|
|
114
|
+
**PlayAI**:
|
|
115
|
+
|
|
116
|
+
```typescript
|
|
117
|
+
import { Agent } from '@mastra/core/agent'
|
|
118
|
+
import { PlayAIVoice } from '@mastra/voice-playai'
|
|
119
|
+
import { playAudio } from '@mastra/node-audio'
|
|
120
|
+
|
|
121
|
+
const voiceAgent = new Agent({
|
|
122
|
+
id: 'voice-agent',
|
|
123
|
+
name: 'Voice Agent',
|
|
124
|
+
instructions: 'You are a voice assistant that can help users with their tasks.',
|
|
125
|
+
model: 'openai/gpt-5.4',
|
|
126
|
+
voice: new PlayAIVoice(),
|
|
127
|
+
})
|
|
128
|
+
|
|
129
|
+
const { text } = await voiceAgent.generate('What color is the sky?')
|
|
130
|
+
|
|
131
|
+
// Convert text to speech to an Audio Stream
|
|
132
|
+
const audioStream = await voiceAgent.voice.speak(text, {
|
|
133
|
+
speaker: 'default', // Optional: specify a speaker
|
|
134
|
+
})
|
|
135
|
+
|
|
136
|
+
playAudio(audioStream)
|
|
137
|
+
```
|
|
138
|
+
|
|
139
|
+
Visit the [PlayAI Voice Reference](https://mastra.ai/reference/voice/playai) for more information on the PlayAI voice provider.
|
|
140
|
+
|
|
141
|
+
**Google**:
|
|
142
|
+
|
|
143
|
+
```typescript
|
|
144
|
+
import { Agent } from '@mastra/core/agent'
|
|
145
|
+
import { GoogleVoice } from '@mastra/voice-google'
|
|
146
|
+
import { playAudio } from '@mastra/node-audio'
|
|
147
|
+
|
|
148
|
+
const voiceAgent = new Agent({
|
|
149
|
+
id: 'voice-agent',
|
|
150
|
+
name: 'Voice Agent',
|
|
151
|
+
instructions: 'You are a voice assistant that can help users with their tasks.',
|
|
152
|
+
model: 'openai/gpt-5.4',
|
|
153
|
+
voice: new GoogleVoice(),
|
|
154
|
+
})
|
|
155
|
+
|
|
156
|
+
const { text } = await voiceAgent.generate('What color is the sky?')
|
|
157
|
+
|
|
158
|
+
// Convert text to speech to an Audio Stream
|
|
159
|
+
const audioStream = await voiceAgent.voice.speak(text, {
|
|
160
|
+
speaker: 'en-US-Studio-O', // Optional: specify a speaker
|
|
161
|
+
})
|
|
162
|
+
|
|
163
|
+
playAudio(audioStream)
|
|
164
|
+
```
|
|
165
|
+
|
|
166
|
+
Visit the [Google Voice Reference](https://mastra.ai/reference/voice/google) for more information on the Google voice provider.
|
|
167
|
+
|
|
168
|
+
**Cloudflare**:
|
|
169
|
+
|
|
170
|
+
```typescript
|
|
171
|
+
import { Agent } from '@mastra/core/agent'
|
|
172
|
+
import { CloudflareVoice } from '@mastra/voice-cloudflare'
|
|
173
|
+
import { playAudio } from '@mastra/node-audio'
|
|
174
|
+
|
|
175
|
+
const voiceAgent = new Agent({
|
|
176
|
+
id: 'voice-agent',
|
|
177
|
+
name: 'Voice Agent',
|
|
178
|
+
instructions: 'You are a voice assistant that can help users with their tasks.',
|
|
179
|
+
model: 'openai/gpt-5.4',
|
|
180
|
+
voice: new CloudflareVoice(),
|
|
181
|
+
})
|
|
182
|
+
|
|
183
|
+
const { text } = await voiceAgent.generate('What color is the sky?')
|
|
184
|
+
|
|
185
|
+
// Convert text to speech to an Audio Stream
|
|
186
|
+
const audioStream = await voiceAgent.voice.speak(text, {
|
|
187
|
+
speaker: 'default', // Optional: specify a speaker
|
|
188
|
+
})
|
|
189
|
+
|
|
190
|
+
playAudio(audioStream)
|
|
191
|
+
```
|
|
192
|
+
|
|
193
|
+
Visit the [Cloudflare Voice Reference](https://mastra.ai/reference/voice/cloudflare) for more information on the Cloudflare voice provider.
|
|
194
|
+
|
|
195
|
+
**Deepgram**:
|
|
196
|
+
|
|
197
|
+
```typescript
|
|
198
|
+
import { Agent } from '@mastra/core/agent'
|
|
199
|
+
import { DeepgramVoice } from '@mastra/voice-deepgram'
|
|
200
|
+
import { playAudio } from '@mastra/node-audio'
|
|
201
|
+
|
|
202
|
+
const voiceAgent = new Agent({
|
|
203
|
+
id: 'voice-agent',
|
|
204
|
+
name: 'Voice Agent',
|
|
205
|
+
instructions: 'You are a voice assistant that can help users with their tasks.',
|
|
206
|
+
model: 'openai/gpt-5.4',
|
|
207
|
+
voice: new DeepgramVoice(),
|
|
208
|
+
})
|
|
209
|
+
|
|
210
|
+
const { text } = await voiceAgent.generate('What color is the sky?')
|
|
211
|
+
|
|
212
|
+
// Convert text to speech to an Audio Stream
|
|
213
|
+
const audioStream = await voiceAgent.voice.speak(text, {
|
|
214
|
+
speaker: 'aura-english-us', // Optional: specify a speaker
|
|
215
|
+
})
|
|
216
|
+
|
|
217
|
+
playAudio(audioStream)
|
|
218
|
+
```
|
|
219
|
+
|
|
220
|
+
Visit the [Deepgram Voice Reference](https://mastra.ai/reference/voice/deepgram) for more information on the Deepgram voice provider.
|
|
221
|
+
|
|
222
|
+
**Inworld**:
|
|
223
|
+
|
|
224
|
+
```typescript
|
|
225
|
+
import { Agent } from '@mastra/core/agent'
|
|
226
|
+
import { InworldVoice } from '@mastra/voice-inworld'
|
|
227
|
+
import { playAudio } from '@mastra/node-audio'
|
|
228
|
+
|
|
229
|
+
const voiceAgent = new Agent({
|
|
230
|
+
id: 'voice-agent',
|
|
231
|
+
name: 'Voice Agent',
|
|
232
|
+
instructions: 'You are a voice assistant that can help users with their tasks.',
|
|
233
|
+
model: 'openai/gpt-5.4',
|
|
234
|
+
voice: new InworldVoice(),
|
|
235
|
+
})
|
|
236
|
+
|
|
237
|
+
const { text } = await voiceAgent.generate('What color is the sky?')
|
|
238
|
+
|
|
239
|
+
// Convert text to speech to an Audio Stream
|
|
240
|
+
const audioStream = await voiceAgent.voice.speak(text, {
|
|
241
|
+
speaker: 'Dennis', // Optional: specify a speaker
|
|
242
|
+
})
|
|
243
|
+
|
|
244
|
+
playAudio(audioStream)
|
|
245
|
+
```
|
|
246
|
+
|
|
247
|
+
Visit the [Inworld Voice Reference](https://mastra.ai/reference/voice/inworld) for more information on the Inworld voice provider.
|
|
248
|
+
|
|
249
|
+
**Speechify**:
|
|
250
|
+
|
|
251
|
+
```typescript
|
|
252
|
+
import { Agent } from '@mastra/core/agent'
|
|
253
|
+
import { SpeechifyVoice } from '@mastra/voice-speechify'
|
|
254
|
+
import { playAudio } from '@mastra/node-audio'
|
|
255
|
+
|
|
256
|
+
const voiceAgent = new Agent({
|
|
257
|
+
id: 'voice-agent',
|
|
258
|
+
name: 'Voice Agent',
|
|
259
|
+
instructions: 'You are a voice assistant that can help users with their tasks.',
|
|
260
|
+
model: 'openai/gpt-5.4',
|
|
261
|
+
voice: new SpeechifyVoice(),
|
|
262
|
+
})
|
|
263
|
+
|
|
264
|
+
const { text } = await voiceAgent.generate('What color is the sky?')
|
|
265
|
+
|
|
266
|
+
// Convert text to speech to an Audio Stream
|
|
267
|
+
const audioStream = await voiceAgent.voice.speak(text, {
|
|
268
|
+
speaker: 'matthew', // Optional: specify a speaker
|
|
269
|
+
})
|
|
270
|
+
|
|
271
|
+
playAudio(audioStream)
|
|
272
|
+
```
|
|
273
|
+
|
|
274
|
+
Visit the [Speechify Voice Reference](https://mastra.ai/reference/voice/speechify) for more information on the Speechify voice provider.
|
|
275
|
+
|
|
276
|
+
**Sarvam**:
|
|
277
|
+
|
|
278
|
+
```typescript
|
|
279
|
+
import { Agent } from '@mastra/core/agent'
|
|
280
|
+
import { SarvamVoice } from '@mastra/voice-sarvam'
|
|
281
|
+
import { playAudio } from '@mastra/node-audio'
|
|
282
|
+
|
|
283
|
+
const voiceAgent = new Agent({
|
|
284
|
+
id: 'voice-agent',
|
|
285
|
+
name: 'Voice Agent',
|
|
286
|
+
instructions: 'You are a voice assistant that can help users with their tasks.',
|
|
287
|
+
model: 'openai/gpt-5.4',
|
|
288
|
+
voice: new SarvamVoice(),
|
|
289
|
+
})
|
|
290
|
+
|
|
291
|
+
const { text } = await voiceAgent.generate('What color is the sky?')
|
|
292
|
+
|
|
293
|
+
// Convert text to speech to an Audio Stream
|
|
294
|
+
const audioStream = await voiceAgent.voice.speak(text, {
|
|
295
|
+
speaker: 'shubh', // Optional: specify a bulbul:v3 speaker
|
|
296
|
+
})
|
|
297
|
+
|
|
298
|
+
playAudio(audioStream)
|
|
299
|
+
```
|
|
300
|
+
|
|
301
|
+
Visit the [Sarvam Voice Reference](https://mastra.ai/reference/voice/sarvam) for more information on the Sarvam voice provider.
|
|
302
|
+
|
|
303
|
+
**Murf**:
|
|
304
|
+
|
|
305
|
+
```typescript
|
|
306
|
+
import { Agent } from '@mastra/core/agent'
|
|
307
|
+
import { MurfVoice } from '@mastra/voice-murf'
|
|
308
|
+
import { playAudio } from '@mastra/node-audio'
|
|
309
|
+
|
|
310
|
+
const voiceAgent = new Agent({
|
|
311
|
+
id: 'voice-agent',
|
|
312
|
+
name: 'Voice Agent',
|
|
313
|
+
instructions: 'You are a voice assistant that can help users with their tasks.',
|
|
314
|
+
model: 'openai/gpt-5.4',
|
|
315
|
+
voice: new MurfVoice(),
|
|
316
|
+
})
|
|
317
|
+
|
|
318
|
+
const { text } = await voiceAgent.generate('What color is the sky?')
|
|
319
|
+
|
|
320
|
+
// Convert text to speech to an Audio Stream
|
|
321
|
+
const audioStream = await voiceAgent.voice.speak(text, {
|
|
322
|
+
speaker: 'default', // Optional: specify a speaker
|
|
323
|
+
})
|
|
324
|
+
|
|
325
|
+
playAudio(audioStream)
|
|
326
|
+
```
|
|
327
|
+
|
|
328
|
+
Visit the [Murf Voice Reference](https://mastra.ai/reference/voice/murf) for more information on the Murf voice provider.
|
|
329
|
+
|
|
330
|
+
### Speech to Text (STT)
|
|
331
|
+
|
|
332
|
+
Transcribe spoken content using various providers like OpenAI, ElevenLabs, and more. For detailed configuration options and more, check out [Speech to Text](https://mastra.ai/docs/voice/speech-to-text).
|
|
333
|
+
|
|
334
|
+
You can download a sample audio file from [here](https://github.com/mastra-ai/realtime-voice-demo/raw/refs/heads/main/how_can_i_help_you.mp3).
|
|
335
|
+
|
|
336
|
+
[](https://github.com/mastra-ai/realtime-voice-demo/raw/refs/heads/main/how_can_i_help_you.mp3)
|
|
337
|
+
|
|
338
|
+
**OpenAI**:
|
|
339
|
+
|
|
340
|
+
```typescript
|
|
341
|
+
import { Agent } from '@mastra/core/agent'
|
|
342
|
+
import { OpenAIVoice } from '@mastra/voice-openai'
|
|
343
|
+
import { createReadStream } from 'fs'
|
|
344
|
+
|
|
345
|
+
const voiceAgent = new Agent({
|
|
346
|
+
id: 'voice-agent',
|
|
347
|
+
name: 'Voice Agent',
|
|
348
|
+
instructions: 'You are a voice assistant that can help users with their tasks.',
|
|
349
|
+
model: 'openai/gpt-5.4',
|
|
350
|
+
voice: new OpenAIVoice(),
|
|
351
|
+
})
|
|
352
|
+
|
|
353
|
+
// Use an audio file from a URL
|
|
354
|
+
const audioStream = await createReadStream('./how_can_i_help_you.mp3')
|
|
355
|
+
|
|
356
|
+
// Convert audio to text
|
|
357
|
+
const transcript = await voiceAgent.voice.listen(audioStream)
|
|
358
|
+
console.log(`User said: ${transcript}`)
|
|
359
|
+
|
|
360
|
+
// Generate a response based on the transcript
|
|
361
|
+
const { text } = await voiceAgent.generate(transcript)
|
|
362
|
+
```
|
|
363
|
+
|
|
364
|
+
Visit the [OpenAI Voice Reference](https://mastra.ai/reference/voice/openai) for more information on the OpenAI voice provider.
|
|
365
|
+
|
|
366
|
+
**Azure**:
|
|
367
|
+
|
|
368
|
+
```typescript
|
|
369
|
+
import { createReadStream } from 'fs'
|
|
370
|
+
import { Agent } from '@mastra/core/agent'
|
|
371
|
+
import { AzureVoice } from '@mastra/voice-azure'
|
|
372
|
+
import { createReadStream } from 'fs'
|
|
373
|
+
|
|
374
|
+
const voiceAgent = new Agent({
|
|
375
|
+
id: 'voice-agent',
|
|
376
|
+
name: 'Voice Agent',
|
|
377
|
+
instructions: 'You are a voice assistant that can help users with their tasks.',
|
|
378
|
+
model: 'openai/gpt-5.4',
|
|
379
|
+
voice: new AzureVoice(),
|
|
380
|
+
})
|
|
381
|
+
|
|
382
|
+
// Use an audio file from a URL
|
|
383
|
+
const audioStream = await createReadStream('./how_can_i_help_you.mp3')
|
|
384
|
+
|
|
385
|
+
// Convert audio to text
|
|
386
|
+
const transcript = await voiceAgent.voice.listen(audioStream)
|
|
387
|
+
console.log(`User said: ${transcript}`)
|
|
388
|
+
|
|
389
|
+
// Generate a response based on the transcript
|
|
390
|
+
const { text } = await voiceAgent.generate(transcript)
|
|
391
|
+
```
|
|
392
|
+
|
|
393
|
+
Visit the [Azure Voice Reference](https://mastra.ai/reference/voice/azure) for more information on the Azure voice provider.
|
|
394
|
+
|
|
395
|
+
**ElevenLabs**:
|
|
396
|
+
|
|
397
|
+
```typescript
|
|
398
|
+
import { Agent } from '@mastra/core/agent'
|
|
399
|
+
import { ElevenLabsVoice } from '@mastra/voice-elevenlabs'
|
|
400
|
+
import { createReadStream } from 'fs'
|
|
401
|
+
|
|
402
|
+
const voiceAgent = new Agent({
|
|
403
|
+
id: 'voice-agent',
|
|
404
|
+
name: 'Voice Agent',
|
|
405
|
+
instructions: 'You are a voice assistant that can help users with their tasks.',
|
|
406
|
+
model: 'openai/gpt-5.4',
|
|
407
|
+
voice: new ElevenLabsVoice(),
|
|
408
|
+
})
|
|
409
|
+
|
|
410
|
+
// Use an audio file from a URL
|
|
411
|
+
const audioStream = await createReadStream('./how_can_i_help_you.mp3')
|
|
412
|
+
|
|
413
|
+
// Convert audio to text
|
|
414
|
+
const transcript = await voiceAgent.voice.listen(audioStream)
|
|
415
|
+
console.log(`User said: ${transcript}`)
|
|
416
|
+
|
|
417
|
+
// Generate a response based on the transcript
|
|
418
|
+
const { text } = await voiceAgent.generate(transcript)
|
|
419
|
+
```
|
|
420
|
+
|
|
421
|
+
Visit the [ElevenLabs Voice Reference](https://mastra.ai/reference/voice/elevenlabs) for more information on the ElevenLabs voice provider.
|
|
422
|
+
|
|
423
|
+
**Google**:
|
|
424
|
+
|
|
425
|
+
```typescript
|
|
426
|
+
import { Agent } from '@mastra/core/agent'
|
|
427
|
+
import { GoogleVoice } from '@mastra/voice-google'
|
|
428
|
+
import { createReadStream } from 'fs'
|
|
429
|
+
|
|
430
|
+
const voiceAgent = new Agent({
|
|
431
|
+
id: 'voice-agent',
|
|
432
|
+
name: 'Voice Agent',
|
|
433
|
+
instructions: 'You are a voice assistant that can help users with their tasks.',
|
|
434
|
+
model: 'openai/gpt-5.4',
|
|
435
|
+
voice: new GoogleVoice(),
|
|
436
|
+
})
|
|
437
|
+
|
|
438
|
+
// Use an audio file from a URL
|
|
439
|
+
const audioStream = await createReadStream('./how_can_i_help_you.mp3')
|
|
440
|
+
|
|
441
|
+
// Convert audio to text
|
|
442
|
+
const transcript = await voiceAgent.voice.listen(audioStream)
|
|
443
|
+
console.log(`User said: ${transcript}`)
|
|
444
|
+
|
|
445
|
+
// Generate a response based on the transcript
|
|
446
|
+
const { text } = await voiceAgent.generate(transcript)
|
|
447
|
+
```
|
|
448
|
+
|
|
449
|
+
Visit the [Google Voice Reference](https://mastra.ai/reference/voice/google) for more information on the Google voice provider.
|
|
450
|
+
|
|
451
|
+
**Cloudflare**:
|
|
452
|
+
|
|
453
|
+
```typescript
|
|
454
|
+
import { Agent } from '@mastra/core/agent'
|
|
455
|
+
import { CloudflareVoice } from '@mastra/voice-cloudflare'
|
|
456
|
+
import { createReadStream } from 'fs'
|
|
457
|
+
|
|
458
|
+
const voiceAgent = new Agent({
|
|
459
|
+
id: 'voice-agent',
|
|
460
|
+
name: 'Voice Agent',
|
|
461
|
+
instructions: 'You are a voice assistant that can help users with their tasks.',
|
|
462
|
+
model: 'openai/gpt-5.4',
|
|
463
|
+
voice: new CloudflareVoice(),
|
|
464
|
+
})
|
|
465
|
+
|
|
466
|
+
// Use an audio file from a URL
|
|
467
|
+
const audioStream = await createReadStream('./how_can_i_help_you.mp3')
|
|
468
|
+
|
|
469
|
+
// Convert audio to text
|
|
470
|
+
const transcript = await voiceAgent.voice.listen(audioStream)
|
|
471
|
+
console.log(`User said: ${transcript}`)
|
|
472
|
+
|
|
473
|
+
// Generate a response based on the transcript
|
|
474
|
+
const { text } = await voiceAgent.generate(transcript)
|
|
475
|
+
```
|
|
476
|
+
|
|
477
|
+
Visit the [Cloudflare Voice Reference](https://mastra.ai/reference/voice/cloudflare) for more information on the Cloudflare voice provider.
|
|
478
|
+
|
|
479
|
+
**Deepgram**:
|
|
480
|
+
|
|
481
|
+
```typescript
|
|
482
|
+
import { Agent } from '@mastra/core/agent'
|
|
483
|
+
import { DeepgramVoice } from '@mastra/voice-deepgram'
|
|
484
|
+
import { createReadStream } from 'fs'
|
|
485
|
+
|
|
486
|
+
const voiceAgent = new Agent({
|
|
487
|
+
id: 'voice-agent',
|
|
488
|
+
name: 'Voice Agent',
|
|
489
|
+
instructions: 'You are a voice assistant that can help users with their tasks.',
|
|
490
|
+
model: 'openai/gpt-5.4',
|
|
491
|
+
voice: new DeepgramVoice(),
|
|
492
|
+
})
|
|
493
|
+
|
|
494
|
+
// Use an audio file from a URL
|
|
495
|
+
const audioStream = await createReadStream('./how_can_i_help_you.mp3')
|
|
496
|
+
|
|
497
|
+
// Convert audio to text
|
|
498
|
+
const transcript = await voiceAgent.voice.listen(audioStream)
|
|
499
|
+
console.log(`User said: ${transcript}`)
|
|
500
|
+
|
|
501
|
+
// Generate a response based on the transcript
|
|
502
|
+
const { text } = await voiceAgent.generate(transcript)
|
|
503
|
+
```
|
|
504
|
+
|
|
505
|
+
Visit the [Deepgram Voice Reference](https://mastra.ai/reference/voice/deepgram) for more information on the Deepgram voice provider.
|
|
506
|
+
|
|
507
|
+
**Inworld**:
|
|
508
|
+
|
|
509
|
+
```typescript
|
|
510
|
+
import { Agent } from '@mastra/core/agent'
|
|
511
|
+
import { InworldVoice } from '@mastra/voice-inworld'
|
|
512
|
+
import { createReadStream } from 'fs'
|
|
513
|
+
|
|
514
|
+
const voiceAgent = new Agent({
|
|
515
|
+
id: 'voice-agent',
|
|
516
|
+
name: 'Voice Agent',
|
|
517
|
+
instructions: 'You are a voice assistant that can help users with their tasks.',
|
|
518
|
+
model: 'openai/gpt-5.4',
|
|
519
|
+
voice: new InworldVoice(),
|
|
520
|
+
})
|
|
521
|
+
|
|
522
|
+
// Use an audio file from a URL
|
|
523
|
+
const audioStream = await createReadStream('./how_can_i_help_you.mp3')
|
|
524
|
+
|
|
525
|
+
// Convert audio to text
|
|
526
|
+
const transcript = await voiceAgent.voice.listen(audioStream)
|
|
527
|
+
console.log(`User said: ${transcript}`)
|
|
528
|
+
|
|
529
|
+
// Generate a response based on the transcript
|
|
530
|
+
const { text } = await voiceAgent.generate(transcript)
|
|
531
|
+
```
|
|
532
|
+
|
|
533
|
+
Visit the [Inworld Voice Reference](https://mastra.ai/reference/voice/inworld) for more information on the Inworld voice provider.
|
|
534
|
+
|
|
535
|
+
**Sarvam**:
|
|
536
|
+
|
|
537
|
+
```typescript
|
|
538
|
+
import { Agent } from '@mastra/core/agent'
|
|
539
|
+
import { SarvamVoice } from '@mastra/voice-sarvam'
|
|
540
|
+
import { createReadStream } from 'fs'
|
|
541
|
+
|
|
542
|
+
const voiceAgent = new Agent({
|
|
543
|
+
id: 'voice-agent',
|
|
544
|
+
name: 'Voice Agent',
|
|
545
|
+
instructions: 'You are a voice assistant that can help users with their tasks.',
|
|
546
|
+
model: 'openai/gpt-5.4',
|
|
547
|
+
voice: new SarvamVoice(),
|
|
548
|
+
})
|
|
549
|
+
|
|
550
|
+
// Use an audio file from a URL
|
|
551
|
+
const audioStream = await createReadStream('./how_can_i_help_you.mp3')
|
|
552
|
+
|
|
553
|
+
// Convert audio to text
|
|
554
|
+
const transcript = await voiceAgent.voice.listen(audioStream)
|
|
555
|
+
console.log(`User said: ${transcript}`)
|
|
556
|
+
|
|
557
|
+
// Generate a response based on the transcript
|
|
558
|
+
const { text } = await voiceAgent.generate(transcript)
|
|
559
|
+
```
|
|
560
|
+
|
|
561
|
+
Visit the [Sarvam Voice Reference](https://mastra.ai/reference/voice/sarvam) for more information on the Sarvam voice provider.
|
|
562
|
+
|
|
563
|
+
### Speech to Speech (STS)
|
|
564
|
+
|
|
565
|
+
Create conversational experiences with speech-to-speech capabilities. The unified API enables real-time voice interactions between users and AI agents. For detailed configuration options and advanced features, check out [Speech to Speech](https://mastra.ai/docs/voice/speech-to-speech).
|
|
566
|
+
|
|
567
|
+
**OpenAI**:
|
|
568
|
+
|
|
569
|
+
```typescript
|
|
570
|
+
import { Agent } from '@mastra/core/agent'
|
|
571
|
+
import { playAudio, getMicrophoneStream } from '@mastra/node-audio'
|
|
572
|
+
import { OpenAIRealtimeVoice } from '@mastra/voice-openai-realtime'
|
|
573
|
+
|
|
574
|
+
const voiceAgent = new Agent({
|
|
575
|
+
id: 'voice-agent',
|
|
576
|
+
name: 'Voice Agent',
|
|
577
|
+
instructions: 'You are a voice assistant that can help users with their tasks.',
|
|
578
|
+
model: 'openai/gpt-5.4',
|
|
579
|
+
voice: new OpenAIRealtimeVoice(),
|
|
580
|
+
})
|
|
581
|
+
|
|
582
|
+
// Listen for agent audio responses
|
|
583
|
+
voiceAgent.voice.on('speaker', ({ audio }) => {
|
|
584
|
+
playAudio(audio)
|
|
585
|
+
})
|
|
586
|
+
|
|
587
|
+
// Initiate the conversation
|
|
588
|
+
await voiceAgent.voice.speak('How can I help you today?')
|
|
589
|
+
|
|
590
|
+
// Send continuous audio from the microphone
|
|
591
|
+
const micStream = getMicrophoneStream()
|
|
592
|
+
await voiceAgent.voice.send(micStream)
|
|
593
|
+
```
|
|
594
|
+
|
|
595
|
+
Visit the [OpenAI Voice Reference](https://mastra.ai/reference/voice/openai-realtime) for more information on the OpenAI voice provider.
|
|
596
|
+
|
|
597
|
+
**Google**:
|
|
598
|
+
|
|
599
|
+
```typescript
|
|
600
|
+
import { Agent } from '@mastra/core/agent'
|
|
601
|
+
import { playAudio, getMicrophoneStream } from '@mastra/node-audio'
|
|
602
|
+
import { GeminiLiveVoice } from '@mastra/voice-google-gemini-live'
|
|
603
|
+
|
|
604
|
+
const voiceAgent = new Agent({
|
|
605
|
+
id: 'voice-agent',
|
|
606
|
+
name: 'Voice Agent',
|
|
607
|
+
instructions: 'You are a voice assistant that can help users with their tasks.',
|
|
608
|
+
model: 'openai/gpt-5.4',
|
|
609
|
+
voice: new GeminiLiveVoice({
|
|
610
|
+
// Live API mode
|
|
611
|
+
apiKey: process.env.GOOGLE_API_KEY,
|
|
612
|
+
model: 'gemini-2.0-flash-exp',
|
|
613
|
+
speaker: 'Puck',
|
|
614
|
+
debug: true,
|
|
615
|
+
// Vertex AI alternative:
|
|
616
|
+
// vertexAI: true,
|
|
617
|
+
// project: 'your-gcp-project',
|
|
618
|
+
// location: 'us-central1',
|
|
619
|
+
// serviceAccountKeyFile: '/path/to/service-account.json',
|
|
620
|
+
}),
|
|
621
|
+
})
|
|
622
|
+
|
|
623
|
+
// Connect before using speak/send
|
|
624
|
+
await voiceAgent.voice.connect()
|
|
625
|
+
|
|
626
|
+
// Listen for agent audio responses
|
|
627
|
+
voiceAgent.voice.on('speaker', ({ audio }) => {
|
|
628
|
+
playAudio(audio)
|
|
629
|
+
})
|
|
630
|
+
|
|
631
|
+
// Listen for text responses and transcriptions
|
|
632
|
+
voiceAgent.voice.on('writing', ({ text, role }) => {
|
|
633
|
+
console.log(`${role}: ${text}`)
|
|
634
|
+
})
|
|
635
|
+
|
|
636
|
+
// Initiate the conversation
|
|
637
|
+
await voiceAgent.voice.speak('How can I help you today?')
|
|
638
|
+
|
|
639
|
+
// Send continuous audio from the microphone
|
|
640
|
+
const micStream = getMicrophoneStream()
|
|
641
|
+
await voiceAgent.voice.send(micStream)
|
|
642
|
+
```
|
|
643
|
+
|
|
644
|
+
Visit the [Google Gemini Live Reference](https://mastra.ai/reference/voice/google-gemini-live) for more information on the Google Gemini Live voice provider.
|
|
645
|
+
|
|
646
|
+
**AWS Nova Sonic**:
|
|
647
|
+
|
|
648
|
+
```typescript
|
|
649
|
+
import { Agent } from '@mastra/core/agent'
|
|
650
|
+
import { playAudio, getMicrophoneStream } from '@mastra/node-audio'
|
|
651
|
+
import { NovaSonicVoice } from '@mastra/voice-aws-nova-sonic'
|
|
652
|
+
|
|
653
|
+
const voiceAgent = new Agent({
|
|
654
|
+
id: 'voice-agent',
|
|
655
|
+
name: 'Voice Agent',
|
|
656
|
+
instructions: 'You are a voice assistant that can help users with their tasks.',
|
|
657
|
+
model: 'openai/gpt-5.4',
|
|
658
|
+
voice: new NovaSonicVoice({
|
|
659
|
+
region: 'us-east-1',
|
|
660
|
+
speaker: 'matthew',
|
|
661
|
+
// Static credentials are optional. The default AWS credential
|
|
662
|
+
// provider chain is used when none are passed.
|
|
663
|
+
}),
|
|
664
|
+
})
|
|
665
|
+
|
|
666
|
+
// Connect before using speak/send
|
|
667
|
+
await voiceAgent.voice.connect()
|
|
668
|
+
|
|
669
|
+
// Listen for assistant audio (Int16Array PCM)
|
|
670
|
+
voiceAgent.voice.on('speaking', ({ audioData }) => {
|
|
671
|
+
if (audioData) playAudio(audioData)
|
|
672
|
+
})
|
|
673
|
+
|
|
674
|
+
// Listen for transcribed text
|
|
675
|
+
voiceAgent.voice.on('writing', ({ text, role }) => {
|
|
676
|
+
console.log(`${role}: ${text}`)
|
|
677
|
+
})
|
|
678
|
+
|
|
679
|
+
// Initiate the conversation
|
|
680
|
+
await voiceAgent.voice.speak('How can I help you today?')
|
|
681
|
+
|
|
682
|
+
// Send continuous audio from the microphone
|
|
683
|
+
const micStream = getMicrophoneStream()
|
|
684
|
+
await voiceAgent.voice.send(micStream)
|
|
685
|
+
```
|
|
686
|
+
|
|
687
|
+
Visit the [AWS Nova Sonic Reference](https://mastra.ai/reference/voice/aws-nova-sonic) for more information on the AWS Nova Sonic voice provider.
|
|
688
|
+
|
|
689
|
+
## Voice configuration
|
|
690
|
+
|
|
691
|
+
Each voice provider can be configured with different models and options. Below are the detailed configuration options for all supported providers:
|
|
692
|
+
|
|
693
|
+
**OpenAI**:
|
|
694
|
+
|
|
695
|
+
```typescript
|
|
696
|
+
// OpenAI Voice Configuration
|
|
697
|
+
const voice = new OpenAIVoice({
|
|
698
|
+
speechModel: {
|
|
699
|
+
name: 'gpt-3.5-turbo', // Example model name
|
|
700
|
+
apiKey: process.env.OPENAI_API_KEY,
|
|
701
|
+
language: 'en-US', // Language code
|
|
702
|
+
voiceType: 'neural', // Type of voice model
|
|
703
|
+
},
|
|
704
|
+
listeningModel: {
|
|
705
|
+
name: 'whisper-1', // Example model name
|
|
706
|
+
apiKey: process.env.OPENAI_API_KEY,
|
|
707
|
+
language: 'en-US', // Language code
|
|
708
|
+
format: 'wav', // Audio format
|
|
709
|
+
},
|
|
710
|
+
speaker: 'alloy', // Example speaker name
|
|
711
|
+
})
|
|
712
|
+
```
|
|
713
|
+
|
|
714
|
+
Visit the [OpenAI Voice Reference](https://mastra.ai/reference/voice/openai) for more information on the OpenAI voice provider.
|
|
715
|
+
|
|
716
|
+
**Azure**:
|
|
717
|
+
|
|
718
|
+
```typescript
|
|
719
|
+
// Azure Voice Configuration
|
|
720
|
+
const voice = new AzureVoice({
|
|
721
|
+
speechModel: {
|
|
722
|
+
name: 'en-US-JennyNeural', // Example model name
|
|
723
|
+
apiKey: process.env.AZURE_SPEECH_KEY,
|
|
724
|
+
region: process.env.AZURE_SPEECH_REGION,
|
|
725
|
+
language: 'en-US', // Language code
|
|
726
|
+
style: 'cheerful', // Voice style
|
|
727
|
+
pitch: '+0Hz', // Pitch adjustment
|
|
728
|
+
rate: '1.0', // Speech rate
|
|
729
|
+
},
|
|
730
|
+
listeningModel: {
|
|
731
|
+
name: 'en-US', // Example model name
|
|
732
|
+
apiKey: process.env.AZURE_SPEECH_KEY,
|
|
733
|
+
region: process.env.AZURE_SPEECH_REGION,
|
|
734
|
+
format: 'simple', // Output format
|
|
735
|
+
},
|
|
736
|
+
})
|
|
737
|
+
```
|
|
738
|
+
|
|
739
|
+
Visit the [Azure Voice Reference](https://mastra.ai/reference/voice/azure) for more information on the Azure voice provider.
|
|
740
|
+
|
|
741
|
+
**ElevenLabs**:
|
|
742
|
+
|
|
743
|
+
```typescript
|
|
744
|
+
// ElevenLabs Voice Configuration
|
|
745
|
+
const voice = new ElevenLabsVoice({
|
|
746
|
+
speechModel: {
|
|
747
|
+
voiceId: 'your-voice-id', // Example voice ID
|
|
748
|
+
model: 'eleven_multilingual_v2', // Example model name
|
|
749
|
+
apiKey: process.env.ELEVENLABS_API_KEY,
|
|
750
|
+
language: 'en', // Language code
|
|
751
|
+
emotion: 'neutral', // Emotion setting
|
|
752
|
+
},
|
|
753
|
+
// ElevenLabs may not have a separate listening model
|
|
754
|
+
})
|
|
755
|
+
```
|
|
756
|
+
|
|
757
|
+
Visit the [ElevenLabs Voice Reference](https://mastra.ai/reference/voice/elevenlabs) for more information on the ElevenLabs voice provider.
|
|
758
|
+
|
|
759
|
+
**PlayAI**:
|
|
760
|
+
|
|
761
|
+
```typescript
|
|
762
|
+
// PlayAI Voice Configuration
|
|
763
|
+
const voice = new PlayAIVoice({
|
|
764
|
+
speechModel: {
|
|
765
|
+
name: 'playai-voice', // Example model name
|
|
766
|
+
speaker: 'emma', // Example speaker name
|
|
767
|
+
apiKey: process.env.PLAYAI_API_KEY,
|
|
768
|
+
language: 'en-US', // Language code
|
|
769
|
+
speed: 1.0, // Speech speed
|
|
770
|
+
},
|
|
771
|
+
// PlayAI may not have a separate listening model
|
|
772
|
+
})
|
|
773
|
+
```
|
|
774
|
+
|
|
775
|
+
Visit the [PlayAI Voice Reference](https://mastra.ai/reference/voice/playai) for more information on the PlayAI voice provider.
|
|
776
|
+
|
|
777
|
+
**Google**:
|
|
778
|
+
|
|
779
|
+
```typescript
|
|
780
|
+
// Google Voice Configuration
|
|
781
|
+
const voice = new GoogleVoice({
|
|
782
|
+
speechModel: {
|
|
783
|
+
name: 'en-US-Studio-O', // Example model name
|
|
784
|
+
apiKey: process.env.GOOGLE_API_KEY,
|
|
785
|
+
languageCode: 'en-US', // Language code
|
|
786
|
+
gender: 'FEMALE', // Voice gender
|
|
787
|
+
speakingRate: 1.0, // Speaking rate
|
|
788
|
+
},
|
|
789
|
+
listeningModel: {
|
|
790
|
+
name: 'en-US', // Example model name
|
|
791
|
+
sampleRateHertz: 16000, // Sample rate
|
|
792
|
+
},
|
|
793
|
+
})
|
|
794
|
+
```
|
|
795
|
+
|
|
796
|
+
Visit the [Google Voice Reference](https://mastra.ai/reference/voice/google) for more information on the Google voice provider.
|
|
797
|
+
|
|
798
|
+
**Cloudflare**:
|
|
799
|
+
|
|
800
|
+
```typescript
|
|
801
|
+
// Cloudflare Voice Configuration
|
|
802
|
+
const voice = new CloudflareVoice({
|
|
803
|
+
speechModel: {
|
|
804
|
+
name: 'cloudflare-voice', // Example model name
|
|
805
|
+
accountId: process.env.CLOUDFLARE_ACCOUNT_ID,
|
|
806
|
+
apiToken: process.env.CLOUDFLARE_API_TOKEN,
|
|
807
|
+
language: 'en-US', // Language code
|
|
808
|
+
format: 'mp3', // Audio format
|
|
809
|
+
},
|
|
810
|
+
// Cloudflare may not have a separate listening model
|
|
811
|
+
})
|
|
812
|
+
```
|
|
813
|
+
|
|
814
|
+
Visit the [Cloudflare Voice Reference](https://mastra.ai/reference/voice/cloudflare) for more information on the Cloudflare voice provider.
|
|
815
|
+
|
|
816
|
+
**Deepgram**:
|
|
817
|
+
|
|
818
|
+
```typescript
|
|
819
|
+
// Deepgram Voice Configuration
|
|
820
|
+
const voice = new DeepgramVoice({
|
|
821
|
+
speechModel: {
|
|
822
|
+
name: 'nova-2', // Example model name
|
|
823
|
+
speaker: 'aura-english-us', // Example speaker name
|
|
824
|
+
apiKey: process.env.DEEPGRAM_API_KEY,
|
|
825
|
+
language: 'en-US', // Language code
|
|
826
|
+
tone: 'formal', // Tone setting
|
|
827
|
+
},
|
|
828
|
+
listeningModel: {
|
|
829
|
+
name: 'nova-2', // Example model name
|
|
830
|
+
format: 'flac', // Audio format
|
|
831
|
+
},
|
|
832
|
+
})
|
|
833
|
+
```
|
|
834
|
+
|
|
835
|
+
Visit the [Deepgram Voice Reference](https://mastra.ai/reference/voice/deepgram) for more information on the Deepgram voice provider.
|
|
836
|
+
|
|
837
|
+
**Inworld**:
|
|
838
|
+
|
|
839
|
+
```typescript
|
|
840
|
+
// Inworld Voice Configuration
|
|
841
|
+
const voice = new InworldVoice({
|
|
842
|
+
speechModel: {
|
|
843
|
+
name: 'inworld-tts-2',
|
|
844
|
+
apiKey: process.env.INWORLD_API_KEY,
|
|
845
|
+
},
|
|
846
|
+
listeningModel: {
|
|
847
|
+
name: 'groq/whisper-large-v3',
|
|
848
|
+
apiKey: process.env.INWORLD_API_KEY,
|
|
849
|
+
},
|
|
850
|
+
speaker: 'Dennis',
|
|
851
|
+
audioEncoding: 'MP3',
|
|
852
|
+
sampleRateHertz: 48000,
|
|
853
|
+
language: 'en-US',
|
|
854
|
+
})
|
|
855
|
+
|
|
856
|
+
// Per-call options: `deliveryMode` is honored only by `inworld-tts-2`.
|
|
857
|
+
const audioStream = await voice.speak('Hello!', {
|
|
858
|
+
deliveryMode: 'BALANCED', // 'STABLE' | 'BALANCED' | 'CREATIVE'
|
|
859
|
+
language: 'en-US', // BCP-47 per-call override
|
|
860
|
+
})
|
|
861
|
+
```
|
|
862
|
+
|
|
863
|
+
Visit the [Inworld Voice Reference](https://mastra.ai/reference/voice/inworld) for more information on the Inworld voice provider.
|
|
864
|
+
|
|
865
|
+
**Speechify**:
|
|
866
|
+
|
|
867
|
+
```typescript
|
|
868
|
+
// Speechify Voice Configuration
|
|
869
|
+
const voice = new SpeechifyVoice({
|
|
870
|
+
speechModel: {
|
|
871
|
+
name: 'speechify-voice', // Example model name
|
|
872
|
+
speaker: 'matthew', // Example speaker name
|
|
873
|
+
apiKey: process.env.SPEECHIFY_API_KEY,
|
|
874
|
+
language: 'en-US', // Language code
|
|
875
|
+
speed: 1.0, // Speech speed
|
|
876
|
+
},
|
|
877
|
+
// Speechify may not have a separate listening model
|
|
878
|
+
})
|
|
879
|
+
```
|
|
880
|
+
|
|
881
|
+
Visit the [Speechify Voice Reference](https://mastra.ai/reference/voice/speechify) for more information on the Speechify voice provider.
|
|
882
|
+
|
|
883
|
+
**Sarvam**:
|
|
884
|
+
|
|
885
|
+
```typescript
|
|
886
|
+
// Sarvam Voice Configuration
|
|
887
|
+
const voice = new SarvamVoice({
|
|
888
|
+
speechModel: {
|
|
889
|
+
model: 'bulbul:v3', // TTS model (bulbul:v2 or bulbul:v3)
|
|
890
|
+
apiKey: process.env.SARVAM_API_KEY,
|
|
891
|
+
language: 'en-IN', // BCP-47 language code
|
|
892
|
+
},
|
|
893
|
+
listeningModel: {
|
|
894
|
+
model: 'saarika:v2.5', // STT model (saarika:v2.5 or saaras:v3)
|
|
895
|
+
apiKey: process.env.SARVAM_API_KEY,
|
|
896
|
+
},
|
|
897
|
+
speaker: 'shubh', // Default bulbul:v3 speaker
|
|
898
|
+
})
|
|
899
|
+
```
|
|
900
|
+
|
|
901
|
+
Visit the [Sarvam Voice Reference](https://mastra.ai/reference/voice/sarvam) for more information on the Sarvam voice provider.
|
|
902
|
+
|
|
903
|
+
**Murf**:
|
|
904
|
+
|
|
905
|
+
```typescript
|
|
906
|
+
// Murf Voice Configuration
|
|
907
|
+
const voice = new MurfVoice({
|
|
908
|
+
speechModel: {
|
|
909
|
+
name: 'murf-voice', // Example model name
|
|
910
|
+
apiKey: process.env.MURF_API_KEY,
|
|
911
|
+
language: 'en-US', // Language code
|
|
912
|
+
emotion: 'happy', // Emotion setting
|
|
913
|
+
},
|
|
914
|
+
// Murf may not have a separate listening model
|
|
915
|
+
})
|
|
916
|
+
```
|
|
917
|
+
|
|
918
|
+
Visit the [Murf Voice Reference](https://mastra.ai/reference/voice/murf) for more information on the Murf voice provider.
|
|
919
|
+
|
|
920
|
+
**OpenAI Realtime**:
|
|
921
|
+
|
|
922
|
+
```typescript
|
|
923
|
+
// OpenAI Realtime Voice Configuration
|
|
924
|
+
const voice = new OpenAIRealtimeVoice({
|
|
925
|
+
speechModel: {
|
|
926
|
+
name: 'gpt-3.5-turbo', // Example model name
|
|
927
|
+
apiKey: process.env.OPENAI_API_KEY,
|
|
928
|
+
language: 'en-US', // Language code
|
|
929
|
+
},
|
|
930
|
+
listeningModel: {
|
|
931
|
+
name: 'whisper-1', // Example model name
|
|
932
|
+
apiKey: process.env.OPENAI_API_KEY,
|
|
933
|
+
format: 'ogg', // Audio format
|
|
934
|
+
},
|
|
935
|
+
speaker: 'alloy', // Example speaker name
|
|
936
|
+
})
|
|
937
|
+
```
|
|
938
|
+
|
|
939
|
+
For more information on the OpenAI Realtime voice provider, refer to the [OpenAI Realtime Voice Reference](https://mastra.ai/reference/voice/openai-realtime).
|
|
940
|
+
|
|
941
|
+
**Google Gemini Live**:
|
|
942
|
+
|
|
943
|
+
```typescript
|
|
944
|
+
// Google Gemini Live Voice Configuration
|
|
945
|
+
const voice = new GeminiLiveVoice({
|
|
946
|
+
speechModel: {
|
|
947
|
+
name: 'gemini-2.0-flash-exp', // Example model name
|
|
948
|
+
apiKey: process.env.GOOGLE_API_KEY,
|
|
949
|
+
},
|
|
950
|
+
speaker: 'Puck', // Example speaker name
|
|
951
|
+
// Google Gemini Live is a realtime bidirectional API without separate speech and listening models
|
|
952
|
+
})
|
|
953
|
+
```
|
|
954
|
+
|
|
955
|
+
Visit the [Google Gemini Live Reference](https://mastra.ai/reference/voice/google-gemini-live) for more information on the Google Gemini Live voice provider.
|
|
956
|
+
|
|
957
|
+
**AWS Nova Sonic**:
|
|
958
|
+
|
|
959
|
+
```typescript
|
|
960
|
+
// AWS Nova Sonic Voice Configuration
|
|
961
|
+
const voice = new NovaSonicVoice({
|
|
962
|
+
region: 'us-east-1',
|
|
963
|
+
speaker: 'matthew',
|
|
964
|
+
sessionConfig: {
|
|
965
|
+
inferenceConfiguration: {
|
|
966
|
+
temperature: 0.7,
|
|
967
|
+
maxTokens: 1024,
|
|
968
|
+
},
|
|
969
|
+
turnDetectionConfiguration: {
|
|
970
|
+
endpointingSensitivity: 'MEDIUM',
|
|
971
|
+
},
|
|
972
|
+
},
|
|
973
|
+
// AWS Nova Sonic is a realtime bidirectional API without separate speech and listening models
|
|
974
|
+
})
|
|
975
|
+
```
|
|
976
|
+
|
|
977
|
+
Visit the [AWS Nova Sonic Reference](https://mastra.ai/reference/voice/aws-nova-sonic) for more information on the AWS Nova Sonic voice provider.
|
|
978
|
+
|
|
979
|
+
**AI SDK**:
|
|
980
|
+
|
|
981
|
+
```typescript
|
|
982
|
+
// AI SDK Voice Configuration
|
|
983
|
+
import { CompositeVoice } from '@mastra/core/voice'
|
|
984
|
+
import { openai } from '@ai-sdk/openai'
|
|
985
|
+
import { elevenlabs } from '@ai-sdk/elevenlabs'
|
|
986
|
+
|
|
987
|
+
// Use AI SDK models directly - no need to install separate packages
|
|
988
|
+
const voice = new CompositeVoice({
|
|
989
|
+
input: openai.transcription('whisper-1'), // AI SDK transcription
|
|
990
|
+
output: elevenlabs.speech('eleven_turbo_v2'), // AI SDK speech
|
|
991
|
+
})
|
|
992
|
+
|
|
993
|
+
// Works seamlessly with your agent
|
|
994
|
+
const voiceAgent = new Agent({
|
|
995
|
+
id: 'aisdk-voice-agent',
|
|
996
|
+
name: 'AI SDK Voice Agent',
|
|
997
|
+
instructions: 'You are a helpful assistant with voice capabilities.',
|
|
998
|
+
model: 'openai/gpt-5.4',
|
|
999
|
+
voice,
|
|
1000
|
+
})
|
|
1001
|
+
```
|
|
1002
|
+
|
|
1003
|
+
### Using Multiple Voice Providers
|
|
1004
|
+
|
|
1005
|
+
This example demonstrates how to create and use two different voice providers in Mastra: OpenAI for speech-to-text (STT) and PlayAI for text-to-speech (TTS).
|
|
1006
|
+
|
|
1007
|
+
Start by creating instances of the voice providers with any necessary configuration.
|
|
1008
|
+
|
|
1009
|
+
```typescript
|
|
1010
|
+
import { OpenAIVoice } from '@mastra/voice-openai'
|
|
1011
|
+
import { PlayAIVoice } from '@mastra/voice-playai'
|
|
1012
|
+
import { CompositeVoice } from '@mastra/core/voice'
|
|
1013
|
+
import { playAudio, getMicrophoneStream } from '@mastra/node-audio'
|
|
1014
|
+
|
|
1015
|
+
// Initialize OpenAI voice for STT
|
|
1016
|
+
const input = new OpenAIVoice({
|
|
1017
|
+
listeningModel: {
|
|
1018
|
+
name: 'whisper-1',
|
|
1019
|
+
apiKey: process.env.OPENAI_API_KEY,
|
|
1020
|
+
},
|
|
1021
|
+
})
|
|
1022
|
+
|
|
1023
|
+
// Initialize PlayAI voice for TTS
|
|
1024
|
+
const output = new PlayAIVoice({
|
|
1025
|
+
speechModel: {
|
|
1026
|
+
name: 'playai-voice',
|
|
1027
|
+
apiKey: process.env.PLAYAI_API_KEY,
|
|
1028
|
+
},
|
|
1029
|
+
})
|
|
1030
|
+
|
|
1031
|
+
// Combine the providers using CompositeVoice
|
|
1032
|
+
const voice = new CompositeVoice({
|
|
1033
|
+
input,
|
|
1034
|
+
output,
|
|
1035
|
+
})
|
|
1036
|
+
|
|
1037
|
+
// Implement voice interactions using the combined voice provider
|
|
1038
|
+
const audioStream = getMicrophoneStream() // Assume this function gets audio input
|
|
1039
|
+
const transcript = await voice.listen(audioStream)
|
|
1040
|
+
|
|
1041
|
+
// Log the transcribed text
|
|
1042
|
+
console.log('Transcribed text:', transcript)
|
|
1043
|
+
|
|
1044
|
+
// Convert text to speech
|
|
1045
|
+
const responseAudio = await voice.speak(`You said: ${transcript}`, {
|
|
1046
|
+
speaker: 'default', // Optional: specify a speaker,
|
|
1047
|
+
responseFormat: 'wav', // Optional: specify a response format
|
|
1048
|
+
})
|
|
1049
|
+
|
|
1050
|
+
// Play the audio response
|
|
1051
|
+
playAudio(responseAudio)
|
|
1052
|
+
```
|
|
1053
|
+
|
|
1054
|
+
### Using AI SDK Model Providers
|
|
1055
|
+
|
|
1056
|
+
You can also use AI SDK models directly with `CompositeVoice`:
|
|
1057
|
+
|
|
1058
|
+
```typescript
|
|
1059
|
+
import { CompositeVoice } from '@mastra/core/voice'
|
|
1060
|
+
import { openai } from '@ai-sdk/openai'
|
|
1061
|
+
import { elevenlabs } from '@ai-sdk/elevenlabs'
|
|
1062
|
+
import { playAudio, getMicrophoneStream } from '@mastra/node-audio'
|
|
1063
|
+
|
|
1064
|
+
// Use AI SDK models directly - no provider setup needed
|
|
1065
|
+
const voice = new CompositeVoice({
|
|
1066
|
+
input: openai.transcription('whisper-1'), // AI SDK transcription
|
|
1067
|
+
output: elevenlabs.speech('eleven_turbo_v2'), // AI SDK speech
|
|
1068
|
+
})
|
|
1069
|
+
|
|
1070
|
+
// Works the same way as Mastra providers
|
|
1071
|
+
const audioStream = getMicrophoneStream()
|
|
1072
|
+
const transcript = await voice.listen(audioStream)
|
|
1073
|
+
|
|
1074
|
+
console.log('Transcribed text:', transcript)
|
|
1075
|
+
|
|
1076
|
+
// Convert text to speech
|
|
1077
|
+
const responseAudio = await voice.speak(`You said: ${transcript}`, {
|
|
1078
|
+
speaker: 'Rachel', // ElevenLabs voice
|
|
1079
|
+
})
|
|
1080
|
+
|
|
1081
|
+
playAudio(responseAudio)
|
|
1082
|
+
```
|
|
1083
|
+
|
|
1084
|
+
You can also mix AI SDK models with Mastra providers:
|
|
1085
|
+
|
|
1086
|
+
```typescript
|
|
1087
|
+
import { CompositeVoice } from '@mastra/core/voice'
|
|
1088
|
+
import { PlayAIVoice } from '@mastra/voice-playai'
|
|
1089
|
+
import { groq } from '@ai-sdk/groq'
|
|
1090
|
+
|
|
1091
|
+
const voice = new CompositeVoice({
|
|
1092
|
+
input: groq.transcription('whisper-large-v3'), // AI SDK for STT
|
|
1093
|
+
output: new PlayAIVoice(), // Mastra provider for TTS
|
|
1094
|
+
})
|
|
1095
|
+
```
|
|
1096
|
+
|
|
1097
|
+
For more information on the CompositeVoice, refer to the [CompositeVoice Reference](https://mastra.ai/reference/voice/composite-voice).
|
|
1098
|
+
|
|
1099
|
+
## More resources
|
|
1100
|
+
|
|
1101
|
+
- [CompositeVoice](https://mastra.ai/reference/voice/composite-voice)
|
|
1102
|
+
- [MastraVoice](https://mastra.ai/reference/voice/mastra-voice)
|
|
1103
|
+
- [OpenAI Voice](https://mastra.ai/reference/voice/openai)
|
|
1104
|
+
- [OpenAI Realtime Voice](https://mastra.ai/reference/voice/openai-realtime)
|
|
1105
|
+
- [Azure Voice](https://mastra.ai/reference/voice/azure)
|
|
1106
|
+
- [Google Voice](https://mastra.ai/reference/voice/google)
|
|
1107
|
+
- [Google Gemini Live Voice](https://mastra.ai/reference/voice/google-gemini-live)
|
|
1108
|
+
- [AWS Nova Sonic Voice](https://mastra.ai/reference/voice/aws-nova-sonic)
|
|
1109
|
+
- [Deepgram Voice](https://mastra.ai/reference/voice/deepgram)
|
|
1110
|
+
- [Inworld Voice](https://mastra.ai/reference/voice/inworld)
|
|
1111
|
+
- [PlayAI Voice](https://mastra.ai/reference/voice/playai)
|
|
1112
|
+
- [Voice Examples](https://github.com/mastra-ai/voice-examples)
|
|
@@ -0,0 +1,133 @@
|
|
|
1
|
+
# Inworld
|
|
2
|
+
|
|
3
|
+
The Inworld voice implementation in Mastra provides streaming text-to-speech (TTS) and batch speech-to-text (STT) capabilities using Inworld AI's API. It supports multiple TTS and STT models, configurable audio encodings, and progressive audio streaming.
|
|
4
|
+
|
|
5
|
+
## Usage example
|
|
6
|
+
|
|
7
|
+
```typescript
|
|
8
|
+
import { InworldVoice } from '@mastra/voice-inworld'
|
|
9
|
+
|
|
10
|
+
// Initialize with default configuration (uses INWORLD_API_KEY environment variable)
|
|
11
|
+
const voice = new InworldVoice()
|
|
12
|
+
|
|
13
|
+
// Initialize with custom configuration
|
|
14
|
+
const voice = new InworldVoice({
|
|
15
|
+
speechModel: {
|
|
16
|
+
name: 'inworld-tts-2',
|
|
17
|
+
apiKey: 'your-api-key',
|
|
18
|
+
},
|
|
19
|
+
listeningModel: {
|
|
20
|
+
name: 'groq/whisper-large-v3',
|
|
21
|
+
apiKey: 'your-api-key',
|
|
22
|
+
},
|
|
23
|
+
speaker: 'Dennis',
|
|
24
|
+
})
|
|
25
|
+
|
|
26
|
+
// Text-to-Speech (streaming)
|
|
27
|
+
const audioStream = await voice.speak('Hello, world!')
|
|
28
|
+
|
|
29
|
+
// Speech-to-Text
|
|
30
|
+
const transcript = await voice.listen(audioStream)
|
|
31
|
+
```
|
|
32
|
+
|
|
33
|
+
## Constructor parameters
|
|
34
|
+
|
|
35
|
+
**speechModel** (`InworldVoiceConfig`): Configuration for text-to-speech functionality. (Default: `{ name: 'inworld-tts-2' }`)
|
|
36
|
+
|
|
37
|
+
**speechModel.name** (`'inworld-tts-2' | 'inworld-tts-1.5-max' | 'inworld-tts-1.5-mini'`): The Inworld TTS model to use.
|
|
38
|
+
|
|
39
|
+
**speechModel.apiKey** (`string`): Inworld API key. Falls back to INWORLD\_API\_KEY environment variable.
|
|
40
|
+
|
|
41
|
+
**listeningModel** (`InworldListeningConfig`): Configuration for speech-to-text functionality. (Default: `{ name: 'groq/whisper-large-v3' }`)
|
|
42
|
+
|
|
43
|
+
**listeningModel.name** (`'groq/whisper-large-v3'`): The Inworld STT model to use.
|
|
44
|
+
|
|
45
|
+
**listeningModel.apiKey** (`string`): Inworld API key. Falls back to INWORLD\_API\_KEY environment variable.
|
|
46
|
+
|
|
47
|
+
**speaker** (`string`): Default voice ID to use for text-to-speech. (Default: `'Dennis'`)
|
|
48
|
+
|
|
49
|
+
**audioEncoding** (`'LINEAR16' | 'MP3' | 'OGG_OPUS' | 'ALAW' | 'MULAW' | 'FLAC' | 'PCM' | 'WAV'`): Default audio encoding for TTS output. (Default: `'MP3'`)
|
|
50
|
+
|
|
51
|
+
**sampleRateHertz** (`number`): Default sample rate for TTS output. (Default: `48000`)
|
|
52
|
+
|
|
53
|
+
**language** (`string`): Default BCP-47 language code for STT. (Default: `'en-US'`)
|
|
54
|
+
|
|
55
|
+
## Methods
|
|
56
|
+
|
|
57
|
+
### `speak(input, options?)`
|
|
58
|
+
|
|
59
|
+
Converts text to speech using Inworld's streaming TTS endpoint. Returns a readable stream that emits audio chunks progressively as they arrive.
|
|
60
|
+
|
|
61
|
+
```typescript
|
|
62
|
+
const audioStream = await voice.speak('Hello, world!', {
|
|
63
|
+
speaker: 'Olivia',
|
|
64
|
+
audioEncoding: 'WAV',
|
|
65
|
+
sampleRateHertz: 24000,
|
|
66
|
+
speakingRate: 1.2,
|
|
67
|
+
temperature: 0.8,
|
|
68
|
+
})
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
**input** (`string | NodeJS.ReadableStream`): Text to convert to speech. If a stream is provided, it will be converted to text first.
|
|
72
|
+
|
|
73
|
+
**options** (`InworldSpeakOptions`): Additional options for speech synthesis.
|
|
74
|
+
|
|
75
|
+
**options.speaker** (`string`): Override the default speaker for this request.
|
|
76
|
+
|
|
77
|
+
**options.audioEncoding** (`AudioEncoding`): Override the default audio encoding.
|
|
78
|
+
|
|
79
|
+
**options.sampleRateHertz** (`number`): Override the default sample rate.
|
|
80
|
+
|
|
81
|
+
**options.speakingRate** (`number`): Adjust the speaking rate.
|
|
82
|
+
|
|
83
|
+
**options.temperature** (`number`): Controls voice variability. Honored on \`inworld-tts-1.5-\*\` models; ignored by \`inworld-tts-2\`.
|
|
84
|
+
|
|
85
|
+
**options.deliveryMode** (`'STABLE' | 'BALANCED' | 'CREATIVE'`): Steering control for delivery style. Only honored by \`inworld-tts-2\`.
|
|
86
|
+
|
|
87
|
+
**options.language** (`string`): BCP-47 language code for this request. Auto-detected when omitted.
|
|
88
|
+
|
|
89
|
+
**Returns:** `Promise<NodeJS.ReadableStream>`
|
|
90
|
+
|
|
91
|
+
### `listen(input, options?)`
|
|
92
|
+
|
|
93
|
+
Converts speech to text using Inworld's batch STT endpoint.
|
|
94
|
+
|
|
95
|
+
```typescript
|
|
96
|
+
const transcript = await voice.listen(audioStream, {
|
|
97
|
+
audioEncoding: 'MP3',
|
|
98
|
+
sampleRateHertz: 44100,
|
|
99
|
+
language: 'ja-JP',
|
|
100
|
+
})
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
**input** (`NodeJS.ReadableStream`): Audio stream to transcribe.
|
|
104
|
+
|
|
105
|
+
**options** (`InworldListenOptions`): Additional options for transcription.
|
|
106
|
+
|
|
107
|
+
**options.audioEncoding** (`'LINEAR16' | 'MP3' | 'OGG_OPUS' | 'FLAC' | 'AUTO_DETECT'`): Audio encoding of the input stream.
|
|
108
|
+
|
|
109
|
+
**options.sampleRateHertz** (`number`): Sample rate of the input audio.
|
|
110
|
+
|
|
111
|
+
**options.language** (`string`): BCP-47 language code for transcription.
|
|
112
|
+
|
|
113
|
+
**options.numberOfChannels** (`number`): Number of audio channels in the input.
|
|
114
|
+
|
|
115
|
+
**Returns:** `Promise<string>`
|
|
116
|
+
|
|
117
|
+
### `getSpeakers()`
|
|
118
|
+
|
|
119
|
+
Returns a list of available voices from the Inworld API.
|
|
120
|
+
|
|
121
|
+
```typescript
|
|
122
|
+
const speakers = await voice.getSpeakers()
|
|
123
|
+
// [{ voiceId: 'Dennis', name: 'Dennis', language: 'en', description: '...', tags: ['friendly'], source: 'SYSTEM' }, ...]
|
|
124
|
+
```
|
|
125
|
+
|
|
126
|
+
**Returns:** `Promise<Array<{ voiceId: string; name: string; language: string; description: string; tags: string[]; source: string }>>`
|
|
127
|
+
|
|
128
|
+
## Notes
|
|
129
|
+
|
|
130
|
+
- The TTS endpoint uses progressive NDJSON streaming, so audio playback can begin before the full response is received.
|
|
131
|
+
- An API key can be provided via the `speechModel` or `listeningModel` config, or the `INWORLD_API_KEY` environment variable. TTS and STT keys are resolved independently: passing distinct `speechModel.apiKey` and `listeningModel.apiKey` values lets each service use its own credential. If only one is provided, it is reused for both services as a fallback before the env var.
|
|
132
|
+
- `inworld-tts-2` is the default flagship model. Use `deliveryMode` (`STABLE` | `BALANCED` | `CREATIVE`) to steer delivery style on this model. The `temperature` option is ignored on `inworld-tts-2`.
|
|
133
|
+
- The `inworld-tts-1.5-mini` model offers lower latency at the cost of reduced voice quality compared to `inworld-tts-1.5-max`.
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@mastra/voice-inworld",
|
|
3
|
-
"version": "0.0.0",
|
|
3
|
+
"version": "0.2.0-alpha.0",
|
|
4
4
|
"description": "Mastra Inworld AI voice integration — streaming TTS and batch STT",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"files": [
|
|
@@ -22,26 +22,19 @@
|
|
|
22
22
|
},
|
|
23
23
|
"./package.json": "./package.json"
|
|
24
24
|
},
|
|
25
|
-
"scripts": {
|
|
26
|
-
"build": "tsup --silent --config tsup.config.ts",
|
|
27
|
-
"prepack": "pnpx tsx ../../scripts/generate-package-docs.ts",
|
|
28
|
-
"build:watch": "tsup --watch --silent --config tsup.config.ts",
|
|
29
|
-
"test": "vitest run",
|
|
30
|
-
"lint": "eslint ."
|
|
31
|
-
},
|
|
32
25
|
"license": "Apache-2.0",
|
|
33
26
|
"dependencies": {},
|
|
34
27
|
"devDependencies": {
|
|
35
|
-
"@internal/lint": "workspace:*",
|
|
36
|
-
"@internal/types-builder": "workspace:*",
|
|
37
|
-
"@mastra/core": "workspace:*",
|
|
38
28
|
"@types/node": "22.19.15",
|
|
39
|
-
"@vitest/coverage-v8": "
|
|
40
|
-
"@vitest/ui": "
|
|
29
|
+
"@vitest/coverage-v8": "4.1.5",
|
|
30
|
+
"@vitest/ui": "4.1.5",
|
|
41
31
|
"eslint": "^10.2.1",
|
|
42
32
|
"tsup": "^8.5.1",
|
|
43
|
-
"typescript": "
|
|
44
|
-
"vitest": "
|
|
33
|
+
"typescript": "^6.0.3",
|
|
34
|
+
"vitest": "4.1.5",
|
|
35
|
+
"@internal/lint": "0.0.92",
|
|
36
|
+
"@internal/types-builder": "0.0.67",
|
|
37
|
+
"@mastra/core": "1.33.0-alpha.13"
|
|
45
38
|
},
|
|
46
39
|
"peerDependencies": {
|
|
47
40
|
"@mastra/core": ">=1.0.0-0 <2.0.0-0",
|
|
@@ -58,5 +51,11 @@
|
|
|
58
51
|
},
|
|
59
52
|
"engines": {
|
|
60
53
|
"node": ">=22.13.0"
|
|
54
|
+
},
|
|
55
|
+
"scripts": {
|
|
56
|
+
"build": "tsup --silent --config tsup.config.ts",
|
|
57
|
+
"build:watch": "tsup --watch --silent --config tsup.config.ts",
|
|
58
|
+
"test": "vitest run",
|
|
59
|
+
"lint": "eslint ."
|
|
61
60
|
}
|
|
62
|
-
}
|
|
61
|
+
}
|