@mastra/voice-openai 0.12.1 → 0.12.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +16 -0
- package/LICENSE.md +15 -0
- package/dist/_types/@internal_voice/dist/_types/@internal_ai-sdk-v5/dist/index.d.ts +8888 -0
- package/dist/_types/@internal_voice/dist/_types/@internal_core/dist/base/index.d.ts +31 -0
- package/dist/_types/@internal_voice/dist/_types/@internal_core/dist/logger/index.d.ts +217 -0
- package/dist/_types/@internal_voice/dist/_types/@internal_core/dist/request-context/index.d.ts +147 -0
- package/dist/_types/@internal_voice/dist/_types/@internal_core/dist/types/index.d.ts +3 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v3/ZodError.d.ts +164 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v3/errors.d.ts +5 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v3/external.d.ts +6 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v3/helpers/enumUtil.d.ts +8 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v3/helpers/errorUtil.d.ts +9 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v3/helpers/parseUtil.d.ts +78 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v3/helpers/partialUtil.d.ts +8 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v3/helpers/typeAliases.d.ts +2 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v3/helpers/util.d.ts +85 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v3/index.d.cts +4 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v3/index.d.ts +4 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v3/locales/en.d.ts +3 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v3/standard-schema.d.ts +102 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v3/types.d.ts +1034 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/classic/checks.d.ts +1 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/classic/coerce.d.ts +17 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/classic/compat.d.ts +50 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/classic/errors.d.ts +30 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/classic/external.d.ts +16 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/classic/from-json-schema.d.ts +12 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/classic/index.d.ts +4 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/classic/iso.d.ts +22 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/classic/parse.d.ts +31 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/classic/schemas.d.ts +767 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/core/api.d.ts +325 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/core/checks.d.ts +278 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/core/core.d.ts +70 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/core/doc.d.ts +14 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/core/errors.d.ts +221 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/core/index.d.ts +16 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/core/json-schema-generator.d.ts +65 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/core/json-schema-processors.d.ts +49 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/core/json-schema.d.ts +88 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/core/parse.d.ts +49 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/core/regexes.d.ts +85 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/core/registries.d.ts +35 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/core/schemas.d.ts +1184 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/core/standard-schema.d.ts +126 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/core/to-json-schema.d.ts +114 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/core/util.d.ts +200 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/core/versions.d.ts +5 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/index.d.cts +3 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/ar.d.ts +4 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/az.d.ts +4 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/be.d.ts +4 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/bg.d.ts +4 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/ca.d.ts +4 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/cs.d.ts +4 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/da.d.ts +4 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/de.d.ts +4 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/el.d.ts +4 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/en.d.ts +4 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/eo.d.ts +4 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/es.d.ts +4 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/fa.d.ts +4 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/fi.d.ts +4 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/fr-CA.d.ts +4 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/fr.d.ts +4 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/he.d.ts +4 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/hr.d.ts +4 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/hu.d.ts +4 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/hy.d.ts +4 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/id.d.ts +4 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/index.d.ts +52 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/is.d.ts +4 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/it.d.ts +4 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/ja.d.ts +4 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/ka.d.ts +4 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/kh.d.ts +5 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/km.d.ts +4 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/ko.d.ts +4 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/lt.d.ts +4 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/mk.d.ts +4 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/ms.d.ts +4 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/nl.d.ts +4 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/no.d.ts +4 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/ota.d.ts +4 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/pl.d.ts +4 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/ps.d.ts +4 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/pt.d.ts +4 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/ro.d.ts +4 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/ru.d.ts +4 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/sl.d.ts +4 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/sv.d.ts +4 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/ta.d.ts +4 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/th.d.ts +4 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/tr.d.ts +4 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/ua.d.ts +5 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/uk.d.ts +4 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/ur.d.ts +4 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/uz.d.ts +4 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/vi.d.ts +4 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/yo.d.ts +4 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/zh-CN.d.ts +4 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/zh-TW.d.ts +4 -0
- package/dist/_types/@internal_voice/dist/index.d.ts +16 -0
- package/dist/_types/@internal_voice/dist/voice/aisdk/index.d.ts +3 -0
- package/dist/_types/@internal_voice/dist/voice/aisdk/speech.d.ts +23 -0
- package/dist/_types/@internal_voice/dist/voice/aisdk/transcription.d.ts +22 -0
- package/dist/_types/@internal_voice/dist/voice/composite-voice.d.ts +72 -0
- package/dist/_types/@internal_voice/dist/voice/default-voice.d.ts +13 -0
- package/dist/_types/@internal_voice/dist/voice/index.d.ts +5 -0
- package/dist/_types/@internal_voice/dist/voice/voice.d.ts +172 -0
- package/dist/docs/SKILL.md +2 -2
- package/dist/docs/assets/SOURCE_MAP.json +1 -1
- package/dist/docs/references/docs-agents-adding-voice.md +186 -158
- package/dist/docs/references/docs-voice-overview.md +650 -379
- package/dist/docs/references/docs-voice-speech-to-text.md +27 -28
- package/dist/docs/references/docs-voice-text-to-speech.md +28 -29
- package/dist/docs/references/reference-voice-composite-voice.md +37 -37
- package/dist/docs/references/reference-voice-openai.md +36 -30
- package/dist/docs/references/reference-voice-voice.getSpeakers.md +45 -45
- package/dist/docs/references/reference-voice-voice.listen.md +64 -58
- package/dist/docs/references/reference-voice-voice.speak.md +65 -55
- package/dist/index.cjs +260 -2
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.ts +1 -1
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +259 -1
- package/dist/index.js.map +1 -1
- package/package.json +10 -11
|
@@ -4,11 +4,11 @@ The `listen()` method is a core function available in all Mastra voice providers
|
|
|
4
4
|
|
|
5
5
|
## Parameters
|
|
6
6
|
|
|
7
|
-
**audioStream
|
|
7
|
+
**audioStream** (`NodeJS.ReadableStream`): Audio stream to transcribe. This can be a file stream or a microphone stream.
|
|
8
8
|
|
|
9
|
-
**options
|
|
9
|
+
**options** (`object`): Provider-specific options for speech recognition
|
|
10
10
|
|
|
11
|
-
## Return
|
|
11
|
+
## Return value
|
|
12
12
|
|
|
13
13
|
Returns one of the following:
|
|
14
14
|
|
|
@@ -16,81 +16,87 @@ Returns one of the following:
|
|
|
16
16
|
- `Promise<NodeJS.ReadableStream>`: A promise that resolves to a stream of transcribed text (for streaming transcription)
|
|
17
17
|
- `Promise<void>`: For real-time providers that emit 'writing' events instead of returning text directly
|
|
18
18
|
|
|
19
|
-
## Provider-
|
|
19
|
+
## Provider-specific options
|
|
20
20
|
|
|
21
21
|
Each voice provider may support additional options specific to their implementation. Here are some examples:
|
|
22
22
|
|
|
23
23
|
### OpenAI
|
|
24
24
|
|
|
25
|
-
**options
|
|
25
|
+
**options** (`Options`): Configuration options.
|
|
26
26
|
|
|
27
|
-
**options.
|
|
27
|
+
**options.filetype** (`string`): Audio file format (e.g., 'mp3', 'wav', 'm4a')
|
|
28
28
|
|
|
29
|
-
**options.
|
|
29
|
+
**options.prompt** (`string`): Text to guide the model's transcription
|
|
30
|
+
|
|
31
|
+
**options.language** (`string`): Language code (e.g., 'en', 'fr', 'de')
|
|
30
32
|
|
|
31
33
|
### Google
|
|
32
34
|
|
|
33
|
-
**options
|
|
35
|
+
**options** (`Options`): Configuration options.
|
|
36
|
+
|
|
37
|
+
**options.stream** (`boolean`): Whether to use streaming recognition
|
|
34
38
|
|
|
35
|
-
**options.config
|
|
39
|
+
**options.config** (`object`): Recognition configuration from Google Cloud Speech-to-Text API
|
|
36
40
|
|
|
37
41
|
### Deepgram
|
|
38
42
|
|
|
39
|
-
**options
|
|
43
|
+
**options** (`Options`): Configuration options.
|
|
40
44
|
|
|
41
|
-
**options.
|
|
45
|
+
**options.model** (`string`): Deepgram model to use for transcription
|
|
42
46
|
|
|
43
|
-
|
|
47
|
+
**options.language** (`string`): Language code for transcription
|
|
48
|
+
|
|
49
|
+
## Usage example
|
|
44
50
|
|
|
45
51
|
```typescript
|
|
46
|
-
import { OpenAIVoice } from
|
|
47
|
-
import { getMicrophoneStream } from
|
|
48
|
-
import { createReadStream } from
|
|
49
|
-
import path from
|
|
52
|
+
import { OpenAIVoice } from '@mastra/voice-openai'
|
|
53
|
+
import { getMicrophoneStream } from '@mastra/node-audio'
|
|
54
|
+
import { createReadStream } from 'fs'
|
|
55
|
+
import path from 'path'
|
|
50
56
|
|
|
51
57
|
// Initialize a voice provider
|
|
52
58
|
const voice = new OpenAIVoice({
|
|
53
59
|
listeningModel: {
|
|
54
|
-
name:
|
|
60
|
+
name: 'whisper-1',
|
|
55
61
|
apiKey: process.env.OPENAI_API_KEY,
|
|
56
62
|
},
|
|
57
|
-
})
|
|
63
|
+
})
|
|
58
64
|
|
|
59
65
|
// Basic usage with a file stream
|
|
60
|
-
const audioFilePath = path.join(process.cwd(),
|
|
61
|
-
const audioStream = createReadStream(audioFilePath)
|
|
66
|
+
const audioFilePath = path.join(process.cwd(), 'audio.mp3')
|
|
67
|
+
const audioStream = createReadStream(audioFilePath)
|
|
62
68
|
const transcript = await voice.listen(audioStream, {
|
|
63
|
-
filetype:
|
|
64
|
-
})
|
|
65
|
-
console.log(
|
|
69
|
+
filetype: 'mp3',
|
|
70
|
+
})
|
|
71
|
+
console.log('Transcribed text:', transcript)
|
|
66
72
|
|
|
67
73
|
// Using a microphone stream
|
|
68
|
-
const microphoneStream = getMicrophoneStream()
|
|
69
|
-
const transcription = await voice.listen(microphoneStream)
|
|
74
|
+
const microphoneStream = getMicrophoneStream() // Assume this function gets audio input
|
|
75
|
+
const transcription = await voice.listen(microphoneStream)
|
|
70
76
|
|
|
71
77
|
// With provider-specific options
|
|
72
78
|
const transcriptWithOptions = await voice.listen(audioStream, {
|
|
73
|
-
language:
|
|
74
|
-
prompt:
|
|
75
|
-
})
|
|
79
|
+
language: 'en',
|
|
80
|
+
prompt: 'This is a conversation about artificial intelligence.',
|
|
81
|
+
})
|
|
76
82
|
```
|
|
77
83
|
|
|
78
|
-
## Using with CompositeVoice
|
|
84
|
+
## Using with `CompositeVoice`
|
|
79
85
|
|
|
80
86
|
When using `CompositeVoice`, the `listen()` method delegates to the configured listening provider:
|
|
81
87
|
|
|
82
88
|
```typescript
|
|
83
|
-
import { CompositeVoice } from
|
|
84
|
-
import { OpenAIVoice } from
|
|
85
|
-
import { PlayAIVoice } from
|
|
89
|
+
import { CompositeVoice } from '@mastra/core/voice'
|
|
90
|
+
import { OpenAIVoice } from '@mastra/voice-openai'
|
|
91
|
+
import { PlayAIVoice } from '@mastra/voice-playai'
|
|
86
92
|
|
|
87
93
|
const voice = new CompositeVoice({
|
|
88
94
|
input: new OpenAIVoice(),
|
|
89
95
|
output: new PlayAIVoice(),
|
|
90
|
-
})
|
|
96
|
+
})
|
|
91
97
|
|
|
92
98
|
// This will use the OpenAIVoice provider
|
|
93
|
-
const transcript = await voice.listen(audioStream)
|
|
99
|
+
const transcript = await voice.listen(audioStream)
|
|
94
100
|
```
|
|
95
101
|
|
|
96
102
|
### Using AI SDK Model Providers
|
|
@@ -98,18 +104,18 @@ const transcript = await voice.listen(audioStream);
|
|
|
98
104
|
You can also use AI SDK transcription models directly with `CompositeVoice`:
|
|
99
105
|
|
|
100
106
|
```typescript
|
|
101
|
-
import { CompositeVoice } from
|
|
102
|
-
import { openai } from
|
|
103
|
-
import { groq } from
|
|
107
|
+
import { CompositeVoice } from '@mastra/core/voice'
|
|
108
|
+
import { openai } from '@ai-sdk/openai'
|
|
109
|
+
import { groq } from '@ai-sdk/groq'
|
|
104
110
|
|
|
105
111
|
// Use AI SDK transcription models
|
|
106
112
|
const voice = new CompositeVoice({
|
|
107
|
-
input: openai.transcription('whisper-1'),
|
|
108
|
-
output: new PlayAIVoice(),
|
|
109
|
-
})
|
|
113
|
+
input: openai.transcription('whisper-1'), // AI SDK model
|
|
114
|
+
output: new PlayAIVoice(), // Mastra provider
|
|
115
|
+
})
|
|
110
116
|
|
|
111
117
|
// Works the same way
|
|
112
|
-
const transcript = await voice.listen(audioStream)
|
|
118
|
+
const transcript = await voice.listen(audioStream)
|
|
113
119
|
|
|
114
120
|
// Provider-specific options can be passed through
|
|
115
121
|
const transcriptWithOptions = await voice.listen(audioStream, {
|
|
@@ -117,14 +123,14 @@ const transcriptWithOptions = await voice.listen(audioStream, {
|
|
|
117
123
|
openai: {
|
|
118
124
|
language: 'en',
|
|
119
125
|
prompt: 'This is about AI',
|
|
120
|
-
}
|
|
121
|
-
}
|
|
122
|
-
})
|
|
126
|
+
},
|
|
127
|
+
},
|
|
128
|
+
})
|
|
123
129
|
```
|
|
124
130
|
|
|
125
131
|
See the [CompositeVoice reference](https://mastra.ai/reference/voice/composite-voice) for more details on AI SDK integration.
|
|
126
132
|
|
|
127
|
-
## Realtime
|
|
133
|
+
## Realtime voice providers
|
|
128
134
|
|
|
129
135
|
When using realtime voice providers like `OpenAIRealtimeVoice`, the `listen()` method behaves differently:
|
|
130
136
|
|
|
@@ -132,20 +138,20 @@ When using realtime voice providers like `OpenAIRealtimeVoice`, the `listen()` m
|
|
|
132
138
|
- You need to register an event listener to receive the transcription
|
|
133
139
|
|
|
134
140
|
```typescript
|
|
135
|
-
import { OpenAIRealtimeVoice } from
|
|
136
|
-
import { getMicrophoneStream } from
|
|
141
|
+
import { OpenAIRealtimeVoice } from '@mastra/voice-openai-realtime'
|
|
142
|
+
import { getMicrophoneStream } from '@mastra/node-audio'
|
|
137
143
|
|
|
138
|
-
const voice = new OpenAIRealtimeVoice()
|
|
139
|
-
await voice.connect()
|
|
144
|
+
const voice = new OpenAIRealtimeVoice()
|
|
145
|
+
await voice.connect()
|
|
140
146
|
|
|
141
147
|
// Register event listener for transcription
|
|
142
|
-
voice.on(
|
|
143
|
-
console.log(`${role}: ${text}`)
|
|
144
|
-
})
|
|
148
|
+
voice.on('writing', ({ text, role }) => {
|
|
149
|
+
console.log(`${role}: ${text}`)
|
|
150
|
+
})
|
|
145
151
|
|
|
146
152
|
// This will emit 'writing' events instead of returning text
|
|
147
|
-
const microphoneStream = getMicrophoneStream()
|
|
148
|
-
await voice.listen(microphoneStream)
|
|
153
|
+
const microphoneStream = getMicrophoneStream()
|
|
154
|
+
await voice.listen(microphoneStream)
|
|
149
155
|
```
|
|
150
156
|
|
|
151
157
|
## Notes
|
|
@@ -157,8 +163,8 @@ await voice.listen(microphoneStream);
|
|
|
157
163
|
- Some providers support streaming transcription, where text is returned as it's transcribed
|
|
158
164
|
- For best performance, consider closing or ending the audio stream when you're done with it
|
|
159
165
|
|
|
160
|
-
## Related
|
|
166
|
+
## Related methods
|
|
161
167
|
|
|
162
|
-
- [voice.speak()](https://mastra.ai/reference/voice/voice.speak)
|
|
163
|
-
- [voice.send()](https://mastra.ai/reference/voice/voice.send)
|
|
164
|
-
- [voice.on()](https://mastra.ai/reference/voice/voice.on)
|
|
168
|
+
- [voice.speak()](https://mastra.ai/reference/voice/voice.speak): Converts text to speech
|
|
169
|
+
- [voice.send()](https://mastra.ai/reference/voice/voice.send): Sends audio data to the voice provider in real-time
|
|
170
|
+
- [voice.on()](https://mastra.ai/reference/voice/voice.on): Registers an event listener for voice events
|
|
@@ -4,88 +4,98 @@ The `speak()` method is a core function available in all Mastra voice providers
|
|
|
4
4
|
|
|
5
5
|
## Parameters
|
|
6
6
|
|
|
7
|
-
**input
|
|
7
|
+
**input** (`string | NodeJS.ReadableStream`): Text to convert to speech. Can be a string or a readable stream of text.
|
|
8
8
|
|
|
9
|
-
**options
|
|
9
|
+
**options** (`object`): Options for speech synthesis
|
|
10
10
|
|
|
11
|
-
**options.speaker
|
|
11
|
+
**options.speaker** (`string`): Voice ID to use for this specific request. Overrides the default speaker set in the constructor.
|
|
12
12
|
|
|
13
|
-
## Return
|
|
13
|
+
## Return value
|
|
14
14
|
|
|
15
15
|
Returns a `Promise<NodeJS.ReadableStream | void>` where:
|
|
16
16
|
|
|
17
17
|
- `NodeJS.ReadableStream`: A stream of audio data that can be played or saved
|
|
18
18
|
- `void`: When using a realtime voice provider that emits audio through events instead of returning it directly
|
|
19
19
|
|
|
20
|
-
## Provider-
|
|
20
|
+
## Provider-specific options
|
|
21
21
|
|
|
22
22
|
Each voice provider may support additional options specific to their implementation. Here are some examples:
|
|
23
23
|
|
|
24
24
|
### OpenAI
|
|
25
25
|
|
|
26
|
-
**options
|
|
26
|
+
**options** (`Options`): Configuration options.
|
|
27
|
+
|
|
28
|
+
**options.speed** (`number`): Speech speed multiplier. Values between 0.25 and 4.0 are supported.
|
|
27
29
|
|
|
28
30
|
### ElevenLabs
|
|
29
31
|
|
|
30
|
-
**options
|
|
32
|
+
**options** (`Options`): Configuration options.
|
|
33
|
+
|
|
34
|
+
**options.stability** (`number`): Voice stability. Higher values result in more stable, less expressive speech.
|
|
31
35
|
|
|
32
|
-
**options.similarity\_boost
|
|
36
|
+
**options.similarity\_boost** (`number`): Voice clarity and similarity to the original voice.
|
|
33
37
|
|
|
34
38
|
### Google
|
|
35
39
|
|
|
36
|
-
**options
|
|
40
|
+
**options** (`Options`): Configuration options.
|
|
41
|
+
|
|
42
|
+
**options.languageCode** (`string`): Language code for the voice (e.g., 'en-US').
|
|
37
43
|
|
|
38
|
-
**options.audioConfig
|
|
44
|
+
**options.audioConfig** (`object`): Audio configuration options from Google Cloud Text-to-Speech API.
|
|
39
45
|
|
|
40
46
|
### Murf
|
|
41
47
|
|
|
42
|
-
**options
|
|
48
|
+
**options** (`Options`): Configuration options.
|
|
49
|
+
|
|
50
|
+
**options.properties** (`object`): properties configuration.
|
|
51
|
+
|
|
52
|
+
**options.properties.rate** (`number`): Speech rate multiplier.
|
|
43
53
|
|
|
44
|
-
**options.properties.pitch
|
|
54
|
+
**options.properties.pitch** (`number`): Voice pitch adjustment.
|
|
45
55
|
|
|
46
|
-
**options.properties.format
|
|
56
|
+
**options.properties.format** (`'MP3' | 'WAV' | 'FLAC' | 'ALAW' | 'ULAW'`): Output audio format.
|
|
47
57
|
|
|
48
|
-
## Usage
|
|
58
|
+
## Usage example
|
|
49
59
|
|
|
50
60
|
```typescript
|
|
51
|
-
import { OpenAIVoice } from
|
|
61
|
+
import { OpenAIVoice } from '@mastra/voice-openai'
|
|
52
62
|
// Initialize a voice provider
|
|
53
63
|
const voice = new OpenAIVoice({
|
|
54
|
-
speaker:
|
|
55
|
-
})
|
|
64
|
+
speaker: 'alloy', // Default voice
|
|
65
|
+
})
|
|
56
66
|
// Basic usage with default settings
|
|
57
|
-
const audioStream = await voice.speak(
|
|
67
|
+
const audioStream = await voice.speak('Hello, world!')
|
|
58
68
|
// Using a different voice for this specific request
|
|
59
|
-
const audioStreamWithDifferentVoice = await voice.speak(
|
|
60
|
-
speaker:
|
|
61
|
-
})
|
|
69
|
+
const audioStreamWithDifferentVoice = await voice.speak('Hello again!', {
|
|
70
|
+
speaker: 'nova',
|
|
71
|
+
})
|
|
62
72
|
// Using provider-specific options
|
|
63
|
-
const audioStreamWithOptions = await voice.speak(
|
|
64
|
-
speaker:
|
|
73
|
+
const audioStreamWithOptions = await voice.speak('Hello with options!', {
|
|
74
|
+
speaker: 'echo',
|
|
65
75
|
speed: 1.2, // OpenAI-specific option
|
|
66
|
-
})
|
|
76
|
+
})
|
|
67
77
|
// Using a text stream as input
|
|
68
|
-
import { Readable } from
|
|
69
|
-
const textStream = Readable.from([
|
|
70
|
-
const audioStreamFromTextStream = await voice.speak(textStream)
|
|
78
|
+
import { Readable } from 'stream'
|
|
79
|
+
const textStream = Readable.from(['Hello', ' from', ' a', ' stream!'])
|
|
80
|
+
const audioStreamFromTextStream = await voice.speak(textStream)
|
|
71
81
|
```
|
|
72
82
|
|
|
73
|
-
## Using with CompositeVoice
|
|
83
|
+
## Using with `CompositeVoice`
|
|
74
84
|
|
|
75
85
|
When using `CompositeVoice`, the `speak()` method delegates to the configured speaking provider:
|
|
76
86
|
|
|
77
87
|
```typescript
|
|
78
|
-
import { CompositeVoice } from
|
|
79
|
-
import { OpenAIVoice } from
|
|
80
|
-
import { PlayAIVoice } from
|
|
88
|
+
import { CompositeVoice } from '@mastra/core/voice'
|
|
89
|
+
import { OpenAIVoice } from '@mastra/voice-openai'
|
|
90
|
+
import { PlayAIVoice } from '@mastra/voice-playai'
|
|
81
91
|
|
|
82
92
|
const voice = new CompositeVoice({
|
|
83
93
|
output: new PlayAIVoice(),
|
|
84
94
|
input: new OpenAIVoice(),
|
|
85
|
-
})
|
|
95
|
+
})
|
|
86
96
|
|
|
87
97
|
// This will use the PlayAIVoice provider
|
|
88
|
-
const audioStream = await voice.speak(
|
|
98
|
+
const audioStream = await voice.speak('Hello, world!')
|
|
89
99
|
```
|
|
90
100
|
|
|
91
101
|
### Using AI SDK Model Providers
|
|
@@ -93,34 +103,34 @@ const audioStream = await voice.speak("Hello, world!");
|
|
|
93
103
|
You can also use AI SDK speech models directly with `CompositeVoice`:
|
|
94
104
|
|
|
95
105
|
```typescript
|
|
96
|
-
import { CompositeVoice } from
|
|
97
|
-
import { openai } from
|
|
98
|
-
import { elevenlabs } from
|
|
106
|
+
import { CompositeVoice } from '@mastra/core/voice'
|
|
107
|
+
import { openai } from '@ai-sdk/openai'
|
|
108
|
+
import { elevenlabs } from '@ai-sdk/elevenlabs'
|
|
99
109
|
|
|
100
110
|
// Use AI SDK speech models
|
|
101
111
|
const voice = new CompositeVoice({
|
|
102
|
-
output: elevenlabs.speech('eleven_turbo_v2'),
|
|
103
|
-
input: openai.transcription('whisper-1'),
|
|
104
|
-
})
|
|
112
|
+
output: elevenlabs.speech('eleven_turbo_v2'), // AI SDK model
|
|
113
|
+
input: openai.transcription('whisper-1'), // AI SDK model
|
|
114
|
+
})
|
|
105
115
|
|
|
106
116
|
// Works the same way
|
|
107
|
-
const audioStream = await voice.speak(
|
|
117
|
+
const audioStream = await voice.speak('Hello from AI SDK!')
|
|
108
118
|
|
|
109
119
|
// Provider-specific options can be passed through
|
|
110
|
-
const audioWithOptions = await voice.speak(
|
|
111
|
-
speaker: 'Rachel',
|
|
120
|
+
const audioWithOptions = await voice.speak('Hello with options!', {
|
|
121
|
+
speaker: 'Rachel', // ElevenLabs voice
|
|
112
122
|
providerOptions: {
|
|
113
123
|
elevenlabs: {
|
|
114
124
|
stability: 0.5,
|
|
115
125
|
similarity_boost: 0.75,
|
|
116
|
-
}
|
|
117
|
-
}
|
|
118
|
-
})
|
|
126
|
+
},
|
|
127
|
+
},
|
|
128
|
+
})
|
|
119
129
|
```
|
|
120
130
|
|
|
121
131
|
See the [CompositeVoice reference](https://mastra.ai/reference/voice/composite-voice) for more details on AI SDK integration.
|
|
122
132
|
|
|
123
|
-
## Realtime
|
|
133
|
+
## Realtime voice providers
|
|
124
134
|
|
|
125
135
|
When using realtime voice providers like `OpenAIRealtimeVoice`, the `speak()` method behaves differently:
|
|
126
136
|
|
|
@@ -128,24 +138,24 @@ When using realtime voice providers like `OpenAIRealtimeVoice`, the `speak()` me
|
|
|
128
138
|
- You need to register an event listener to receive the audio chunks
|
|
129
139
|
|
|
130
140
|
```typescript
|
|
131
|
-
import { OpenAIRealtimeVoice } from
|
|
132
|
-
import Speaker from
|
|
141
|
+
import { OpenAIRealtimeVoice } from '@mastra/voice-openai-realtime'
|
|
142
|
+
import Speaker from '@mastra/node-speaker'
|
|
133
143
|
|
|
134
144
|
const speaker = new Speaker({
|
|
135
145
|
sampleRate: 24100, // Audio sample rate in Hz - standard for high-quality audio on MacBook Pro
|
|
136
146
|
channels: 1, // Mono audio output (as opposed to stereo which would be 2)
|
|
137
147
|
bitDepth: 16, // Bit depth for audio quality - CD quality standard (16-bit resolution)
|
|
138
|
-
})
|
|
148
|
+
})
|
|
139
149
|
|
|
140
|
-
const voice = new OpenAIRealtimeVoice()
|
|
141
|
-
await voice.connect()
|
|
150
|
+
const voice = new OpenAIRealtimeVoice()
|
|
151
|
+
await voice.connect()
|
|
142
152
|
// Register event listener for audio chunks
|
|
143
|
-
voice.on(
|
|
153
|
+
voice.on('speaker', stream => {
|
|
144
154
|
// Handle audio chunk (e.g., play it or save it)
|
|
145
|
-
stream.pipe(speaker)
|
|
146
|
-
})
|
|
155
|
+
stream.pipe(speaker)
|
|
156
|
+
})
|
|
147
157
|
// This will emit 'speaking' events instead of returning a stream
|
|
148
|
-
await voice.speak(
|
|
158
|
+
await voice.speak('Hello, this is realtime speech!')
|
|
149
159
|
```
|
|
150
160
|
|
|
151
161
|
## Notes
|