@mastra/voice-openai 0.12.1 → 0.12.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +16 -0
- package/LICENSE.md +15 -0
- package/dist/_types/@internal_voice/dist/_types/@internal_ai-sdk-v5/dist/index.d.ts +8888 -0
- package/dist/_types/@internal_voice/dist/_types/@internal_core/dist/base/index.d.ts +31 -0
- package/dist/_types/@internal_voice/dist/_types/@internal_core/dist/logger/index.d.ts +217 -0
- package/dist/_types/@internal_voice/dist/_types/@internal_core/dist/request-context/index.d.ts +147 -0
- package/dist/_types/@internal_voice/dist/_types/@internal_core/dist/types/index.d.ts +3 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v3/ZodError.d.ts +164 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v3/errors.d.ts +5 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v3/external.d.ts +6 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v3/helpers/enumUtil.d.ts +8 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v3/helpers/errorUtil.d.ts +9 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v3/helpers/parseUtil.d.ts +78 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v3/helpers/partialUtil.d.ts +8 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v3/helpers/typeAliases.d.ts +2 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v3/helpers/util.d.ts +85 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v3/index.d.cts +4 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v3/index.d.ts +4 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v3/locales/en.d.ts +3 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v3/standard-schema.d.ts +102 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v3/types.d.ts +1034 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/classic/checks.d.ts +1 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/classic/coerce.d.ts +17 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/classic/compat.d.ts +50 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/classic/errors.d.ts +30 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/classic/external.d.ts +16 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/classic/from-json-schema.d.ts +12 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/classic/index.d.ts +4 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/classic/iso.d.ts +22 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/classic/parse.d.ts +31 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/classic/schemas.d.ts +767 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/core/api.d.ts +325 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/core/checks.d.ts +278 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/core/core.d.ts +70 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/core/doc.d.ts +14 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/core/errors.d.ts +221 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/core/index.d.ts +16 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/core/json-schema-generator.d.ts +65 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/core/json-schema-processors.d.ts +49 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/core/json-schema.d.ts +88 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/core/parse.d.ts +49 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/core/regexes.d.ts +85 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/core/registries.d.ts +35 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/core/schemas.d.ts +1184 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/core/standard-schema.d.ts +126 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/core/to-json-schema.d.ts +114 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/core/util.d.ts +200 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/core/versions.d.ts +5 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/index.d.cts +3 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/ar.d.ts +4 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/az.d.ts +4 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/be.d.ts +4 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/bg.d.ts +4 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/ca.d.ts +4 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/cs.d.ts +4 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/da.d.ts +4 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/de.d.ts +4 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/el.d.ts +4 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/en.d.ts +4 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/eo.d.ts +4 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/es.d.ts +4 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/fa.d.ts +4 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/fi.d.ts +4 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/fr-CA.d.ts +4 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/fr.d.ts +4 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/he.d.ts +4 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/hr.d.ts +4 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/hu.d.ts +4 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/hy.d.ts +4 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/id.d.ts +4 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/index.d.ts +52 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/is.d.ts +4 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/it.d.ts +4 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/ja.d.ts +4 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/ka.d.ts +4 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/kh.d.ts +5 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/km.d.ts +4 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/ko.d.ts +4 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/lt.d.ts +4 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/mk.d.ts +4 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/ms.d.ts +4 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/nl.d.ts +4 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/no.d.ts +4 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/ota.d.ts +4 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/pl.d.ts +4 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/ps.d.ts +4 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/pt.d.ts +4 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/ro.d.ts +4 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/ru.d.ts +4 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/sl.d.ts +4 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/sv.d.ts +4 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/ta.d.ts +4 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/th.d.ts +4 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/tr.d.ts +4 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/ua.d.ts +5 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/uk.d.ts +4 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/ur.d.ts +4 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/uz.d.ts +4 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/vi.d.ts +4 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/yo.d.ts +4 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/zh-CN.d.ts +4 -0
- package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/zh-TW.d.ts +4 -0
- package/dist/_types/@internal_voice/dist/index.d.ts +16 -0
- package/dist/_types/@internal_voice/dist/voice/aisdk/index.d.ts +3 -0
- package/dist/_types/@internal_voice/dist/voice/aisdk/speech.d.ts +23 -0
- package/dist/_types/@internal_voice/dist/voice/aisdk/transcription.d.ts +22 -0
- package/dist/_types/@internal_voice/dist/voice/composite-voice.d.ts +72 -0
- package/dist/_types/@internal_voice/dist/voice/default-voice.d.ts +13 -0
- package/dist/_types/@internal_voice/dist/voice/index.d.ts +5 -0
- package/dist/_types/@internal_voice/dist/voice/voice.d.ts +172 -0
- package/dist/docs/SKILL.md +2 -2
- package/dist/docs/assets/SOURCE_MAP.json +1 -1
- package/dist/docs/references/docs-agents-adding-voice.md +186 -158
- package/dist/docs/references/docs-voice-overview.md +650 -379
- package/dist/docs/references/docs-voice-speech-to-text.md +27 -28
- package/dist/docs/references/docs-voice-text-to-speech.md +28 -29
- package/dist/docs/references/reference-voice-composite-voice.md +37 -37
- package/dist/docs/references/reference-voice-openai.md +36 -30
- package/dist/docs/references/reference-voice-voice.getSpeakers.md +45 -45
- package/dist/docs/references/reference-voice-voice.listen.md +64 -58
- package/dist/docs/references/reference-voice-voice.speak.md +65 -55
- package/dist/index.cjs +260 -2
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.ts +1 -1
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +259 -1
- package/dist/index.js.map +1 -1
- package/package.json +10 -11
|
@@ -15,26 +15,26 @@ To use STT in Mastra, you need to provide a `listeningModel` when initializing t
|
|
|
15
15
|
```typescript
|
|
16
16
|
const voice = new OpenAIVoice({
|
|
17
17
|
listeningModel: {
|
|
18
|
-
name:
|
|
18
|
+
name: 'whisper-1',
|
|
19
19
|
apiKey: process.env.OPENAI_API_KEY,
|
|
20
20
|
},
|
|
21
|
-
})
|
|
21
|
+
})
|
|
22
22
|
|
|
23
23
|
// If using default settings the configuration can be simplified to:
|
|
24
|
-
const voice = new OpenAIVoice()
|
|
24
|
+
const voice = new OpenAIVoice()
|
|
25
25
|
```
|
|
26
26
|
|
|
27
|
-
## Available
|
|
27
|
+
## Available providers
|
|
28
28
|
|
|
29
29
|
Mastra supports several Speech-to-Text providers, each with their own capabilities and strengths:
|
|
30
30
|
|
|
31
|
-
- [**OpenAI**](https://mastra.ai/reference/voice/openai)
|
|
32
|
-
- [**Azure**](https://mastra.ai/reference/voice/azure)
|
|
33
|
-
- [**ElevenLabs**](https://mastra.ai/reference/voice/elevenlabs)
|
|
34
|
-
- [**Google**](https://mastra.ai/reference/voice/google)
|
|
35
|
-
- [**Cloudflare**](https://mastra.ai/reference/voice/cloudflare)
|
|
36
|
-
- [**Deepgram**](https://mastra.ai/reference/voice/deepgram)
|
|
37
|
-
- [**Sarvam**](https://mastra.ai/reference/voice/sarvam)
|
|
31
|
+
- [**OpenAI**](https://mastra.ai/reference/voice/openai): High-accuracy transcription with Whisper models
|
|
32
|
+
- [**Azure**](https://mastra.ai/reference/voice/azure): Microsoft's speech recognition with enterprise-grade reliability
|
|
33
|
+
- [**ElevenLabs**](https://mastra.ai/reference/voice/elevenlabs): Advanced speech recognition with support for multiple languages
|
|
34
|
+
- [**Google**](https://mastra.ai/reference/voice/google): Google's speech recognition with extensive language support
|
|
35
|
+
- [**Cloudflare**](https://mastra.ai/reference/voice/cloudflare): Edge-optimized speech recognition for low-latency applications
|
|
36
|
+
- [**Deepgram**](https://mastra.ai/reference/voice/deepgram): AI-powered speech recognition with high accuracy for various accents
|
|
37
|
+
- [**Sarvam**](https://mastra.ai/reference/voice/sarvam): Specialized in Indic languages and accents
|
|
38
38
|
|
|
39
39
|
Each provider is implemented as a separate package that you can install as needed:
|
|
40
40
|
|
|
@@ -42,39 +42,38 @@ Each provider is implemented as a separate package that you can install as neede
|
|
|
42
42
|
pnpm add @mastra/voice-openai@latest # Example for OpenAI
|
|
43
43
|
```
|
|
44
44
|
|
|
45
|
-
## Using the
|
|
45
|
+
## Using the listen method
|
|
46
46
|
|
|
47
47
|
The primary method for STT is the `listen()` method, which converts spoken audio into text. Here's how to use it:
|
|
48
48
|
|
|
49
49
|
```typescript
|
|
50
|
-
import { Agent } from
|
|
51
|
-
import { OpenAIVoice } from
|
|
52
|
-
import { getMicrophoneStream } from
|
|
50
|
+
import { Agent } from '@mastra/core/agent'
|
|
51
|
+
import { OpenAIVoice } from '@mastra/voice-openai'
|
|
52
|
+
import { getMicrophoneStream } from '@mastra/node-audio'
|
|
53
53
|
|
|
54
|
-
const voice = new OpenAIVoice()
|
|
54
|
+
const voice = new OpenAIVoice()
|
|
55
55
|
|
|
56
56
|
const agent = new Agent({
|
|
57
|
-
id:
|
|
58
|
-
name:
|
|
59
|
-
instructions:
|
|
60
|
-
|
|
61
|
-
model: "openai/gpt-5.1",
|
|
57
|
+
id: 'voice-agent',
|
|
58
|
+
name: 'Voice Agent',
|
|
59
|
+
instructions: 'You are a voice assistant that provides recommendations based on user input.',
|
|
60
|
+
model: 'openai/gpt-5.5',
|
|
62
61
|
voice,
|
|
63
|
-
})
|
|
62
|
+
})
|
|
64
63
|
|
|
65
|
-
const audioStream = getMicrophoneStream()
|
|
64
|
+
const audioStream = getMicrophoneStream() // Assume this function gets audio input
|
|
66
65
|
|
|
67
66
|
const transcript = await agent.voice.listen(audioStream, {
|
|
68
|
-
filetype:
|
|
69
|
-
})
|
|
67
|
+
filetype: 'm4a', // Optional: specify the audio file type
|
|
68
|
+
})
|
|
70
69
|
|
|
71
|
-
console.log(`User said: ${transcript}`)
|
|
70
|
+
console.log(`User said: ${transcript}`)
|
|
72
71
|
|
|
73
72
|
const { text } = await agent.generate(
|
|
74
73
|
`Based on what the user said, provide them a recommendation: ${transcript}`,
|
|
75
|
-
)
|
|
74
|
+
)
|
|
76
75
|
|
|
77
|
-
console.log(`Recommendation: ${text}`)
|
|
76
|
+
console.log(`Recommendation: ${text}`)
|
|
78
77
|
```
|
|
79
78
|
|
|
80
79
|
Check out the [Adding Voice to Agents](https://mastra.ai/docs/agents/adding-voice) documentation to learn how to use STT in an agent.
|
|
@@ -19,30 +19,30 @@ The **`speaker`** option allows you to select different voices for speech synthe
|
|
|
19
19
|
```typescript
|
|
20
20
|
const voice = new OpenAIVoice({
|
|
21
21
|
speechModel: {
|
|
22
|
-
name:
|
|
22
|
+
name: 'tts-1-hd',
|
|
23
23
|
apiKey: process.env.OPENAI_API_KEY,
|
|
24
24
|
},
|
|
25
|
-
speaker:
|
|
26
|
-
})
|
|
25
|
+
speaker: 'alloy',
|
|
26
|
+
})
|
|
27
27
|
|
|
28
28
|
// If using default settings the configuration can be simplified to:
|
|
29
|
-
const voice = new OpenAIVoice()
|
|
29
|
+
const voice = new OpenAIVoice()
|
|
30
30
|
```
|
|
31
31
|
|
|
32
|
-
## Available
|
|
32
|
+
## Available providers
|
|
33
33
|
|
|
34
34
|
Mastra supports a wide range of Text-to-Speech providers, each with their own unique capabilities and voice options. You can choose the provider that best suits your application's needs:
|
|
35
35
|
|
|
36
|
-
- [**OpenAI**](https://mastra.ai/reference/voice/openai)
|
|
37
|
-
- [**Azure**](https://mastra.ai/reference/voice/azure)
|
|
38
|
-
- [**ElevenLabs**](https://mastra.ai/reference/voice/elevenlabs)
|
|
39
|
-
- [**PlayAI**](https://mastra.ai/reference/voice/playai)
|
|
40
|
-
- [**Google**](https://mastra.ai/reference/voice/google)
|
|
41
|
-
- [**Cloudflare**](https://mastra.ai/reference/voice/cloudflare)
|
|
42
|
-
- [**Deepgram**](https://mastra.ai/reference/voice/deepgram)
|
|
43
|
-
- [**Speechify**](https://mastra.ai/reference/voice/speechify)
|
|
44
|
-
- [**Sarvam**](https://mastra.ai/reference/voice/sarvam)
|
|
45
|
-
- [**Murf**](https://mastra.ai/reference/voice/murf)
|
|
36
|
+
- [**OpenAI**](https://mastra.ai/reference/voice/openai): High-quality voices with natural intonation and expression
|
|
37
|
+
- [**Azure**](https://mastra.ai/reference/voice/azure): Microsoft's speech service with a wide range of voices and languages
|
|
38
|
+
- [**ElevenLabs**](https://mastra.ai/reference/voice/elevenlabs): Ultra-realistic voices with emotion and fine-grained control
|
|
39
|
+
- [**PlayAI**](https://mastra.ai/reference/voice/playai): Specialized in natural-sounding voices with various styles
|
|
40
|
+
- [**Google**](https://mastra.ai/reference/voice/google): Google's speech synthesis with multilingual support
|
|
41
|
+
- [**Cloudflare**](https://mastra.ai/reference/voice/cloudflare): Edge-optimized speech synthesis for low-latency applications
|
|
42
|
+
- [**Deepgram**](https://mastra.ai/reference/voice/deepgram): AI-powered speech technology with high accuracy
|
|
43
|
+
- [**Speechify**](https://mastra.ai/reference/voice/speechify): Text-to-speech optimized for readability and accessibility
|
|
44
|
+
- [**Sarvam**](https://mastra.ai/reference/voice/sarvam): Specialized in Indic languages and accents
|
|
45
|
+
- [**Murf**](https://mastra.ai/reference/voice/murf): Studio-quality voice overs with customizable parameters
|
|
46
46
|
|
|
47
47
|
Each provider is implemented as a separate package that you can install as needed:
|
|
48
48
|
|
|
@@ -50,35 +50,34 @@ Each provider is implemented as a separate package that you can install as neede
|
|
|
50
50
|
pnpm add @mastra/voice-openai@latest # Example for OpenAI
|
|
51
51
|
```
|
|
52
52
|
|
|
53
|
-
## Using the
|
|
53
|
+
## Using the speak method
|
|
54
54
|
|
|
55
55
|
The primary method for TTS is the `speak()` method, which converts text to speech. This method can accept options that allows you to specify the speaker and other provider-specific options. Here's how to use it:
|
|
56
56
|
|
|
57
57
|
```typescript
|
|
58
|
-
import { Agent } from
|
|
59
|
-
import { OpenAIVoice } from
|
|
58
|
+
import { Agent } from '@mastra/core/agent'
|
|
59
|
+
import { OpenAIVoice } from '@mastra/voice-openai'
|
|
60
60
|
|
|
61
|
-
const voice = new OpenAIVoice()
|
|
61
|
+
const voice = new OpenAIVoice()
|
|
62
62
|
|
|
63
63
|
const agent = new Agent({
|
|
64
|
-
id:
|
|
65
|
-
name:
|
|
66
|
-
instructions:
|
|
67
|
-
|
|
68
|
-
model: "openai/gpt-5.1",
|
|
64
|
+
id: 'voice-agent',
|
|
65
|
+
name: 'Voice Agent',
|
|
66
|
+
instructions: 'You are a voice assistant that can help users with their tasks.',
|
|
67
|
+
model: 'openai/gpt-5.5',
|
|
69
68
|
voice,
|
|
70
|
-
})
|
|
69
|
+
})
|
|
71
70
|
|
|
72
|
-
const { text } = await agent.generate(
|
|
71
|
+
const { text } = await agent.generate('What color is the sky?')
|
|
73
72
|
|
|
74
73
|
// Convert text to speech to an Audio Stream
|
|
75
74
|
const readableStream = await voice.speak(text, {
|
|
76
|
-
speaker:
|
|
75
|
+
speaker: 'default', // Optional: specify a speaker
|
|
77
76
|
properties: {
|
|
78
77
|
speed: 1.0, // Optional: adjust speech speed
|
|
79
|
-
pitch:
|
|
78
|
+
pitch: 'default', // Optional: specify pitch if supported
|
|
80
79
|
},
|
|
81
|
-
})
|
|
80
|
+
})
|
|
82
81
|
```
|
|
83
82
|
|
|
84
83
|
Check out the [Adding Voice to Agents](https://mastra.ai/docs/agents/adding-voice) documentation to learn how to use TTS in an agent.
|
|
@@ -4,25 +4,25 @@ The CompositeVoice class allows you to combine different voice providers for tex
|
|
|
4
4
|
|
|
5
5
|
CompositeVoice supports both Mastra voice providers and AI SDK model providers
|
|
6
6
|
|
|
7
|
-
## Constructor
|
|
7
|
+
## Constructor parameters
|
|
8
8
|
|
|
9
|
-
**config
|
|
9
|
+
**config** (`object`): Configuration object for the composite voice service
|
|
10
10
|
|
|
11
|
-
**config.input
|
|
11
|
+
**config.input** (`MastraVoice | TranscriptionModel`): Voice provider or AI SDK transcription model to use for speech-to-text operations. AI SDK models are automatically wrapped.
|
|
12
12
|
|
|
13
|
-
**config.output
|
|
13
|
+
**config.output** (`MastraVoice | SpeechModel`): Voice provider or AI SDK speech model to use for text-to-speech operations. AI SDK models are automatically wrapped.
|
|
14
14
|
|
|
15
|
-
**config.realtime
|
|
15
|
+
**config.realtime** (`MastraVoice`): Voice provider to use for real-time speech-to-speech operations
|
|
16
16
|
|
|
17
17
|
## Methods
|
|
18
18
|
|
|
19
|
-
### speak()
|
|
19
|
+
### `speak()`
|
|
20
20
|
|
|
21
21
|
Converts text to speech using the configured speaking provider.
|
|
22
22
|
|
|
23
|
-
**input
|
|
23
|
+
**input** (`string | NodeJS.ReadableStream`): Text to convert to speech
|
|
24
24
|
|
|
25
|
-
**options
|
|
25
|
+
**options** (`object`): Provider-specific options passed to the speaking provider
|
|
26
26
|
|
|
27
27
|
Notes:
|
|
28
28
|
|
|
@@ -30,13 +30,13 @@ Notes:
|
|
|
30
30
|
- Options are passed through to the configured speaking provider
|
|
31
31
|
- Returns a stream of audio data
|
|
32
32
|
|
|
33
|
-
### listen()
|
|
33
|
+
### `listen()`
|
|
34
34
|
|
|
35
35
|
Converts speech to text using the configured listening provider.
|
|
36
36
|
|
|
37
|
-
**audioStream
|
|
37
|
+
**audioStream** (`NodeJS.ReadableStream`): Audio stream to convert to text
|
|
38
38
|
|
|
39
|
-
**options
|
|
39
|
+
**options** (`object`): Provider-specific options passed to the listening provider
|
|
40
40
|
|
|
41
41
|
Notes:
|
|
42
42
|
|
|
@@ -44,13 +44,13 @@ Notes:
|
|
|
44
44
|
- Options are passed through to the configured listening provider
|
|
45
45
|
- Returns either a string or a stream of transcribed text, depending on the provider
|
|
46
46
|
|
|
47
|
-
### getSpeakers()
|
|
47
|
+
### `getSpeakers()`
|
|
48
48
|
|
|
49
49
|
Returns a list of available voices from the speaking provider, where each node contains:
|
|
50
50
|
|
|
51
|
-
**voiceId
|
|
51
|
+
**voiceId** (`string`): Unique identifier for the voice
|
|
52
52
|
|
|
53
|
-
**key
|
|
53
|
+
**key** (`value`): Additional voice properties that vary by provider (e.g., name, language)
|
|
54
54
|
|
|
55
55
|
Notes:
|
|
56
56
|
|
|
@@ -59,30 +59,30 @@ Notes:
|
|
|
59
59
|
- Each voice object will have at least a voiceId property
|
|
60
60
|
- Additional voice properties depend on the speaking provider
|
|
61
61
|
|
|
62
|
-
## Usage
|
|
62
|
+
## Usage examples
|
|
63
63
|
|
|
64
64
|
### Using Mastra Voice Providers
|
|
65
65
|
|
|
66
66
|
```typescript
|
|
67
|
-
import { CompositeVoice } from
|
|
68
|
-
import { OpenAIVoice } from
|
|
69
|
-
import { PlayAIVoice } from
|
|
67
|
+
import { CompositeVoice } from '@mastra/core/voice'
|
|
68
|
+
import { OpenAIVoice } from '@mastra/voice-openai'
|
|
69
|
+
import { PlayAIVoice } from '@mastra/voice-playai'
|
|
70
70
|
|
|
71
71
|
// Create voice providers
|
|
72
|
-
const openai = new OpenAIVoice()
|
|
73
|
-
const playai = new PlayAIVoice()
|
|
72
|
+
const openai = new OpenAIVoice()
|
|
73
|
+
const playai = new PlayAIVoice()
|
|
74
74
|
|
|
75
75
|
// Use OpenAI for listening (speech-to-text) and PlayAI for speaking (text-to-speech)
|
|
76
76
|
const voice = new CompositeVoice({
|
|
77
77
|
input: openai,
|
|
78
78
|
output: playai,
|
|
79
|
-
})
|
|
79
|
+
})
|
|
80
80
|
|
|
81
81
|
// Convert speech to text using OpenAI
|
|
82
|
-
const text = await voice.listen(audioStream)
|
|
82
|
+
const text = await voice.listen(audioStream)
|
|
83
83
|
|
|
84
84
|
// Convert text to speech using PlayAI
|
|
85
|
-
const audio = await voice.speak(
|
|
85
|
+
const audio = await voice.speak('Hello, world!')
|
|
86
86
|
```
|
|
87
87
|
|
|
88
88
|
### Using AI SDK Model Providers
|
|
@@ -90,19 +90,19 @@ const audio = await voice.speak("Hello, world!");
|
|
|
90
90
|
You can pass AI SDK transcription and speech models directly to CompositeVoice:
|
|
91
91
|
|
|
92
92
|
```typescript
|
|
93
|
-
import { CompositeVoice } from
|
|
94
|
-
import { openai } from
|
|
95
|
-
import { elevenlabs } from
|
|
93
|
+
import { CompositeVoice } from '@mastra/core/voice'
|
|
94
|
+
import { openai } from '@ai-sdk/openai'
|
|
95
|
+
import { elevenlabs } from '@ai-sdk/elevenlabs'
|
|
96
96
|
|
|
97
97
|
// Use AI SDK models directly - they will be auto-wrapped
|
|
98
98
|
const voice = new CompositeVoice({
|
|
99
|
-
input: openai.transcription('whisper-1'),
|
|
100
|
-
output: elevenlabs.speech('eleven_turbo_v2'),
|
|
101
|
-
})
|
|
99
|
+
input: openai.transcription('whisper-1'), // AI SDK transcription
|
|
100
|
+
output: elevenlabs.speech('eleven_turbo_v2'), // AI SDK speech
|
|
101
|
+
})
|
|
102
102
|
|
|
103
103
|
// Works the same way as with Mastra providers
|
|
104
|
-
const text = await voice.listen(audioStream)
|
|
105
|
-
const audio = await voice.speak(
|
|
104
|
+
const text = await voice.listen(audioStream)
|
|
105
|
+
const audio = await voice.speak('Hello from AI SDK!')
|
|
106
106
|
```
|
|
107
107
|
|
|
108
108
|
### Mix and Match
|
|
@@ -110,12 +110,12 @@ const audio = await voice.speak("Hello from AI SDK!");
|
|
|
110
110
|
You can combine Mastra providers with AI SDK models:
|
|
111
111
|
|
|
112
112
|
```typescript
|
|
113
|
-
import { CompositeVoice } from
|
|
114
|
-
import { PlayAIVoice } from
|
|
115
|
-
import { groq } from
|
|
113
|
+
import { CompositeVoice } from '@mastra/core/voice'
|
|
114
|
+
import { PlayAIVoice } from '@mastra/voice-playai'
|
|
115
|
+
import { groq } from '@ai-sdk/groq'
|
|
116
116
|
|
|
117
117
|
const voice = new CompositeVoice({
|
|
118
|
-
input: groq.transcription('whisper-large-v3'),
|
|
119
|
-
output: new PlayAIVoice(),
|
|
120
|
-
})
|
|
118
|
+
input: groq.transcription('whisper-large-v3'), // AI SDK for STT
|
|
119
|
+
output: new PlayAIVoice(), // Mastra for TTS
|
|
120
|
+
})
|
|
121
121
|
```
|
|
@@ -2,84 +2,90 @@
|
|
|
2
2
|
|
|
3
3
|
The OpenAIVoice class in Mastra provides text-to-speech and speech-to-text capabilities using OpenAI's models.
|
|
4
4
|
|
|
5
|
-
## Usage
|
|
5
|
+
## Usage example
|
|
6
6
|
|
|
7
7
|
```typescript
|
|
8
|
-
import { OpenAIVoice } from
|
|
8
|
+
import { OpenAIVoice } from '@mastra/voice-openai'
|
|
9
9
|
|
|
10
10
|
// Initialize with default configuration using environment variables
|
|
11
|
-
const voice = new OpenAIVoice()
|
|
11
|
+
const voice = new OpenAIVoice()
|
|
12
12
|
|
|
13
13
|
// Or initialize with specific configuration
|
|
14
14
|
const voiceWithConfig = new OpenAIVoice({
|
|
15
15
|
speechModel: {
|
|
16
|
-
name:
|
|
17
|
-
apiKey:
|
|
16
|
+
name: 'tts-1-hd',
|
|
17
|
+
apiKey: 'your-openai-api-key',
|
|
18
18
|
},
|
|
19
19
|
listeningModel: {
|
|
20
|
-
name:
|
|
21
|
-
apiKey:
|
|
20
|
+
name: 'whisper-1',
|
|
21
|
+
apiKey: 'your-openai-api-key',
|
|
22
22
|
},
|
|
23
|
-
speaker:
|
|
24
|
-
})
|
|
23
|
+
speaker: 'alloy', // Default voice
|
|
24
|
+
})
|
|
25
25
|
|
|
26
26
|
// Convert text to speech
|
|
27
|
-
const audioStream = await voice.speak(
|
|
28
|
-
speaker:
|
|
27
|
+
const audioStream = await voice.speak('Hello, how can I help you?', {
|
|
28
|
+
speaker: 'nova', // Override default voice
|
|
29
29
|
speed: 1.2, // Adjust speech speed
|
|
30
|
-
})
|
|
30
|
+
})
|
|
31
31
|
|
|
32
32
|
// Convert speech to text
|
|
33
33
|
const text = await voice.listen(audioStream, {
|
|
34
|
-
filetype:
|
|
35
|
-
})
|
|
34
|
+
filetype: 'mp3',
|
|
35
|
+
})
|
|
36
36
|
```
|
|
37
37
|
|
|
38
38
|
## Configuration
|
|
39
39
|
|
|
40
|
-
### Constructor
|
|
40
|
+
### Constructor options
|
|
41
41
|
|
|
42
|
-
**speechModel
|
|
42
|
+
**speechModel** (`OpenAIConfig`): Configuration for text-to-speech synthesis. (Default: `{ name: 'tts-1' }`)
|
|
43
43
|
|
|
44
|
-
**
|
|
44
|
+
**speechModel.name** (`'tts-1' | 'tts-1-hd' | 'whisper-1'`): Model name. Use 'tts-1-hd' for higher quality audio.
|
|
45
45
|
|
|
46
|
-
**
|
|
46
|
+
**speechModel.apiKey** (`string`): OpenAI API key. Falls back to OPENAI\_API\_KEY environment variable.
|
|
47
47
|
|
|
48
|
-
|
|
48
|
+
**listeningModel** (`OpenAIConfig`): Configuration for speech-to-text recognition. (Default: `{ name: 'whisper-1' }`)
|
|
49
49
|
|
|
50
|
-
**name
|
|
50
|
+
**listeningModel.name** (`'tts-1' | 'tts-1-hd' | 'whisper-1'`): Model name. Use 'tts-1-hd' for higher quality audio.
|
|
51
51
|
|
|
52
|
-
**apiKey
|
|
52
|
+
**listeningModel.apiKey** (`string`): OpenAI API key. Falls back to OPENAI\_API\_KEY environment variable.
|
|
53
|
+
|
|
54
|
+
**speaker** (`OpenAIVoiceId`): Default voice ID for speech synthesis. (Default: `'alloy'`)
|
|
53
55
|
|
|
54
56
|
## Methods
|
|
55
57
|
|
|
56
|
-
### speak()
|
|
58
|
+
### `speak()`
|
|
57
59
|
|
|
58
60
|
Converts text to speech using OpenAI's text-to-speech models.
|
|
59
61
|
|
|
60
|
-
**input
|
|
62
|
+
**input** (`string | NodeJS.ReadableStream`): Text or text stream to convert to speech.
|
|
63
|
+
|
|
64
|
+
**options** (`Options`): Configuration options.
|
|
61
65
|
|
|
62
|
-
**options.speaker
|
|
66
|
+
**options.speaker** (`OpenAIVoiceId`): Voice ID to use for speech synthesis.
|
|
63
67
|
|
|
64
|
-
**options.speed
|
|
68
|
+
**options.speed** (`number`): Speech speed multiplier.
|
|
65
69
|
|
|
66
70
|
Returns: `Promise<NodeJS.ReadableStream>`
|
|
67
71
|
|
|
68
|
-
### listen()
|
|
72
|
+
### `listen()`
|
|
69
73
|
|
|
70
74
|
Transcribes audio using OpenAI's Whisper model.
|
|
71
75
|
|
|
72
|
-
**audioStream
|
|
76
|
+
**audioStream** (`NodeJS.ReadableStream`): Audio stream to transcribe.
|
|
77
|
+
|
|
78
|
+
**options** (`Options`): Configuration options.
|
|
73
79
|
|
|
74
|
-
**options.filetype
|
|
80
|
+
**options.filetype** (`string`): Audio format of the input stream.
|
|
75
81
|
|
|
76
82
|
Returns: `Promise<string>`
|
|
77
83
|
|
|
78
|
-
### getSpeakers()
|
|
84
|
+
### `getSpeakers()`
|
|
79
85
|
|
|
80
86
|
Returns an array of available voice options, where each node contains:
|
|
81
87
|
|
|
82
|
-
**voiceId
|
|
88
|
+
**voiceId** (`string`): Unique identifier for the voice
|
|
83
89
|
|
|
84
90
|
## Notes
|
|
85
91
|
|
|
@@ -2,123 +2,123 @@
|
|
|
2
2
|
|
|
3
3
|
The `getSpeakers()` method retrieves a list of available voice options (speakers) from the voice provider. This allows applications to present users with voice choices or programmatically select the most appropriate voice for different contexts.
|
|
4
4
|
|
|
5
|
-
## Usage
|
|
5
|
+
## Usage example
|
|
6
6
|
|
|
7
7
|
```typescript
|
|
8
|
-
import { OpenAIVoice } from
|
|
9
|
-
import { ElevenLabsVoice } from
|
|
8
|
+
import { OpenAIVoice } from '@mastra/voice-openai'
|
|
9
|
+
import { ElevenLabsVoice } from '@mastra/voice-elevenlabs'
|
|
10
10
|
|
|
11
11
|
// Initialize voice providers
|
|
12
|
-
const openaiVoice = new OpenAIVoice()
|
|
12
|
+
const openaiVoice = new OpenAIVoice()
|
|
13
13
|
const elevenLabsVoice = new ElevenLabsVoice({
|
|
14
14
|
apiKey: process.env.ELEVENLABS_API_KEY,
|
|
15
|
-
})
|
|
15
|
+
})
|
|
16
16
|
|
|
17
17
|
// Get available speakers from OpenAI
|
|
18
|
-
const openaiSpeakers = await openaiVoice.getSpeakers()
|
|
19
|
-
console.log(
|
|
18
|
+
const openaiSpeakers = await openaiVoice.getSpeakers()
|
|
19
|
+
console.log('OpenAI voices:', openaiSpeakers)
|
|
20
20
|
// Example output: [{ voiceId: "alloy" }, { voiceId: "echo" }, { voiceId: "fable" }, ...]
|
|
21
21
|
|
|
22
22
|
// Get available speakers from ElevenLabs
|
|
23
|
-
const elevenLabsSpeakers = await elevenLabsVoice.getSpeakers()
|
|
24
|
-
console.log(
|
|
23
|
+
const elevenLabsSpeakers = await elevenLabsVoice.getSpeakers()
|
|
24
|
+
console.log('ElevenLabs voices:', elevenLabsSpeakers)
|
|
25
25
|
// Example output: [{ voiceId: "21m00Tcm4TlvDq8ikWAM", name: "Rachel" }, ...]
|
|
26
26
|
|
|
27
27
|
// Use a specific voice for speech
|
|
28
|
-
const text =
|
|
29
|
-
await openaiVoice.speak(text, { speaker: openaiSpeakers[2].voiceId })
|
|
30
|
-
await elevenLabsVoice.speak(text, { speaker: elevenLabsSpeakers[0].voiceId })
|
|
28
|
+
const text = 'Hello, this is a test of different voices.'
|
|
29
|
+
await openaiVoice.speak(text, { speaker: openaiSpeakers[2].voiceId })
|
|
30
|
+
await elevenLabsVoice.speak(text, { speaker: elevenLabsSpeakers[0].voiceId })
|
|
31
31
|
```
|
|
32
32
|
|
|
33
33
|
## Parameters
|
|
34
34
|
|
|
35
|
-
This method
|
|
35
|
+
This method doesn't accept any parameters.
|
|
36
36
|
|
|
37
|
-
## Return
|
|
37
|
+
## Return value
|
|
38
38
|
|
|
39
|
-
**Promise\<Array<{ voiceId: string } & TSpeakerMetadata
|
|
39
|
+
**Promise\<Array<{ voiceId: string } & TSpeakerMetadata>>** (`Promise`): A promise that resolves to an array of voice options, where each option contains at least a voiceId property and may include additional provider-specific metadata.
|
|
40
40
|
|
|
41
|
-
## Provider-
|
|
41
|
+
## Provider-specific metadata
|
|
42
42
|
|
|
43
43
|
Different voice providers return different metadata for their voices:
|
|
44
44
|
|
|
45
45
|
**OpenAI**:
|
|
46
46
|
|
|
47
|
-
**voiceId
|
|
47
|
+
**voiceId** (`string`): Unique identifier for the voice (e.g., 'alloy', 'echo', 'fable', 'onyx', 'nova', 'shimmer')
|
|
48
48
|
|
|
49
49
|
**OpenAI Realtime**:
|
|
50
50
|
|
|
51
|
-
**voiceId
|
|
51
|
+
**voiceId** (`string`): Unique identifier for the voice (e.g., 'alloy', 'echo', 'fable', 'onyx', 'nova', 'shimmer')
|
|
52
52
|
|
|
53
53
|
**Deepgram**:
|
|
54
54
|
|
|
55
|
-
**voiceId
|
|
55
|
+
**voiceId** (`string`): Unique identifier for the voice
|
|
56
56
|
|
|
57
|
-
**language
|
|
57
|
+
**language** (`string`): Language code embedded in the voice ID (e.g., 'en')
|
|
58
58
|
|
|
59
59
|
**ElevenLabs**:
|
|
60
60
|
|
|
61
|
-
**voiceId
|
|
61
|
+
**voiceId** (`string`): Unique identifier for the voice
|
|
62
62
|
|
|
63
|
-
**name
|
|
63
|
+
**name** (`string`): Human-readable name of the voice
|
|
64
64
|
|
|
65
|
-
**category
|
|
65
|
+
**category** (`string`): Category of the voice (e.g., 'premade', 'cloned')
|
|
66
66
|
|
|
67
67
|
**Google**:
|
|
68
68
|
|
|
69
|
-
**voiceId
|
|
69
|
+
**voiceId** (`string`): Unique identifier for the voice
|
|
70
70
|
|
|
71
|
-
**languageCodes
|
|
71
|
+
**languageCodes** (`string[]`): Array of language codes supported by the voice (e.g., \['en-US'])
|
|
72
72
|
|
|
73
73
|
**Azure**:
|
|
74
74
|
|
|
75
|
-
**voiceId
|
|
75
|
+
**voiceId** (`string`): Unique identifier for the voice
|
|
76
76
|
|
|
77
|
-
**language
|
|
77
|
+
**language** (`string`): Language code extracted from the voice ID (e.g., 'en')
|
|
78
78
|
|
|
79
|
-
**region
|
|
79
|
+
**region** (`string`): Region code extracted from the voice ID (e.g., 'US')
|
|
80
80
|
|
|
81
81
|
**Murf**:
|
|
82
82
|
|
|
83
|
-
**voiceId
|
|
83
|
+
**voiceId** (`string`): Unique identifier for the voice
|
|
84
84
|
|
|
85
|
-
**name
|
|
85
|
+
**name** (`string`): Name of the voice (same as voiceId)
|
|
86
86
|
|
|
87
|
-
**language
|
|
87
|
+
**language** (`string`): Language code extracted from the voice ID (e.g., 'en')
|
|
88
88
|
|
|
89
|
-
**gender
|
|
89
|
+
**gender** (`string`): Gender of the voice (always 'neutral' in current implementation)
|
|
90
90
|
|
|
91
91
|
**PlayAI**:
|
|
92
92
|
|
|
93
|
-
**voiceId
|
|
93
|
+
**voiceId** (`string`): Unique identifier for the voice (S3 URL to manifest.json)
|
|
94
94
|
|
|
95
|
-
**name
|
|
95
|
+
**name** (`string`): Human-readable name of the voice (e.g., 'Angelo', 'Arsenio')
|
|
96
96
|
|
|
97
|
-
**accent
|
|
97
|
+
**accent** (`string`): Accent of the voice (e.g., 'US', 'Irish', 'US African American')
|
|
98
98
|
|
|
99
|
-
**gender
|
|
99
|
+
**gender** (`string`): Gender of the voice ('M' or 'F')
|
|
100
100
|
|
|
101
|
-
**age
|
|
101
|
+
**age** (`string`): Age category of the voice (e.g., 'Young', 'Middle')
|
|
102
102
|
|
|
103
|
-
**style
|
|
103
|
+
**style** (`string`): Speaking style of the voice (e.g., 'Conversational')
|
|
104
104
|
|
|
105
105
|
**Speechify**:
|
|
106
106
|
|
|
107
|
-
**voiceId
|
|
107
|
+
**voiceId** (`string`): Unique identifier for the voice
|
|
108
108
|
|
|
109
|
-
**name
|
|
109
|
+
**name** (`string`): Human-readable name of the voice
|
|
110
110
|
|
|
111
|
-
**language
|
|
111
|
+
**language** (`string`): Language code of the voice (e.g., 'en-US')
|
|
112
112
|
|
|
113
113
|
**Sarvam**:
|
|
114
114
|
|
|
115
|
-
**voiceId
|
|
115
|
+
**voiceId** (`string`): Unique identifier for the voice
|
|
116
116
|
|
|
117
|
-
**name
|
|
117
|
+
**name** (`string`): Human-readable name of the voice
|
|
118
118
|
|
|
119
|
-
**language
|
|
119
|
+
**language** (`string`): Language of the voice (e.g., 'english', 'hindi')
|
|
120
120
|
|
|
121
|
-
**gender
|
|
121
|
+
**gender** (`string`): Gender of the voice ('male' or 'female')
|
|
122
122
|
|
|
123
123
|
## Notes
|
|
124
124
|
|