@mastra/voice-openai 0.12.1 → 0.12.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (128) hide show
  1. package/CHANGELOG.md +16 -0
  2. package/LICENSE.md +15 -0
  3. package/dist/_types/@internal_voice/dist/_types/@internal_ai-sdk-v5/dist/index.d.ts +8888 -0
  4. package/dist/_types/@internal_voice/dist/_types/@internal_core/dist/base/index.d.ts +31 -0
  5. package/dist/_types/@internal_voice/dist/_types/@internal_core/dist/logger/index.d.ts +217 -0
  6. package/dist/_types/@internal_voice/dist/_types/@internal_core/dist/request-context/index.d.ts +147 -0
  7. package/dist/_types/@internal_voice/dist/_types/@internal_core/dist/types/index.d.ts +3 -0
  8. package/dist/_types/@internal_voice/dist/_types/zod/v3/ZodError.d.ts +164 -0
  9. package/dist/_types/@internal_voice/dist/_types/zod/v3/errors.d.ts +5 -0
  10. package/dist/_types/@internal_voice/dist/_types/zod/v3/external.d.ts +6 -0
  11. package/dist/_types/@internal_voice/dist/_types/zod/v3/helpers/enumUtil.d.ts +8 -0
  12. package/dist/_types/@internal_voice/dist/_types/zod/v3/helpers/errorUtil.d.ts +9 -0
  13. package/dist/_types/@internal_voice/dist/_types/zod/v3/helpers/parseUtil.d.ts +78 -0
  14. package/dist/_types/@internal_voice/dist/_types/zod/v3/helpers/partialUtil.d.ts +8 -0
  15. package/dist/_types/@internal_voice/dist/_types/zod/v3/helpers/typeAliases.d.ts +2 -0
  16. package/dist/_types/@internal_voice/dist/_types/zod/v3/helpers/util.d.ts +85 -0
  17. package/dist/_types/@internal_voice/dist/_types/zod/v3/index.d.cts +4 -0
  18. package/dist/_types/@internal_voice/dist/_types/zod/v3/index.d.ts +4 -0
  19. package/dist/_types/@internal_voice/dist/_types/zod/v3/locales/en.d.ts +3 -0
  20. package/dist/_types/@internal_voice/dist/_types/zod/v3/standard-schema.d.ts +102 -0
  21. package/dist/_types/@internal_voice/dist/_types/zod/v3/types.d.ts +1034 -0
  22. package/dist/_types/@internal_voice/dist/_types/zod/v4/classic/checks.d.ts +1 -0
  23. package/dist/_types/@internal_voice/dist/_types/zod/v4/classic/coerce.d.ts +17 -0
  24. package/dist/_types/@internal_voice/dist/_types/zod/v4/classic/compat.d.ts +50 -0
  25. package/dist/_types/@internal_voice/dist/_types/zod/v4/classic/errors.d.ts +30 -0
  26. package/dist/_types/@internal_voice/dist/_types/zod/v4/classic/external.d.ts +16 -0
  27. package/dist/_types/@internal_voice/dist/_types/zod/v4/classic/from-json-schema.d.ts +12 -0
  28. package/dist/_types/@internal_voice/dist/_types/zod/v4/classic/index.d.ts +4 -0
  29. package/dist/_types/@internal_voice/dist/_types/zod/v4/classic/iso.d.ts +22 -0
  30. package/dist/_types/@internal_voice/dist/_types/zod/v4/classic/parse.d.ts +31 -0
  31. package/dist/_types/@internal_voice/dist/_types/zod/v4/classic/schemas.d.ts +767 -0
  32. package/dist/_types/@internal_voice/dist/_types/zod/v4/core/api.d.ts +325 -0
  33. package/dist/_types/@internal_voice/dist/_types/zod/v4/core/checks.d.ts +278 -0
  34. package/dist/_types/@internal_voice/dist/_types/zod/v4/core/core.d.ts +70 -0
  35. package/dist/_types/@internal_voice/dist/_types/zod/v4/core/doc.d.ts +14 -0
  36. package/dist/_types/@internal_voice/dist/_types/zod/v4/core/errors.d.ts +221 -0
  37. package/dist/_types/@internal_voice/dist/_types/zod/v4/core/index.d.ts +16 -0
  38. package/dist/_types/@internal_voice/dist/_types/zod/v4/core/json-schema-generator.d.ts +65 -0
  39. package/dist/_types/@internal_voice/dist/_types/zod/v4/core/json-schema-processors.d.ts +49 -0
  40. package/dist/_types/@internal_voice/dist/_types/zod/v4/core/json-schema.d.ts +88 -0
  41. package/dist/_types/@internal_voice/dist/_types/zod/v4/core/parse.d.ts +49 -0
  42. package/dist/_types/@internal_voice/dist/_types/zod/v4/core/regexes.d.ts +85 -0
  43. package/dist/_types/@internal_voice/dist/_types/zod/v4/core/registries.d.ts +35 -0
  44. package/dist/_types/@internal_voice/dist/_types/zod/v4/core/schemas.d.ts +1184 -0
  45. package/dist/_types/@internal_voice/dist/_types/zod/v4/core/standard-schema.d.ts +126 -0
  46. package/dist/_types/@internal_voice/dist/_types/zod/v4/core/to-json-schema.d.ts +114 -0
  47. package/dist/_types/@internal_voice/dist/_types/zod/v4/core/util.d.ts +200 -0
  48. package/dist/_types/@internal_voice/dist/_types/zod/v4/core/versions.d.ts +5 -0
  49. package/dist/_types/@internal_voice/dist/_types/zod/v4/index.d.cts +3 -0
  50. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/ar.d.ts +4 -0
  51. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/az.d.ts +4 -0
  52. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/be.d.ts +4 -0
  53. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/bg.d.ts +4 -0
  54. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/ca.d.ts +4 -0
  55. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/cs.d.ts +4 -0
  56. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/da.d.ts +4 -0
  57. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/de.d.ts +4 -0
  58. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/el.d.ts +4 -0
  59. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/en.d.ts +4 -0
  60. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/eo.d.ts +4 -0
  61. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/es.d.ts +4 -0
  62. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/fa.d.ts +4 -0
  63. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/fi.d.ts +4 -0
  64. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/fr-CA.d.ts +4 -0
  65. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/fr.d.ts +4 -0
  66. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/he.d.ts +4 -0
  67. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/hr.d.ts +4 -0
  68. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/hu.d.ts +4 -0
  69. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/hy.d.ts +4 -0
  70. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/id.d.ts +4 -0
  71. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/index.d.ts +52 -0
  72. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/is.d.ts +4 -0
  73. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/it.d.ts +4 -0
  74. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/ja.d.ts +4 -0
  75. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/ka.d.ts +4 -0
  76. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/kh.d.ts +5 -0
  77. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/km.d.ts +4 -0
  78. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/ko.d.ts +4 -0
  79. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/lt.d.ts +4 -0
  80. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/mk.d.ts +4 -0
  81. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/ms.d.ts +4 -0
  82. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/nl.d.ts +4 -0
  83. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/no.d.ts +4 -0
  84. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/ota.d.ts +4 -0
  85. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/pl.d.ts +4 -0
  86. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/ps.d.ts +4 -0
  87. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/pt.d.ts +4 -0
  88. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/ro.d.ts +4 -0
  89. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/ru.d.ts +4 -0
  90. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/sl.d.ts +4 -0
  91. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/sv.d.ts +4 -0
  92. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/ta.d.ts +4 -0
  93. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/th.d.ts +4 -0
  94. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/tr.d.ts +4 -0
  95. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/ua.d.ts +5 -0
  96. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/uk.d.ts +4 -0
  97. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/ur.d.ts +4 -0
  98. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/uz.d.ts +4 -0
  99. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/vi.d.ts +4 -0
  100. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/yo.d.ts +4 -0
  101. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/zh-CN.d.ts +4 -0
  102. package/dist/_types/@internal_voice/dist/_types/zod/v4/locales/zh-TW.d.ts +4 -0
  103. package/dist/_types/@internal_voice/dist/index.d.ts +16 -0
  104. package/dist/_types/@internal_voice/dist/voice/aisdk/index.d.ts +3 -0
  105. package/dist/_types/@internal_voice/dist/voice/aisdk/speech.d.ts +23 -0
  106. package/dist/_types/@internal_voice/dist/voice/aisdk/transcription.d.ts +22 -0
  107. package/dist/_types/@internal_voice/dist/voice/composite-voice.d.ts +72 -0
  108. package/dist/_types/@internal_voice/dist/voice/default-voice.d.ts +13 -0
  109. package/dist/_types/@internal_voice/dist/voice/index.d.ts +5 -0
  110. package/dist/_types/@internal_voice/dist/voice/voice.d.ts +172 -0
  111. package/dist/docs/SKILL.md +2 -2
  112. package/dist/docs/assets/SOURCE_MAP.json +1 -1
  113. package/dist/docs/references/docs-agents-adding-voice.md +186 -158
  114. package/dist/docs/references/docs-voice-overview.md +650 -379
  115. package/dist/docs/references/docs-voice-speech-to-text.md +27 -28
  116. package/dist/docs/references/docs-voice-text-to-speech.md +28 -29
  117. package/dist/docs/references/reference-voice-composite-voice.md +37 -37
  118. package/dist/docs/references/reference-voice-openai.md +36 -30
  119. package/dist/docs/references/reference-voice-voice.getSpeakers.md +45 -45
  120. package/dist/docs/references/reference-voice-voice.listen.md +64 -58
  121. package/dist/docs/references/reference-voice-voice.speak.md +65 -55
  122. package/dist/index.cjs +260 -2
  123. package/dist/index.cjs.map +1 -1
  124. package/dist/index.d.ts +1 -1
  125. package/dist/index.d.ts.map +1 -1
  126. package/dist/index.js +259 -1
  127. package/dist/index.js.map +1 -1
  128. package/package.json +10 -11
@@ -15,26 +15,26 @@ To use STT in Mastra, you need to provide a `listeningModel` when initializing t
15
15
  ```typescript
16
16
  const voice = new OpenAIVoice({
17
17
  listeningModel: {
18
- name: "whisper-1",
18
+ name: 'whisper-1',
19
19
  apiKey: process.env.OPENAI_API_KEY,
20
20
  },
21
- });
21
+ })
22
22
 
23
23
  // If using default settings the configuration can be simplified to:
24
- const voice = new OpenAIVoice();
24
+ const voice = new OpenAIVoice()
25
25
  ```
26
26
 
27
- ## Available Providers
27
+ ## Available providers
28
28
 
29
29
  Mastra supports several Speech-to-Text providers, each with their own capabilities and strengths:
30
30
 
31
- - [**OpenAI**](https://mastra.ai/reference/voice/openai) - High-accuracy transcription with Whisper models
32
- - [**Azure**](https://mastra.ai/reference/voice/azure) - Microsoft's speech recognition with enterprise-grade reliability
33
- - [**ElevenLabs**](https://mastra.ai/reference/voice/elevenlabs) - Advanced speech recognition with support for multiple languages
34
- - [**Google**](https://mastra.ai/reference/voice/google) - Google's speech recognition with extensive language support
35
- - [**Cloudflare**](https://mastra.ai/reference/voice/cloudflare) - Edge-optimized speech recognition for low-latency applications
36
- - [**Deepgram**](https://mastra.ai/reference/voice/deepgram) - AI-powered speech recognition with high accuracy for various accents
37
- - [**Sarvam**](https://mastra.ai/reference/voice/sarvam) - Specialized in Indic languages and accents
31
+ - [**OpenAI**](https://mastra.ai/reference/voice/openai): High-accuracy transcription with Whisper models
32
+ - [**Azure**](https://mastra.ai/reference/voice/azure): Microsoft's speech recognition with enterprise-grade reliability
33
+ - [**ElevenLabs**](https://mastra.ai/reference/voice/elevenlabs): Advanced speech recognition with support for multiple languages
34
+ - [**Google**](https://mastra.ai/reference/voice/google): Google's speech recognition with extensive language support
35
+ - [**Cloudflare**](https://mastra.ai/reference/voice/cloudflare): Edge-optimized speech recognition for low-latency applications
36
+ - [**Deepgram**](https://mastra.ai/reference/voice/deepgram): AI-powered speech recognition with high accuracy for various accents
37
+ - [**Sarvam**](https://mastra.ai/reference/voice/sarvam): Specialized in Indic languages and accents
38
38
 
39
39
  Each provider is implemented as a separate package that you can install as needed:
40
40
 
@@ -42,39 +42,38 @@ Each provider is implemented as a separate package that you can install as neede
42
42
  pnpm add @mastra/voice-openai@latest # Example for OpenAI
43
43
  ```
44
44
 
45
- ## Using the Listen Method
45
+ ## Using the listen method
46
46
 
47
47
  The primary method for STT is the `listen()` method, which converts spoken audio into text. Here's how to use it:
48
48
 
49
49
  ```typescript
50
- import { Agent } from "@mastra/core/agent";
51
- import { OpenAIVoice } from "@mastra/voice-openai";
52
- import { getMicrophoneStream } from "@mastra/node-audio";
50
+ import { Agent } from '@mastra/core/agent'
51
+ import { OpenAIVoice } from '@mastra/voice-openai'
52
+ import { getMicrophoneStream } from '@mastra/node-audio'
53
53
 
54
- const voice = new OpenAIVoice();
54
+ const voice = new OpenAIVoice()
55
55
 
56
56
  const agent = new Agent({
57
- id: "voice-agent",
58
- name: "Voice Agent",
59
- instructions:
60
- "You are a voice assistant that provides recommendations based on user input.",
61
- model: "openai/gpt-5.1",
57
+ id: 'voice-agent',
58
+ name: 'Voice Agent',
59
+ instructions: 'You are a voice assistant that provides recommendations based on user input.',
60
+ model: 'openai/gpt-5.5',
62
61
  voice,
63
- });
62
+ })
64
63
 
65
- const audioStream = getMicrophoneStream(); // Assume this function gets audio input
64
+ const audioStream = getMicrophoneStream() // Assume this function gets audio input
66
65
 
67
66
  const transcript = await agent.voice.listen(audioStream, {
68
- filetype: "m4a", // Optional: specify the audio file type
69
- });
67
+ filetype: 'm4a', // Optional: specify the audio file type
68
+ })
70
69
 
71
- console.log(`User said: ${transcript}`);
70
+ console.log(`User said: ${transcript}`)
72
71
 
73
72
  const { text } = await agent.generate(
74
73
  `Based on what the user said, provide them a recommendation: ${transcript}`,
75
- );
74
+ )
76
75
 
77
- console.log(`Recommendation: ${text}`);
76
+ console.log(`Recommendation: ${text}`)
78
77
  ```
79
78
 
80
79
  Check out the [Adding Voice to Agents](https://mastra.ai/docs/agents/adding-voice) documentation to learn how to use STT in an agent.
@@ -19,30 +19,30 @@ The **`speaker`** option allows you to select different voices for speech synthe
19
19
  ```typescript
20
20
  const voice = new OpenAIVoice({
21
21
  speechModel: {
22
- name: "tts-1-hd",
22
+ name: 'tts-1-hd',
23
23
  apiKey: process.env.OPENAI_API_KEY,
24
24
  },
25
- speaker: "alloy",
26
- });
25
+ speaker: 'alloy',
26
+ })
27
27
 
28
28
  // If using default settings the configuration can be simplified to:
29
- const voice = new OpenAIVoice();
29
+ const voice = new OpenAIVoice()
30
30
  ```
31
31
 
32
- ## Available Providers
32
+ ## Available providers
33
33
 
34
34
  Mastra supports a wide range of Text-to-Speech providers, each with their own unique capabilities and voice options. You can choose the provider that best suits your application's needs:
35
35
 
36
- - [**OpenAI**](https://mastra.ai/reference/voice/openai) - High-quality voices with natural intonation and expression
37
- - [**Azure**](https://mastra.ai/reference/voice/azure) - Microsoft's speech service with a wide range of voices and languages
38
- - [**ElevenLabs**](https://mastra.ai/reference/voice/elevenlabs) - Ultra-realistic voices with emotion and fine-grained control
39
- - [**PlayAI**](https://mastra.ai/reference/voice/playai) - Specialized in natural-sounding voices with various styles
40
- - [**Google**](https://mastra.ai/reference/voice/google) - Google's speech synthesis with multilingual support
41
- - [**Cloudflare**](https://mastra.ai/reference/voice/cloudflare) - Edge-optimized speech synthesis for low-latency applications
42
- - [**Deepgram**](https://mastra.ai/reference/voice/deepgram) - AI-powered speech technology with high accuracy
43
- - [**Speechify**](https://mastra.ai/reference/voice/speechify) - Text-to-speech optimized for readability and accessibility
44
- - [**Sarvam**](https://mastra.ai/reference/voice/sarvam) - Specialized in Indic languages and accents
45
- - [**Murf**](https://mastra.ai/reference/voice/murf) - Studio-quality voice overs with customizable parameters
36
+ - [**OpenAI**](https://mastra.ai/reference/voice/openai): High-quality voices with natural intonation and expression
37
+ - [**Azure**](https://mastra.ai/reference/voice/azure): Microsoft's speech service with a wide range of voices and languages
38
+ - [**ElevenLabs**](https://mastra.ai/reference/voice/elevenlabs): Ultra-realistic voices with emotion and fine-grained control
39
+ - [**PlayAI**](https://mastra.ai/reference/voice/playai): Specialized in natural-sounding voices with various styles
40
+ - [**Google**](https://mastra.ai/reference/voice/google): Google's speech synthesis with multilingual support
41
+ - [**Cloudflare**](https://mastra.ai/reference/voice/cloudflare): Edge-optimized speech synthesis for low-latency applications
42
+ - [**Deepgram**](https://mastra.ai/reference/voice/deepgram): AI-powered speech technology with high accuracy
43
+ - [**Speechify**](https://mastra.ai/reference/voice/speechify): Text-to-speech optimized for readability and accessibility
44
+ - [**Sarvam**](https://mastra.ai/reference/voice/sarvam): Specialized in Indic languages and accents
45
+ - [**Murf**](https://mastra.ai/reference/voice/murf): Studio-quality voice overs with customizable parameters
46
46
 
47
47
  Each provider is implemented as a separate package that you can install as needed:
48
48
 
@@ -50,35 +50,34 @@ Each provider is implemented as a separate package that you can install as neede
50
50
  pnpm add @mastra/voice-openai@latest # Example for OpenAI
51
51
  ```
52
52
 
53
- ## Using the Speak Method
53
+ ## Using the speak method
54
54
 
55
55
  The primary method for TTS is the `speak()` method, which converts text to speech. This method can accept options that allows you to specify the speaker and other provider-specific options. Here's how to use it:
56
56
 
57
57
  ```typescript
58
- import { Agent } from "@mastra/core/agent";
59
- import { OpenAIVoice } from "@mastra/voice-openai";
58
+ import { Agent } from '@mastra/core/agent'
59
+ import { OpenAIVoice } from '@mastra/voice-openai'
60
60
 
61
- const voice = new OpenAIVoice();
61
+ const voice = new OpenAIVoice()
62
62
 
63
63
  const agent = new Agent({
64
- id: "voice-agent",
65
- name: "Voice Agent",
66
- instructions:
67
- "You are a voice assistant that can help users with their tasks.",
68
- model: "openai/gpt-5.1",
64
+ id: 'voice-agent',
65
+ name: 'Voice Agent',
66
+ instructions: 'You are a voice assistant that can help users with their tasks.',
67
+ model: 'openai/gpt-5.5',
69
68
  voice,
70
- });
69
+ })
71
70
 
72
- const { text } = await agent.generate("What color is the sky?");
71
+ const { text } = await agent.generate('What color is the sky?')
73
72
 
74
73
  // Convert text to speech to an Audio Stream
75
74
  const readableStream = await voice.speak(text, {
76
- speaker: "default", // Optional: specify a speaker
75
+ speaker: 'default', // Optional: specify a speaker
77
76
  properties: {
78
77
  speed: 1.0, // Optional: adjust speech speed
79
- pitch: "default", // Optional: specify pitch if supported
78
+ pitch: 'default', // Optional: specify pitch if supported
80
79
  },
81
- });
80
+ })
82
81
  ```
83
82
 
84
83
  Check out the [Adding Voice to Agents](https://mastra.ai/docs/agents/adding-voice) documentation to learn how to use TTS in an agent.
@@ -4,25 +4,25 @@ The CompositeVoice class allows you to combine different voice providers for tex
4
4
 
5
5
  CompositeVoice supports both Mastra voice providers and AI SDK model providers
6
6
 
7
- ## Constructor Parameters
7
+ ## Constructor parameters
8
8
 
9
- **config:** (`object`): Configuration object for the composite voice service
9
+ **config** (`object`): Configuration object for the composite voice service
10
10
 
11
- **config.input?:** (`MastraVoice | TranscriptionModel`): Voice provider or AI SDK transcription model to use for speech-to-text operations. AI SDK models are automatically wrapped.
11
+ **config.input** (`MastraVoice | TranscriptionModel`): Voice provider or AI SDK transcription model to use for speech-to-text operations. AI SDK models are automatically wrapped.
12
12
 
13
- **config.output?:** (`MastraVoice | SpeechModel`): Voice provider or AI SDK speech model to use for text-to-speech operations. AI SDK models are automatically wrapped.
13
+ **config.output** (`MastraVoice | SpeechModel`): Voice provider or AI SDK speech model to use for text-to-speech operations. AI SDK models are automatically wrapped.
14
14
 
15
- **config.realtime?:** (`MastraVoice`): Voice provider to use for real-time speech-to-speech operations
15
+ **config.realtime** (`MastraVoice`): Voice provider to use for real-time speech-to-speech operations
16
16
 
17
17
  ## Methods
18
18
 
19
- ### speak()
19
+ ### `speak()`
20
20
 
21
21
  Converts text to speech using the configured speaking provider.
22
22
 
23
- **input:** (`string | NodeJS.ReadableStream`): Text to convert to speech
23
+ **input** (`string | NodeJS.ReadableStream`): Text to convert to speech
24
24
 
25
- **options?:** (`object`): Provider-specific options passed to the speaking provider
25
+ **options** (`object`): Provider-specific options passed to the speaking provider
26
26
 
27
27
  Notes:
28
28
 
@@ -30,13 +30,13 @@ Notes:
30
30
  - Options are passed through to the configured speaking provider
31
31
  - Returns a stream of audio data
32
32
 
33
- ### listen()
33
+ ### `listen()`
34
34
 
35
35
  Converts speech to text using the configured listening provider.
36
36
 
37
- **audioStream:** (`NodeJS.ReadableStream`): Audio stream to convert to text
37
+ **audioStream** (`NodeJS.ReadableStream`): Audio stream to convert to text
38
38
 
39
- **options?:** (`object`): Provider-specific options passed to the listening provider
39
+ **options** (`object`): Provider-specific options passed to the listening provider
40
40
 
41
41
  Notes:
42
42
 
@@ -44,13 +44,13 @@ Notes:
44
44
  - Options are passed through to the configured listening provider
45
45
  - Returns either a string or a stream of transcribed text, depending on the provider
46
46
 
47
- ### getSpeakers()
47
+ ### `getSpeakers()`
48
48
 
49
49
  Returns a list of available voices from the speaking provider, where each node contains:
50
50
 
51
- **voiceId:** (`string`): Unique identifier for the voice
51
+ **voiceId** (`string`): Unique identifier for the voice
52
52
 
53
- **key?:** (`value`): Additional voice properties that vary by provider (e.g., name, language)
53
+ **key** (`value`): Additional voice properties that vary by provider (e.g., name, language)
54
54
 
55
55
  Notes:
56
56
 
@@ -59,30 +59,30 @@ Notes:
59
59
  - Each voice object will have at least a voiceId property
60
60
  - Additional voice properties depend on the speaking provider
61
61
 
62
- ## Usage Examples
62
+ ## Usage examples
63
63
 
64
64
  ### Using Mastra Voice Providers
65
65
 
66
66
  ```typescript
67
- import { CompositeVoice } from "@mastra/core/voice";
68
- import { OpenAIVoice } from "@mastra/voice-openai";
69
- import { PlayAIVoice } from "@mastra/voice-playai";
67
+ import { CompositeVoice } from '@mastra/core/voice'
68
+ import { OpenAIVoice } from '@mastra/voice-openai'
69
+ import { PlayAIVoice } from '@mastra/voice-playai'
70
70
 
71
71
  // Create voice providers
72
- const openai = new OpenAIVoice();
73
- const playai = new PlayAIVoice();
72
+ const openai = new OpenAIVoice()
73
+ const playai = new PlayAIVoice()
74
74
 
75
75
  // Use OpenAI for listening (speech-to-text) and PlayAI for speaking (text-to-speech)
76
76
  const voice = new CompositeVoice({
77
77
  input: openai,
78
78
  output: playai,
79
- });
79
+ })
80
80
 
81
81
  // Convert speech to text using OpenAI
82
- const text = await voice.listen(audioStream);
82
+ const text = await voice.listen(audioStream)
83
83
 
84
84
  // Convert text to speech using PlayAI
85
- const audio = await voice.speak("Hello, world!");
85
+ const audio = await voice.speak('Hello, world!')
86
86
  ```
87
87
 
88
88
  ### Using AI SDK Model Providers
@@ -90,19 +90,19 @@ const audio = await voice.speak("Hello, world!");
90
90
  You can pass AI SDK transcription and speech models directly to CompositeVoice:
91
91
 
92
92
  ```typescript
93
- import { CompositeVoice } from "@mastra/core/voice";
94
- import { openai } from "@ai-sdk/openai";
95
- import { elevenlabs } from "@ai-sdk/elevenlabs";
93
+ import { CompositeVoice } from '@mastra/core/voice'
94
+ import { openai } from '@ai-sdk/openai'
95
+ import { elevenlabs } from '@ai-sdk/elevenlabs'
96
96
 
97
97
  // Use AI SDK models directly - they will be auto-wrapped
98
98
  const voice = new CompositeVoice({
99
- input: openai.transcription('whisper-1'), // AI SDK transcription
100
- output: elevenlabs.speech('eleven_turbo_v2'), // AI SDK speech
101
- });
99
+ input: openai.transcription('whisper-1'), // AI SDK transcription
100
+ output: elevenlabs.speech('eleven_turbo_v2'), // AI SDK speech
101
+ })
102
102
 
103
103
  // Works the same way as with Mastra providers
104
- const text = await voice.listen(audioStream);
105
- const audio = await voice.speak("Hello from AI SDK!");
104
+ const text = await voice.listen(audioStream)
105
+ const audio = await voice.speak('Hello from AI SDK!')
106
106
  ```
107
107
 
108
108
  ### Mix and Match
@@ -110,12 +110,12 @@ const audio = await voice.speak("Hello from AI SDK!");
110
110
  You can combine Mastra providers with AI SDK models:
111
111
 
112
112
  ```typescript
113
- import { CompositeVoice } from "@mastra/core/voice";
114
- import { PlayAIVoice } from "@mastra/voice-playai";
115
- import { groq } from "@ai-sdk/groq";
113
+ import { CompositeVoice } from '@mastra/core/voice'
114
+ import { PlayAIVoice } from '@mastra/voice-playai'
115
+ import { groq } from '@ai-sdk/groq'
116
116
 
117
117
  const voice = new CompositeVoice({
118
- input: groq.transcription('whisper-large-v3'), // AI SDK for STT
119
- output: new PlayAIVoice(), // Mastra for TTS
120
- });
118
+ input: groq.transcription('whisper-large-v3'), // AI SDK for STT
119
+ output: new PlayAIVoice(), // Mastra for TTS
120
+ })
121
121
  ```
@@ -2,84 +2,90 @@
2
2
 
3
3
  The OpenAIVoice class in Mastra provides text-to-speech and speech-to-text capabilities using OpenAI's models.
4
4
 
5
- ## Usage Example
5
+ ## Usage example
6
6
 
7
7
  ```typescript
8
- import { OpenAIVoice } from "@mastra/voice-openai";
8
+ import { OpenAIVoice } from '@mastra/voice-openai'
9
9
 
10
10
  // Initialize with default configuration using environment variables
11
- const voice = new OpenAIVoice();
11
+ const voice = new OpenAIVoice()
12
12
 
13
13
  // Or initialize with specific configuration
14
14
  const voiceWithConfig = new OpenAIVoice({
15
15
  speechModel: {
16
- name: "tts-1-hd",
17
- apiKey: "your-openai-api-key",
16
+ name: 'tts-1-hd',
17
+ apiKey: 'your-openai-api-key',
18
18
  },
19
19
  listeningModel: {
20
- name: "whisper-1",
21
- apiKey: "your-openai-api-key",
20
+ name: 'whisper-1',
21
+ apiKey: 'your-openai-api-key',
22
22
  },
23
- speaker: "alloy", // Default voice
24
- });
23
+ speaker: 'alloy', // Default voice
24
+ })
25
25
 
26
26
  // Convert text to speech
27
- const audioStream = await voice.speak("Hello, how can I help you?", {
28
- speaker: "nova", // Override default voice
27
+ const audioStream = await voice.speak('Hello, how can I help you?', {
28
+ speaker: 'nova', // Override default voice
29
29
  speed: 1.2, // Adjust speech speed
30
- });
30
+ })
31
31
 
32
32
  // Convert speech to text
33
33
  const text = await voice.listen(audioStream, {
34
- filetype: "mp3",
35
- });
34
+ filetype: 'mp3',
35
+ })
36
36
  ```
37
37
 
38
38
  ## Configuration
39
39
 
40
- ### Constructor Options
40
+ ### Constructor options
41
41
 
42
- **speechModel?:** (`OpenAIConfig`): Configuration for text-to-speech synthesis. (Default: `{ name: 'tts-1' }`)
42
+ **speechModel** (`OpenAIConfig`): Configuration for text-to-speech synthesis. (Default: `{ name: 'tts-1' }`)
43
43
 
44
- **listeningModel?:** (`OpenAIConfig`): Configuration for speech-to-text recognition. (Default: `{ name: 'whisper-1' }`)
44
+ **speechModel.name** (`'tts-1' | 'tts-1-hd' | 'whisper-1'`): Model name. Use 'tts-1-hd' for higher quality audio.
45
45
 
46
- **speaker?:** (`OpenAIVoiceId`): Default voice ID for speech synthesis. (Default: `'alloy'`)
46
+ **speechModel.apiKey** (`string`): OpenAI API key. Falls back to OPENAI\_API\_KEY environment variable.
47
47
 
48
- ### OpenAIConfig
48
+ **listeningModel** (`OpenAIConfig`): Configuration for speech-to-text recognition. (Default: `{ name: 'whisper-1' }`)
49
49
 
50
- **name?:** (`'tts-1' | 'tts-1-hd' | 'whisper-1'`): Model name. Use 'tts-1-hd' for higher quality audio.
50
+ **listeningModel.name** (`'tts-1' | 'tts-1-hd' | 'whisper-1'`): Model name. Use 'tts-1-hd' for higher quality audio.
51
51
 
52
- **apiKey?:** (`string`): OpenAI API key. Falls back to OPENAI\_API\_KEY environment variable.
52
+ **listeningModel.apiKey** (`string`): OpenAI API key. Falls back to OPENAI\_API\_KEY environment variable.
53
+
54
+ **speaker** (`OpenAIVoiceId`): Default voice ID for speech synthesis. (Default: `'alloy'`)
53
55
 
54
56
  ## Methods
55
57
 
56
- ### speak()
58
+ ### `speak()`
57
59
 
58
60
  Converts text to speech using OpenAI's text-to-speech models.
59
61
 
60
- **input:** (`string | NodeJS.ReadableStream`): Text or text stream to convert to speech.
62
+ **input** (`string | NodeJS.ReadableStream`): Text or text stream to convert to speech.
63
+
64
+ **options** (`Options`): Configuration options.
61
65
 
62
- **options.speaker?:** (`OpenAIVoiceId`): Voice ID to use for speech synthesis. (Default: `Constructor's speaker value`)
66
+ **options.speaker** (`OpenAIVoiceId`): Voice ID to use for speech synthesis.
63
67
 
64
- **options.speed?:** (`number`): Speech speed multiplier. (Default: `1.0`)
68
+ **options.speed** (`number`): Speech speed multiplier.
65
69
 
66
70
  Returns: `Promise<NodeJS.ReadableStream>`
67
71
 
68
- ### listen()
72
+ ### `listen()`
69
73
 
70
74
  Transcribes audio using OpenAI's Whisper model.
71
75
 
72
- **audioStream:** (`NodeJS.ReadableStream`): Audio stream to transcribe.
76
+ **audioStream** (`NodeJS.ReadableStream`): Audio stream to transcribe.
77
+
78
+ **options** (`Options`): Configuration options.
73
79
 
74
- **options.filetype?:** (`string`): Audio format of the input stream. (Default: `'mp3'`)
80
+ **options.filetype** (`string`): Audio format of the input stream.
75
81
 
76
82
  Returns: `Promise<string>`
77
83
 
78
- ### getSpeakers()
84
+ ### `getSpeakers()`
79
85
 
80
86
  Returns an array of available voice options, where each node contains:
81
87
 
82
- **voiceId:** (`string`): Unique identifier for the voice
88
+ **voiceId** (`string`): Unique identifier for the voice
83
89
 
84
90
  ## Notes
85
91
 
@@ -2,123 +2,123 @@
2
2
 
3
3
  The `getSpeakers()` method retrieves a list of available voice options (speakers) from the voice provider. This allows applications to present users with voice choices or programmatically select the most appropriate voice for different contexts.
4
4
 
5
- ## Usage Example
5
+ ## Usage example
6
6
 
7
7
  ```typescript
8
- import { OpenAIVoice } from "@mastra/voice-openai";
9
- import { ElevenLabsVoice } from "@mastra/voice-elevenlabs";
8
+ import { OpenAIVoice } from '@mastra/voice-openai'
9
+ import { ElevenLabsVoice } from '@mastra/voice-elevenlabs'
10
10
 
11
11
  // Initialize voice providers
12
- const openaiVoice = new OpenAIVoice();
12
+ const openaiVoice = new OpenAIVoice()
13
13
  const elevenLabsVoice = new ElevenLabsVoice({
14
14
  apiKey: process.env.ELEVENLABS_API_KEY,
15
- });
15
+ })
16
16
 
17
17
  // Get available speakers from OpenAI
18
- const openaiSpeakers = await openaiVoice.getSpeakers();
19
- console.log("OpenAI voices:", openaiSpeakers);
18
+ const openaiSpeakers = await openaiVoice.getSpeakers()
19
+ console.log('OpenAI voices:', openaiSpeakers)
20
20
  // Example output: [{ voiceId: "alloy" }, { voiceId: "echo" }, { voiceId: "fable" }, ...]
21
21
 
22
22
  // Get available speakers from ElevenLabs
23
- const elevenLabsSpeakers = await elevenLabsVoice.getSpeakers();
24
- console.log("ElevenLabs voices:", elevenLabsSpeakers);
23
+ const elevenLabsSpeakers = await elevenLabsVoice.getSpeakers()
24
+ console.log('ElevenLabs voices:', elevenLabsSpeakers)
25
25
  // Example output: [{ voiceId: "21m00Tcm4TlvDq8ikWAM", name: "Rachel" }, ...]
26
26
 
27
27
  // Use a specific voice for speech
28
- const text = "Hello, this is a test of different voices.";
29
- await openaiVoice.speak(text, { speaker: openaiSpeakers[2].voiceId });
30
- await elevenLabsVoice.speak(text, { speaker: elevenLabsSpeakers[0].voiceId });
28
+ const text = 'Hello, this is a test of different voices.'
29
+ await openaiVoice.speak(text, { speaker: openaiSpeakers[2].voiceId })
30
+ await elevenLabsVoice.speak(text, { speaker: elevenLabsSpeakers[0].voiceId })
31
31
  ```
32
32
 
33
33
  ## Parameters
34
34
 
35
- This method does not accept any parameters.
35
+ This method doesn't accept any parameters.
36
36
 
37
- ## Return Value
37
+ ## Return value
38
38
 
39
- **Promise\<Array<{ voiceId: string } & TSpeakerMetadata>>:** (`Promise`): A promise that resolves to an array of voice options, where each option contains at least a voiceId property and may include additional provider-specific metadata.
39
+ **Promise\<Array<{ voiceId: string } & TSpeakerMetadata>>** (`Promise`): A promise that resolves to an array of voice options, where each option contains at least a voiceId property and may include additional provider-specific metadata.
40
40
 
41
- ## Provider-Specific Metadata
41
+ ## Provider-specific metadata
42
42
 
43
43
  Different voice providers return different metadata for their voices:
44
44
 
45
45
  **OpenAI**:
46
46
 
47
- **voiceId:** (`string`): Unique identifier for the voice (e.g., 'alloy', 'echo', 'fable', 'onyx', 'nova', 'shimmer')
47
+ **voiceId** (`string`): Unique identifier for the voice (e.g., 'alloy', 'echo', 'fable', 'onyx', 'nova', 'shimmer')
48
48
 
49
49
  **OpenAI Realtime**:
50
50
 
51
- **voiceId:** (`string`): Unique identifier for the voice (e.g., 'alloy', 'echo', 'fable', 'onyx', 'nova', 'shimmer')
51
+ **voiceId** (`string`): Unique identifier for the voice (e.g., 'alloy', 'echo', 'fable', 'onyx', 'nova', 'shimmer')
52
52
 
53
53
  **Deepgram**:
54
54
 
55
- **voiceId:** (`string`): Unique identifier for the voice
55
+ **voiceId** (`string`): Unique identifier for the voice
56
56
 
57
- **language:** (`string`): Language code embedded in the voice ID (e.g., 'en')
57
+ **language** (`string`): Language code embedded in the voice ID (e.g., 'en')
58
58
 
59
59
  **ElevenLabs**:
60
60
 
61
- **voiceId:** (`string`): Unique identifier for the voice
61
+ **voiceId** (`string`): Unique identifier for the voice
62
62
 
63
- **name:** (`string`): Human-readable name of the voice
63
+ **name** (`string`): Human-readable name of the voice
64
64
 
65
- **category:** (`string`): Category of the voice (e.g., 'premade', 'cloned')
65
+ **category** (`string`): Category of the voice (e.g., 'premade', 'cloned')
66
66
 
67
67
  **Google**:
68
68
 
69
- **voiceId:** (`string`): Unique identifier for the voice
69
+ **voiceId** (`string`): Unique identifier for the voice
70
70
 
71
- **languageCodes:** (`string[]`): Array of language codes supported by the voice (e.g., \['en-US'])
71
+ **languageCodes** (`string[]`): Array of language codes supported by the voice (e.g., \['en-US'])
72
72
 
73
73
  **Azure**:
74
74
 
75
- **voiceId:** (`string`): Unique identifier for the voice
75
+ **voiceId** (`string`): Unique identifier for the voice
76
76
 
77
- **language:** (`string`): Language code extracted from the voice ID (e.g., 'en')
77
+ **language** (`string`): Language code extracted from the voice ID (e.g., 'en')
78
78
 
79
- **region:** (`string`): Region code extracted from the voice ID (e.g., 'US')
79
+ **region** (`string`): Region code extracted from the voice ID (e.g., 'US')
80
80
 
81
81
  **Murf**:
82
82
 
83
- **voiceId:** (`string`): Unique identifier for the voice
83
+ **voiceId** (`string`): Unique identifier for the voice
84
84
 
85
- **name:** (`string`): Name of the voice (same as voiceId)
85
+ **name** (`string`): Name of the voice (same as voiceId)
86
86
 
87
- **language:** (`string`): Language code extracted from the voice ID (e.g., 'en')
87
+ **language** (`string`): Language code extracted from the voice ID (e.g., 'en')
88
88
 
89
- **gender:** (`string`): Gender of the voice (always 'neutral' in current implementation)
89
+ **gender** (`string`): Gender of the voice (always 'neutral' in current implementation)
90
90
 
91
91
  **PlayAI**:
92
92
 
93
- **voiceId:** (`string`): Unique identifier for the voice (S3 URL to manifest.json)
93
+ **voiceId** (`string`): Unique identifier for the voice (S3 URL to manifest.json)
94
94
 
95
- **name:** (`string`): Human-readable name of the voice (e.g., 'Angelo', 'Arsenio')
95
+ **name** (`string`): Human-readable name of the voice (e.g., 'Angelo', 'Arsenio')
96
96
 
97
- **accent:** (`string`): Accent of the voice (e.g., 'US', 'Irish', 'US African American')
97
+ **accent** (`string`): Accent of the voice (e.g., 'US', 'Irish', 'US African American')
98
98
 
99
- **gender:** (`string`): Gender of the voice ('M' or 'F')
99
+ **gender** (`string`): Gender of the voice ('M' or 'F')
100
100
 
101
- **age:** (`string`): Age category of the voice (e.g., 'Young', 'Middle')
101
+ **age** (`string`): Age category of the voice (e.g., 'Young', 'Middle')
102
102
 
103
- **style:** (`string`): Speaking style of the voice (e.g., 'Conversational')
103
+ **style** (`string`): Speaking style of the voice (e.g., 'Conversational')
104
104
 
105
105
  **Speechify**:
106
106
 
107
- **voiceId:** (`string`): Unique identifier for the voice
107
+ **voiceId** (`string`): Unique identifier for the voice
108
108
 
109
- **name:** (`string`): Human-readable name of the voice
109
+ **name** (`string`): Human-readable name of the voice
110
110
 
111
- **language:** (`string`): Language code of the voice (e.g., 'en-US')
111
+ **language** (`string`): Language code of the voice (e.g., 'en-US')
112
112
 
113
113
  **Sarvam**:
114
114
 
115
- **voiceId:** (`string`): Unique identifier for the voice
115
+ **voiceId** (`string`): Unique identifier for the voice
116
116
 
117
- **name:** (`string`): Human-readable name of the voice
117
+ **name** (`string`): Human-readable name of the voice
118
118
 
119
- **language:** (`string`): Language of the voice (e.g., 'english', 'hindi')
119
+ **language** (`string`): Language of the voice (e.g., 'english', 'hindi')
120
120
 
121
- **gender:** (`string`): Gender of the voice ('male' or 'female')
121
+ **gender** (`string`): Gender of the voice ('male' or 'female')
122
122
 
123
123
  ## Notes
124
124