@mastra/voice-openai-realtime 0.11.12 → 0.12.0-beta.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +67 -13
- package/README.md +1 -1
- package/dist/docs/README.md +32 -0
- package/dist/docs/SKILL.md +33 -0
- package/dist/docs/SOURCE_MAP.json +6 -0
- package/dist/docs/agents/01-adding-voice.md +352 -0
- package/dist/docs/voice/01-overview.md +1019 -0
- package/dist/docs/voice/02-speech-to-speech.md +106 -0
- package/dist/docs/voice/03-reference.md +1096 -0
- package/dist/index.cjs +4 -4
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.ts +4 -4
- package/dist/index.js +4 -4
- package/dist/index.js.map +1 -1
- package/package.json +13 -15
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
> Overview of speech-to-speech capabilities in Mastra, including real-time interactions and event-driven architecture.
|
|
2
|
+
|
|
3
|
+
# Speech-to-Speech Capabilities in Mastra
|
|
4
|
+
|
|
5
|
+
## Introduction
|
|
6
|
+
|
|
7
|
+
Speech-to-Speech (STS) in Mastra provides a standardized interface for real-time interactions across multiple providers.
|
|
8
|
+
STS enables continuous bidirectional audio communication through listening to events from Realtime models. Unlike separate TTS and STT operations, STS maintains an open connection that processes speech continuously in both directions.
|
|
9
|
+
|
|
10
|
+
## Configuration
|
|
11
|
+
|
|
12
|
+
- **`apiKey`**: Your OpenAI API key. Falls back to the `OPENAI_API_KEY` environment variable.
|
|
13
|
+
- **`model`**: The model ID to use for real-time voice interactions (e.g., `gpt-5.1-realtime`).
|
|
14
|
+
- **`speaker`**: The default voice ID for speech synthesis. This allows you to specify which voice to use for the speech output.
|
|
15
|
+
|
|
16
|
+
```typescript
|
|
17
|
+
const voice = new OpenAIRealtimeVoice({
|
|
18
|
+
apiKey: "your-openai-api-key",
|
|
19
|
+
model: "gpt-5.1-realtime",
|
|
20
|
+
speaker: "alloy", // Default voice
|
|
21
|
+
});
|
|
22
|
+
|
|
23
|
+
// If using default settings the configuration can be simplified to:
|
|
24
|
+
const voice = new OpenAIRealtimeVoice();
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
## Using STS
|
|
28
|
+
|
|
29
|
+
```typescript
|
|
30
|
+
import { Agent } from "@mastra/core/agent";
|
|
31
|
+
import { OpenAIRealtimeVoice } from "@mastra/voice-openai-realtime";
|
|
32
|
+
import { playAudio, getMicrophoneStream } from "@mastra/node-audio";
|
|
33
|
+
|
|
34
|
+
const agent = new Agent({
|
|
35
|
+
id: "agent",
|
|
36
|
+
name: "OpenAI Realtime Agent",
|
|
37
|
+
instructions: `You are a helpful assistant with real-time voice capabilities.`,
|
|
38
|
+
model: "openai/gpt-5.1",
|
|
39
|
+
voice: new OpenAIRealtimeVoice(),
|
|
40
|
+
});
|
|
41
|
+
|
|
42
|
+
// Connect to the voice service
|
|
43
|
+
await agent.voice.connect();
|
|
44
|
+
|
|
45
|
+
// Listen for agent audio responses
|
|
46
|
+
agent.voice.on("speaker", ({ audio }) => {
|
|
47
|
+
playAudio(audio);
|
|
48
|
+
});
|
|
49
|
+
|
|
50
|
+
// Initiate the conversation
|
|
51
|
+
await agent.voice.speak("How can I help you today?");
|
|
52
|
+
|
|
53
|
+
// Send continuous audio from the microphone
|
|
54
|
+
const micStream = getMicrophoneStream();
|
|
55
|
+
await agent.voice.send(micStream);
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
For integrating Speech-to-Speech capabilities with agents, refer to the [Adding Voice to Agents](https://mastra.ai/docs/v1/agents/adding-voice) documentation.
|
|
59
|
+
|
|
60
|
+
## Google Gemini Live (Realtime)
|
|
61
|
+
|
|
62
|
+
```typescript
|
|
63
|
+
import { Agent } from "@mastra/core/agent";
|
|
64
|
+
import { GeminiLiveVoice } from "@mastra/voice-google-gemini-live";
|
|
65
|
+
import { playAudio, getMicrophoneStream } from "@mastra/node-audio";
|
|
66
|
+
|
|
67
|
+
const agent = new Agent({
|
|
68
|
+
id: "agent",
|
|
69
|
+
name: "Gemini Live Agent",
|
|
70
|
+
instructions:
|
|
71
|
+
"You are a helpful assistant with real-time voice capabilities.",
|
|
72
|
+
// Model used for text generation; voice provider handles realtime audio
|
|
73
|
+
model: "openai/gpt-5.1",
|
|
74
|
+
voice: new GeminiLiveVoice({
|
|
75
|
+
apiKey: process.env.GOOGLE_API_KEY,
|
|
76
|
+
model: "gemini-2.0-flash-exp",
|
|
77
|
+
speaker: "Puck",
|
|
78
|
+
debug: true,
|
|
79
|
+
// Vertex AI option:
|
|
80
|
+
// vertexAI: true,
|
|
81
|
+
// project: 'your-gcp-project',
|
|
82
|
+
// location: 'us-central1',
|
|
83
|
+
// serviceAccountKeyFile: '/path/to/service-account.json',
|
|
84
|
+
}),
|
|
85
|
+
});
|
|
86
|
+
|
|
87
|
+
await agent.voice.connect();
|
|
88
|
+
|
|
89
|
+
agent.voice.on("speaker", ({ audio }) => {
|
|
90
|
+
playAudio(audio);
|
|
91
|
+
});
|
|
92
|
+
|
|
93
|
+
agent.voice.on("writing", ({ role, text }) => {
|
|
94
|
+
console.log(`${role}: ${text}`);
|
|
95
|
+
});
|
|
96
|
+
|
|
97
|
+
await agent.voice.speak("How can I help you today?");
|
|
98
|
+
|
|
99
|
+
const micStream = getMicrophoneStream();
|
|
100
|
+
await agent.voice.send(micStream);
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
Note:
|
|
104
|
+
|
|
105
|
+
- Live API requires `GOOGLE_API_KEY`. Vertex AI requires project/location and service account credentials.
|
|
106
|
+
- Events: `speaker` (audio stream), `writing` (text), `turnComplete`, `usage`, and `error`.
|