@mastra/voice-google-gemini-live 0.0.0-feat-add-query-option-to-playground-20251209160219 → 0.0.0-feat-mcp-embedded-docs-tools-clean-20260102135536

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,106 @@
1
+ > Overview of speech-to-speech capabilities in Mastra, including real-time interactions and event-driven architecture.
2
+
3
+ # Speech-to-Speech Capabilities in Mastra
4
+
5
+ ## Introduction
6
+
7
+ Speech-to-Speech (STS) in Mastra provides a standardized interface for real-time interactions across multiple providers.
8
+ STS enables continuous bidirectional audio communication through listening to events from Realtime models. Unlike separate TTS and STT operations, STS maintains an open connection that processes speech continuously in both directions.
9
+
10
+ ## Configuration
11
+
12
+ - **`apiKey`**: Your OpenAI API key. Falls back to the `OPENAI_API_KEY` environment variable.
13
+ - **`model`**: The model ID to use for real-time voice interactions (e.g., `gpt-5.1-realtime`).
14
+ - **`speaker`**: The default voice ID for speech synthesis. This allows you to specify which voice to use for the speech output.
15
+
16
+ ```typescript
17
+ const voice = new OpenAIRealtimeVoice({
18
+ apiKey: "your-openai-api-key",
19
+ model: "gpt-5.1-realtime",
20
+ speaker: "alloy", // Default voice
21
+ });
22
+
23
+ // If using default settings the configuration can be simplified to:
24
+ const voice = new OpenAIRealtimeVoice();
25
+ ```
26
+
27
+ ## Using STS
28
+
29
+ ```typescript
30
+ import { Agent } from "@mastra/core/agent";
31
+ import { OpenAIRealtimeVoice } from "@mastra/voice-openai-realtime";
32
+ import { playAudio, getMicrophoneStream } from "@mastra/node-audio";
33
+
34
+ const agent = new Agent({
35
+ id: "agent",
36
+ name: "OpenAI Realtime Agent",
37
+ instructions: `You are a helpful assistant with real-time voice capabilities.`,
38
+ model: "openai/gpt-5.1",
39
+ voice: new OpenAIRealtimeVoice(),
40
+ });
41
+
42
+ // Connect to the voice service
43
+ await agent.voice.connect();
44
+
45
+ // Listen for agent audio responses
46
+ agent.voice.on("speaker", ({ audio }) => {
47
+ playAudio(audio);
48
+ });
49
+
50
+ // Initiate the conversation
51
+ await agent.voice.speak("How can I help you today?");
52
+
53
+ // Send continuous audio from the microphone
54
+ const micStream = getMicrophoneStream();
55
+ await agent.voice.send(micStream);
56
+ ```
57
+
58
+ For integrating Speech-to-Speech capabilities with agents, refer to the [Adding Voice to Agents](https://mastra.ai/docs/v1/agents/adding-voice) documentation.
59
+
60
+ ## Google Gemini Live (Realtime)
61
+
62
+ ```typescript
63
+ import { Agent } from "@mastra/core/agent";
64
+ import { GeminiLiveVoice } from "@mastra/voice-google-gemini-live";
65
+ import { playAudio, getMicrophoneStream } from "@mastra/node-audio";
66
+
67
+ const agent = new Agent({
68
+ id: "agent",
69
+ name: "Gemini Live Agent",
70
+ instructions:
71
+ "You are a helpful assistant with real-time voice capabilities.",
72
+ // Model used for text generation; voice provider handles realtime audio
73
+ model: "openai/gpt-5.1",
74
+ voice: new GeminiLiveVoice({
75
+ apiKey: process.env.GOOGLE_API_KEY,
76
+ model: "gemini-2.0-flash-exp",
77
+ speaker: "Puck",
78
+ debug: true,
79
+ // Vertex AI option:
80
+ // vertexAI: true,
81
+ // project: 'your-gcp-project',
82
+ // location: 'us-central1',
83
+ // serviceAccountKeyFile: '/path/to/service-account.json',
84
+ }),
85
+ });
86
+
87
+ await agent.voice.connect();
88
+
89
+ agent.voice.on("speaker", ({ audio }) => {
90
+ playAudio(audio);
91
+ });
92
+
93
+ agent.voice.on("writing", ({ role, text }) => {
94
+ console.log(`${role}: ${text}`);
95
+ });
96
+
97
+ await agent.voice.speak("How can I help you today?");
98
+
99
+ const micStream = getMicrophoneStream();
100
+ await agent.voice.send(micStream);
101
+ ```
102
+
103
+ Note:
104
+
105
+ - Live API requires `GOOGLE_API_KEY`. Vertex AI requires project/location and service account credentials.
106
+ - Events: `speaker` (audio stream), `writing` (text), `turnComplete`, `usage`, and `error`.
@@ -0,0 +1,303 @@
1
+ # Voice API Reference
2
+
3
+ > API reference for voice - 1 entries
4
+
5
+
6
+ ---
7
+
8
+ ## Reference: Google Gemini Live Voice
9
+
10
+ > Documentation for the GeminiLiveVoice class, providing real-time multimodal voice interactions using Google
11
+
12
+ The GeminiLiveVoice class provides real-time voice interaction capabilities using Google's Gemini Live API. It supports bidirectional audio streaming, tool calling, session management, and both standard Google API and Vertex AI authentication methods.
13
+
14
+ ## Usage Example
15
+
16
+ ```typescript
17
+ import { GeminiLiveVoice } from "@mastra/voice-google-gemini-live";
18
+ import { playAudio, getMicrophoneStream } from "@mastra/node-audio";
19
+
20
+ // Initialize with Gemini API (using API key)
21
+ const voice = new GeminiLiveVoice({
22
+ apiKey: process.env.GOOGLE_API_KEY, // Required for Gemini API
23
+ model: "gemini-2.0-flash-exp",
24
+ speaker: "Puck", // Default voice
25
+ debug: true,
26
+ });
27
+
28
+ // Or initialize with Vertex AI (using OAuth)
29
+ const voiceWithVertexAI = new GeminiLiveVoice({
30
+ vertexAI: true,
31
+ project: "your-gcp-project",
32
+ location: "us-central1",
33
+ serviceAccountKeyFile: "/path/to/service-account.json",
34
+ model: "gemini-2.0-flash-exp",
35
+ speaker: "Puck",
36
+ });
37
+
38
+ // Or use the VoiceConfig pattern (recommended for consistency with other providers)
39
+ const voiceWithConfig = new GeminiLiveVoice({
40
+ speechModel: {
41
+ name: "gemini-2.0-flash-exp",
42
+ apiKey: process.env.GOOGLE_API_KEY,
43
+ },
44
+ speaker: "Puck",
45
+ realtimeConfig: {
46
+ model: "gemini-2.0-flash-exp",
47
+ apiKey: process.env.GOOGLE_API_KEY,
48
+ options: {
49
+ debug: true,
50
+ sessionConfig: {
51
+ interrupts: { enabled: true },
52
+ },
53
+ },
54
+ },
55
+ });
56
+
57
+ // Establish connection (required before using other methods)
58
+ await voice.connect();
59
+
60
+ // Set up event listeners
61
+ voice.on("speaker", (audioStream) => {
62
+ // Handle audio stream (NodeJS.ReadableStream)
63
+ playAudio(audioStream);
64
+ });
65
+
66
+ voice.on("writing", ({ text, role }) => {
67
+ // Handle transcribed text
68
+ console.log(`${role}: ${text}`);
69
+ });
70
+
71
+ voice.on("turnComplete", ({ timestamp }) => {
72
+ // Handle turn completion
73
+ console.log("Turn completed at:", timestamp);
74
+ });
75
+
76
+ // Convert text to speech
77
+ await voice.speak("Hello, how can I help you today?", {
78
+ speaker: "Charon", // Override default voice
79
+ responseModalities: ["AUDIO", "TEXT"],
80
+ });
81
+
82
+ // Process audio input
83
+ const microphoneStream = getMicrophoneStream();
84
+ await voice.send(microphoneStream);
85
+
86
+ // Update session configuration
87
+ await voice.updateSessionConfig({
88
+ speaker: "Kore",
89
+ instructions: "Be more concise in your responses",
90
+ });
91
+
92
+ // When done, disconnect
93
+ await voice.disconnect();
94
+ // Or use the synchronous wrapper
95
+ voice.close();
96
+ ```
97
+
98
+ ## Configuration
99
+
100
+ ### Constructor Options
101
+
102
+ ### Session Configuration
103
+
104
+ ## Methods
105
+
106
+ ### connect()
107
+
108
+ Establishes a connection to the Gemini Live API. Must be called before using speak, listen, or send methods.
109
+
110
+ ### speak()
111
+
112
+ Converts text to speech and sends it to the model. Can accept either a string or a readable stream as input.
113
+
114
+ Returns: `Promise<void>` (responses are emitted via `speaker` and `writing` events)
115
+
116
+ ### listen()
117
+
118
+ Processes audio input for speech recognition. Takes a readable stream of audio data and returns the transcribed text.
119
+
120
+ Returns: `Promise<string>` - The transcribed text
121
+
122
+ ### send()
123
+
124
+ Streams audio data in real-time to the Gemini service for continuous audio streaming scenarios like live microphone input.
125
+
126
+ Returns: `Promise<void>`
127
+
128
+ ### updateSessionConfig()
129
+
130
+ Updates the session configuration dynamically. This can be used to modify voice settings, speaker selection, and other runtime configurations.
131
+
132
+ Returns: `Promise<void>`
133
+
134
+ ### addTools()
135
+
136
+ Adds a set of tools to the voice instance. Tools allow the model to perform additional actions during conversations. When GeminiLiveVoice is added to an Agent, any tools configured for the Agent will automatically be available to the voice interface.
137
+
138
+ Returns: `void`
139
+
140
+ ### addInstructions()
141
+
142
+ Adds or updates system instructions for the model.
143
+
144
+ Returns: `void`
145
+
146
+ ### answer()
147
+
148
+ Triggers a response from the model. This method is primarily used internally when integrated with an Agent.
149
+
150
+ Returns: `Promise<void>`
151
+
152
+ ### getSpeakers()
153
+
154
+ Returns a list of available voice speakers for the Gemini Live API.
155
+
156
+ Returns: `Promise<Array<{ voiceId: string; description?: string }>>`
157
+
158
+ ### disconnect()
159
+
160
+ Disconnects from the Gemini Live session and cleans up resources. This is the async method that properly handles cleanup.
161
+
162
+ Returns: `Promise<void>`
163
+
164
+ ### close()
165
+
166
+ Synchronous wrapper for disconnect(). Calls disconnect() internally without awaiting.
167
+
168
+ Returns: `void`
169
+
170
+ ### on()
171
+
172
+ Registers an event listener for voice events.
173
+
174
+ Returns: `void`
175
+
176
+ ### off()
177
+
178
+ Removes a previously registered event listener.
179
+
180
+ Returns: `void`
181
+
182
+ ## Events
183
+
184
+ The GeminiLiveVoice class emits the following events:
185
+
186
+ ## Available Models
187
+
188
+ The following Gemini Live models are available:
189
+
190
+ - `gemini-2.0-flash-exp` (default)
191
+ - `gemini-2.0-flash-exp-image-generation`
192
+ - `gemini-2.0-flash-live-001`
193
+ - `gemini-live-2.5-flash-preview-native-audio`
194
+ - `gemini-2.5-flash-exp-native-audio-thinking-dialog`
195
+ - `gemini-live-2.5-flash-preview`
196
+ - `gemini-2.6.flash-preview-tts`
197
+
198
+ ## Available Voices
199
+
200
+ The following voice options are available:
201
+
202
+ - `Puck` (default): Conversational, friendly
203
+ - `Charon`: Deep, authoritative
204
+ - `Kore`: Neutral, professional
205
+ - `Fenrir`: Warm, approachable
206
+
207
+ ## Authentication Methods
208
+
209
+ ### Gemini API (Development)
210
+
211
+ The simplest method using an API key from [Google AI Studio](https://makersuite.google.com/app/apikey):
212
+
213
+ ```typescript
214
+ const voice = new GeminiLiveVoice({
215
+ apiKey: "your-api-key", // Required for Gemini API
216
+ model: "gemini-2.0-flash-exp",
217
+ });
218
+ ```
219
+
220
+ ### Vertex AI (Production)
221
+
222
+ For production use with OAuth authentication and Google Cloud Platform:
223
+
224
+ ```typescript
225
+ // Using service account key file
226
+ const voice = new GeminiLiveVoice({
227
+ vertexAI: true,
228
+ project: "your-gcp-project",
229
+ location: "us-central1",
230
+ serviceAccountKeyFile: "/path/to/service-account.json",
231
+ });
232
+
233
+ // Using Application Default Credentials
234
+ const voice = new GeminiLiveVoice({
235
+ vertexAI: true,
236
+ project: "your-gcp-project",
237
+ location: "us-central1",
238
+ });
239
+
240
+ // Using service account impersonation
241
+ const voice = new GeminiLiveVoice({
242
+ vertexAI: true,
243
+ project: "your-gcp-project",
244
+ location: "us-central1",
245
+ serviceAccountEmail: "service-account@project.iam.gserviceaccount.com",
246
+ });
247
+ ```
248
+
249
+ ## Advanced Features
250
+
251
+ ### Session Management
252
+
253
+ The Gemini Live API supports session resumption for handling network interruptions:
254
+
255
+ ```typescript
256
+ voice.on("sessionHandle", ({ handle, expiresAt }) => {
257
+ // Store session handle for resumption
258
+ saveSessionHandle(handle, expiresAt);
259
+ });
260
+
261
+ // Resume a previous session
262
+ const voice = new GeminiLiveVoice({
263
+ sessionConfig: {
264
+ enableResumption: true,
265
+ maxDuration: "2h",
266
+ },
267
+ });
268
+ ```
269
+
270
+ ### Tool Calling
271
+
272
+ Enable the model to call functions during conversations:
273
+
274
+ ```typescript
275
+ import { z } from "zod";
276
+
277
+ voice.addTools({
278
+ weather: {
279
+ description: "Get weather information",
280
+ parameters: z.object({
281
+ location: z.string(),
282
+ }),
283
+ execute: async ({ location }) => {
284
+ const weather = await getWeather(location);
285
+ return weather;
286
+ },
287
+ },
288
+ });
289
+
290
+ voice.on("toolCall", ({ name, args, id }) => {
291
+ console.log(`Tool called: ${name} with args:`, args);
292
+ });
293
+ ```
294
+
295
+ ## Notes
296
+
297
+ - The Gemini Live API uses WebSockets for real-time communication
298
+ - Audio is processed as 16kHz PCM16 for input and 24kHz PCM16 for output
299
+ - The voice instance must be connected with `connect()` before using other methods
300
+ - Always call `close()` when done to properly clean up resources
301
+ - Vertex AI authentication requires appropriate IAM permissions (`aiplatform.user` role)
302
+ - Session resumption allows recovery from network interruptions
303
+ - The API supports real-time interactions with text and audio
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@mastra/voice-google-gemini-live",
3
- "version": "0.0.0-feat-add-query-option-to-playground-20251209160219",
3
+ "version": "0.0.0-feat-mcp-embedded-docs-tools-clean-20260102135536",
4
4
  "description": "Mastra Google Gemini Live API integration",
5
5
  "type": "module",
6
6
  "files": [
@@ -40,13 +40,13 @@
40
40
  "tsx": "latest",
41
41
  "typescript": "^5.8.3",
42
42
  "vitest": "4.0.12",
43
- "@internal/types-builder": "0.0.0-feat-add-query-option-to-playground-20251209160219",
44
- "@mastra/core": "0.0.0-feat-add-query-option-to-playground-20251209160219",
45
- "@internal/lint": "0.0.0-feat-add-query-option-to-playground-20251209160219"
43
+ "@internal/types-builder": "0.0.0-feat-mcp-embedded-docs-tools-clean-20260102135536",
44
+ "@internal/lint": "0.0.0-feat-mcp-embedded-docs-tools-clean-20260102135536",
45
+ "@mastra/core": "0.0.0-feat-mcp-embedded-docs-tools-clean-20260102135536"
46
46
  },
47
47
  "peerDependencies": {
48
48
  "zod": "^3.0.0",
49
- "@mastra/core": "0.0.0-feat-add-query-option-to-playground-20251209160219"
49
+ "@mastra/core": "0.0.0-feat-mcp-embedded-docs-tools-clean-20260102135536"
50
50
  },
51
51
  "homepage": "https://mastra.ai",
52
52
  "repository": {
@@ -62,6 +62,7 @@
62
62
  },
63
63
  "scripts": {
64
64
  "build": "tsup --silent --config tsup.config.ts",
65
+ "postbuild": "pnpx tsx ../../scripts/generate-package-docs.ts voice/google-gemini-live-api",
65
66
  "build:watch": "tsup --watch --silent --config tsup.config.ts",
66
67
  "test": "vitest run",
67
68
  "test:integration": "vitest run tool-args-integration.test.ts",