@mastra/voice-google-gemini-live 0.0.0-remove-unused-model-providers-api-20251030210744 → 0.0.0-remove-ai-peer-dep-from-evals-20260105220639
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +150 -4
- package/README.md +3 -3
- package/dist/docs/README.md +31 -0
- package/dist/docs/SKILL.md +32 -0
- package/dist/docs/SOURCE_MAP.json +6 -0
- package/dist/docs/voice/01-overview.md +1019 -0
- package/dist/docs/voice/02-speech-to-speech.md +106 -0
- package/dist/docs/voice/03-reference.md +303 -0
- package/dist/index.cjs +80 -23
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.ts +18 -3
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +80 -23
- package/dist/index.js.map +1 -1
- package/dist/managers/AudioStreamManager.d.ts +1 -1
- package/dist/managers/AudioStreamManager.d.ts.map +1 -1
- package/dist/managers/EventManager.d.ts +1 -1
- package/dist/managers/EventManager.d.ts.map +1 -1
- package/dist/managers/SessionManager.d.ts +1 -1
- package/dist/managers/SessionManager.d.ts.map +1 -1
- package/dist/types.d.ts +10 -0
- package/dist/types.d.ts.map +1 -1
- package/package.json +15 -11
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
> Overview of speech-to-speech capabilities in Mastra, including real-time interactions and event-driven architecture.
|
|
2
|
+
|
|
3
|
+
# Speech-to-Speech Capabilities in Mastra
|
|
4
|
+
|
|
5
|
+
## Introduction
|
|
6
|
+
|
|
7
|
+
Speech-to-Speech (STS) in Mastra provides a standardized interface for real-time interactions across multiple providers.
|
|
8
|
+
STS enables continuous bidirectional audio communication through listening to events from Realtime models. Unlike separate TTS and STT operations, STS maintains an open connection that processes speech continuously in both directions.
|
|
9
|
+
|
|
10
|
+
## Configuration
|
|
11
|
+
|
|
12
|
+
- **`apiKey`**: Your OpenAI API key. Falls back to the `OPENAI_API_KEY` environment variable.
|
|
13
|
+
- **`model`**: The model ID to use for real-time voice interactions (e.g., `gpt-5.1-realtime`).
|
|
14
|
+
- **`speaker`**: The default voice ID for speech synthesis. This allows you to specify which voice to use for the speech output.
|
|
15
|
+
|
|
16
|
+
```typescript
|
|
17
|
+
const voice = new OpenAIRealtimeVoice({
|
|
18
|
+
apiKey: "your-openai-api-key",
|
|
19
|
+
model: "gpt-5.1-realtime",
|
|
20
|
+
speaker: "alloy", // Default voice
|
|
21
|
+
});
|
|
22
|
+
|
|
23
|
+
// If using default settings the configuration can be simplified to:
|
|
24
|
+
const voice = new OpenAIRealtimeVoice();
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
## Using STS
|
|
28
|
+
|
|
29
|
+
```typescript
|
|
30
|
+
import { Agent } from "@mastra/core/agent";
|
|
31
|
+
import { OpenAIRealtimeVoice } from "@mastra/voice-openai-realtime";
|
|
32
|
+
import { playAudio, getMicrophoneStream } from "@mastra/node-audio";
|
|
33
|
+
|
|
34
|
+
const agent = new Agent({
|
|
35
|
+
id: "agent",
|
|
36
|
+
name: "OpenAI Realtime Agent",
|
|
37
|
+
instructions: `You are a helpful assistant with real-time voice capabilities.`,
|
|
38
|
+
model: "openai/gpt-5.1",
|
|
39
|
+
voice: new OpenAIRealtimeVoice(),
|
|
40
|
+
});
|
|
41
|
+
|
|
42
|
+
// Connect to the voice service
|
|
43
|
+
await agent.voice.connect();
|
|
44
|
+
|
|
45
|
+
// Listen for agent audio responses
|
|
46
|
+
agent.voice.on("speaker", ({ audio }) => {
|
|
47
|
+
playAudio(audio);
|
|
48
|
+
});
|
|
49
|
+
|
|
50
|
+
// Initiate the conversation
|
|
51
|
+
await agent.voice.speak("How can I help you today?");
|
|
52
|
+
|
|
53
|
+
// Send continuous audio from the microphone
|
|
54
|
+
const micStream = getMicrophoneStream();
|
|
55
|
+
await agent.voice.send(micStream);
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
For integrating Speech-to-Speech capabilities with agents, refer to the [Adding Voice to Agents](https://mastra.ai/docs/v1/agents/adding-voice) documentation.
|
|
59
|
+
|
|
60
|
+
## Google Gemini Live (Realtime)
|
|
61
|
+
|
|
62
|
+
```typescript
|
|
63
|
+
import { Agent } from "@mastra/core/agent";
|
|
64
|
+
import { GeminiLiveVoice } from "@mastra/voice-google-gemini-live";
|
|
65
|
+
import { playAudio, getMicrophoneStream } from "@mastra/node-audio";
|
|
66
|
+
|
|
67
|
+
const agent = new Agent({
|
|
68
|
+
id: "agent",
|
|
69
|
+
name: "Gemini Live Agent",
|
|
70
|
+
instructions:
|
|
71
|
+
"You are a helpful assistant with real-time voice capabilities.",
|
|
72
|
+
// Model used for text generation; voice provider handles realtime audio
|
|
73
|
+
model: "openai/gpt-5.1",
|
|
74
|
+
voice: new GeminiLiveVoice({
|
|
75
|
+
apiKey: process.env.GOOGLE_API_KEY,
|
|
76
|
+
model: "gemini-2.0-flash-exp",
|
|
77
|
+
speaker: "Puck",
|
|
78
|
+
debug: true,
|
|
79
|
+
// Vertex AI option:
|
|
80
|
+
// vertexAI: true,
|
|
81
|
+
// project: 'your-gcp-project',
|
|
82
|
+
// location: 'us-central1',
|
|
83
|
+
// serviceAccountKeyFile: '/path/to/service-account.json',
|
|
84
|
+
}),
|
|
85
|
+
});
|
|
86
|
+
|
|
87
|
+
await agent.voice.connect();
|
|
88
|
+
|
|
89
|
+
agent.voice.on("speaker", ({ audio }) => {
|
|
90
|
+
playAudio(audio);
|
|
91
|
+
});
|
|
92
|
+
|
|
93
|
+
agent.voice.on("writing", ({ role, text }) => {
|
|
94
|
+
console.log(`${role}: ${text}`);
|
|
95
|
+
});
|
|
96
|
+
|
|
97
|
+
await agent.voice.speak("How can I help you today?");
|
|
98
|
+
|
|
99
|
+
const micStream = getMicrophoneStream();
|
|
100
|
+
await agent.voice.send(micStream);
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
Note:
|
|
104
|
+
|
|
105
|
+
- Live API requires `GOOGLE_API_KEY`. Vertex AI requires project/location and service account credentials.
|
|
106
|
+
- Events: `speaker` (audio stream), `writing` (text), `turnComplete`, `usage`, and `error`.
|
|
@@ -0,0 +1,303 @@
|
|
|
1
|
+
# Voice API Reference
|
|
2
|
+
|
|
3
|
+
> API reference for voice - 1 entries
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
---
|
|
7
|
+
|
|
8
|
+
## Reference: Google Gemini Live Voice
|
|
9
|
+
|
|
10
|
+
> Documentation for the GeminiLiveVoice class, providing real-time multimodal voice interactions using Google
|
|
11
|
+
|
|
12
|
+
The GeminiLiveVoice class provides real-time voice interaction capabilities using Google's Gemini Live API. It supports bidirectional audio streaming, tool calling, session management, and both standard Google API and Vertex AI authentication methods.
|
|
13
|
+
|
|
14
|
+
## Usage Example
|
|
15
|
+
|
|
16
|
+
```typescript
|
|
17
|
+
import { GeminiLiveVoice } from "@mastra/voice-google-gemini-live";
|
|
18
|
+
import { playAudio, getMicrophoneStream } from "@mastra/node-audio";
|
|
19
|
+
|
|
20
|
+
// Initialize with Gemini API (using API key)
|
|
21
|
+
const voice = new GeminiLiveVoice({
|
|
22
|
+
apiKey: process.env.GOOGLE_API_KEY, // Required for Gemini API
|
|
23
|
+
model: "gemini-2.0-flash-exp",
|
|
24
|
+
speaker: "Puck", // Default voice
|
|
25
|
+
debug: true,
|
|
26
|
+
});
|
|
27
|
+
|
|
28
|
+
// Or initialize with Vertex AI (using OAuth)
|
|
29
|
+
const voiceWithVertexAI = new GeminiLiveVoice({
|
|
30
|
+
vertexAI: true,
|
|
31
|
+
project: "your-gcp-project",
|
|
32
|
+
location: "us-central1",
|
|
33
|
+
serviceAccountKeyFile: "/path/to/service-account.json",
|
|
34
|
+
model: "gemini-2.0-flash-exp",
|
|
35
|
+
speaker: "Puck",
|
|
36
|
+
});
|
|
37
|
+
|
|
38
|
+
// Or use the VoiceConfig pattern (recommended for consistency with other providers)
|
|
39
|
+
const voiceWithConfig = new GeminiLiveVoice({
|
|
40
|
+
speechModel: {
|
|
41
|
+
name: "gemini-2.0-flash-exp",
|
|
42
|
+
apiKey: process.env.GOOGLE_API_KEY,
|
|
43
|
+
},
|
|
44
|
+
speaker: "Puck",
|
|
45
|
+
realtimeConfig: {
|
|
46
|
+
model: "gemini-2.0-flash-exp",
|
|
47
|
+
apiKey: process.env.GOOGLE_API_KEY,
|
|
48
|
+
options: {
|
|
49
|
+
debug: true,
|
|
50
|
+
sessionConfig: {
|
|
51
|
+
interrupts: { enabled: true },
|
|
52
|
+
},
|
|
53
|
+
},
|
|
54
|
+
},
|
|
55
|
+
});
|
|
56
|
+
|
|
57
|
+
// Establish connection (required before using other methods)
|
|
58
|
+
await voice.connect();
|
|
59
|
+
|
|
60
|
+
// Set up event listeners
|
|
61
|
+
voice.on("speaker", (audioStream) => {
|
|
62
|
+
// Handle audio stream (NodeJS.ReadableStream)
|
|
63
|
+
playAudio(audioStream);
|
|
64
|
+
});
|
|
65
|
+
|
|
66
|
+
voice.on("writing", ({ text, role }) => {
|
|
67
|
+
// Handle transcribed text
|
|
68
|
+
console.log(`${role}: ${text}`);
|
|
69
|
+
});
|
|
70
|
+
|
|
71
|
+
voice.on("turnComplete", ({ timestamp }) => {
|
|
72
|
+
// Handle turn completion
|
|
73
|
+
console.log("Turn completed at:", timestamp);
|
|
74
|
+
});
|
|
75
|
+
|
|
76
|
+
// Convert text to speech
|
|
77
|
+
await voice.speak("Hello, how can I help you today?", {
|
|
78
|
+
speaker: "Charon", // Override default voice
|
|
79
|
+
responseModalities: ["AUDIO", "TEXT"],
|
|
80
|
+
});
|
|
81
|
+
|
|
82
|
+
// Process audio input
|
|
83
|
+
const microphoneStream = getMicrophoneStream();
|
|
84
|
+
await voice.send(microphoneStream);
|
|
85
|
+
|
|
86
|
+
// Update session configuration
|
|
87
|
+
await voice.updateSessionConfig({
|
|
88
|
+
speaker: "Kore",
|
|
89
|
+
instructions: "Be more concise in your responses",
|
|
90
|
+
});
|
|
91
|
+
|
|
92
|
+
// When done, disconnect
|
|
93
|
+
await voice.disconnect();
|
|
94
|
+
// Or use the synchronous wrapper
|
|
95
|
+
voice.close();
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
## Configuration
|
|
99
|
+
|
|
100
|
+
### Constructor Options
|
|
101
|
+
|
|
102
|
+
### Session Configuration
|
|
103
|
+
|
|
104
|
+
## Methods
|
|
105
|
+
|
|
106
|
+
### connect()
|
|
107
|
+
|
|
108
|
+
Establishes a connection to the Gemini Live API. Must be called before using speak, listen, or send methods.
|
|
109
|
+
|
|
110
|
+
### speak()
|
|
111
|
+
|
|
112
|
+
Converts text to speech and sends it to the model. Can accept either a string or a readable stream as input.
|
|
113
|
+
|
|
114
|
+
Returns: `Promise<void>` (responses are emitted via `speaker` and `writing` events)
|
|
115
|
+
|
|
116
|
+
### listen()
|
|
117
|
+
|
|
118
|
+
Processes audio input for speech recognition. Takes a readable stream of audio data and returns the transcribed text.
|
|
119
|
+
|
|
120
|
+
Returns: `Promise<string>` - The transcribed text
|
|
121
|
+
|
|
122
|
+
### send()
|
|
123
|
+
|
|
124
|
+
Streams audio data in real-time to the Gemini service for continuous audio streaming scenarios like live microphone input.
|
|
125
|
+
|
|
126
|
+
Returns: `Promise<void>`
|
|
127
|
+
|
|
128
|
+
### updateSessionConfig()
|
|
129
|
+
|
|
130
|
+
Updates the session configuration dynamically. This can be used to modify voice settings, speaker selection, and other runtime configurations.
|
|
131
|
+
|
|
132
|
+
Returns: `Promise<void>`
|
|
133
|
+
|
|
134
|
+
### addTools()
|
|
135
|
+
|
|
136
|
+
Adds a set of tools to the voice instance. Tools allow the model to perform additional actions during conversations. When GeminiLiveVoice is added to an Agent, any tools configured for the Agent will automatically be available to the voice interface.
|
|
137
|
+
|
|
138
|
+
Returns: `void`
|
|
139
|
+
|
|
140
|
+
### addInstructions()
|
|
141
|
+
|
|
142
|
+
Adds or updates system instructions for the model.
|
|
143
|
+
|
|
144
|
+
Returns: `void`
|
|
145
|
+
|
|
146
|
+
### answer()
|
|
147
|
+
|
|
148
|
+
Triggers a response from the model. This method is primarily used internally when integrated with an Agent.
|
|
149
|
+
|
|
150
|
+
Returns: `Promise<void>`
|
|
151
|
+
|
|
152
|
+
### getSpeakers()
|
|
153
|
+
|
|
154
|
+
Returns a list of available voice speakers for the Gemini Live API.
|
|
155
|
+
|
|
156
|
+
Returns: `Promise<Array<{ voiceId: string; description?: string }>>`
|
|
157
|
+
|
|
158
|
+
### disconnect()
|
|
159
|
+
|
|
160
|
+
Disconnects from the Gemini Live session and cleans up resources. This is the async method that properly handles cleanup.
|
|
161
|
+
|
|
162
|
+
Returns: `Promise<void>`
|
|
163
|
+
|
|
164
|
+
### close()
|
|
165
|
+
|
|
166
|
+
Synchronous wrapper for disconnect(). Calls disconnect() internally without awaiting.
|
|
167
|
+
|
|
168
|
+
Returns: `void`
|
|
169
|
+
|
|
170
|
+
### on()
|
|
171
|
+
|
|
172
|
+
Registers an event listener for voice events.
|
|
173
|
+
|
|
174
|
+
Returns: `void`
|
|
175
|
+
|
|
176
|
+
### off()
|
|
177
|
+
|
|
178
|
+
Removes a previously registered event listener.
|
|
179
|
+
|
|
180
|
+
Returns: `void`
|
|
181
|
+
|
|
182
|
+
## Events
|
|
183
|
+
|
|
184
|
+
The GeminiLiveVoice class emits the following events:
|
|
185
|
+
|
|
186
|
+
## Available Models
|
|
187
|
+
|
|
188
|
+
The following Gemini Live models are available:
|
|
189
|
+
|
|
190
|
+
- `gemini-2.0-flash-exp` (default)
|
|
191
|
+
- `gemini-2.0-flash-exp-image-generation`
|
|
192
|
+
- `gemini-2.0-flash-live-001`
|
|
193
|
+
- `gemini-live-2.5-flash-preview-native-audio`
|
|
194
|
+
- `gemini-2.5-flash-exp-native-audio-thinking-dialog`
|
|
195
|
+
- `gemini-live-2.5-flash-preview`
|
|
196
|
+
- `gemini-2.6.flash-preview-tts`
|
|
197
|
+
|
|
198
|
+
## Available Voices
|
|
199
|
+
|
|
200
|
+
The following voice options are available:
|
|
201
|
+
|
|
202
|
+
- `Puck` (default): Conversational, friendly
|
|
203
|
+
- `Charon`: Deep, authoritative
|
|
204
|
+
- `Kore`: Neutral, professional
|
|
205
|
+
- `Fenrir`: Warm, approachable
|
|
206
|
+
|
|
207
|
+
## Authentication Methods
|
|
208
|
+
|
|
209
|
+
### Gemini API (Development)
|
|
210
|
+
|
|
211
|
+
The simplest method using an API key from [Google AI Studio](https://makersuite.google.com/app/apikey):
|
|
212
|
+
|
|
213
|
+
```typescript
|
|
214
|
+
const voice = new GeminiLiveVoice({
|
|
215
|
+
apiKey: "your-api-key", // Required for Gemini API
|
|
216
|
+
model: "gemini-2.0-flash-exp",
|
|
217
|
+
});
|
|
218
|
+
```
|
|
219
|
+
|
|
220
|
+
### Vertex AI (Production)
|
|
221
|
+
|
|
222
|
+
For production use with OAuth authentication and Google Cloud Platform:
|
|
223
|
+
|
|
224
|
+
```typescript
|
|
225
|
+
// Using service account key file
|
|
226
|
+
const voice = new GeminiLiveVoice({
|
|
227
|
+
vertexAI: true,
|
|
228
|
+
project: "your-gcp-project",
|
|
229
|
+
location: "us-central1",
|
|
230
|
+
serviceAccountKeyFile: "/path/to/service-account.json",
|
|
231
|
+
});
|
|
232
|
+
|
|
233
|
+
// Using Application Default Credentials
|
|
234
|
+
const voice = new GeminiLiveVoice({
|
|
235
|
+
vertexAI: true,
|
|
236
|
+
project: "your-gcp-project",
|
|
237
|
+
location: "us-central1",
|
|
238
|
+
});
|
|
239
|
+
|
|
240
|
+
// Using service account impersonation
|
|
241
|
+
const voice = new GeminiLiveVoice({
|
|
242
|
+
vertexAI: true,
|
|
243
|
+
project: "your-gcp-project",
|
|
244
|
+
location: "us-central1",
|
|
245
|
+
serviceAccountEmail: "service-account@project.iam.gserviceaccount.com",
|
|
246
|
+
});
|
|
247
|
+
```
|
|
248
|
+
|
|
249
|
+
## Advanced Features
|
|
250
|
+
|
|
251
|
+
### Session Management
|
|
252
|
+
|
|
253
|
+
The Gemini Live API supports session resumption for handling network interruptions:
|
|
254
|
+
|
|
255
|
+
```typescript
|
|
256
|
+
voice.on("sessionHandle", ({ handle, expiresAt }) => {
|
|
257
|
+
// Store session handle for resumption
|
|
258
|
+
saveSessionHandle(handle, expiresAt);
|
|
259
|
+
});
|
|
260
|
+
|
|
261
|
+
// Resume a previous session
|
|
262
|
+
const voice = new GeminiLiveVoice({
|
|
263
|
+
sessionConfig: {
|
|
264
|
+
enableResumption: true,
|
|
265
|
+
maxDuration: "2h",
|
|
266
|
+
},
|
|
267
|
+
});
|
|
268
|
+
```
|
|
269
|
+
|
|
270
|
+
### Tool Calling
|
|
271
|
+
|
|
272
|
+
Enable the model to call functions during conversations:
|
|
273
|
+
|
|
274
|
+
```typescript
|
|
275
|
+
import { z } from "zod";
|
|
276
|
+
|
|
277
|
+
voice.addTools({
|
|
278
|
+
weather: {
|
|
279
|
+
description: "Get weather information",
|
|
280
|
+
parameters: z.object({
|
|
281
|
+
location: z.string(),
|
|
282
|
+
}),
|
|
283
|
+
execute: async ({ location }) => {
|
|
284
|
+
const weather = await getWeather(location);
|
|
285
|
+
return weather;
|
|
286
|
+
},
|
|
287
|
+
},
|
|
288
|
+
});
|
|
289
|
+
|
|
290
|
+
voice.on("toolCall", ({ name, args, id }) => {
|
|
291
|
+
console.log(`Tool called: ${name} with args:`, args);
|
|
292
|
+
});
|
|
293
|
+
```
|
|
294
|
+
|
|
295
|
+
## Notes
|
|
296
|
+
|
|
297
|
+
- The Gemini Live API uses WebSockets for real-time communication
|
|
298
|
+
- Audio is processed as 16kHz PCM16 for input and 24kHz PCM16 for output
|
|
299
|
+
- The voice instance must be connected with `connect()` before using other methods
|
|
300
|
+
- Always call `close()` when done to properly clean up resources
|
|
301
|
+
- Vertex AI authentication requires appropriate IAM permissions (`aiplatform.user` role)
|
|
302
|
+
- Session resumption allows recovery from network interruptions
|
|
303
|
+
- The API supports real-time interactions with text and audio
|
package/dist/index.cjs
CHANGED
|
@@ -1510,7 +1510,8 @@ var GeminiLiveVoice = class _GeminiLiveVoice extends voice.MastraVoice {
|
|
|
1510
1510
|
let wsUrl;
|
|
1511
1511
|
let headers = {};
|
|
1512
1512
|
if (this.options.vertexAI) {
|
|
1513
|
-
|
|
1513
|
+
const location = this.getVertexLocation();
|
|
1514
|
+
wsUrl = `wss://${location}-aiplatform.googleapis.com/ws/google.cloud.aiplatform.v1beta1.LlmBidiService/BidiGenerateContent`;
|
|
1514
1515
|
await this.authManager.initialize();
|
|
1515
1516
|
const accessToken = await this.authManager.getAccessToken();
|
|
1516
1517
|
headers = { headers: { Authorization: `Bearer ${accessToken}` } };
|
|
@@ -2259,6 +2260,18 @@ var GeminiLiveVoice = class _GeminiLiveVoice extends voice.MastraVoice {
|
|
|
2259
2260
|
role: "assistant"
|
|
2260
2261
|
});
|
|
2261
2262
|
}
|
|
2263
|
+
if (part.functionCall) {
|
|
2264
|
+
this.log("Found function call in serverContent.modelTurn.parts", part.functionCall);
|
|
2265
|
+
const toolCallData = {
|
|
2266
|
+
toolCall: {
|
|
2267
|
+
name: part.functionCall.name,
|
|
2268
|
+
args: part.functionCall.args || {},
|
|
2269
|
+
id: part.functionCall.id || crypto.randomUUID()
|
|
2270
|
+
}
|
|
2271
|
+
};
|
|
2272
|
+
void this.handleToolCall(toolCallData);
|
|
2273
|
+
continue;
|
|
2274
|
+
}
|
|
2262
2275
|
if (part.inlineData?.mimeType?.includes("audio") && typeof part.inlineData.data === "string") {
|
|
2263
2276
|
try {
|
|
2264
2277
|
const audioData = part.inlineData.data;
|
|
@@ -2333,9 +2346,24 @@ var GeminiLiveVoice = class _GeminiLiveVoice extends voice.MastraVoice {
|
|
|
2333
2346
|
if (!data.toolCall) {
|
|
2334
2347
|
return;
|
|
2335
2348
|
}
|
|
2336
|
-
|
|
2337
|
-
|
|
2338
|
-
|
|
2349
|
+
let toolCalls = [];
|
|
2350
|
+
if (data.toolCall.functionCalls && Array.isArray(data.toolCall.functionCalls)) {
|
|
2351
|
+
toolCalls = data.toolCall.functionCalls;
|
|
2352
|
+
} else if (data.toolCall.name) {
|
|
2353
|
+
toolCalls = [{ name: data.toolCall.name, args: data.toolCall.args, id: data.toolCall.id }];
|
|
2354
|
+
}
|
|
2355
|
+
for (const toolCall of toolCalls) {
|
|
2356
|
+
const toolName = toolCall.name || "";
|
|
2357
|
+
const toolArgs = toolCall.args || {};
|
|
2358
|
+
const toolId = toolCall.id || crypto.randomUUID();
|
|
2359
|
+
await this.processSingleToolCall(toolName, toolArgs, toolId);
|
|
2360
|
+
}
|
|
2361
|
+
}
|
|
2362
|
+
/**
|
|
2363
|
+
* Process a single tool call
|
|
2364
|
+
* @private
|
|
2365
|
+
*/
|
|
2366
|
+
async processSingleToolCall(toolName, toolArgs, toolId) {
|
|
2339
2367
|
this.log("Processing tool call", { toolName, toolArgs, toolId });
|
|
2340
2368
|
this.emit("toolCall", {
|
|
2341
2369
|
name: toolName,
|
|
@@ -2355,36 +2383,38 @@ var GeminiLiveVoice = class _GeminiLiveVoice extends voice.MastraVoice {
|
|
|
2355
2383
|
let result;
|
|
2356
2384
|
if (tool.execute) {
|
|
2357
2385
|
this.log("Executing tool", { toolName, toolArgs });
|
|
2358
|
-
result = await tool.execute(
|
|
2359
|
-
{ context: toolArgs, requestContext: this.requestContext },
|
|
2360
|
-
{
|
|
2361
|
-
toolCallId: toolId,
|
|
2362
|
-
messages: []
|
|
2363
|
-
}
|
|
2364
|
-
);
|
|
2386
|
+
result = await tool.execute(toolArgs, { requestContext: this.requestContext });
|
|
2365
2387
|
this.log("Tool executed successfully", { toolName, result });
|
|
2366
2388
|
} else {
|
|
2367
2389
|
this.log("Tool has no execute function", { toolName });
|
|
2368
2390
|
result = { error: "Tool has no execute function" };
|
|
2369
2391
|
}
|
|
2370
2392
|
const toolResultMessage = {
|
|
2371
|
-
|
|
2372
|
-
|
|
2373
|
-
|
|
2393
|
+
toolResponse: {
|
|
2394
|
+
functionResponses: [
|
|
2395
|
+
{
|
|
2396
|
+
id: toolId,
|
|
2397
|
+
response: result
|
|
2398
|
+
}
|
|
2399
|
+
]
|
|
2374
2400
|
}
|
|
2375
2401
|
};
|
|
2376
|
-
this.sendEvent("
|
|
2402
|
+
this.sendEvent("toolResponse", toolResultMessage);
|
|
2377
2403
|
this.log("Tool result sent", { toolName, toolId, result });
|
|
2378
2404
|
} catch (error) {
|
|
2379
2405
|
const errorMessage = error instanceof Error ? error.message : "Unknown error";
|
|
2380
2406
|
this.log("Tool execution failed", { toolName, error: errorMessage });
|
|
2381
2407
|
const errorResultMessage = {
|
|
2382
|
-
|
|
2383
|
-
|
|
2384
|
-
|
|
2408
|
+
toolResponse: {
|
|
2409
|
+
functionResponses: [
|
|
2410
|
+
{
|
|
2411
|
+
id: toolId,
|
|
2412
|
+
response: { error: errorMessage }
|
|
2413
|
+
}
|
|
2414
|
+
]
|
|
2385
2415
|
}
|
|
2386
2416
|
};
|
|
2387
|
-
this.sendEvent("
|
|
2417
|
+
this.sendEvent("toolResponse", errorResultMessage);
|
|
2388
2418
|
this.createAndEmitError("tool_execution_error" /* TOOL_EXECUTION_ERROR */, `Tool execution failed: ${errorMessage}`, {
|
|
2389
2419
|
toolName,
|
|
2390
2420
|
toolArgs,
|
|
@@ -2444,6 +2474,31 @@ var GeminiLiveVoice = class _GeminiLiveVoice extends voice.MastraVoice {
|
|
|
2444
2474
|
}
|
|
2445
2475
|
return "text";
|
|
2446
2476
|
}
|
|
2477
|
+
/**
|
|
2478
|
+
* Resolve Vertex AI location with sensible default
|
|
2479
|
+
* @private
|
|
2480
|
+
*/
|
|
2481
|
+
getVertexLocation() {
|
|
2482
|
+
return this.options.location?.trim() || "us-central1";
|
|
2483
|
+
}
|
|
2484
|
+
/**
|
|
2485
|
+
* Resolve the correct model identifier for Gemini API or Vertex AI
|
|
2486
|
+
* @private
|
|
2487
|
+
*/
|
|
2488
|
+
resolveModelIdentifier() {
|
|
2489
|
+
const model = this.options.model ?? DEFAULT_MODEL;
|
|
2490
|
+
if (!this.options.vertexAI) {
|
|
2491
|
+
return `models/${model}`;
|
|
2492
|
+
}
|
|
2493
|
+
if (!this.options.project) {
|
|
2494
|
+
throw this.createAndEmitError(
|
|
2495
|
+
"project_id_missing" /* PROJECT_ID_MISSING */,
|
|
2496
|
+
"Google Cloud project ID is required when using Vertex AI."
|
|
2497
|
+
);
|
|
2498
|
+
}
|
|
2499
|
+
const location = this.getVertexLocation();
|
|
2500
|
+
return `projects/${this.options.project}/locations/${location}/publishers/google/models/${model}`;
|
|
2501
|
+
}
|
|
2447
2502
|
/**
|
|
2448
2503
|
* Send initial configuration to Gemini Live API
|
|
2449
2504
|
* @private
|
|
@@ -2454,7 +2509,7 @@ var GeminiLiveVoice = class _GeminiLiveVoice extends voice.MastraVoice {
|
|
|
2454
2509
|
}
|
|
2455
2510
|
const setupMessage = {
|
|
2456
2511
|
setup: {
|
|
2457
|
-
model:
|
|
2512
|
+
model: this.resolveModelIdentifier()
|
|
2458
2513
|
}
|
|
2459
2514
|
};
|
|
2460
2515
|
if (this.options.instructions) {
|
|
@@ -2603,6 +2658,8 @@ var GeminiLiveVoice = class _GeminiLiveVoice extends voice.MastraVoice {
|
|
|
2603
2658
|
message = data;
|
|
2604
2659
|
} else if (type === "realtime_input" && data.realtime_input) {
|
|
2605
2660
|
message = data;
|
|
2661
|
+
} else if (type === "toolResponse" && data.toolResponse) {
|
|
2662
|
+
message = data;
|
|
2606
2663
|
} else if (type === "session.update" && data.session) {
|
|
2607
2664
|
message = data;
|
|
2608
2665
|
} else {
|
|
@@ -2628,14 +2685,14 @@ var GeminiLiveVoice = class _GeminiLiveVoice extends voice.MastraVoice {
|
|
|
2628
2685
|
* inputSchema: z.object({
|
|
2629
2686
|
* location: z.string().describe("The city and state, e.g. San Francisco, CA"),
|
|
2630
2687
|
* }),
|
|
2631
|
-
* execute: async (
|
|
2688
|
+
* execute: async (inputData) => {
|
|
2632
2689
|
* // Fetch weather data from an API
|
|
2633
2690
|
* const response = await fetch(
|
|
2634
|
-
* `https://api.weather.com?location=${encodeURIComponent(
|
|
2691
|
+
* `https://api.weather.com?location=${encodeURIComponent(inputData.location)}`,
|
|
2635
2692
|
* );
|
|
2636
2693
|
* const data = await response.json();
|
|
2637
2694
|
* return {
|
|
2638
|
-
* message: `The current temperature in ${
|
|
2695
|
+
* message: `The current temperature in ${inputData.location} is ${data.temperature}°F with ${data.conditions}.`,
|
|
2639
2696
|
* };
|
|
2640
2697
|
* },
|
|
2641
2698
|
* });
|