@mobileai/react-native 0.4.5 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +80 -15
- package/lib/module/components/AIAgent.js +181 -38
- package/lib/module/components/AIAgent.js.map +1 -1
- package/lib/module/components/AgentChatBar.js +53 -29
- package/lib/module/components/AgentChatBar.js.map +1 -1
- package/lib/module/components/Icons.js +337 -0
- package/lib/module/components/Icons.js.map +1 -0
- package/lib/module/core/AgentRuntime.js +74 -3
- package/lib/module/core/AgentRuntime.js.map +1 -1
- package/lib/module/core/systemPrompt.js +87 -34
- package/lib/module/core/systemPrompt.js.map +1 -1
- package/lib/module/services/AudioInputService.js +73 -2
- package/lib/module/services/AudioInputService.js.map +1 -1
- package/lib/module/services/AudioOutputService.js +58 -5
- package/lib/module/services/AudioOutputService.js.map +1 -1
- package/lib/module/services/VoiceService.js +284 -239
- package/lib/module/services/VoiceService.js.map +1 -1
- package/lib/typescript/src/components/AIAgent.d.ts.map +1 -1
- package/lib/typescript/src/components/AgentChatBar.d.ts.map +1 -1
- package/lib/typescript/src/components/Icons.d.ts +43 -0
- package/lib/typescript/src/components/Icons.d.ts.map +1 -0
- package/lib/typescript/src/core/AgentRuntime.d.ts +12 -0
- package/lib/typescript/src/core/AgentRuntime.d.ts.map +1 -1
- package/lib/typescript/src/core/systemPrompt.d.ts +7 -4
- package/lib/typescript/src/core/systemPrompt.d.ts.map +1 -1
- package/lib/typescript/src/services/AudioInputService.d.ts +13 -0
- package/lib/typescript/src/services/AudioInputService.d.ts.map +1 -1
- package/lib/typescript/src/services/AudioOutputService.d.ts.map +1 -1
- package/lib/typescript/src/services/VoiceService.d.ts +41 -24
- package/lib/typescript/src/services/VoiceService.d.ts.map +1 -1
- package/package.json +1 -1
- package/src/components/AIAgent.tsx +194 -38
- package/src/components/AgentChatBar.tsx +44 -25
- package/src/components/Icons.tsx +253 -0
- package/src/core/AgentRuntime.ts +70 -3
- package/src/core/systemPrompt.ts +87 -34
- package/src/services/AudioInputService.ts +77 -2
- package/src/services/AudioOutputService.ts +59 -5
- package/src/services/VoiceService.ts +280 -252
|
@@ -1,15 +1,26 @@
|
|
|
1
1
|
/**
|
|
2
|
-
* VoiceService —
|
|
2
|
+
* VoiceService — @google/genai SDK Live API connection.
|
|
3
|
+
*
|
|
4
|
+
* Uses the official `ai.live.connect()` method instead of raw WebSocket.
|
|
5
|
+
* This fixes function calling reliability: the SDK handles protocol details
|
|
6
|
+
* (binary framing, message transforms, model name prefixes) that our
|
|
7
|
+
* previous raw WebSocket implementation missed.
|
|
3
8
|
*
|
|
4
9
|
* Handles bidirectional audio streaming between the app and Gemini:
|
|
5
10
|
* - Sends PCM 16kHz 16-bit audio chunks (mic input)
|
|
6
11
|
* - Receives PCM 24kHz 16-bit audio chunks (AI responses)
|
|
7
12
|
* - Receives function calls (tap, navigate, etc.) for agentic actions
|
|
8
|
-
* - Sends screen context (DOM text
|
|
9
|
-
*
|
|
10
|
-
* Protocol: wss://generativelanguage.googleapis.com/ws/google.ai.generativelanguage.v1beta.GenerativeService.BidiGenerateContent
|
|
13
|
+
* - Sends screen context (DOM text) for live mode
|
|
11
14
|
*/
|
|
12
15
|
|
|
16
|
+
// Platform-specific import: Metro can't resolve '@google/genai/web' sub-path
|
|
17
|
+
// export, so we use the full path to the web bundle. This is what the SDK
|
|
18
|
+
// recommends ('use a platform specific import') — RN's WebSocket API is
|
|
19
|
+
// browser-compatible so the web bundle works correctly.
|
|
20
|
+
// @ts-ignore — TS can't find declarations for the deep path
|
|
21
|
+
import { GoogleGenAI, Modality } from '@google/genai/dist/web/index.mjs';
|
|
22
|
+
// @ts-ignore
|
|
23
|
+
import type { Session } from '@google/genai/dist/web/index.mjs';
|
|
13
24
|
import { logger } from '../utils/logger';
|
|
14
25
|
import type { ToolDefinition } from '../core/types';
|
|
15
26
|
|
|
@@ -34,25 +45,26 @@ export interface VoiceServiceCallbacks {
|
|
|
34
45
|
onError?: (error: string) => void;
|
|
35
46
|
/** Called when AI turn is complete (all audio sent) */
|
|
36
47
|
onTurnComplete?: () => void;
|
|
48
|
+
/** Called when SDK setup is complete — safe to send screen context */
|
|
49
|
+
onSetupComplete?: () => void;
|
|
37
50
|
}
|
|
38
51
|
|
|
39
52
|
export type VoiceStatus = 'disconnected' | 'connecting' | 'connected' | 'error';
|
|
40
53
|
|
|
41
54
|
// ─── Constants ─────────────────────────────────────────────────
|
|
42
55
|
|
|
43
|
-
const WS_HOST = 'generativelanguage.googleapis.com';
|
|
44
|
-
const WS_PATH = '/ws/google.ai.generativelanguage.v1beta.GenerativeService.BidiGenerateContent';
|
|
45
56
|
const DEFAULT_MODEL = 'gemini-2.5-flash-native-audio-preview-12-2025';
|
|
46
57
|
const DEFAULT_INPUT_SAMPLE_RATE = 16000;
|
|
47
58
|
|
|
48
59
|
// ─── Service ───────────────────────────────────────────────────
|
|
49
60
|
|
|
50
61
|
export class VoiceService {
|
|
51
|
-
private
|
|
62
|
+
private session: Session | null = null;
|
|
52
63
|
private config: VoiceServiceConfig;
|
|
53
64
|
private callbacks: VoiceServiceCallbacks = {};
|
|
54
|
-
|
|
65
|
+
public lastCallbacks: VoiceServiceCallbacks | null = null;
|
|
55
66
|
private _status: VoiceStatus = 'disconnected';
|
|
67
|
+
public intentionalDisconnect = false;
|
|
56
68
|
|
|
57
69
|
constructor(config: VoiceServiceConfig) {
|
|
58
70
|
this.config = config;
|
|
@@ -60,55 +72,115 @@ export class VoiceService {
|
|
|
60
72
|
|
|
61
73
|
// ─── Connection ────────────────────────────────────────────
|
|
62
74
|
|
|
63
|
-
|
|
64
|
-
|
|
75
|
+
/**
|
|
76
|
+
* Connect to Gemini Live API via the official SDK.
|
|
77
|
+
* Now async because `ai.live.connect()` returns a Promise.
|
|
78
|
+
*/
|
|
79
|
+
async connect(callbacks: VoiceServiceCallbacks): Promise<void> {
|
|
80
|
+
if (this.session) {
|
|
65
81
|
logger.info('VoiceService', 'Already connected');
|
|
66
82
|
return;
|
|
67
83
|
}
|
|
68
84
|
|
|
69
85
|
this.callbacks = callbacks;
|
|
86
|
+
this.lastCallbacks = callbacks;
|
|
70
87
|
this.setStatus('connecting');
|
|
88
|
+
this.intentionalDisconnect = false;
|
|
71
89
|
|
|
72
90
|
const model = this.config.model || DEFAULT_MODEL;
|
|
73
|
-
|
|
91
|
+
logger.info('VoiceService', `Connecting via SDK (model: ${model})`);
|
|
74
92
|
|
|
75
|
-
|
|
76
|
-
|
|
93
|
+
try {
|
|
94
|
+
const ai = new GoogleGenAI({ apiKey: this.config.apiKey });
|
|
77
95
|
|
|
78
|
-
|
|
79
|
-
logger.info('VoiceService', 'WebSocket connected, sending setup...');
|
|
80
|
-
this.sendSetup();
|
|
81
|
-
};
|
|
96
|
+
const toolDeclarations = this.buildToolDeclarations();
|
|
82
97
|
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
};
|
|
98
|
+
// Build SDK config matching the official docs pattern
|
|
99
|
+
const sdkConfig: Record<string, any> = {
|
|
100
|
+
responseModalities: [Modality.AUDIO],
|
|
101
|
+
};
|
|
88
102
|
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
};
|
|
103
|
+
// Enable transcription for debugging and UX
|
|
104
|
+
sdkConfig.inputAudioTranscription = {};
|
|
105
|
+
sdkConfig.outputAudioTranscription = {};
|
|
106
|
+
logger.info('VoiceService', 'Transcription enabled');
|
|
94
107
|
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
108
|
+
if (this.config.systemPrompt) {
|
|
109
|
+
sdkConfig.systemInstruction = {
|
|
110
|
+
parts: [{ text: this.config.systemPrompt }],
|
|
111
|
+
};
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
if (toolDeclarations.length > 0) {
|
|
115
|
+
sdkConfig.tools = [{ functionDeclarations: toolDeclarations }];
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
// FULL CONFIG DUMP — see exactly what we send to SDK
|
|
119
|
+
const configDump = JSON.stringify({
|
|
120
|
+
...sdkConfig,
|
|
121
|
+
systemInstruction: sdkConfig.systemInstruction ? '(present)' : '(none)',
|
|
122
|
+
tools: sdkConfig.tools ? `${toolDeclarations.length} declarations` : '(none)',
|
|
123
|
+
});
|
|
124
|
+
logger.info('VoiceService', `📋 SDK config: ${configDump}`);
|
|
125
|
+
logger.info('VoiceService', `📋 Tool names: ${toolDeclarations.map((t: any) => t.name).join(', ')}`);
|
|
126
|
+
|
|
127
|
+
const session = await ai.live.connect({
|
|
128
|
+
model: model,
|
|
129
|
+
config: sdkConfig,
|
|
130
|
+
callbacks: {
|
|
131
|
+
onopen: () => {
|
|
132
|
+
logger.info('VoiceService', '✅ SDK session connected');
|
|
133
|
+
this.setStatus('connected');
|
|
134
|
+
},
|
|
135
|
+
onmessage: (message: any) => {
|
|
136
|
+
this.handleSDKMessage(message);
|
|
137
|
+
},
|
|
138
|
+
onerror: (error: any) => {
|
|
139
|
+
const errDetail = error
|
|
140
|
+
? JSON.stringify(error, Object.getOwnPropertyNames(error)).substring(0, 500)
|
|
141
|
+
: 'null';
|
|
142
|
+
logger.error('VoiceService', `SDK error: ${errDetail}`);
|
|
143
|
+
this.setStatus('error');
|
|
144
|
+
this.callbacks.onError?.(error?.message || 'SDK connection error');
|
|
145
|
+
},
|
|
146
|
+
onclose: (event: any) => {
|
|
147
|
+
const closeDetail = event
|
|
148
|
+
? JSON.stringify(event, Object.getOwnPropertyNames(event)).substring(0, 500)
|
|
149
|
+
: 'null';
|
|
150
|
+
if (this.intentionalDisconnect) {
|
|
151
|
+
logger.info('VoiceService', `SDK session closed (intentional)`);
|
|
152
|
+
} else {
|
|
153
|
+
logger.error('VoiceService', `SDK session closed UNEXPECTEDLY — code: ${event?.code}, reason: ${event?.reason}, detail: ${closeDetail}`);
|
|
154
|
+
this.callbacks.onError?.(`Connection lost (code: ${event?.code || 'unknown'})`);
|
|
155
|
+
}
|
|
156
|
+
this.session = null;
|
|
157
|
+
this.setStatus('disconnected');
|
|
158
|
+
},
|
|
159
|
+
},
|
|
160
|
+
});
|
|
161
|
+
|
|
162
|
+
this.session = session;
|
|
163
|
+
logger.info('VoiceService', 'SDK session established');
|
|
164
|
+
|
|
165
|
+
} catch (error: any) {
|
|
166
|
+
logger.error('VoiceService', `Connection failed: ${error.message}`);
|
|
167
|
+
this.setStatus('error');
|
|
168
|
+
this.callbacks.onError?.(error.message || 'Failed to connect');
|
|
169
|
+
}
|
|
98
170
|
}
|
|
99
171
|
|
|
100
172
|
disconnect(): void {
|
|
101
|
-
if (this.
|
|
102
|
-
logger.info('VoiceService', 'Disconnecting...');
|
|
103
|
-
this.
|
|
104
|
-
this.
|
|
105
|
-
this.
|
|
173
|
+
if (this.session) {
|
|
174
|
+
logger.info('VoiceService', 'Disconnecting (intentional)...');
|
|
175
|
+
this.intentionalDisconnect = true;
|
|
176
|
+
this.session.close();
|
|
177
|
+
this.session = null;
|
|
106
178
|
this.setStatus('disconnected');
|
|
107
179
|
}
|
|
108
180
|
}
|
|
109
181
|
|
|
110
182
|
get isConnected(): boolean {
|
|
111
|
-
return this.
|
|
183
|
+
return this.session !== null && this._status === 'connected';
|
|
112
184
|
}
|
|
113
185
|
|
|
114
186
|
get currentStatus(): VoiceStatus {
|
|
@@ -117,275 +189,240 @@ export class VoiceService {
|
|
|
117
189
|
|
|
118
190
|
// ─── Send Audio ────────────────────────────────────────────
|
|
119
191
|
|
|
120
|
-
/** Send PCM audio chunk (base64 encoded)
|
|
192
|
+
/** Send PCM audio chunk (base64 encoded) via SDK's sendRealtimeInput */
|
|
121
193
|
private sendCount = 0;
|
|
122
194
|
sendAudio(base64Audio: string): void {
|
|
123
195
|
this.sendCount++;
|
|
124
|
-
if (!this.isConnected) {
|
|
125
|
-
|
|
196
|
+
if (!this.isConnected || !this.session) {
|
|
197
|
+
if (this.sendCount % 20 === 0) {
|
|
198
|
+
logger.warn('VoiceService', `sendAudio #${this.sendCount} DROPPED — not connected`);
|
|
199
|
+
}
|
|
126
200
|
return;
|
|
127
201
|
}
|
|
128
202
|
|
|
129
|
-
const
|
|
130
|
-
realtimeInput: {
|
|
131
|
-
audio: {
|
|
132
|
-
mimeType: `audio/pcm;rate=${this.config.inputSampleRate || DEFAULT_INPUT_SAMPLE_RATE}`,
|
|
133
|
-
data: base64Audio,
|
|
134
|
-
},
|
|
135
|
-
},
|
|
136
|
-
};
|
|
203
|
+
const mimeType = `audio/pcm;rate=${this.config.inputSampleRate || DEFAULT_INPUT_SAMPLE_RATE}`;
|
|
137
204
|
|
|
138
|
-
|
|
139
|
-
this.
|
|
205
|
+
// DEBUG: log every send call
|
|
206
|
+
if (this.sendCount <= 5 || this.sendCount % 10 === 0) {
|
|
207
|
+
logger.info('VoiceService', `📡 sendAudio #${this.sendCount}: len=${base64Audio.length}, mime=${mimeType}, preview=${base64Audio.substring(0, 30)}...`);
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
try {
|
|
211
|
+
this.session.sendRealtimeInput({
|
|
212
|
+
audio: { data: base64Audio, mimeType },
|
|
213
|
+
});
|
|
214
|
+
// Log every 50th successful send to confirm data is reaching WebSocket
|
|
215
|
+
if (this.sendCount % 50 === 0) {
|
|
216
|
+
logger.info('VoiceService', `✅ sendAudio #${this.sendCount} OK — session.isOpen=${!!this.session}`);
|
|
217
|
+
}
|
|
218
|
+
} catch (error: any) {
|
|
219
|
+
logger.error('VoiceService', `❌ sendAudio EXCEPTION: ${error.message}\n${error.stack?.substring(0, 300)}`);
|
|
220
|
+
this.session = null;
|
|
221
|
+
this.setStatus('disconnected');
|
|
222
|
+
}
|
|
140
223
|
}
|
|
141
224
|
|
|
142
225
|
// ─── Send Text ─────────────────────────────────────────────
|
|
143
226
|
|
|
144
|
-
/** Send text message via
|
|
227
|
+
/** Send text message via SDK's sendClientContent */
|
|
145
228
|
sendText(text: string): void {
|
|
146
|
-
if (!this.isConnected) return;
|
|
147
|
-
|
|
148
|
-
const message = {
|
|
149
|
-
realtimeInput: { text },
|
|
150
|
-
};
|
|
229
|
+
if (!this.isConnected || !this.session) return;
|
|
151
230
|
|
|
152
|
-
|
|
231
|
+
logger.info('VoiceService', `🗣️ USER (text): "${text}"`);
|
|
232
|
+
try {
|
|
233
|
+
this.session.sendClientContent({
|
|
234
|
+
turns: [{ role: 'user', parts: [{ text }] }],
|
|
235
|
+
turnComplete: true,
|
|
236
|
+
});
|
|
237
|
+
} catch (error: any) {
|
|
238
|
+
logger.error('VoiceService', `sendText failed: ${error.message}`);
|
|
239
|
+
}
|
|
153
240
|
}
|
|
154
241
|
|
|
155
|
-
/**
|
|
156
|
-
*
|
|
157
|
-
* Uses
|
|
158
|
-
* WITHOUT triggering a model response. This is the "incremental content
|
|
159
|
-
* updates" pattern from the Gemini docs for establishing session context.
|
|
160
|
-
*
|
|
161
|
-
* Called once at connect + after each tool call (not on a timer).
|
|
162
|
-
* Screenshots are handled separately via the capture_screenshot tool.
|
|
242
|
+
/**
|
|
243
|
+
* Send DOM tree as passive context during live conversation.
|
|
244
|
+
* Uses turnComplete: false — the model receives context without responding.
|
|
163
245
|
*/
|
|
164
246
|
sendScreenContext(domText: string): void {
|
|
165
|
-
if (!this.isConnected) return;
|
|
247
|
+
if (!this.isConnected || !this.session) return;
|
|
166
248
|
|
|
167
|
-
|
|
168
|
-
|
|
249
|
+
try {
|
|
250
|
+
this.session.sendClientContent({
|
|
169
251
|
turns: [{ role: 'user', parts: [{ text: domText }] }],
|
|
170
|
-
turnComplete:
|
|
171
|
-
}
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
252
|
+
turnComplete: true,
|
|
253
|
+
});
|
|
254
|
+
logger.info('VoiceService', `📤 Screen context sent (${domText.length} chars)`);
|
|
255
|
+
} catch (error: any) {
|
|
256
|
+
logger.error('VoiceService', `sendScreenContext failed: ${error.message}`);
|
|
257
|
+
}
|
|
176
258
|
}
|
|
177
259
|
|
|
178
260
|
// ─── Send Function Response ────────────────────────────────
|
|
179
261
|
|
|
180
|
-
/** Send function call result back
|
|
262
|
+
/** Send function call result back via SDK's sendToolResponse */
|
|
181
263
|
sendFunctionResponse(name: string, id: string, result: any): void {
|
|
182
|
-
if (!this.isConnected) return;
|
|
183
|
-
|
|
184
|
-
const message = {
|
|
185
|
-
toolResponse: {
|
|
186
|
-
functionResponses: [{
|
|
187
|
-
name,
|
|
188
|
-
id,
|
|
189
|
-
response: result,
|
|
190
|
-
}],
|
|
191
|
-
},
|
|
192
|
-
};
|
|
264
|
+
if (!this.isConnected || !this.session) return;
|
|
193
265
|
|
|
194
266
|
logger.info('VoiceService', `📤 Sending tool response for ${name} (id=${id})`);
|
|
195
|
-
|
|
267
|
+
|
|
268
|
+
try {
|
|
269
|
+
this.session.sendToolResponse({
|
|
270
|
+
functionResponses: [{ name, id, response: result }],
|
|
271
|
+
});
|
|
272
|
+
} catch (error: any) {
|
|
273
|
+
logger.error('VoiceService', `sendFunctionResponse failed: ${error.message}`);
|
|
274
|
+
}
|
|
196
275
|
}
|
|
197
276
|
|
|
198
|
-
// ─── Internal:
|
|
277
|
+
// ─── Internal: Tool Declarations ───────────────────────────
|
|
199
278
|
|
|
200
|
-
|
|
201
|
-
|
|
279
|
+
/**
|
|
280
|
+
* Builds function declarations from configured tools.
|
|
281
|
+
* Converts BOOLEAN params to STRING (native audio model limitation).
|
|
282
|
+
*/
|
|
283
|
+
private buildToolDeclarations(): any[] {
|
|
284
|
+
if (!this.config.tools?.length) return [];
|
|
202
285
|
|
|
203
|
-
const
|
|
286
|
+
const validTools = this.config.tools.filter(t => t.name !== 'capture_screenshot');
|
|
287
|
+
if (validTools.length === 0) return [];
|
|
204
288
|
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
};
|
|
211
|
-
|
|
212
|
-
// For native audio models, language is enforced via system instructions.
|
|
213
|
-
// Explicitly setting speechConfig.languageCode causes silent API failures
|
|
214
|
-
// without a prebuiltVoiceConfig.
|
|
215
|
-
|
|
216
|
-
// Add system instruction if provided
|
|
217
|
-
if (this.config.systemPrompt) {
|
|
218
|
-
setup.systemInstruction = {
|
|
219
|
-
parts: [{ text: this.config.systemPrompt }],
|
|
289
|
+
return validTools.map(tool => {
|
|
290
|
+
const hasParams = Object.keys(tool.parameters || {}).length > 0;
|
|
291
|
+
const functionDecl: any = {
|
|
292
|
+
name: tool.name,
|
|
293
|
+
description: tool.description,
|
|
220
294
|
};
|
|
221
|
-
}
|
|
222
295
|
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
),
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
})),
|
|
245
|
-
}];
|
|
246
|
-
}
|
|
247
|
-
|
|
248
|
-
// Enable transcription
|
|
249
|
-
setup.inputAudioTranscription = {};
|
|
250
|
-
setup.outputAudioTranscription = {};
|
|
251
|
-
|
|
252
|
-
const setupMessage = { setup };
|
|
253
|
-
logger.info('VoiceService', `Sending setup (model: ${model}, tools: ${this.config.tools?.length || 0})`);
|
|
254
|
-
this.ws.send(JSON.stringify(setupMessage));
|
|
296
|
+
if (hasParams) {
|
|
297
|
+
functionDecl.parameters = {
|
|
298
|
+
type: 'OBJECT',
|
|
299
|
+
properties: Object.fromEntries(
|
|
300
|
+
Object.entries(tool.parameters).map(([key, param]) => {
|
|
301
|
+
let paramType = param.type.toUpperCase();
|
|
302
|
+
let desc = param.description;
|
|
303
|
+
if (paramType === 'BOOLEAN') {
|
|
304
|
+
paramType = 'STRING';
|
|
305
|
+
desc = `${desc} (use "true" or "false")`;
|
|
306
|
+
}
|
|
307
|
+
return [key, { type: paramType, description: desc }];
|
|
308
|
+
})
|
|
309
|
+
),
|
|
310
|
+
required: Object.entries(tool.parameters)
|
|
311
|
+
.filter(([, param]) => param.required)
|
|
312
|
+
.map(([key]) => key),
|
|
313
|
+
};
|
|
314
|
+
}
|
|
315
|
+
return functionDecl;
|
|
316
|
+
});
|
|
255
317
|
}
|
|
256
318
|
|
|
257
319
|
// ─── Internal: Message Handling ────────────────────────────
|
|
258
320
|
|
|
259
|
-
|
|
321
|
+
/**
|
|
322
|
+
* Handle messages from the SDK's onmessage callback.
|
|
323
|
+
* The SDK parses binary/JSON automatically — we get clean objects.
|
|
324
|
+
*
|
|
325
|
+
* Per official docs, tool calls come at the top level as
|
|
326
|
+
* `response.toolCall.functionCalls`.
|
|
327
|
+
*/
|
|
328
|
+
private handleSDKMessage(message: any): void {
|
|
260
329
|
try {
|
|
261
|
-
|
|
262
|
-
const
|
|
263
|
-
logger.info('VoiceService',
|
|
264
|
-
|
|
265
|
-
//
|
|
266
|
-
if (
|
|
267
|
-
|
|
268
|
-
|
|
330
|
+
// RAW MESSAGE DUMP — full session visibility
|
|
331
|
+
const msgKeys = Object.keys(message || {}).join(', ');
|
|
332
|
+
logger.info('VoiceService', `📨 SDK message keys: [${msgKeys}]`);
|
|
333
|
+
|
|
334
|
+
// Full raw dump for non-audio messages (audio is too large)
|
|
335
|
+
if (!message.serverContent?.modelTurn?.parts?.some((p: any) => p.inlineData)) {
|
|
336
|
+
const rawDump = JSON.stringify(message).substring(0, 1000);
|
|
337
|
+
logger.info('VoiceService', `📨 RAW: ${rawDump}`);
|
|
338
|
+
}
|
|
339
|
+
|
|
340
|
+
// Tool calls — top-level (per official docs)
|
|
341
|
+
if (message.toolCall?.functionCalls) {
|
|
342
|
+
this.handleToolCalls(message.toolCall.functionCalls);
|
|
269
343
|
return;
|
|
270
344
|
}
|
|
271
345
|
|
|
272
|
-
//
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
} catch (error: any) {
|
|
277
|
-
logger.error('VoiceService', `Error handling message: ${error.message}`);
|
|
278
|
-
}
|
|
279
|
-
}
|
|
346
|
+
// Server content (audio, text, transcripts, turn events)
|
|
347
|
+
if (message.serverContent) {
|
|
348
|
+
this.handleServerContent(message.serverContent);
|
|
349
|
+
}
|
|
280
350
|
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
if (data instanceof ArrayBuffer) {
|
|
286
|
-
bytes = new Uint8Array(data);
|
|
287
|
-
} else if (data instanceof Blob) {
|
|
288
|
-
// Blob handling — read as ArrayBuffer
|
|
289
|
-
const reader = new FileReader();
|
|
290
|
-
reader.onload = () => {
|
|
291
|
-
if (reader.result instanceof ArrayBuffer) {
|
|
292
|
-
this.processBinaryBytes(new Uint8Array(reader.result));
|
|
293
|
-
}
|
|
294
|
-
};
|
|
295
|
-
reader.readAsArrayBuffer(data);
|
|
296
|
-
return;
|
|
297
|
-
} else {
|
|
298
|
-
return;
|
|
351
|
+
// Setup complete acknowledgment
|
|
352
|
+
if (message.setupComplete !== undefined) {
|
|
353
|
+
logger.info('VoiceService', '✅ Setup complete — ready for audio');
|
|
354
|
+
this.callbacks.onSetupComplete?.();
|
|
299
355
|
}
|
|
300
356
|
|
|
301
|
-
|
|
357
|
+
// Error messages
|
|
358
|
+
if (message.error) {
|
|
359
|
+
logger.error('VoiceService', `Server error: ${JSON.stringify(message.error)}`);
|
|
360
|
+
this.callbacks.onError?.(message.error.message || 'Server error');
|
|
361
|
+
}
|
|
302
362
|
} catch (error: any) {
|
|
303
|
-
logger.error('VoiceService', `Error handling
|
|
363
|
+
logger.error('VoiceService', `Error handling SDK message: ${error.message}`);
|
|
304
364
|
}
|
|
305
365
|
}
|
|
306
366
|
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
} catch {
|
|
317
|
-
// Not JSON — treat as raw PCM audio
|
|
318
|
-
this.callbacks.onAudioResponse?.(this.arrayBufferToBase64(bytes.buffer as ArrayBuffer));
|
|
319
|
-
}
|
|
320
|
-
} else {
|
|
321
|
-
// Raw PCM audio data
|
|
322
|
-
this.callbacks.onAudioResponse?.(this.arrayBufferToBase64(bytes.buffer as ArrayBuffer));
|
|
367
|
+
/** Process tool calls from the model */
|
|
368
|
+
private handleToolCalls(functionCalls: any[]): void {
|
|
369
|
+
for (const fn of functionCalls) {
|
|
370
|
+
logger.info('VoiceService', `🎯 Tool call: ${fn.name}(${JSON.stringify(fn.args)}) [id=${fn.id}]`);
|
|
371
|
+
this.callbacks.onToolCall?.({
|
|
372
|
+
name: fn.name,
|
|
373
|
+
args: fn.args || {},
|
|
374
|
+
id: fn.id,
|
|
375
|
+
});
|
|
323
376
|
}
|
|
324
377
|
}
|
|
325
378
|
|
|
326
|
-
private
|
|
327
|
-
// Setup complete acknowledgment
|
|
328
|
-
if (message.setupComplete !== undefined) {
|
|
329
|
-
logger.info('VoiceService', '✅ Setup complete — ready for audio exchange');
|
|
330
|
-
this.setupComplete = true;
|
|
331
|
-
this.setStatus('connected');
|
|
332
|
-
return;
|
|
333
|
-
}
|
|
379
|
+
private audioResponseCount = 0;
|
|
334
380
|
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
381
|
+
/** Process server content (audio responses, transcripts, turn events) */
|
|
382
|
+
private handleServerContent(content: any): void {
|
|
383
|
+
// Log all keys for full visibility
|
|
384
|
+
const contentKeys = Object.keys(content || {}).join(', ');
|
|
385
|
+
logger.debug('VoiceService', `📦 serverContent keys: [${contentKeys}]`);
|
|
339
386
|
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
if (content.modelTurn?.parts) {
|
|
347
|
-
for (const part of content.modelTurn.parts) {
|
|
348
|
-
// Audio response
|
|
349
|
-
if (part.inlineData?.data) {
|
|
350
|
-
logger.info('VoiceService', `🔊 Audio response: ${part.inlineData.data.length} chars`);
|
|
351
|
-
this.callbacks.onAudioResponse?.(part.inlineData.data);
|
|
352
|
-
}
|
|
387
|
+
// Turn complete
|
|
388
|
+
if (content.turnComplete) {
|
|
389
|
+
logger.info('VoiceService', `🏁 Turn complete (audioChunks sent: ${this.audioResponseCount})`);
|
|
390
|
+
this.audioResponseCount = 0;
|
|
391
|
+
this.callbacks.onTurnComplete?.();
|
|
392
|
+
}
|
|
353
393
|
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
394
|
+
// Model output parts (audio + optional thinking text)
|
|
395
|
+
if (content.modelTurn?.parts) {
|
|
396
|
+
for (const part of content.modelTurn.parts) {
|
|
397
|
+
if (part.inlineData?.data) {
|
|
398
|
+
this.audioResponseCount++;
|
|
399
|
+
if (this.audioResponseCount <= 3 || this.audioResponseCount % 20 === 0) {
|
|
400
|
+
logger.info('VoiceService', `🔊 Audio chunk #${this.audioResponseCount}: ${part.inlineData.data.length} b64 chars, mime=${part.inlineData.mimeType || 'unknown'}`);
|
|
358
401
|
}
|
|
402
|
+
this.callbacks.onAudioResponse?.(part.inlineData.data);
|
|
403
|
+
}
|
|
404
|
+
if (part.text) {
|
|
405
|
+
logger.info('VoiceService', `🤖 MODEL: "${part.text}"`);
|
|
406
|
+
this.callbacks.onTranscript?.(part.text, true, 'model');
|
|
359
407
|
}
|
|
360
408
|
}
|
|
409
|
+
}
|
|
361
410
|
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
// Output transcription (model's speech-to-text)
|
|
368
|
-
if (content.outputTranscription?.text) {
|
|
369
|
-
this.callbacks.onTranscript?.(content.outputTranscription.text, true, 'model');
|
|
370
|
-
}
|
|
411
|
+
// Input transcription (user's speech-to-text)
|
|
412
|
+
if (content.inputTranscription?.text) {
|
|
413
|
+
logger.info('VoiceService', `🗣️ USER (voice): "${content.inputTranscription.text}"`);
|
|
414
|
+
this.callbacks.onTranscript?.(content.inputTranscription.text, true, 'user');
|
|
371
415
|
}
|
|
372
416
|
|
|
373
|
-
//
|
|
374
|
-
if (
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
this.callbacks.onToolCall?.({
|
|
378
|
-
name: fn.name,
|
|
379
|
-
args: fn.args || {},
|
|
380
|
-
id: fn.id,
|
|
381
|
-
});
|
|
382
|
-
}
|
|
417
|
+
// Output transcription (model's speech-to-text)
|
|
418
|
+
if (content.outputTranscription?.text) {
|
|
419
|
+
logger.info('VoiceService', `🤖 MODEL (voice): "${content.outputTranscription.text}"`);
|
|
420
|
+
this.callbacks.onTranscript?.(content.outputTranscription.text, true, 'model');
|
|
383
421
|
}
|
|
384
422
|
|
|
385
|
-
//
|
|
386
|
-
if (
|
|
387
|
-
|
|
388
|
-
this.callbacks.onError?.(message.error.message || 'Server error');
|
|
423
|
+
// Tool calls inside serverContent (some SDK versions deliver here)
|
|
424
|
+
if (content.toolCall?.functionCalls) {
|
|
425
|
+
this.handleToolCalls(content.toolCall.functionCalls);
|
|
389
426
|
}
|
|
390
427
|
}
|
|
391
428
|
|
|
@@ -395,13 +432,4 @@ export class VoiceService {
|
|
|
395
432
|
this._status = newStatus;
|
|
396
433
|
this.callbacks.onStatusChange?.(newStatus);
|
|
397
434
|
}
|
|
398
|
-
|
|
399
|
-
private arrayBufferToBase64(buffer: ArrayBuffer): string {
|
|
400
|
-
const bytes = new Uint8Array(buffer);
|
|
401
|
-
let binary = '';
|
|
402
|
-
for (let i = 0; i < bytes.byteLength; i++) {
|
|
403
|
-
binary += String.fromCharCode(bytes[i]!);
|
|
404
|
-
}
|
|
405
|
-
return btoa(binary);
|
|
406
|
-
}
|
|
407
435
|
}
|