@mobileai/react-native 0.4.5 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +80 -15
- package/lib/module/components/AIAgent.js +181 -38
- package/lib/module/components/AIAgent.js.map +1 -1
- package/lib/module/components/AgentChatBar.js +53 -29
- package/lib/module/components/AgentChatBar.js.map +1 -1
- package/lib/module/components/Icons.js +337 -0
- package/lib/module/components/Icons.js.map +1 -0
- package/lib/module/core/AgentRuntime.js +74 -3
- package/lib/module/core/AgentRuntime.js.map +1 -1
- package/lib/module/core/systemPrompt.js +87 -34
- package/lib/module/core/systemPrompt.js.map +1 -1
- package/lib/module/services/AudioInputService.js +73 -2
- package/lib/module/services/AudioInputService.js.map +1 -1
- package/lib/module/services/AudioOutputService.js +58 -5
- package/lib/module/services/AudioOutputService.js.map +1 -1
- package/lib/module/services/VoiceService.js +284 -239
- package/lib/module/services/VoiceService.js.map +1 -1
- package/lib/typescript/src/components/AIAgent.d.ts.map +1 -1
- package/lib/typescript/src/components/AgentChatBar.d.ts.map +1 -1
- package/lib/typescript/src/components/Icons.d.ts +43 -0
- package/lib/typescript/src/components/Icons.d.ts.map +1 -0
- package/lib/typescript/src/core/AgentRuntime.d.ts +12 -0
- package/lib/typescript/src/core/AgentRuntime.d.ts.map +1 -1
- package/lib/typescript/src/core/systemPrompt.d.ts +7 -4
- package/lib/typescript/src/core/systemPrompt.d.ts.map +1 -1
- package/lib/typescript/src/services/AudioInputService.d.ts +13 -0
- package/lib/typescript/src/services/AudioInputService.d.ts.map +1 -1
- package/lib/typescript/src/services/AudioOutputService.d.ts.map +1 -1
- package/lib/typescript/src/services/VoiceService.d.ts +41 -24
- package/lib/typescript/src/services/VoiceService.d.ts.map +1 -1
- package/package.json +1 -1
- package/src/components/AIAgent.tsx +194 -38
- package/src/components/AgentChatBar.tsx +44 -25
- package/src/components/Icons.tsx +253 -0
- package/src/core/AgentRuntime.ts +70 -3
- package/src/core/systemPrompt.ts +87 -34
- package/src/services/AudioInputService.ts +77 -2
- package/src/services/AudioOutputService.ts +59 -5
- package/src/services/VoiceService.ts +280 -252
|
@@ -1,81 +1,151 @@
|
|
|
1
1
|
"use strict";
|
|
2
2
|
|
|
3
3
|
/**
|
|
4
|
-
* VoiceService —
|
|
4
|
+
* VoiceService — @google/genai SDK Live API connection.
|
|
5
|
+
*
|
|
6
|
+
* Uses the official `ai.live.connect()` method instead of raw WebSocket.
|
|
7
|
+
* This fixes function calling reliability: the SDK handles protocol details
|
|
8
|
+
* (binary framing, message transforms, model name prefixes) that our
|
|
9
|
+
* previous raw WebSocket implementation missed.
|
|
5
10
|
*
|
|
6
11
|
* Handles bidirectional audio streaming between the app and Gemini:
|
|
7
12
|
* - Sends PCM 16kHz 16-bit audio chunks (mic input)
|
|
8
13
|
* - Receives PCM 24kHz 16-bit audio chunks (AI responses)
|
|
9
14
|
* - Receives function calls (tap, navigate, etc.) for agentic actions
|
|
10
|
-
* - Sends screen context (DOM text
|
|
11
|
-
*
|
|
12
|
-
* Protocol: wss://generativelanguage.googleapis.com/ws/google.ai.generativelanguage.v1beta.GenerativeService.BidiGenerateContent
|
|
15
|
+
* - Sends screen context (DOM text) for live mode
|
|
13
16
|
*/
|
|
14
17
|
|
|
18
|
+
// Platform-specific import: Metro can't resolve '@google/genai/web' sub-path
|
|
19
|
+
// export, so we use the full path to the web bundle. This is what the SDK
|
|
20
|
+
// recommends ('use a platform specific import') — RN's WebSocket API is
|
|
21
|
+
// browser-compatible so the web bundle works correctly.
|
|
22
|
+
// @ts-ignore — TS can't find declarations for the deep path
|
|
23
|
+
import { GoogleGenAI, Modality } from '@google/genai/dist/web/index.mjs';
|
|
24
|
+
// @ts-ignore
|
|
25
|
+
|
|
15
26
|
import { logger } from "../utils/logger.js";
|
|
16
27
|
|
|
17
28
|
// ─── Types ─────────────────────────────────────────────────────
|
|
18
29
|
|
|
19
30
|
// ─── Constants ─────────────────────────────────────────────────
|
|
20
31
|
|
|
21
|
-
const WS_HOST = 'generativelanguage.googleapis.com';
|
|
22
|
-
const WS_PATH = '/ws/google.ai.generativelanguage.v1beta.GenerativeService.BidiGenerateContent';
|
|
23
32
|
const DEFAULT_MODEL = 'gemini-2.5-flash-native-audio-preview-12-2025';
|
|
24
33
|
const DEFAULT_INPUT_SAMPLE_RATE = 16000;
|
|
25
34
|
|
|
26
35
|
// ─── Service ───────────────────────────────────────────────────
|
|
27
36
|
|
|
28
37
|
export class VoiceService {
|
|
29
|
-
|
|
38
|
+
session = null;
|
|
30
39
|
callbacks = {};
|
|
31
|
-
|
|
40
|
+
lastCallbacks = null;
|
|
32
41
|
_status = 'disconnected';
|
|
42
|
+
intentionalDisconnect = false;
|
|
33
43
|
constructor(config) {
|
|
34
44
|
this.config = config;
|
|
35
45
|
}
|
|
36
46
|
|
|
37
47
|
// ─── Connection ────────────────────────────────────────────
|
|
38
48
|
|
|
39
|
-
|
|
40
|
-
|
|
49
|
+
/**
|
|
50
|
+
* Connect to Gemini Live API via the official SDK.
|
|
51
|
+
* Now async because `ai.live.connect()` returns a Promise.
|
|
52
|
+
*/
|
|
53
|
+
async connect(callbacks) {
|
|
54
|
+
if (this.session) {
|
|
41
55
|
logger.info('VoiceService', 'Already connected');
|
|
42
56
|
return;
|
|
43
57
|
}
|
|
44
58
|
this.callbacks = callbacks;
|
|
59
|
+
this.lastCallbacks = callbacks;
|
|
45
60
|
this.setStatus('connecting');
|
|
61
|
+
this.intentionalDisconnect = false;
|
|
46
62
|
const model = this.config.model || DEFAULT_MODEL;
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
this.
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
63
|
+
logger.info('VoiceService', `Connecting via SDK (model: ${model})`);
|
|
64
|
+
try {
|
|
65
|
+
const ai = new GoogleGenAI({
|
|
66
|
+
apiKey: this.config.apiKey
|
|
67
|
+
});
|
|
68
|
+
const toolDeclarations = this.buildToolDeclarations();
|
|
69
|
+
|
|
70
|
+
// Build SDK config matching the official docs pattern
|
|
71
|
+
const sdkConfig = {
|
|
72
|
+
responseModalities: [Modality.AUDIO]
|
|
73
|
+
};
|
|
74
|
+
|
|
75
|
+
// Enable transcription for debugging and UX
|
|
76
|
+
sdkConfig.inputAudioTranscription = {};
|
|
77
|
+
sdkConfig.outputAudioTranscription = {};
|
|
78
|
+
logger.info('VoiceService', 'Transcription enabled');
|
|
79
|
+
if (this.config.systemPrompt) {
|
|
80
|
+
sdkConfig.systemInstruction = {
|
|
81
|
+
parts: [{
|
|
82
|
+
text: this.config.systemPrompt
|
|
83
|
+
}]
|
|
84
|
+
};
|
|
85
|
+
}
|
|
86
|
+
if (toolDeclarations.length > 0) {
|
|
87
|
+
sdkConfig.tools = [{
|
|
88
|
+
functionDeclarations: toolDeclarations
|
|
89
|
+
}];
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
// FULL CONFIG DUMP — see exactly what we send to SDK
|
|
93
|
+
const configDump = JSON.stringify({
|
|
94
|
+
...sdkConfig,
|
|
95
|
+
systemInstruction: sdkConfig.systemInstruction ? '(present)' : '(none)',
|
|
96
|
+
tools: sdkConfig.tools ? `${toolDeclarations.length} declarations` : '(none)'
|
|
97
|
+
});
|
|
98
|
+
logger.info('VoiceService', `📋 SDK config: ${configDump}`);
|
|
99
|
+
logger.info('VoiceService', `📋 Tool names: ${toolDeclarations.map(t => t.name).join(', ')}`);
|
|
100
|
+
const session = await ai.live.connect({
|
|
101
|
+
model: model,
|
|
102
|
+
config: sdkConfig,
|
|
103
|
+
callbacks: {
|
|
104
|
+
onopen: () => {
|
|
105
|
+
logger.info('VoiceService', '✅ SDK session connected');
|
|
106
|
+
this.setStatus('connected');
|
|
107
|
+
},
|
|
108
|
+
onmessage: message => {
|
|
109
|
+
this.handleSDKMessage(message);
|
|
110
|
+
},
|
|
111
|
+
onerror: error => {
|
|
112
|
+
const errDetail = error ? JSON.stringify(error, Object.getOwnPropertyNames(error)).substring(0, 500) : 'null';
|
|
113
|
+
logger.error('VoiceService', `SDK error: ${errDetail}`);
|
|
114
|
+
this.setStatus('error');
|
|
115
|
+
this.callbacks.onError?.(error?.message || 'SDK connection error');
|
|
116
|
+
},
|
|
117
|
+
onclose: event => {
|
|
118
|
+
const closeDetail = event ? JSON.stringify(event, Object.getOwnPropertyNames(event)).substring(0, 500) : 'null';
|
|
119
|
+
if (this.intentionalDisconnect) {
|
|
120
|
+
logger.info('VoiceService', `SDK session closed (intentional)`);
|
|
121
|
+
} else {
|
|
122
|
+
logger.error('VoiceService', `SDK session closed UNEXPECTEDLY — code: ${event?.code}, reason: ${event?.reason}, detail: ${closeDetail}`);
|
|
123
|
+
this.callbacks.onError?.(`Connection lost (code: ${event?.code || 'unknown'})`);
|
|
124
|
+
}
|
|
125
|
+
this.session = null;
|
|
126
|
+
this.setStatus('disconnected');
|
|
127
|
+
}
|
|
128
|
+
}
|
|
129
|
+
});
|
|
130
|
+
this.session = session;
|
|
131
|
+
logger.info('VoiceService', 'SDK session established');
|
|
132
|
+
} catch (error) {
|
|
133
|
+
logger.error('VoiceService', `Connection failed: ${error.message}`);
|
|
61
134
|
this.setStatus('error');
|
|
62
|
-
this.callbacks.onError?.(error.message || '
|
|
63
|
-
}
|
|
64
|
-
this.ws.onmessage = event => {
|
|
65
|
-
this.handleMessage(event);
|
|
66
|
-
};
|
|
135
|
+
this.callbacks.onError?.(error.message || 'Failed to connect');
|
|
136
|
+
}
|
|
67
137
|
}
|
|
68
138
|
disconnect() {
|
|
69
|
-
if (this.
|
|
70
|
-
logger.info('VoiceService', 'Disconnecting...');
|
|
71
|
-
this.
|
|
72
|
-
this.
|
|
73
|
-
this.
|
|
139
|
+
if (this.session) {
|
|
140
|
+
logger.info('VoiceService', 'Disconnecting (intentional)...');
|
|
141
|
+
this.intentionalDisconnect = true;
|
|
142
|
+
this.session.close();
|
|
143
|
+
this.session = null;
|
|
74
144
|
this.setStatus('disconnected');
|
|
75
145
|
}
|
|
76
146
|
}
|
|
77
147
|
get isConnected() {
|
|
78
|
-
return this.
|
|
148
|
+
return this.session !== null && this._status === 'connected';
|
|
79
149
|
}
|
|
80
150
|
get currentStatus() {
|
|
81
151
|
return this._status;
|
|
@@ -83,263 +153,246 @@ export class VoiceService {
|
|
|
83
153
|
|
|
84
154
|
// ─── Send Audio ────────────────────────────────────────────
|
|
85
155
|
|
|
86
|
-
/** Send PCM audio chunk (base64 encoded)
|
|
156
|
+
/** Send PCM audio chunk (base64 encoded) via SDK's sendRealtimeInput */
|
|
87
157
|
sendCount = 0;
|
|
88
158
|
sendAudio(base64Audio) {
|
|
89
159
|
this.sendCount++;
|
|
90
|
-
if (!this.isConnected) {
|
|
91
|
-
|
|
160
|
+
if (!this.isConnected || !this.session) {
|
|
161
|
+
if (this.sendCount % 20 === 0) {
|
|
162
|
+
logger.warn('VoiceService', `sendAudio #${this.sendCount} DROPPED — not connected`);
|
|
163
|
+
}
|
|
92
164
|
return;
|
|
93
165
|
}
|
|
94
|
-
const
|
|
95
|
-
|
|
166
|
+
const mimeType = `audio/pcm;rate=${this.config.inputSampleRate || DEFAULT_INPUT_SAMPLE_RATE}`;
|
|
167
|
+
|
|
168
|
+
// DEBUG: log every send call
|
|
169
|
+
if (this.sendCount <= 5 || this.sendCount % 10 === 0) {
|
|
170
|
+
logger.info('VoiceService', `📡 sendAudio #${this.sendCount}: len=${base64Audio.length}, mime=${mimeType}, preview=${base64Audio.substring(0, 30)}...`);
|
|
171
|
+
}
|
|
172
|
+
try {
|
|
173
|
+
this.session.sendRealtimeInput({
|
|
96
174
|
audio: {
|
|
97
|
-
|
|
98
|
-
|
|
175
|
+
data: base64Audio,
|
|
176
|
+
mimeType
|
|
99
177
|
}
|
|
178
|
+
});
|
|
179
|
+
// Log every 50th successful send to confirm data is reaching WebSocket
|
|
180
|
+
if (this.sendCount % 50 === 0) {
|
|
181
|
+
logger.info('VoiceService', `✅ sendAudio #${this.sendCount} OK — session.isOpen=${!!this.session}`);
|
|
100
182
|
}
|
|
101
|
-
}
|
|
102
|
-
|
|
103
|
-
|
|
183
|
+
} catch (error) {
|
|
184
|
+
logger.error('VoiceService', `❌ sendAudio EXCEPTION: ${error.message}\n${error.stack?.substring(0, 300)}`);
|
|
185
|
+
this.session = null;
|
|
186
|
+
this.setStatus('disconnected');
|
|
187
|
+
}
|
|
104
188
|
}
|
|
105
189
|
|
|
106
190
|
// ─── Send Text ─────────────────────────────────────────────
|
|
107
191
|
|
|
108
|
-
/** Send text message via
|
|
192
|
+
/** Send text message via SDK's sendClientContent */
|
|
109
193
|
sendText(text) {
|
|
110
|
-
if (!this.isConnected) return;
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
194
|
+
if (!this.isConnected || !this.session) return;
|
|
195
|
+
logger.info('VoiceService', `🗣️ USER (text): "${text}"`);
|
|
196
|
+
try {
|
|
197
|
+
this.session.sendClientContent({
|
|
198
|
+
turns: [{
|
|
199
|
+
role: 'user',
|
|
200
|
+
parts: [{
|
|
201
|
+
text
|
|
202
|
+
}]
|
|
203
|
+
}],
|
|
204
|
+
turnComplete: true
|
|
205
|
+
});
|
|
206
|
+
} catch (error) {
|
|
207
|
+
logger.error('VoiceService', `sendText failed: ${error.message}`);
|
|
208
|
+
}
|
|
117
209
|
}
|
|
118
210
|
|
|
119
|
-
/**
|
|
120
|
-
*
|
|
121
|
-
* Uses
|
|
122
|
-
* WITHOUT triggering a model response. This is the "incremental content
|
|
123
|
-
* updates" pattern from the Gemini docs for establishing session context.
|
|
124
|
-
*
|
|
125
|
-
* Called once at connect + after each tool call (not on a timer).
|
|
126
|
-
* Screenshots are handled separately via the capture_screenshot tool.
|
|
211
|
+
/**
|
|
212
|
+
* Send DOM tree as passive context during live conversation.
|
|
213
|
+
* Uses turnComplete: false — the model receives context without responding.
|
|
127
214
|
*/
|
|
128
215
|
sendScreenContext(domText) {
|
|
129
|
-
if (!this.isConnected) return;
|
|
130
|
-
|
|
131
|
-
|
|
216
|
+
if (!this.isConnected || !this.session) return;
|
|
217
|
+
try {
|
|
218
|
+
this.session.sendClientContent({
|
|
132
219
|
turns: [{
|
|
133
220
|
role: 'user',
|
|
134
221
|
parts: [{
|
|
135
222
|
text: domText
|
|
136
223
|
}]
|
|
137
224
|
}],
|
|
138
|
-
turnComplete:
|
|
139
|
-
}
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
225
|
+
turnComplete: true
|
|
226
|
+
});
|
|
227
|
+
logger.info('VoiceService', `📤 Screen context sent (${domText.length} chars)`);
|
|
228
|
+
} catch (error) {
|
|
229
|
+
logger.error('VoiceService', `sendScreenContext failed: ${error.message}`);
|
|
230
|
+
}
|
|
143
231
|
}
|
|
144
232
|
|
|
145
233
|
// ─── Send Function Response ────────────────────────────────
|
|
146
234
|
|
|
147
|
-
/** Send function call result back
|
|
235
|
+
/** Send function call result back via SDK's sendToolResponse */
|
|
148
236
|
sendFunctionResponse(name, id, result) {
|
|
149
|
-
if (!this.isConnected) return;
|
|
150
|
-
|
|
151
|
-
|
|
237
|
+
if (!this.isConnected || !this.session) return;
|
|
238
|
+
logger.info('VoiceService', `📤 Sending tool response for ${name} (id=${id})`);
|
|
239
|
+
try {
|
|
240
|
+
this.session.sendToolResponse({
|
|
152
241
|
functionResponses: [{
|
|
153
242
|
name,
|
|
154
243
|
id,
|
|
155
244
|
response: result
|
|
156
245
|
}]
|
|
157
|
-
}
|
|
158
|
-
}
|
|
159
|
-
|
|
160
|
-
|
|
246
|
+
});
|
|
247
|
+
} catch (error) {
|
|
248
|
+
logger.error('VoiceService', `sendFunctionResponse failed: ${error.message}`);
|
|
249
|
+
}
|
|
161
250
|
}
|
|
162
251
|
|
|
163
|
-
// ─── Internal:
|
|
252
|
+
// ─── Internal: Tool Declarations ───────────────────────────
|
|
164
253
|
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
// Add system instruction if provided
|
|
180
|
-
if (this.config.systemPrompt) {
|
|
181
|
-
setup.systemInstruction = {
|
|
182
|
-
parts: [{
|
|
183
|
-
text: this.config.systemPrompt
|
|
184
|
-
}]
|
|
254
|
+
/**
|
|
255
|
+
* Builds function declarations from configured tools.
|
|
256
|
+
* Converts BOOLEAN params to STRING (native audio model limitation).
|
|
257
|
+
*/
|
|
258
|
+
buildToolDeclarations() {
|
|
259
|
+
if (!this.config.tools?.length) return [];
|
|
260
|
+
const validTools = this.config.tools.filter(t => t.name !== 'capture_screenshot');
|
|
261
|
+
if (validTools.length === 0) return [];
|
|
262
|
+
return validTools.map(tool => {
|
|
263
|
+
const hasParams = Object.keys(tool.parameters || {}).length > 0;
|
|
264
|
+
const functionDecl = {
|
|
265
|
+
name: tool.name,
|
|
266
|
+
description: tool.description
|
|
185
267
|
};
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
type:
|
|
198
|
-
description:
|
|
199
|
-
}]
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
}
|
|
203
|
-
}
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
// Enable transcription
|
|
207
|
-
setup.inputAudioTranscription = {};
|
|
208
|
-
setup.outputAudioTranscription = {};
|
|
209
|
-
const setupMessage = {
|
|
210
|
-
setup
|
|
211
|
-
};
|
|
212
|
-
logger.info('VoiceService', `Sending setup (model: ${model}, tools: ${this.config.tools?.length || 0})`);
|
|
213
|
-
this.ws.send(JSON.stringify(setupMessage));
|
|
268
|
+
if (hasParams) {
|
|
269
|
+
functionDecl.parameters = {
|
|
270
|
+
type: 'OBJECT',
|
|
271
|
+
properties: Object.fromEntries(Object.entries(tool.parameters).map(([key, param]) => {
|
|
272
|
+
let paramType = param.type.toUpperCase();
|
|
273
|
+
let desc = param.description;
|
|
274
|
+
if (paramType === 'BOOLEAN') {
|
|
275
|
+
paramType = 'STRING';
|
|
276
|
+
desc = `${desc} (use "true" or "false")`;
|
|
277
|
+
}
|
|
278
|
+
return [key, {
|
|
279
|
+
type: paramType,
|
|
280
|
+
description: desc
|
|
281
|
+
}];
|
|
282
|
+
})),
|
|
283
|
+
required: Object.entries(tool.parameters).filter(([, param]) => param.required).map(([key]) => key)
|
|
284
|
+
};
|
|
285
|
+
}
|
|
286
|
+
return functionDecl;
|
|
287
|
+
});
|
|
214
288
|
}
|
|
215
289
|
|
|
216
290
|
// ─── Internal: Message Handling ────────────────────────────
|
|
217
291
|
|
|
218
|
-
|
|
292
|
+
/**
|
|
293
|
+
* Handle messages from the SDK's onmessage callback.
|
|
294
|
+
* The SDK parses binary/JSON automatically — we get clean objects.
|
|
295
|
+
*
|
|
296
|
+
* Per official docs, tool calls come at the top level as
|
|
297
|
+
* `response.toolCall.functionCalls`.
|
|
298
|
+
*/
|
|
299
|
+
handleSDKMessage(message) {
|
|
219
300
|
try {
|
|
220
|
-
|
|
221
|
-
const
|
|
222
|
-
logger.info('VoiceService',
|
|
223
|
-
|
|
224
|
-
//
|
|
225
|
-
if (
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
return;
|
|
301
|
+
// RAW MESSAGE DUMP — full session visibility
|
|
302
|
+
const msgKeys = Object.keys(message || {}).join(', ');
|
|
303
|
+
logger.info('VoiceService', `📨 SDK message keys: [${msgKeys}]`);
|
|
304
|
+
|
|
305
|
+
// Full raw dump for non-audio messages (audio is too large)
|
|
306
|
+
if (!message.serverContent?.modelTurn?.parts?.some(p => p.inlineData)) {
|
|
307
|
+
const rawDump = JSON.stringify(message).substring(0, 1000);
|
|
308
|
+
logger.info('VoiceService', `📨 RAW: ${rawDump}`);
|
|
229
309
|
}
|
|
230
310
|
|
|
231
|
-
//
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
this.processMessage(message);
|
|
235
|
-
} catch (error) {
|
|
236
|
-
logger.error('VoiceService', `Error handling message: ${error.message}`);
|
|
237
|
-
}
|
|
238
|
-
}
|
|
239
|
-
handleBinaryMessage(data) {
|
|
240
|
-
try {
|
|
241
|
-
// Try to decode as JSON first
|
|
242
|
-
let bytes;
|
|
243
|
-
if (data instanceof ArrayBuffer) {
|
|
244
|
-
bytes = new Uint8Array(data);
|
|
245
|
-
} else if (data instanceof Blob) {
|
|
246
|
-
// Blob handling — read as ArrayBuffer
|
|
247
|
-
const reader = new FileReader();
|
|
248
|
-
reader.onload = () => {
|
|
249
|
-
if (reader.result instanceof ArrayBuffer) {
|
|
250
|
-
this.processBinaryBytes(new Uint8Array(reader.result));
|
|
251
|
-
}
|
|
252
|
-
};
|
|
253
|
-
reader.readAsArrayBuffer(data);
|
|
254
|
-
return;
|
|
255
|
-
} else {
|
|
311
|
+
// Tool calls — top-level (per official docs)
|
|
312
|
+
if (message.toolCall?.functionCalls) {
|
|
313
|
+
this.handleToolCalls(message.toolCall.functionCalls);
|
|
256
314
|
return;
|
|
257
315
|
}
|
|
258
|
-
|
|
316
|
+
|
|
317
|
+
// Server content (audio, text, transcripts, turn events)
|
|
318
|
+
if (message.serverContent) {
|
|
319
|
+
this.handleServerContent(message.serverContent);
|
|
320
|
+
}
|
|
321
|
+
|
|
322
|
+
// Setup complete acknowledgment
|
|
323
|
+
if (message.setupComplete !== undefined) {
|
|
324
|
+
logger.info('VoiceService', '✅ Setup complete — ready for audio');
|
|
325
|
+
this.callbacks.onSetupComplete?.();
|
|
326
|
+
}
|
|
327
|
+
|
|
328
|
+
// Error messages
|
|
329
|
+
if (message.error) {
|
|
330
|
+
logger.error('VoiceService', `Server error: ${JSON.stringify(message.error)}`);
|
|
331
|
+
this.callbacks.onError?.(message.error.message || 'Server error');
|
|
332
|
+
}
|
|
259
333
|
} catch (error) {
|
|
260
|
-
logger.error('VoiceService', `Error handling
|
|
334
|
+
logger.error('VoiceService', `Error handling SDK message: ${error.message}`);
|
|
261
335
|
}
|
|
262
336
|
}
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
this.callbacks.onAudioResponse?.(this.arrayBufferToBase64(bytes.buffer));
|
|
274
|
-
}
|
|
275
|
-
} else {
|
|
276
|
-
// Raw PCM audio data
|
|
277
|
-
this.callbacks.onAudioResponse?.(this.arrayBufferToBase64(bytes.buffer));
|
|
337
|
+
|
|
338
|
+
/** Process tool calls from the model */
|
|
339
|
+
handleToolCalls(functionCalls) {
|
|
340
|
+
for (const fn of functionCalls) {
|
|
341
|
+
logger.info('VoiceService', `🎯 Tool call: ${fn.name}(${JSON.stringify(fn.args)}) [id=${fn.id}]`);
|
|
342
|
+
this.callbacks.onToolCall?.({
|
|
343
|
+
name: fn.name,
|
|
344
|
+
args: fn.args || {},
|
|
345
|
+
id: fn.id
|
|
346
|
+
});
|
|
278
347
|
}
|
|
279
348
|
}
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
349
|
+
audioResponseCount = 0;
|
|
350
|
+
|
|
351
|
+
/** Process server content (audio responses, transcripts, turn events) */
|
|
352
|
+
handleServerContent(content) {
|
|
353
|
+
// Log all keys for full visibility
|
|
354
|
+
const contentKeys = Object.keys(content || {}).join(', ');
|
|
355
|
+
logger.debug('VoiceService', `📦 serverContent keys: [${contentKeys}]`);
|
|
356
|
+
|
|
357
|
+
// Turn complete
|
|
358
|
+
if (content.turnComplete) {
|
|
359
|
+
logger.info('VoiceService', `🏁 Turn complete (audioChunks sent: ${this.audioResponseCount})`);
|
|
360
|
+
this.audioResponseCount = 0;
|
|
361
|
+
this.callbacks.onTurnComplete?.();
|
|
287
362
|
}
|
|
288
363
|
|
|
289
|
-
//
|
|
290
|
-
if (
|
|
291
|
-
const
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
this.callbacks.onTurnComplete?.();
|
|
297
|
-
}
|
|
298
|
-
|
|
299
|
-
// Process model output parts
|
|
300
|
-
if (content.modelTurn?.parts) {
|
|
301
|
-
for (const part of content.modelTurn.parts) {
|
|
302
|
-
// Audio response
|
|
303
|
-
if (part.inlineData?.data) {
|
|
304
|
-
logger.info('VoiceService', `🔊 Audio response: ${part.inlineData.data.length} chars`);
|
|
305
|
-
this.callbacks.onAudioResponse?.(part.inlineData.data);
|
|
306
|
-
}
|
|
307
|
-
|
|
308
|
-
// Text response (transcript)
|
|
309
|
-
if (part.text) {
|
|
310
|
-
logger.info('VoiceService', `💬 Text response: "${part.text}"`);
|
|
311
|
-
this.callbacks.onTranscript?.(part.text, true, 'model');
|
|
364
|
+
// Model output parts (audio + optional thinking text)
|
|
365
|
+
if (content.modelTurn?.parts) {
|
|
366
|
+
for (const part of content.modelTurn.parts) {
|
|
367
|
+
if (part.inlineData?.data) {
|
|
368
|
+
this.audioResponseCount++;
|
|
369
|
+
if (this.audioResponseCount <= 3 || this.audioResponseCount % 20 === 0) {
|
|
370
|
+
logger.info('VoiceService', `🔊 Audio chunk #${this.audioResponseCount}: ${part.inlineData.data.length} b64 chars, mime=${part.inlineData.mimeType || 'unknown'}`);
|
|
312
371
|
}
|
|
372
|
+
this.callbacks.onAudioResponse?.(part.inlineData.data);
|
|
373
|
+
}
|
|
374
|
+
if (part.text) {
|
|
375
|
+
logger.info('VoiceService', `🤖 MODEL: "${part.text}"`);
|
|
376
|
+
this.callbacks.onTranscript?.(part.text, true, 'model');
|
|
313
377
|
}
|
|
314
378
|
}
|
|
379
|
+
}
|
|
315
380
|
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
// Output transcription (model's speech-to-text)
|
|
322
|
-
if (content.outputTranscription?.text) {
|
|
323
|
-
this.callbacks.onTranscript?.(content.outputTranscription.text, true, 'model');
|
|
324
|
-
}
|
|
381
|
+
// Input transcription (user's speech-to-text)
|
|
382
|
+
if (content.inputTranscription?.text) {
|
|
383
|
+
logger.info('VoiceService', `🗣️ USER (voice): "${content.inputTranscription.text}"`);
|
|
384
|
+
this.callbacks.onTranscript?.(content.inputTranscription.text, true, 'user');
|
|
325
385
|
}
|
|
326
386
|
|
|
327
|
-
//
|
|
328
|
-
if (
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
this.callbacks.onToolCall?.({
|
|
332
|
-
name: fn.name,
|
|
333
|
-
args: fn.args || {},
|
|
334
|
-
id: fn.id
|
|
335
|
-
});
|
|
336
|
-
}
|
|
387
|
+
// Output transcription (model's speech-to-text)
|
|
388
|
+
if (content.outputTranscription?.text) {
|
|
389
|
+
logger.info('VoiceService', `🤖 MODEL (voice): "${content.outputTranscription.text}"`);
|
|
390
|
+
this.callbacks.onTranscript?.(content.outputTranscription.text, true, 'model');
|
|
337
391
|
}
|
|
338
392
|
|
|
339
|
-
//
|
|
340
|
-
if (
|
|
341
|
-
|
|
342
|
-
this.callbacks.onError?.(message.error.message || 'Server error');
|
|
393
|
+
// Tool calls inside serverContent (some SDK versions deliver here)
|
|
394
|
+
if (content.toolCall?.functionCalls) {
|
|
395
|
+
this.handleToolCalls(content.toolCall.functionCalls);
|
|
343
396
|
}
|
|
344
397
|
}
|
|
345
398
|
|
|
@@ -349,13 +402,5 @@ export class VoiceService {
|
|
|
349
402
|
this._status = newStatus;
|
|
350
403
|
this.callbacks.onStatusChange?.(newStatus);
|
|
351
404
|
}
|
|
352
|
-
arrayBufferToBase64(buffer) {
|
|
353
|
-
const bytes = new Uint8Array(buffer);
|
|
354
|
-
let binary = '';
|
|
355
|
-
for (let i = 0; i < bytes.byteLength; i++) {
|
|
356
|
-
binary += String.fromCharCode(bytes[i]);
|
|
357
|
-
}
|
|
358
|
-
return btoa(binary);
|
|
359
|
-
}
|
|
360
405
|
}
|
|
361
406
|
//# sourceMappingURL=VoiceService.js.map
|