react-native-agentic-ai 0.4.6 → 0.5.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +80 -4
- package/lib/module/components/AIAgent.js +179 -38
- package/lib/module/components/AIAgent.js.map +1 -1
- package/lib/module/components/AgentChatBar.js +53 -29
- package/lib/module/components/AgentChatBar.js.map +1 -1
- package/lib/module/components/Icons.js +337 -0
- package/lib/module/components/Icons.js.map +1 -0
- package/lib/module/core/AgentRuntime.js +74 -3
- package/lib/module/core/AgentRuntime.js.map +1 -1
- package/lib/module/core/systemPrompt.js +66 -39
- package/lib/module/core/systemPrompt.js.map +1 -1
- package/lib/module/index.js +3 -9
- package/lib/module/index.js.map +1 -1
- package/lib/module/services/AudioInputService.js +73 -2
- package/lib/module/services/AudioInputService.js.map +1 -1
- package/lib/module/services/AudioOutputService.js +58 -5
- package/lib/module/services/AudioOutputService.js.map +1 -1
- package/lib/module/services/VoiceService.js +281 -275
- package/lib/module/services/VoiceService.js.map +1 -1
- package/lib/typescript/src/components/AIAgent.d.ts.map +1 -1
- package/lib/typescript/src/components/AgentChatBar.d.ts.map +1 -1
- package/lib/typescript/src/components/Icons.d.ts +43 -0
- package/lib/typescript/src/components/Icons.d.ts.map +1 -0
- package/lib/typescript/src/core/AgentRuntime.d.ts +12 -0
- package/lib/typescript/src/core/AgentRuntime.d.ts.map +1 -1
- package/lib/typescript/src/core/systemPrompt.d.ts.map +1 -1
- package/lib/typescript/src/index.d.ts +4 -0
- package/lib/typescript/src/index.d.ts.map +1 -1
- package/lib/typescript/src/services/AudioInputService.d.ts +13 -0
- package/lib/typescript/src/services/AudioInputService.d.ts.map +1 -1
- package/lib/typescript/src/services/AudioOutputService.d.ts.map +1 -1
- package/lib/typescript/src/services/VoiceService.d.ts +38 -29
- package/lib/typescript/src/services/VoiceService.d.ts.map +1 -1
- package/package.json +1 -1
- package/src/components/AIAgent.tsx +192 -39
- package/src/components/AgentChatBar.tsx +44 -25
- package/src/components/Icons.tsx +253 -0
- package/src/core/AgentRuntime.ts +70 -3
- package/src/core/systemPrompt.ts +66 -39
- package/src/index.ts +8 -8
- package/src/services/AudioInputService.ts +77 -2
- package/src/services/AudioOutputService.ts +59 -5
- package/src/services/VoiceService.ts +278 -290
|
@@ -1,85 +1,151 @@
|
|
|
1
1
|
"use strict";
|
|
2
2
|
|
|
3
3
|
/**
|
|
4
|
-
* VoiceService —
|
|
4
|
+
* VoiceService — @google/genai SDK Live API connection.
|
|
5
|
+
*
|
|
6
|
+
* Uses the official `ai.live.connect()` method instead of raw WebSocket.
|
|
7
|
+
* This fixes function calling reliability: the SDK handles protocol details
|
|
8
|
+
* (binary framing, message transforms, model name prefixes) that our
|
|
9
|
+
* previous raw WebSocket implementation missed.
|
|
5
10
|
*
|
|
6
11
|
* Handles bidirectional audio streaming between the app and Gemini:
|
|
7
12
|
* - Sends PCM 16kHz 16-bit audio chunks (mic input)
|
|
8
13
|
* - Receives PCM 24kHz 16-bit audio chunks (AI responses)
|
|
9
14
|
* - Receives function calls (tap, navigate, etc.) for agentic actions
|
|
10
|
-
* - Sends screen context (DOM text
|
|
11
|
-
*
|
|
12
|
-
* Protocol: wss://generativelanguage.googleapis.com/ws/google.ai.generativelanguage.v1beta.GenerativeService.BidiGenerateContent
|
|
15
|
+
* - Sends screen context (DOM text) for live mode
|
|
13
16
|
*/
|
|
14
17
|
|
|
18
|
+
// Platform-specific import: Metro can't resolve '@google/genai/web' sub-path
|
|
19
|
+
// export, so we use the full path to the web bundle. This is what the SDK
|
|
20
|
+
// recommends ('use a platform specific import') — RN's WebSocket API is
|
|
21
|
+
// browser-compatible so the web bundle works correctly.
|
|
22
|
+
// @ts-ignore — TS can't find declarations for the deep path
|
|
23
|
+
import { GoogleGenAI, Modality } from '@google/genai/dist/web/index.mjs';
|
|
24
|
+
// @ts-ignore
|
|
25
|
+
|
|
15
26
|
import { logger } from "../utils/logger.js";
|
|
16
27
|
|
|
17
28
|
// ─── Types ─────────────────────────────────────────────────────
|
|
18
29
|
|
|
19
30
|
// ─── Constants ─────────────────────────────────────────────────
|
|
20
31
|
|
|
21
|
-
const
|
|
22
|
-
const WS_PATH = '/ws/google.ai.generativelanguage.v1beta.GenerativeService.BidiGenerateContent';
|
|
23
|
-
// Use -09-2025: Google's own cookbook uses this model for Live API tool use.
|
|
24
|
-
// The -12-2025 model had server-side regressions with function calling
|
|
25
|
-
// and was deprecated March 19, 2026. The -09-2025 version has
|
|
26
|
-
// "improved function calling and better handling of speech cut-offs."
|
|
27
|
-
const DEFAULT_MODEL = 'gemini-2.5-flash-native-audio-preview-09-2025';
|
|
32
|
+
const DEFAULT_MODEL = 'gemini-2.5-flash-native-audio-preview-12-2025';
|
|
28
33
|
const DEFAULT_INPUT_SAMPLE_RATE = 16000;
|
|
29
34
|
|
|
30
35
|
// ─── Service ───────────────────────────────────────────────────
|
|
31
36
|
|
|
32
37
|
export class VoiceService {
|
|
33
|
-
|
|
38
|
+
session = null;
|
|
34
39
|
callbacks = {};
|
|
35
|
-
|
|
40
|
+
lastCallbacks = null;
|
|
36
41
|
_status = 'disconnected';
|
|
42
|
+
intentionalDisconnect = false;
|
|
37
43
|
constructor(config) {
|
|
38
44
|
this.config = config;
|
|
39
45
|
}
|
|
40
46
|
|
|
41
47
|
// ─── Connection ────────────────────────────────────────────
|
|
42
48
|
|
|
43
|
-
|
|
44
|
-
|
|
49
|
+
/**
|
|
50
|
+
* Connect to Gemini Live API via the official SDK.
|
|
51
|
+
* Now async because `ai.live.connect()` returns a Promise.
|
|
52
|
+
*/
|
|
53
|
+
async connect(callbacks) {
|
|
54
|
+
if (this.session) {
|
|
45
55
|
logger.info('VoiceService', 'Already connected');
|
|
46
56
|
return;
|
|
47
57
|
}
|
|
48
58
|
this.callbacks = callbacks;
|
|
59
|
+
this.lastCallbacks = callbacks;
|
|
49
60
|
this.setStatus('connecting');
|
|
61
|
+
this.intentionalDisconnect = false;
|
|
50
62
|
const model = this.config.model || DEFAULT_MODEL;
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
this.
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
63
|
+
logger.info('VoiceService', `Connecting via SDK (model: ${model})`);
|
|
64
|
+
try {
|
|
65
|
+
const ai = new GoogleGenAI({
|
|
66
|
+
apiKey: this.config.apiKey
|
|
67
|
+
});
|
|
68
|
+
const toolDeclarations = this.buildToolDeclarations();
|
|
69
|
+
|
|
70
|
+
// Build SDK config matching the official docs pattern
|
|
71
|
+
const sdkConfig = {
|
|
72
|
+
responseModalities: [Modality.AUDIO]
|
|
73
|
+
};
|
|
74
|
+
|
|
75
|
+
// Enable transcription for debugging and UX
|
|
76
|
+
sdkConfig.inputAudioTranscription = {};
|
|
77
|
+
sdkConfig.outputAudioTranscription = {};
|
|
78
|
+
logger.info('VoiceService', 'Transcription enabled');
|
|
79
|
+
if (this.config.systemPrompt) {
|
|
80
|
+
sdkConfig.systemInstruction = {
|
|
81
|
+
parts: [{
|
|
82
|
+
text: this.config.systemPrompt
|
|
83
|
+
}]
|
|
84
|
+
};
|
|
85
|
+
}
|
|
86
|
+
if (toolDeclarations.length > 0) {
|
|
87
|
+
sdkConfig.tools = [{
|
|
88
|
+
functionDeclarations: toolDeclarations
|
|
89
|
+
}];
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
// FULL CONFIG DUMP — see exactly what we send to SDK
|
|
93
|
+
const configDump = JSON.stringify({
|
|
94
|
+
...sdkConfig,
|
|
95
|
+
systemInstruction: sdkConfig.systemInstruction ? '(present)' : '(none)',
|
|
96
|
+
tools: sdkConfig.tools ? `${toolDeclarations.length} declarations` : '(none)'
|
|
97
|
+
});
|
|
98
|
+
logger.info('VoiceService', `📋 SDK config: ${configDump}`);
|
|
99
|
+
logger.info('VoiceService', `📋 Tool names: ${toolDeclarations.map(t => t.name).join(', ')}`);
|
|
100
|
+
const session = await ai.live.connect({
|
|
101
|
+
model: model,
|
|
102
|
+
config: sdkConfig,
|
|
103
|
+
callbacks: {
|
|
104
|
+
onopen: () => {
|
|
105
|
+
logger.info('VoiceService', '✅ SDK session connected');
|
|
106
|
+
this.setStatus('connected');
|
|
107
|
+
},
|
|
108
|
+
onmessage: message => {
|
|
109
|
+
this.handleSDKMessage(message);
|
|
110
|
+
},
|
|
111
|
+
onerror: error => {
|
|
112
|
+
const errDetail = error ? JSON.stringify(error, Object.getOwnPropertyNames(error)).substring(0, 500) : 'null';
|
|
113
|
+
logger.error('VoiceService', `SDK error: ${errDetail}`);
|
|
114
|
+
this.setStatus('error');
|
|
115
|
+
this.callbacks.onError?.(error?.message || 'SDK connection error');
|
|
116
|
+
},
|
|
117
|
+
onclose: event => {
|
|
118
|
+
const closeDetail = event ? JSON.stringify(event, Object.getOwnPropertyNames(event)).substring(0, 500) : 'null';
|
|
119
|
+
if (this.intentionalDisconnect) {
|
|
120
|
+
logger.info('VoiceService', `SDK session closed (intentional)`);
|
|
121
|
+
} else {
|
|
122
|
+
logger.error('VoiceService', `SDK session closed UNEXPECTEDLY — code: ${event?.code}, reason: ${event?.reason}, detail: ${closeDetail}`);
|
|
123
|
+
this.callbacks.onError?.(`Connection lost (code: ${event?.code || 'unknown'})`);
|
|
124
|
+
}
|
|
125
|
+
this.session = null;
|
|
126
|
+
this.setStatus('disconnected');
|
|
127
|
+
}
|
|
128
|
+
}
|
|
129
|
+
});
|
|
130
|
+
this.session = session;
|
|
131
|
+
logger.info('VoiceService', 'SDK session established');
|
|
132
|
+
} catch (error) {
|
|
133
|
+
logger.error('VoiceService', `Connection failed: ${error.message}`);
|
|
65
134
|
this.setStatus('error');
|
|
66
|
-
this.callbacks.onError?.(error.message || '
|
|
67
|
-
}
|
|
68
|
-
this.ws.onmessage = event => {
|
|
69
|
-
this.handleMessage(event);
|
|
70
|
-
};
|
|
135
|
+
this.callbacks.onError?.(error.message || 'Failed to connect');
|
|
136
|
+
}
|
|
71
137
|
}
|
|
72
138
|
disconnect() {
|
|
73
|
-
if (this.
|
|
74
|
-
logger.info('VoiceService', 'Disconnecting...');
|
|
75
|
-
this.
|
|
76
|
-
this.
|
|
77
|
-
this.
|
|
139
|
+
if (this.session) {
|
|
140
|
+
logger.info('VoiceService', 'Disconnecting (intentional)...');
|
|
141
|
+
this.intentionalDisconnect = true;
|
|
142
|
+
this.session.close();
|
|
143
|
+
this.session = null;
|
|
78
144
|
this.setStatus('disconnected');
|
|
79
145
|
}
|
|
80
146
|
}
|
|
81
147
|
get isConnected() {
|
|
82
|
-
return this.
|
|
148
|
+
return this.session !== null && this._status === 'connected';
|
|
83
149
|
}
|
|
84
150
|
get currentStatus() {
|
|
85
151
|
return this._status;
|
|
@@ -87,298 +153,246 @@ export class VoiceService {
|
|
|
87
153
|
|
|
88
154
|
// ─── Send Audio ────────────────────────────────────────────
|
|
89
155
|
|
|
90
|
-
/** Send PCM audio chunk (base64 encoded)
|
|
156
|
+
/** Send PCM audio chunk (base64 encoded) via SDK's sendRealtimeInput */
|
|
91
157
|
sendCount = 0;
|
|
92
158
|
sendAudio(base64Audio) {
|
|
93
159
|
this.sendCount++;
|
|
94
|
-
if (!this.isConnected) {
|
|
95
|
-
|
|
160
|
+
if (!this.isConnected || !this.session) {
|
|
161
|
+
if (this.sendCount % 20 === 0) {
|
|
162
|
+
logger.warn('VoiceService', `sendAudio #${this.sendCount} DROPPED — not connected`);
|
|
163
|
+
}
|
|
96
164
|
return;
|
|
97
165
|
}
|
|
98
|
-
const
|
|
99
|
-
|
|
166
|
+
const mimeType = `audio/pcm;rate=${this.config.inputSampleRate || DEFAULT_INPUT_SAMPLE_RATE}`;
|
|
167
|
+
|
|
168
|
+
// DEBUG: log every send call
|
|
169
|
+
if (this.sendCount <= 5 || this.sendCount % 10 === 0) {
|
|
170
|
+
logger.info('VoiceService', `📡 sendAudio #${this.sendCount}: len=${base64Audio.length}, mime=${mimeType}, preview=${base64Audio.substring(0, 30)}...`);
|
|
171
|
+
}
|
|
172
|
+
try {
|
|
173
|
+
this.session.sendRealtimeInput({
|
|
100
174
|
audio: {
|
|
101
|
-
|
|
102
|
-
|
|
175
|
+
data: base64Audio,
|
|
176
|
+
mimeType
|
|
103
177
|
}
|
|
178
|
+
});
|
|
179
|
+
// Log every 50th successful send to confirm data is reaching WebSocket
|
|
180
|
+
if (this.sendCount % 50 === 0) {
|
|
181
|
+
logger.info('VoiceService', `✅ sendAudio #${this.sendCount} OK — session.isOpen=${!!this.session}`);
|
|
104
182
|
}
|
|
105
|
-
}
|
|
106
|
-
|
|
107
|
-
|
|
183
|
+
} catch (error) {
|
|
184
|
+
logger.error('VoiceService', `❌ sendAudio EXCEPTION: ${error.message}\n${error.stack?.substring(0, 300)}`);
|
|
185
|
+
this.session = null;
|
|
186
|
+
this.setStatus('disconnected');
|
|
187
|
+
}
|
|
108
188
|
}
|
|
109
189
|
|
|
110
190
|
// ─── Send Text ─────────────────────────────────────────────
|
|
111
191
|
|
|
112
|
-
/** Send text message via
|
|
192
|
+
/** Send text message via SDK's sendClientContent */
|
|
113
193
|
sendText(text) {
|
|
114
|
-
if (!this.isConnected) return;
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
194
|
+
if (!this.isConnected || !this.session) return;
|
|
195
|
+
logger.info('VoiceService', `🗣️ USER (text): "${text}"`);
|
|
196
|
+
try {
|
|
197
|
+
this.session.sendClientContent({
|
|
198
|
+
turns: [{
|
|
199
|
+
role: 'user',
|
|
200
|
+
parts: [{
|
|
201
|
+
text
|
|
202
|
+
}]
|
|
203
|
+
}],
|
|
204
|
+
turnComplete: true
|
|
205
|
+
});
|
|
206
|
+
} catch (error) {
|
|
207
|
+
logger.error('VoiceService', `sendText failed: ${error.message}`);
|
|
208
|
+
}
|
|
121
209
|
}
|
|
122
210
|
|
|
123
|
-
/**
|
|
124
|
-
*
|
|
125
|
-
* Uses
|
|
126
|
-
* WITHOUT triggering a model response. This is the "incremental content
|
|
127
|
-
* updates" pattern from the Gemini docs for establishing session context.
|
|
128
|
-
*
|
|
129
|
-
* Called once at connect + after each tool call (not on a timer).
|
|
130
|
-
* Screenshots are handled separately via the capture_screenshot tool.
|
|
211
|
+
/**
|
|
212
|
+
* Send DOM tree as passive context during live conversation.
|
|
213
|
+
* Uses turnComplete: false — the model receives context without responding.
|
|
131
214
|
*/
|
|
132
215
|
sendScreenContext(domText) {
|
|
133
|
-
if (!this.isConnected) return;
|
|
134
|
-
|
|
135
|
-
|
|
216
|
+
if (!this.isConnected || !this.session) return;
|
|
217
|
+
try {
|
|
218
|
+
this.session.sendClientContent({
|
|
136
219
|
turns: [{
|
|
137
220
|
role: 'user',
|
|
138
221
|
parts: [{
|
|
139
222
|
text: domText
|
|
140
223
|
}]
|
|
141
224
|
}],
|
|
142
|
-
turnComplete:
|
|
143
|
-
}
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
225
|
+
turnComplete: true
|
|
226
|
+
});
|
|
227
|
+
logger.info('VoiceService', `📤 Screen context sent (${domText.length} chars)`);
|
|
228
|
+
} catch (error) {
|
|
229
|
+
logger.error('VoiceService', `sendScreenContext failed: ${error.message}`);
|
|
230
|
+
}
|
|
148
231
|
}
|
|
149
232
|
|
|
150
233
|
// ─── Send Function Response ────────────────────────────────
|
|
151
234
|
|
|
152
|
-
/** Send function call result back
|
|
235
|
+
/** Send function call result back via SDK's sendToolResponse */
|
|
153
236
|
sendFunctionResponse(name, id, result) {
|
|
154
|
-
if (!this.isConnected) return;
|
|
155
|
-
|
|
156
|
-
|
|
237
|
+
if (!this.isConnected || !this.session) return;
|
|
238
|
+
logger.info('VoiceService', `📤 Sending tool response for ${name} (id=${id})`);
|
|
239
|
+
try {
|
|
240
|
+
this.session.sendToolResponse({
|
|
157
241
|
functionResponses: [{
|
|
158
242
|
name,
|
|
159
243
|
id,
|
|
160
244
|
response: result
|
|
161
245
|
}]
|
|
162
|
-
}
|
|
163
|
-
}
|
|
164
|
-
|
|
165
|
-
|
|
246
|
+
});
|
|
247
|
+
} catch (error) {
|
|
248
|
+
logger.error('VoiceService', `sendFunctionResponse failed: ${error.message}`);
|
|
249
|
+
}
|
|
166
250
|
}
|
|
167
251
|
|
|
168
|
-
// ─── Internal:
|
|
252
|
+
// ─── Internal: Tool Declarations ───────────────────────────
|
|
169
253
|
|
|
170
254
|
/**
|
|
171
|
-
* Builds
|
|
172
|
-
*
|
|
173
|
-
*
|
|
174
|
-
* The agent_step tool flattens reasoning fields (previous_goal_eval,
|
|
175
|
-
* memory, plan) + action_name enum + all action parameters into a single
|
|
176
|
-
* function — matching GeminiProvider.buildAgentStepDeclaration exactly.
|
|
255
|
+
* Builds function declarations from configured tools.
|
|
256
|
+
* Converts BOOLEAN params to STRING (native audio model limitation).
|
|
177
257
|
*/
|
|
178
|
-
|
|
179
|
-
if (!this.
|
|
180
|
-
const
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
// The text thinking blocks are a trade-off for working function calling.
|
|
188
|
-
}
|
|
189
|
-
};
|
|
190
|
-
|
|
191
|
-
// Add system instruction if provided
|
|
192
|
-
if (this.config.systemPrompt) {
|
|
193
|
-
setup.systemInstruction = {
|
|
194
|
-
parts: [{
|
|
195
|
-
text: this.config.systemPrompt
|
|
196
|
-
}]
|
|
258
|
+
buildToolDeclarations() {
|
|
259
|
+
if (!this.config.tools?.length) return [];
|
|
260
|
+
const validTools = this.config.tools.filter(t => t.name !== 'capture_screenshot');
|
|
261
|
+
if (validTools.length === 0) return [];
|
|
262
|
+
return validTools.map(tool => {
|
|
263
|
+
const hasParams = Object.keys(tool.parameters || {}).length > 0;
|
|
264
|
+
const functionDecl = {
|
|
265
|
+
name: tool.name,
|
|
266
|
+
description: tool.description
|
|
197
267
|
};
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
const validTools = this.config.tools.filter(t => t.name !== 'capture_screenshot');
|
|
208
|
-
if (validTools.length > 0) {
|
|
209
|
-
setup.tools = [{
|
|
210
|
-
functionDeclarations: validTools.map(tool => {
|
|
211
|
-
const hasParams = Object.keys(tool.parameters || {}).length > 0;
|
|
212
|
-
const functionDecl = {
|
|
213
|
-
name: tool.name,
|
|
214
|
-
description: tool.description
|
|
215
|
-
};
|
|
216
|
-
if (hasParams) {
|
|
217
|
-
functionDecl.parameters = {
|
|
218
|
-
type: 'OBJECT',
|
|
219
|
-
properties: Object.fromEntries(Object.entries(tool.parameters).map(([key, param]) => {
|
|
220
|
-
// Native audio model crashes with BOOLEAN/ENUM types (error 1008)
|
|
221
|
-
// Convert to STRING as a workaround
|
|
222
|
-
let paramType = param.type.toUpperCase();
|
|
223
|
-
let desc = param.description;
|
|
224
|
-
if (paramType === 'BOOLEAN') {
|
|
225
|
-
paramType = 'STRING';
|
|
226
|
-
desc = `${desc} (use "true" or "false")`;
|
|
227
|
-
}
|
|
228
|
-
return [key, {
|
|
229
|
-
type: paramType,
|
|
230
|
-
description: desc
|
|
231
|
-
}];
|
|
232
|
-
})),
|
|
233
|
-
required: Object.entries(tool.parameters).filter(([, param]) => param.required).map(([key]) => key)
|
|
234
|
-
};
|
|
268
|
+
if (hasParams) {
|
|
269
|
+
functionDecl.parameters = {
|
|
270
|
+
type: 'OBJECT',
|
|
271
|
+
properties: Object.fromEntries(Object.entries(tool.parameters).map(([key, param]) => {
|
|
272
|
+
let paramType = param.type.toUpperCase();
|
|
273
|
+
let desc = param.description;
|
|
274
|
+
if (paramType === 'BOOLEAN') {
|
|
275
|
+
paramType = 'STRING';
|
|
276
|
+
desc = `${desc} (use "true" or "false")`;
|
|
235
277
|
}
|
|
236
|
-
return
|
|
237
|
-
|
|
238
|
-
|
|
278
|
+
return [key, {
|
|
279
|
+
type: paramType,
|
|
280
|
+
description: desc
|
|
281
|
+
}];
|
|
282
|
+
})),
|
|
283
|
+
required: Object.entries(tool.parameters).filter(([, param]) => param.required).map(([key]) => key)
|
|
284
|
+
};
|
|
239
285
|
}
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
setup
|
|
243
|
-
};
|
|
244
|
-
logger.info('VoiceService', `Sending setup (model: ${model}, ${this.config.tools?.length || 0} tools)`);
|
|
245
|
-
try {
|
|
246
|
-
const payload = JSON.stringify(setupMessage);
|
|
247
|
-
logger.info('VoiceService', `📤 Raw Setup Payload: ${payload}`);
|
|
248
|
-
this.ws.send(payload);
|
|
249
|
-
} catch (err) {
|
|
250
|
-
logger.error('VoiceService', `❌ Error stringifying setup message: ${err.message}`);
|
|
251
|
-
}
|
|
286
|
+
return functionDecl;
|
|
287
|
+
});
|
|
252
288
|
}
|
|
253
289
|
|
|
254
290
|
// ─── Internal: Message Handling ────────────────────────────
|
|
255
291
|
|
|
256
|
-
|
|
292
|
+
/**
|
|
293
|
+
* Handle messages from the SDK's onmessage callback.
|
|
294
|
+
* The SDK parses binary/JSON automatically — we get clean objects.
|
|
295
|
+
*
|
|
296
|
+
* Per official docs, tool calls come at the top level as
|
|
297
|
+
* `response.toolCall.functionCalls`.
|
|
298
|
+
*/
|
|
299
|
+
handleSDKMessage(message) {
|
|
257
300
|
try {
|
|
258
|
-
|
|
259
|
-
const
|
|
260
|
-
logger.info('VoiceService',
|
|
261
|
-
|
|
262
|
-
//
|
|
263
|
-
if (
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
return;
|
|
301
|
+
// RAW MESSAGE DUMP — full session visibility
|
|
302
|
+
const msgKeys = Object.keys(message || {}).join(', ');
|
|
303
|
+
logger.info('VoiceService', `📨 SDK message keys: [${msgKeys}]`);
|
|
304
|
+
|
|
305
|
+
// Full raw dump for non-audio messages (audio is too large)
|
|
306
|
+
if (!message.serverContent?.modelTurn?.parts?.some(p => p.inlineData)) {
|
|
307
|
+
const rawDump = JSON.stringify(message).substring(0, 1000);
|
|
308
|
+
logger.info('VoiceService', `📨 RAW: ${rawDump}`);
|
|
267
309
|
}
|
|
268
310
|
|
|
269
|
-
//
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
logger.info('VoiceService', `📥 Raw JSON Message: ${event.data.substring(0, 1000)}`);
|
|
273
|
-
this.processMessage(message);
|
|
274
|
-
} catch (error) {
|
|
275
|
-
logger.error('VoiceService', `Error handling message: ${error.message}`);
|
|
276
|
-
}
|
|
277
|
-
}
|
|
278
|
-
handleBinaryMessage(data) {
|
|
279
|
-
try {
|
|
280
|
-
// Try to decode as JSON first
|
|
281
|
-
let bytes;
|
|
282
|
-
if (data instanceof ArrayBuffer) {
|
|
283
|
-
bytes = new Uint8Array(data);
|
|
284
|
-
} else if (data instanceof Blob) {
|
|
285
|
-
// Blob handling — read as ArrayBuffer
|
|
286
|
-
const reader = new FileReader();
|
|
287
|
-
reader.onload = () => {
|
|
288
|
-
if (reader.result instanceof ArrayBuffer) {
|
|
289
|
-
this.processBinaryBytes(new Uint8Array(reader.result));
|
|
290
|
-
}
|
|
291
|
-
};
|
|
292
|
-
reader.readAsArrayBuffer(data);
|
|
293
|
-
return;
|
|
294
|
-
} else {
|
|
311
|
+
// Tool calls — top-level (per official docs)
|
|
312
|
+
if (message.toolCall?.functionCalls) {
|
|
313
|
+
this.handleToolCalls(message.toolCall.functionCalls);
|
|
295
314
|
return;
|
|
296
315
|
}
|
|
297
|
-
|
|
316
|
+
|
|
317
|
+
// Server content (audio, text, transcripts, turn events)
|
|
318
|
+
if (message.serverContent) {
|
|
319
|
+
this.handleServerContent(message.serverContent);
|
|
320
|
+
}
|
|
321
|
+
|
|
322
|
+
// Setup complete acknowledgment
|
|
323
|
+
if (message.setupComplete !== undefined) {
|
|
324
|
+
logger.info('VoiceService', '✅ Setup complete — ready for audio');
|
|
325
|
+
this.callbacks.onSetupComplete?.();
|
|
326
|
+
}
|
|
327
|
+
|
|
328
|
+
// Error messages
|
|
329
|
+
if (message.error) {
|
|
330
|
+
logger.error('VoiceService', `Server error: ${JSON.stringify(message.error)}`);
|
|
331
|
+
this.callbacks.onError?.(message.error.message || 'Server error');
|
|
332
|
+
}
|
|
298
333
|
} catch (error) {
|
|
299
|
-
logger.error('VoiceService', `Error handling
|
|
334
|
+
logger.error('VoiceService', `Error handling SDK message: ${error.message}`);
|
|
300
335
|
}
|
|
301
336
|
}
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
this.callbacks.onAudioResponse?.(this.arrayBufferToBase64(bytes.buffer));
|
|
313
|
-
}
|
|
314
|
-
} else {
|
|
315
|
-
// Raw PCM audio data
|
|
316
|
-
this.callbacks.onAudioResponse?.(this.arrayBufferToBase64(bytes.buffer));
|
|
337
|
+
|
|
338
|
+
/** Process tool calls from the model */
|
|
339
|
+
handleToolCalls(functionCalls) {
|
|
340
|
+
for (const fn of functionCalls) {
|
|
341
|
+
logger.info('VoiceService', `🎯 Tool call: ${fn.name}(${JSON.stringify(fn.args)}) [id=${fn.id}]`);
|
|
342
|
+
this.callbacks.onToolCall?.({
|
|
343
|
+
name: fn.name,
|
|
344
|
+
args: fn.args || {},
|
|
345
|
+
id: fn.id
|
|
346
|
+
});
|
|
317
347
|
}
|
|
318
348
|
}
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
349
|
+
audioResponseCount = 0;
|
|
350
|
+
|
|
351
|
+
/** Process server content (audio responses, transcripts, turn events) */
|
|
352
|
+
handleServerContent(content) {
|
|
353
|
+
// Log all keys for full visibility
|
|
354
|
+
const contentKeys = Object.keys(content || {}).join(', ');
|
|
355
|
+
logger.debug('VoiceService', `📦 serverContent keys: [${contentKeys}]`);
|
|
356
|
+
|
|
357
|
+
// Turn complete
|
|
358
|
+
if (content.turnComplete) {
|
|
359
|
+
logger.info('VoiceService', `🏁 Turn complete (audioChunks sent: ${this.audioResponseCount})`);
|
|
360
|
+
this.audioResponseCount = 0;
|
|
361
|
+
this.callbacks.onTurnComplete?.();
|
|
326
362
|
}
|
|
327
363
|
|
|
328
|
-
//
|
|
329
|
-
if (
|
|
330
|
-
const
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
this.callbacks.onTurnComplete?.();
|
|
336
|
-
}
|
|
337
|
-
|
|
338
|
-
// Process model output parts
|
|
339
|
-
if (content.modelTurn?.parts) {
|
|
340
|
-
for (const part of content.modelTurn.parts) {
|
|
341
|
-
// Audio response
|
|
342
|
-
if (part.inlineData?.data) {
|
|
343
|
-
logger.info('VoiceService', `🔊 Audio response: ${part.inlineData.data.length} chars`);
|
|
344
|
-
this.callbacks.onAudioResponse?.(part.inlineData.data);
|
|
345
|
-
}
|
|
346
|
-
|
|
347
|
-
// Text response (transcript)
|
|
348
|
-
if (part.text) {
|
|
349
|
-
logger.info('VoiceService', `💬 Text response: "${part.text}"`);
|
|
350
|
-
this.callbacks.onTranscript?.(part.text, true, 'model');
|
|
364
|
+
// Model output parts (audio + optional thinking text)
|
|
365
|
+
if (content.modelTurn?.parts) {
|
|
366
|
+
for (const part of content.modelTurn.parts) {
|
|
367
|
+
if (part.inlineData?.data) {
|
|
368
|
+
this.audioResponseCount++;
|
|
369
|
+
if (this.audioResponseCount <= 3 || this.audioResponseCount % 20 === 0) {
|
|
370
|
+
logger.info('VoiceService', `🔊 Audio chunk #${this.audioResponseCount}: ${part.inlineData.data.length} b64 chars, mime=${part.inlineData.mimeType || 'unknown'}`);
|
|
351
371
|
}
|
|
372
|
+
this.callbacks.onAudioResponse?.(part.inlineData.data);
|
|
373
|
+
}
|
|
374
|
+
if (part.text) {
|
|
375
|
+
logger.info('VoiceService', `🤖 MODEL: "${part.text}"`);
|
|
376
|
+
this.callbacks.onTranscript?.(part.text, true, 'model');
|
|
352
377
|
}
|
|
353
378
|
}
|
|
379
|
+
}
|
|
354
380
|
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
// Output transcription (model's speech-to-text)
|
|
361
|
-
if (content.outputTranscription?.text) {
|
|
362
|
-
this.callbacks.onTranscript?.(content.outputTranscription.text, true, 'model');
|
|
363
|
-
}
|
|
381
|
+
// Input transcription (user's speech-to-text)
|
|
382
|
+
if (content.inputTranscription?.text) {
|
|
383
|
+
logger.info('VoiceService', `🗣️ USER (voice): "${content.inputTranscription.text}"`);
|
|
384
|
+
this.callbacks.onTranscript?.(content.inputTranscription.text, true, 'user');
|
|
364
385
|
}
|
|
365
386
|
|
|
366
|
-
//
|
|
367
|
-
if (
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
this.callbacks.onToolCall?.({
|
|
371
|
-
name: fn.name,
|
|
372
|
-
args: fn.args || {},
|
|
373
|
-
id: fn.id
|
|
374
|
-
});
|
|
375
|
-
}
|
|
387
|
+
// Output transcription (model's speech-to-text)
|
|
388
|
+
if (content.outputTranscription?.text) {
|
|
389
|
+
logger.info('VoiceService', `🤖 MODEL (voice): "${content.outputTranscription.text}"`);
|
|
390
|
+
this.callbacks.onTranscript?.(content.outputTranscription.text, true, 'model');
|
|
376
391
|
}
|
|
377
392
|
|
|
378
|
-
//
|
|
379
|
-
if (
|
|
380
|
-
|
|
381
|
-
this.callbacks.onError?.(message.error.message || 'Server error');
|
|
393
|
+
// Tool calls inside serverContent (some SDK versions deliver here)
|
|
394
|
+
if (content.toolCall?.functionCalls) {
|
|
395
|
+
this.handleToolCalls(content.toolCall.functionCalls);
|
|
382
396
|
}
|
|
383
397
|
}
|
|
384
398
|
|
|
@@ -388,13 +402,5 @@ export class VoiceService {
|
|
|
388
402
|
this._status = newStatus;
|
|
389
403
|
this.callbacks.onStatusChange?.(newStatus);
|
|
390
404
|
}
|
|
391
|
-
arrayBufferToBase64(buffer) {
|
|
392
|
-
const bytes = new Uint8Array(buffer);
|
|
393
|
-
let binary = '';
|
|
394
|
-
for (let i = 0; i < bytes.byteLength; i++) {
|
|
395
|
-
binary += String.fromCharCode(bytes[i]);
|
|
396
|
-
}
|
|
397
|
-
return btoa(binary);
|
|
398
|
-
}
|
|
399
405
|
}
|
|
400
406
|
//# sourceMappingURL=VoiceService.js.map
|