@mobileai/react-native 0.4.6 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. package/README.md +80 -4
  2. package/lib/module/components/AIAgent.js +179 -38
  3. package/lib/module/components/AIAgent.js.map +1 -1
  4. package/lib/module/components/AgentChatBar.js +53 -29
  5. package/lib/module/components/AgentChatBar.js.map +1 -1
  6. package/lib/module/components/Icons.js +337 -0
  7. package/lib/module/components/Icons.js.map +1 -0
  8. package/lib/module/core/AgentRuntime.js +74 -3
  9. package/lib/module/core/AgentRuntime.js.map +1 -1
  10. package/lib/module/core/systemPrompt.js +57 -38
  11. package/lib/module/core/systemPrompt.js.map +1 -1
  12. package/lib/module/index.js +3 -9
  13. package/lib/module/index.js.map +1 -1
  14. package/lib/module/services/AudioInputService.js +73 -2
  15. package/lib/module/services/AudioInputService.js.map +1 -1
  16. package/lib/module/services/AudioOutputService.js +58 -5
  17. package/lib/module/services/AudioOutputService.js.map +1 -1
  18. package/lib/module/services/VoiceService.js +281 -275
  19. package/lib/module/services/VoiceService.js.map +1 -1
  20. package/lib/typescript/src/components/AIAgent.d.ts.map +1 -1
  21. package/lib/typescript/src/components/AgentChatBar.d.ts.map +1 -1
  22. package/lib/typescript/src/components/Icons.d.ts +43 -0
  23. package/lib/typescript/src/components/Icons.d.ts.map +1 -0
  24. package/lib/typescript/src/core/AgentRuntime.d.ts +12 -0
  25. package/lib/typescript/src/core/AgentRuntime.d.ts.map +1 -1
  26. package/lib/typescript/src/core/systemPrompt.d.ts.map +1 -1
  27. package/lib/typescript/src/index.d.ts +4 -0
  28. package/lib/typescript/src/index.d.ts.map +1 -1
  29. package/lib/typescript/src/services/AudioInputService.d.ts +13 -0
  30. package/lib/typescript/src/services/AudioInputService.d.ts.map +1 -1
  31. package/lib/typescript/src/services/AudioOutputService.d.ts.map +1 -1
  32. package/lib/typescript/src/services/VoiceService.d.ts +38 -29
  33. package/lib/typescript/src/services/VoiceService.d.ts.map +1 -1
  34. package/package.json +1 -1
  35. package/src/components/AIAgent.tsx +192 -39
  36. package/src/components/AgentChatBar.tsx +44 -25
  37. package/src/components/Icons.tsx +253 -0
  38. package/src/core/AgentRuntime.ts +70 -3
  39. package/src/core/systemPrompt.ts +57 -38
  40. package/src/index.ts +8 -8
  41. package/src/services/AudioInputService.ts +77 -2
  42. package/src/services/AudioOutputService.ts +59 -5
  43. package/src/services/VoiceService.ts +278 -290
@@ -1,85 +1,151 @@
1
1
  "use strict";
2
2
 
3
3
  /**
4
- * VoiceService — WebSocket connection to Gemini Live API.
4
+ * VoiceService — @google/genai SDK Live API connection.
5
+ *
6
+ * Uses the official `ai.live.connect()` method instead of raw WebSocket.
7
+ * This fixes function calling reliability: the SDK handles protocol details
8
+ * (binary framing, message transforms, model name prefixes) that our
9
+ * previous raw WebSocket implementation missed.
5
10
  *
6
11
  * Handles bidirectional audio streaming between the app and Gemini:
7
12
  * - Sends PCM 16kHz 16-bit audio chunks (mic input)
8
13
  * - Receives PCM 24kHz 16-bit audio chunks (AI responses)
9
14
  * - Receives function calls (tap, navigate, etc.) for agentic actions
10
- * - Sends screen context (DOM text + optional screenshot) for live mode
11
- *
12
- * Protocol: wss://generativelanguage.googleapis.com/ws/google.ai.generativelanguage.v1beta.GenerativeService.BidiGenerateContent
15
+ * - Sends screen context (DOM text) for live mode
13
16
  */
14
17
 
18
+ // Platform-specific import: Metro can't resolve '@google/genai/web' sub-path
19
+ // export, so we use the full path to the web bundle. This is what the SDK
20
+ // recommends ('use a platform specific import') — RN's WebSocket API is
21
+ // browser-compatible so the web bundle works correctly.
22
+ // @ts-ignore — TS can't find declarations for the deep path
23
+ import { GoogleGenAI, Modality } from '@google/genai/dist/web/index.mjs';
24
+ // @ts-ignore
25
+
15
26
  import { logger } from "../utils/logger.js";
16
27
 
17
28
  // ─── Types ─────────────────────────────────────────────────────
18
29
 
19
30
  // ─── Constants ─────────────────────────────────────────────────
20
31
 
21
- const WS_HOST = 'generativelanguage.googleapis.com';
22
- const WS_PATH = '/ws/google.ai.generativelanguage.v1beta.GenerativeService.BidiGenerateContent';
23
- // Use -09-2025: Google's own cookbook uses this model for Live API tool use.
24
- // The -12-2025 model had server-side regressions with function calling
25
- // and was deprecated March 19, 2026. The -09-2025 version has
26
- // "improved function calling and better handling of speech cut-offs."
27
- const DEFAULT_MODEL = 'gemini-2.5-flash-native-audio-preview-09-2025';
32
+ const DEFAULT_MODEL = 'gemini-2.5-flash-native-audio-preview-12-2025';
28
33
  const DEFAULT_INPUT_SAMPLE_RATE = 16000;
29
34
 
30
35
  // ─── Service ───────────────────────────────────────────────────
31
36
 
32
37
  export class VoiceService {
33
- ws = null;
38
+ session = null;
34
39
  callbacks = {};
35
- setupComplete = false;
40
+ lastCallbacks = null;
36
41
  _status = 'disconnected';
42
+ intentionalDisconnect = false;
37
43
  constructor(config) {
38
44
  this.config = config;
39
45
  }
40
46
 
41
47
  // ─── Connection ────────────────────────────────────────────
42
48
 
43
- connect(callbacks) {
44
- if (this.ws?.readyState === WebSocket.OPEN) {
49
+ /**
50
+ * Connect to Gemini Live API via the official SDK.
51
+ * Now async because `ai.live.connect()` returns a Promise.
52
+ */
53
+ async connect(callbacks) {
54
+ if (this.session) {
45
55
  logger.info('VoiceService', 'Already connected');
46
56
  return;
47
57
  }
48
58
  this.callbacks = callbacks;
59
+ this.lastCallbacks = callbacks;
49
60
  this.setStatus('connecting');
61
+ this.intentionalDisconnect = false;
50
62
  const model = this.config.model || DEFAULT_MODEL;
51
- const url = `wss://${WS_HOST}${WS_PATH}?key=${this.config.apiKey}`;
52
- logger.info('VoiceService', `Connecting to Gemini Live API (model: ${model})`);
53
- this.ws = new WebSocket(url);
54
- this.ws.onopen = () => {
55
- logger.info('VoiceService', 'WebSocket connected, sending setup...');
56
- this.sendSetup();
57
- };
58
- this.ws.onclose = event => {
59
- logger.info('VoiceService', `WebSocket closed: ${event.code} ${event.reason}`);
60
- this.setStatus('disconnected');
61
- this.setupComplete = false;
62
- };
63
- this.ws.onerror = error => {
64
- logger.error('VoiceService', `WebSocket error: ${error.message || 'Unknown'}`);
63
+ logger.info('VoiceService', `Connecting via SDK (model: ${model})`);
64
+ try {
65
+ const ai = new GoogleGenAI({
66
+ apiKey: this.config.apiKey
67
+ });
68
+ const toolDeclarations = this.buildToolDeclarations();
69
+
70
+ // Build SDK config matching the official docs pattern
71
+ const sdkConfig = {
72
+ responseModalities: [Modality.AUDIO]
73
+ };
74
+
75
+ // Enable transcription for debugging and UX
76
+ sdkConfig.inputAudioTranscription = {};
77
+ sdkConfig.outputAudioTranscription = {};
78
+ logger.info('VoiceService', 'Transcription enabled');
79
+ if (this.config.systemPrompt) {
80
+ sdkConfig.systemInstruction = {
81
+ parts: [{
82
+ text: this.config.systemPrompt
83
+ }]
84
+ };
85
+ }
86
+ if (toolDeclarations.length > 0) {
87
+ sdkConfig.tools = [{
88
+ functionDeclarations: toolDeclarations
89
+ }];
90
+ }
91
+
92
+ // FULL CONFIG DUMP — see exactly what we send to SDK
93
+ const configDump = JSON.stringify({
94
+ ...sdkConfig,
95
+ systemInstruction: sdkConfig.systemInstruction ? '(present)' : '(none)',
96
+ tools: sdkConfig.tools ? `${toolDeclarations.length} declarations` : '(none)'
97
+ });
98
+ logger.info('VoiceService', `📋 SDK config: ${configDump}`);
99
+ logger.info('VoiceService', `📋 Tool names: ${toolDeclarations.map(t => t.name).join(', ')}`);
100
+ const session = await ai.live.connect({
101
+ model: model,
102
+ config: sdkConfig,
103
+ callbacks: {
104
+ onopen: () => {
105
+ logger.info('VoiceService', '✅ SDK session connected');
106
+ this.setStatus('connected');
107
+ },
108
+ onmessage: message => {
109
+ this.handleSDKMessage(message);
110
+ },
111
+ onerror: error => {
112
+ const errDetail = error ? JSON.stringify(error, Object.getOwnPropertyNames(error)).substring(0, 500) : 'null';
113
+ logger.error('VoiceService', `SDK error: ${errDetail}`);
114
+ this.setStatus('error');
115
+ this.callbacks.onError?.(error?.message || 'SDK connection error');
116
+ },
117
+ onclose: event => {
118
+ const closeDetail = event ? JSON.stringify(event, Object.getOwnPropertyNames(event)).substring(0, 500) : 'null';
119
+ if (this.intentionalDisconnect) {
120
+ logger.info('VoiceService', `SDK session closed (intentional)`);
121
+ } else {
122
+ logger.error('VoiceService', `SDK session closed UNEXPECTEDLY — code: ${event?.code}, reason: ${event?.reason}, detail: ${closeDetail}`);
123
+ this.callbacks.onError?.(`Connection lost (code: ${event?.code || 'unknown'})`);
124
+ }
125
+ this.session = null;
126
+ this.setStatus('disconnected');
127
+ }
128
+ }
129
+ });
130
+ this.session = session;
131
+ logger.info('VoiceService', 'SDK session established');
132
+ } catch (error) {
133
+ logger.error('VoiceService', `Connection failed: ${error.message}`);
65
134
  this.setStatus('error');
66
- this.callbacks.onError?.(error.message || 'WebSocket connection error');
67
- };
68
- this.ws.onmessage = event => {
69
- this.handleMessage(event);
70
- };
135
+ this.callbacks.onError?.(error.message || 'Failed to connect');
136
+ }
71
137
  }
72
138
  disconnect() {
73
- if (this.ws) {
74
- logger.info('VoiceService', 'Disconnecting...');
75
- this.ws.close();
76
- this.ws = null;
77
- this.setupComplete = false;
139
+ if (this.session) {
140
+ logger.info('VoiceService', 'Disconnecting (intentional)...');
141
+ this.intentionalDisconnect = true;
142
+ this.session.close();
143
+ this.session = null;
78
144
  this.setStatus('disconnected');
79
145
  }
80
146
  }
81
147
  get isConnected() {
82
- return this.ws?.readyState === WebSocket.OPEN && this.setupComplete;
148
+ return this.session !== null && this._status === 'connected';
83
149
  }
84
150
  get currentStatus() {
85
151
  return this._status;
@@ -87,298 +153,246 @@ export class VoiceService {
87
153
 
88
154
  // ─── Send Audio ────────────────────────────────────────────
89
155
 
90
- /** Send PCM audio chunk (base64 encoded) to Gemini */
156
+ /** Send PCM audio chunk (base64 encoded) via SDK's sendRealtimeInput */
91
157
  sendCount = 0;
92
158
  sendAudio(base64Audio) {
93
159
  this.sendCount++;
94
- if (!this.isConnected) {
95
- logger.warn('VoiceService', `sendAudio #${this.sendCount} DROPPED not connected (ws=${this.ws?.readyState}, setup=${this.setupComplete})`);
160
+ if (!this.isConnected || !this.session) {
161
+ if (this.sendCount % 20 === 0) {
162
+ logger.warn('VoiceService', `sendAudio #${this.sendCount} DROPPED — not connected`);
163
+ }
96
164
  return;
97
165
  }
98
- const message = {
99
- realtimeInput: {
166
+ const mimeType = `audio/pcm;rate=${this.config.inputSampleRate || DEFAULT_INPUT_SAMPLE_RATE}`;
167
+
168
+ // DEBUG: log every send call
169
+ if (this.sendCount <= 5 || this.sendCount % 10 === 0) {
170
+ logger.info('VoiceService', `📡 sendAudio #${this.sendCount}: len=${base64Audio.length}, mime=${mimeType}, preview=${base64Audio.substring(0, 30)}...`);
171
+ }
172
+ try {
173
+ this.session.sendRealtimeInput({
100
174
  audio: {
101
- mimeType: `audio/pcm;rate=${this.config.inputSampleRate || DEFAULT_INPUT_SAMPLE_RATE}`,
102
- data: base64Audio
175
+ data: base64Audio,
176
+ mimeType
103
177
  }
178
+ });
179
+ // Log every 50th successful send to confirm data is reaching WebSocket
180
+ if (this.sendCount % 50 === 0) {
181
+ logger.info('VoiceService', `✅ sendAudio #${this.sendCount} OK — session.isOpen=${!!this.session}`);
104
182
  }
105
- };
106
- logger.info('VoiceService', `📤 #${this.sendCount} sending ${base64Audio.length} chars (ws=${this.ws?.readyState})`);
107
- this.ws.send(JSON.stringify(message));
183
+ } catch (error) {
184
+ logger.error('VoiceService', `❌ sendAudio EXCEPTION: ${error.message}\n${error.stack?.substring(0, 300)}`);
185
+ this.session = null;
186
+ this.setStatus('disconnected');
187
+ }
108
188
  }
109
189
 
110
190
  // ─── Send Text ─────────────────────────────────────────────
111
191
 
112
- /** Send text message via realtimeInput (same channel as audio) */
192
+ /** Send text message via SDK's sendClientContent */
113
193
  sendText(text) {
114
- if (!this.isConnected) return;
115
- const message = {
116
- realtimeInput: {
117
- text
118
- }
119
- };
120
- this.ws.send(JSON.stringify(message));
194
+ if (!this.isConnected || !this.session) return;
195
+ logger.info('VoiceService', `🗣️ USER (text): "${text}"`);
196
+ try {
197
+ this.session.sendClientContent({
198
+ turns: [{
199
+ role: 'user',
200
+ parts: [{
201
+ text
202
+ }]
203
+ }],
204
+ turnComplete: true
205
+ });
206
+ } catch (error) {
207
+ logger.error('VoiceService', `sendText failed: ${error.message}`);
208
+ }
121
209
  }
122
210
 
123
- /** Send DOM tree as passive context during live conversation.
124
- *
125
- * Uses `clientContent` with `turnComplete: false` to inject context
126
- * WITHOUT triggering a model response. This is the "incremental content
127
- * updates" pattern from the Gemini docs for establishing session context.
128
- *
129
- * Called once at connect + after each tool call (not on a timer).
130
- * Screenshots are handled separately via the capture_screenshot tool.
211
+ /**
212
+ * Send DOM tree as passive context during live conversation.
213
+ * Uses turnComplete: false the model receives context without responding.
131
214
  */
132
215
  sendScreenContext(domText) {
133
- if (!this.isConnected) return;
134
- const message = {
135
- clientContent: {
216
+ if (!this.isConnected || !this.session) return;
217
+ try {
218
+ this.session.sendClientContent({
136
219
  turns: [{
137
220
  role: 'user',
138
221
  parts: [{
139
222
  text: domText
140
223
  }]
141
224
  }],
142
- turnComplete: false // Passive context — don't trigger a response
143
- }
144
- };
145
- logger.debug('VoiceService', `📤 Screen context sent (${domText.length} chars)`);
146
- logger.debug('VoiceService', `📤 Raw Screen Context Payload: ${JSON.stringify(message).substring(0, 500)}...`);
147
- this.ws.send(JSON.stringify(message));
225
+ turnComplete: true
226
+ });
227
+ logger.info('VoiceService', `📤 Screen context sent (${domText.length} chars)`);
228
+ } catch (error) {
229
+ logger.error('VoiceService', `sendScreenContext failed: ${error.message}`);
230
+ }
148
231
  }
149
232
 
150
233
  // ─── Send Function Response ────────────────────────────────
151
234
 
152
- /** Send function call result back to Gemini */
235
+ /** Send function call result back via SDK's sendToolResponse */
153
236
  sendFunctionResponse(name, id, result) {
154
- if (!this.isConnected) return;
155
- const message = {
156
- toolResponse: {
237
+ if (!this.isConnected || !this.session) return;
238
+ logger.info('VoiceService', `📤 Sending tool response for ${name} (id=${id})`);
239
+ try {
240
+ this.session.sendToolResponse({
157
241
  functionResponses: [{
158
242
  name,
159
243
  id,
160
244
  response: result
161
245
  }]
162
- }
163
- };
164
- logger.info('VoiceService', `📤 Sending tool response for ${name} (id=${id})`);
165
- this.ws.send(JSON.stringify(message));
246
+ });
247
+ } catch (error) {
248
+ logger.error('VoiceService', `sendFunctionResponse failed: ${error.message}`);
249
+ }
166
250
  }
167
251
 
168
- // ─── Internal: Setup ───────────────────────────────────────
252
+ // ─── Internal: Tool Declarations ───────────────────────────
169
253
 
170
254
  /**
171
- * Builds and sends the setup message, replicating text mode's agent_step
172
- * compound tool so the model uses structured reasoning + actions.
173
- *
174
- * The agent_step tool flattens reasoning fields (previous_goal_eval,
175
- * memory, plan) + action_name enum + all action parameters into a single
176
- * function — matching GeminiProvider.buildAgentStepDeclaration exactly.
255
+ * Builds function declarations from configured tools.
256
+ * Converts BOOLEAN params to STRING (native audio model limitation).
177
257
  */
178
- sendSetup() {
179
- if (!this.ws || this.ws.readyState !== WebSocket.OPEN) return;
180
- const model = this.config.model || DEFAULT_MODEL;
181
- const setup = {
182
- model: `models/${model}`,
183
- generationConfig: {
184
- responseModalities: ['AUDIO']
185
- // Note: Do NOT set thinkingBudget: 0 — it completely disables
186
- // the model's ability to reason about when to call tools.
187
- // The text thinking blocks are a trade-off for working function calling.
188
- }
189
- };
190
-
191
- // Add system instruction if provided
192
- if (this.config.systemPrompt) {
193
- setup.systemInstruction = {
194
- parts: [{
195
- text: this.config.systemPrompt
196
- }]
258
+ buildToolDeclarations() {
259
+ if (!this.config.tools?.length) return [];
260
+ const validTools = this.config.tools.filter(t => t.name !== 'capture_screenshot');
261
+ if (validTools.length === 0) return [];
262
+ return validTools.map(tool => {
263
+ const hasParams = Object.keys(tool.parameters || {}).length > 0;
264
+ const functionDecl = {
265
+ name: tool.name,
266
+ description: tool.description
197
267
  };
198
- }
199
-
200
- // Add individual tool declarations for function calling
201
- // NOTE: We use individual tools (tap, type, navigate, done, ask_user)
202
- // instead of the compound agent_step used in text mode.
203
- // The native audio model in real-time can call simple tools but struggles
204
- // with the complex agent_step schema (it speaks about calling tools
205
- // instead of actually calling them).
206
- if (this.config.tools?.length) {
207
- const validTools = this.config.tools.filter(t => t.name !== 'capture_screenshot');
208
- if (validTools.length > 0) {
209
- setup.tools = [{
210
- functionDeclarations: validTools.map(tool => {
211
- const hasParams = Object.keys(tool.parameters || {}).length > 0;
212
- const functionDecl = {
213
- name: tool.name,
214
- description: tool.description
215
- };
216
- if (hasParams) {
217
- functionDecl.parameters = {
218
- type: 'OBJECT',
219
- properties: Object.fromEntries(Object.entries(tool.parameters).map(([key, param]) => {
220
- // Native audio model crashes with BOOLEAN/ENUM types (error 1008)
221
- // Convert to STRING as a workaround
222
- let paramType = param.type.toUpperCase();
223
- let desc = param.description;
224
- if (paramType === 'BOOLEAN') {
225
- paramType = 'STRING';
226
- desc = `${desc} (use "true" or "false")`;
227
- }
228
- return [key, {
229
- type: paramType,
230
- description: desc
231
- }];
232
- })),
233
- required: Object.entries(tool.parameters).filter(([, param]) => param.required).map(([key]) => key)
234
- };
268
+ if (hasParams) {
269
+ functionDecl.parameters = {
270
+ type: 'OBJECT',
271
+ properties: Object.fromEntries(Object.entries(tool.parameters).map(([key, param]) => {
272
+ let paramType = param.type.toUpperCase();
273
+ let desc = param.description;
274
+ if (paramType === 'BOOLEAN') {
275
+ paramType = 'STRING';
276
+ desc = `${desc} (use "true" or "false")`;
235
277
  }
236
- return functionDecl;
237
- })
238
- }];
278
+ return [key, {
279
+ type: paramType,
280
+ description: desc
281
+ }];
282
+ })),
283
+ required: Object.entries(tool.parameters).filter(([, param]) => param.required).map(([key]) => key)
284
+ };
239
285
  }
240
- }
241
- const setupMessage = {
242
- setup
243
- };
244
- logger.info('VoiceService', `Sending setup (model: ${model}, ${this.config.tools?.length || 0} tools)`);
245
- try {
246
- const payload = JSON.stringify(setupMessage);
247
- logger.info('VoiceService', `📤 Raw Setup Payload: ${payload}`);
248
- this.ws.send(payload);
249
- } catch (err) {
250
- logger.error('VoiceService', `❌ Error stringifying setup message: ${err.message}`);
251
- }
286
+ return functionDecl;
287
+ });
252
288
  }
253
289
 
254
290
  // ─── Internal: Message Handling ────────────────────────────
255
291
 
256
- handleMessage(event) {
292
+ /**
293
+ * Handle messages from the SDK's onmessage callback.
294
+ * The SDK parses binary/JSON automatically — we get clean objects.
295
+ *
296
+ * Per official docs, tool calls come at the top level as
297
+ * `response.toolCall.functionCalls`.
298
+ */
299
+ handleSDKMessage(message) {
257
300
  try {
258
- const dataType = typeof event.data;
259
- const dataLen = typeof event.data === 'string' ? event.data.length : event.data?.byteLength || 'unknown';
260
- logger.info('VoiceService', `📥 WS message received: type=${dataType}, length=${dataLen}`);
261
-
262
- // Handle binary data (could be JSON or raw PCM)
263
- if (typeof event.data !== 'string') {
264
- logger.info('VoiceService', '📥 Binary message — processing...');
265
- this.handleBinaryMessage(event.data);
266
- return;
301
+ // RAW MESSAGE DUMP — full session visibility
302
+ const msgKeys = Object.keys(message || {}).join(', ');
303
+ logger.info('VoiceService', `📨 SDK message keys: [${msgKeys}]`);
304
+
305
+ // Full raw dump for non-audio messages (audio is too large)
306
+ if (!message.serverContent?.modelTurn?.parts?.some(p => p.inlineData)) {
307
+ const rawDump = JSON.stringify(message).substring(0, 1000);
308
+ logger.info('VoiceService', `📨 RAW: ${rawDump}`);
267
309
  }
268
310
 
269
- // Handle JSON text messages
270
- const message = JSON.parse(event.data);
271
- logger.info('VoiceService', `📥 JSON message keys: ${Object.keys(message).join(', ')}`);
272
- logger.info('VoiceService', `📥 Raw JSON Message: ${event.data.substring(0, 1000)}`);
273
- this.processMessage(message);
274
- } catch (error) {
275
- logger.error('VoiceService', `Error handling message: ${error.message}`);
276
- }
277
- }
278
- handleBinaryMessage(data) {
279
- try {
280
- // Try to decode as JSON first
281
- let bytes;
282
- if (data instanceof ArrayBuffer) {
283
- bytes = new Uint8Array(data);
284
- } else if (data instanceof Blob) {
285
- // Blob handling — read as ArrayBuffer
286
- const reader = new FileReader();
287
- reader.onload = () => {
288
- if (reader.result instanceof ArrayBuffer) {
289
- this.processBinaryBytes(new Uint8Array(reader.result));
290
- }
291
- };
292
- reader.readAsArrayBuffer(data);
293
- return;
294
- } else {
311
+ // Tool calls top-level (per official docs)
312
+ if (message.toolCall?.functionCalls) {
313
+ this.handleToolCalls(message.toolCall.functionCalls);
295
314
  return;
296
315
  }
297
- this.processBinaryBytes(bytes);
316
+
317
+ // Server content (audio, text, transcripts, turn events)
318
+ if (message.serverContent) {
319
+ this.handleServerContent(message.serverContent);
320
+ }
321
+
322
+ // Setup complete acknowledgment
323
+ if (message.setupComplete !== undefined) {
324
+ logger.info('VoiceService', '✅ Setup complete — ready for audio');
325
+ this.callbacks.onSetupComplete?.();
326
+ }
327
+
328
+ // Error messages
329
+ if (message.error) {
330
+ logger.error('VoiceService', `Server error: ${JSON.stringify(message.error)}`);
331
+ this.callbacks.onError?.(message.error.message || 'Server error');
332
+ }
298
333
  } catch (error) {
299
- logger.error('VoiceService', `Error handling binary message: ${error.message}`);
334
+ logger.error('VoiceService', `Error handling SDK message: ${error.message}`);
300
335
  }
301
336
  }
302
- processBinaryBytes(bytes) {
303
- // Check if it looks like JSON (starts with '{' or '[')
304
- const looksLikeJson = bytes.length > 0 && (bytes[0] === 123 || bytes[0] === 91);
305
- if (looksLikeJson) {
306
- try {
307
- const text = new TextDecoder('utf-8').decode(bytes);
308
- const message = JSON.parse(text);
309
- this.processMessage(message);
310
- } catch {
311
- // Not JSON — treat as raw PCM audio
312
- this.callbacks.onAudioResponse?.(this.arrayBufferToBase64(bytes.buffer));
313
- }
314
- } else {
315
- // Raw PCM audio data
316
- this.callbacks.onAudioResponse?.(this.arrayBufferToBase64(bytes.buffer));
337
+
338
+ /** Process tool calls from the model */
339
+ handleToolCalls(functionCalls) {
340
+ for (const fn of functionCalls) {
341
+ logger.info('VoiceService', `🎯 Tool call: ${fn.name}(${JSON.stringify(fn.args)}) [id=${fn.id}]`);
342
+ this.callbacks.onToolCall?.({
343
+ name: fn.name,
344
+ args: fn.args || {},
345
+ id: fn.id
346
+ });
317
347
  }
318
348
  }
319
- processMessage(message) {
320
- // Setup complete acknowledgment
321
- if (message.setupComplete !== undefined) {
322
- logger.info('VoiceService', '✅ Setup complete — ready for audio exchange');
323
- this.setupComplete = true;
324
- this.setStatus('connected');
325
- return;
349
+ audioResponseCount = 0;
350
+
351
+ /** Process server content (audio responses, transcripts, turn events) */
352
+ handleServerContent(content) {
353
+ // Log all keys for full visibility
354
+ const contentKeys = Object.keys(content || {}).join(', ');
355
+ logger.debug('VoiceService', `📦 serverContent keys: [${contentKeys}]`);
356
+
357
+ // Turn complete
358
+ if (content.turnComplete) {
359
+ logger.info('VoiceService', `🏁 Turn complete (audioChunks sent: ${this.audioResponseCount})`);
360
+ this.audioResponseCount = 0;
361
+ this.callbacks.onTurnComplete?.();
326
362
  }
327
363
 
328
- // Server content (audio response + transcripts)
329
- if (message.serverContent) {
330
- const content = message.serverContent;
331
- logger.info('VoiceService', `📥 serverContent received — turnComplete=${content.turnComplete}, hasParts=${!!content.modelTurn?.parts}, inputTranscription=${!!content.inputTranscription}, outputTranscription=${!!content.outputTranscription}`);
332
-
333
- // Check for turn complete
334
- if (content.turnComplete) {
335
- this.callbacks.onTurnComplete?.();
336
- }
337
-
338
- // Process model output parts
339
- if (content.modelTurn?.parts) {
340
- for (const part of content.modelTurn.parts) {
341
- // Audio response
342
- if (part.inlineData?.data) {
343
- logger.info('VoiceService', `🔊 Audio response: ${part.inlineData.data.length} chars`);
344
- this.callbacks.onAudioResponse?.(part.inlineData.data);
345
- }
346
-
347
- // Text response (transcript)
348
- if (part.text) {
349
- logger.info('VoiceService', `💬 Text response: "${part.text}"`);
350
- this.callbacks.onTranscript?.(part.text, true, 'model');
364
+ // Model output parts (audio + optional thinking text)
365
+ if (content.modelTurn?.parts) {
366
+ for (const part of content.modelTurn.parts) {
367
+ if (part.inlineData?.data) {
368
+ this.audioResponseCount++;
369
+ if (this.audioResponseCount <= 3 || this.audioResponseCount % 20 === 0) {
370
+ logger.info('VoiceService', `🔊 Audio chunk #${this.audioResponseCount}: ${part.inlineData.data.length} b64 chars, mime=${part.inlineData.mimeType || 'unknown'}`);
351
371
  }
372
+ this.callbacks.onAudioResponse?.(part.inlineData.data);
373
+ }
374
+ if (part.text) {
375
+ logger.info('VoiceService', `🤖 MODEL: "${part.text}"`);
376
+ this.callbacks.onTranscript?.(part.text, true, 'model');
352
377
  }
353
378
  }
379
+ }
354
380
 
355
- // Input transcription (user's speech)
356
- if (content.inputTranscription?.text) {
357
- this.callbacks.onTranscript?.(content.inputTranscription.text, true, 'user');
358
- }
359
-
360
- // Output transcription (model's speech-to-text)
361
- if (content.outputTranscription?.text) {
362
- this.callbacks.onTranscript?.(content.outputTranscription.text, true, 'model');
363
- }
381
+ // Input transcription (user's speech-to-text)
382
+ if (content.inputTranscription?.text) {
383
+ logger.info('VoiceService', `🗣️ USER (voice): "${content.inputTranscription.text}"`);
384
+ this.callbacks.onTranscript?.(content.inputTranscription.text, true, 'user');
364
385
  }
365
386
 
366
- // Tool calls from the model
367
- if (message.toolCall?.functionCalls) {
368
- for (const fn of message.toolCall.functionCalls) {
369
- logger.info('VoiceService', `🎯 Tool call: ${fn.name}(${JSON.stringify(fn.args)})`);
370
- this.callbacks.onToolCall?.({
371
- name: fn.name,
372
- args: fn.args || {},
373
- id: fn.id
374
- });
375
- }
387
+ // Output transcription (model's speech-to-text)
388
+ if (content.outputTranscription?.text) {
389
+ logger.info('VoiceService', `🤖 MODEL (voice): "${content.outputTranscription.text}"`);
390
+ this.callbacks.onTranscript?.(content.outputTranscription.text, true, 'model');
376
391
  }
377
392
 
378
- // Error messages
379
- if (message.error) {
380
- logger.error('VoiceService', `Server error: ${JSON.stringify(message.error)}`);
381
- this.callbacks.onError?.(message.error.message || 'Server error');
393
+ // Tool calls inside serverContent (some SDK versions deliver here)
394
+ if (content.toolCall?.functionCalls) {
395
+ this.handleToolCalls(content.toolCall.functionCalls);
382
396
  }
383
397
  }
384
398
 
@@ -388,13 +402,5 @@ export class VoiceService {
388
402
  this._status = newStatus;
389
403
  this.callbacks.onStatusChange?.(newStatus);
390
404
  }
391
- arrayBufferToBase64(buffer) {
392
- const bytes = new Uint8Array(buffer);
393
- let binary = '';
394
- for (let i = 0; i < bytes.byteLength; i++) {
395
- binary += String.fromCharCode(bytes[i]);
396
- }
397
- return btoa(binary);
398
- }
399
405
  }
400
406
  //# sourceMappingURL=VoiceService.js.map