react-native-agentic-ai 0.4.6 → 0.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. package/README.md +80 -4
  2. package/lib/module/components/AIAgent.js +179 -38
  3. package/lib/module/components/AIAgent.js.map +1 -1
  4. package/lib/module/components/AgentChatBar.js +53 -29
  5. package/lib/module/components/AgentChatBar.js.map +1 -1
  6. package/lib/module/components/Icons.js +337 -0
  7. package/lib/module/components/Icons.js.map +1 -0
  8. package/lib/module/core/AgentRuntime.js +74 -3
  9. package/lib/module/core/AgentRuntime.js.map +1 -1
  10. package/lib/module/core/systemPrompt.js +66 -39
  11. package/lib/module/core/systemPrompt.js.map +1 -1
  12. package/lib/module/index.js +3 -9
  13. package/lib/module/index.js.map +1 -1
  14. package/lib/module/services/AudioInputService.js +73 -2
  15. package/lib/module/services/AudioInputService.js.map +1 -1
  16. package/lib/module/services/AudioOutputService.js +58 -5
  17. package/lib/module/services/AudioOutputService.js.map +1 -1
  18. package/lib/module/services/VoiceService.js +281 -275
  19. package/lib/module/services/VoiceService.js.map +1 -1
  20. package/lib/typescript/src/components/AIAgent.d.ts.map +1 -1
  21. package/lib/typescript/src/components/AgentChatBar.d.ts.map +1 -1
  22. package/lib/typescript/src/components/Icons.d.ts +43 -0
  23. package/lib/typescript/src/components/Icons.d.ts.map +1 -0
  24. package/lib/typescript/src/core/AgentRuntime.d.ts +12 -0
  25. package/lib/typescript/src/core/AgentRuntime.d.ts.map +1 -1
  26. package/lib/typescript/src/core/systemPrompt.d.ts.map +1 -1
  27. package/lib/typescript/src/index.d.ts +4 -0
  28. package/lib/typescript/src/index.d.ts.map +1 -1
  29. package/lib/typescript/src/services/AudioInputService.d.ts +13 -0
  30. package/lib/typescript/src/services/AudioInputService.d.ts.map +1 -1
  31. package/lib/typescript/src/services/AudioOutputService.d.ts.map +1 -1
  32. package/lib/typescript/src/services/VoiceService.d.ts +38 -29
  33. package/lib/typescript/src/services/VoiceService.d.ts.map +1 -1
  34. package/package.json +1 -1
  35. package/src/components/AIAgent.tsx +192 -39
  36. package/src/components/AgentChatBar.tsx +44 -25
  37. package/src/components/Icons.tsx +253 -0
  38. package/src/core/AgentRuntime.ts +70 -3
  39. package/src/core/systemPrompt.ts +66 -39
  40. package/src/index.ts +8 -8
  41. package/src/services/AudioInputService.ts +77 -2
  42. package/src/services/AudioOutputService.ts +59 -5
  43. package/src/services/VoiceService.ts +278 -290
@@ -1,15 +1,26 @@
1
1
  /**
2
- * VoiceService — WebSocket connection to Gemini Live API.
2
+ * VoiceService — @google/genai SDK Live API connection.
3
+ *
4
+ * Uses the official `ai.live.connect()` method instead of raw WebSocket.
5
+ * This fixes function calling reliability: the SDK handles protocol details
6
+ * (binary framing, message transforms, model name prefixes) that our
7
+ * previous raw WebSocket implementation missed.
3
8
  *
4
9
  * Handles bidirectional audio streaming between the app and Gemini:
5
10
  * - Sends PCM 16kHz 16-bit audio chunks (mic input)
6
11
  * - Receives PCM 24kHz 16-bit audio chunks (AI responses)
7
12
  * - Receives function calls (tap, navigate, etc.) for agentic actions
8
- * - Sends screen context (DOM text + optional screenshot) for live mode
9
- *
10
- * Protocol: wss://generativelanguage.googleapis.com/ws/google.ai.generativelanguage.v1beta.GenerativeService.BidiGenerateContent
13
+ * - Sends screen context (DOM text) for live mode
11
14
  */
12
15
 
16
+ // Platform-specific import: Metro can't resolve '@google/genai/web' sub-path
17
+ // export, so we use the full path to the web bundle. This is what the SDK
18
+ // recommends ('use a platform specific import') — RN's WebSocket API is
19
+ // browser-compatible so the web bundle works correctly.
20
+ // @ts-ignore — TS can't find declarations for the deep path
21
+ import { GoogleGenAI, Modality } from '@google/genai/dist/web/index.mjs';
22
+ // @ts-ignore
23
+ import type { Session } from '@google/genai/dist/web/index.mjs';
13
24
  import { logger } from '../utils/logger';
14
25
  import type { ToolDefinition } from '../core/types';
15
26
 
@@ -34,29 +45,26 @@ export interface VoiceServiceCallbacks {
34
45
  onError?: (error: string) => void;
35
46
  /** Called when AI turn is complete (all audio sent) */
36
47
  onTurnComplete?: () => void;
48
+ /** Called when SDK setup is complete — safe to send screen context */
49
+ onSetupComplete?: () => void;
37
50
  }
38
51
 
39
52
  export type VoiceStatus = 'disconnected' | 'connecting' | 'connected' | 'error';
40
53
 
41
54
  // ─── Constants ─────────────────────────────────────────────────
42
55
 
43
- const WS_HOST = 'generativelanguage.googleapis.com';
44
- const WS_PATH = '/ws/google.ai.generativelanguage.v1beta.GenerativeService.BidiGenerateContent';
45
- // Use -09-2025: Google's own cookbook uses this model for Live API tool use.
46
- // The -12-2025 model had server-side regressions with function calling
47
- // and was deprecated March 19, 2026. The -09-2025 version has
48
- // "improved function calling and better handling of speech cut-offs."
49
- const DEFAULT_MODEL = 'gemini-2.5-flash-native-audio-preview-09-2025';
56
+ const DEFAULT_MODEL = 'gemini-2.5-flash-native-audio-preview-12-2025';
50
57
  const DEFAULT_INPUT_SAMPLE_RATE = 16000;
51
58
 
52
59
  // ─── Service ───────────────────────────────────────────────────
53
60
 
54
61
  export class VoiceService {
55
- private ws: WebSocket | null = null;
62
+ private session: Session | null = null;
56
63
  private config: VoiceServiceConfig;
57
64
  private callbacks: VoiceServiceCallbacks = {};
58
- private setupComplete = false;
65
+ public lastCallbacks: VoiceServiceCallbacks | null = null;
59
66
  private _status: VoiceStatus = 'disconnected';
67
+ public intentionalDisconnect = false;
60
68
 
61
69
  constructor(config: VoiceServiceConfig) {
62
70
  this.config = config;
@@ -64,55 +72,115 @@ export class VoiceService {
64
72
 
65
73
  // ─── Connection ────────────────────────────────────────────
66
74
 
67
- connect(callbacks: VoiceServiceCallbacks): void {
68
- if (this.ws?.readyState === WebSocket.OPEN) {
75
+ /**
76
+ * Connect to Gemini Live API via the official SDK.
77
+ * Now async because `ai.live.connect()` returns a Promise.
78
+ */
79
+ async connect(callbacks: VoiceServiceCallbacks): Promise<void> {
80
+ if (this.session) {
69
81
  logger.info('VoiceService', 'Already connected');
70
82
  return;
71
83
  }
72
84
 
73
85
  this.callbacks = callbacks;
86
+ this.lastCallbacks = callbacks;
74
87
  this.setStatus('connecting');
88
+ this.intentionalDisconnect = false;
75
89
 
76
90
  const model = this.config.model || DEFAULT_MODEL;
77
- const url = `wss://${WS_HOST}${WS_PATH}?key=${this.config.apiKey}`;
91
+ logger.info('VoiceService', `Connecting via SDK (model: ${model})`);
78
92
 
79
- logger.info('VoiceService', `Connecting to Gemini Live API (model: ${model})`);
80
- this.ws = new WebSocket(url);
93
+ try {
94
+ const ai = new GoogleGenAI({ apiKey: this.config.apiKey });
81
95
 
82
- this.ws.onopen = () => {
83
- logger.info('VoiceService', 'WebSocket connected, sending setup...');
84
- this.sendSetup();
85
- };
96
+ const toolDeclarations = this.buildToolDeclarations();
86
97
 
87
- this.ws.onclose = (event) => {
88
- logger.info('VoiceService', `WebSocket closed: ${event.code} ${event.reason}`);
89
- this.setStatus('disconnected');
90
- this.setupComplete = false;
91
- };
98
+ // Build SDK config matching the official docs pattern
99
+ const sdkConfig: Record<string, any> = {
100
+ responseModalities: [Modality.AUDIO],
101
+ };
92
102
 
93
- this.ws.onerror = (error: any) => {
94
- logger.error('VoiceService', `WebSocket error: ${error.message || 'Unknown'}`);
95
- this.setStatus('error');
96
- this.callbacks.onError?.(error.message || 'WebSocket connection error');
97
- };
103
+ // Enable transcription for debugging and UX
104
+ sdkConfig.inputAudioTranscription = {};
105
+ sdkConfig.outputAudioTranscription = {};
106
+ logger.info('VoiceService', 'Transcription enabled');
107
+
108
+ if (this.config.systemPrompt) {
109
+ sdkConfig.systemInstruction = {
110
+ parts: [{ text: this.config.systemPrompt }],
111
+ };
112
+ }
98
113
 
99
- this.ws.onmessage = (event) => {
100
- this.handleMessage(event);
101
- };
114
+ if (toolDeclarations.length > 0) {
115
+ sdkConfig.tools = [{ functionDeclarations: toolDeclarations }];
116
+ }
117
+
118
+ // FULL CONFIG DUMP — see exactly what we send to SDK
119
+ const configDump = JSON.stringify({
120
+ ...sdkConfig,
121
+ systemInstruction: sdkConfig.systemInstruction ? '(present)' : '(none)',
122
+ tools: sdkConfig.tools ? `${toolDeclarations.length} declarations` : '(none)',
123
+ });
124
+ logger.info('VoiceService', `📋 SDK config: ${configDump}`);
125
+ logger.info('VoiceService', `📋 Tool names: ${toolDeclarations.map((t: any) => t.name).join(', ')}`);
126
+
127
+ const session = await ai.live.connect({
128
+ model: model,
129
+ config: sdkConfig,
130
+ callbacks: {
131
+ onopen: () => {
132
+ logger.info('VoiceService', '✅ SDK session connected');
133
+ this.setStatus('connected');
134
+ },
135
+ onmessage: (message: any) => {
136
+ this.handleSDKMessage(message);
137
+ },
138
+ onerror: (error: any) => {
139
+ const errDetail = error
140
+ ? JSON.stringify(error, Object.getOwnPropertyNames(error)).substring(0, 500)
141
+ : 'null';
142
+ logger.error('VoiceService', `SDK error: ${errDetail}`);
143
+ this.setStatus('error');
144
+ this.callbacks.onError?.(error?.message || 'SDK connection error');
145
+ },
146
+ onclose: (event: any) => {
147
+ const closeDetail = event
148
+ ? JSON.stringify(event, Object.getOwnPropertyNames(event)).substring(0, 500)
149
+ : 'null';
150
+ if (this.intentionalDisconnect) {
151
+ logger.info('VoiceService', `SDK session closed (intentional)`);
152
+ } else {
153
+ logger.error('VoiceService', `SDK session closed UNEXPECTEDLY — code: ${event?.code}, reason: ${event?.reason}, detail: ${closeDetail}`);
154
+ this.callbacks.onError?.(`Connection lost (code: ${event?.code || 'unknown'})`);
155
+ }
156
+ this.session = null;
157
+ this.setStatus('disconnected');
158
+ },
159
+ },
160
+ });
161
+
162
+ this.session = session;
163
+ logger.info('VoiceService', 'SDK session established');
164
+
165
+ } catch (error: any) {
166
+ logger.error('VoiceService', `Connection failed: ${error.message}`);
167
+ this.setStatus('error');
168
+ this.callbacks.onError?.(error.message || 'Failed to connect');
169
+ }
102
170
  }
103
171
 
104
172
  disconnect(): void {
105
- if (this.ws) {
106
- logger.info('VoiceService', 'Disconnecting...');
107
- this.ws.close();
108
- this.ws = null;
109
- this.setupComplete = false;
173
+ if (this.session) {
174
+ logger.info('VoiceService', 'Disconnecting (intentional)...');
175
+ this.intentionalDisconnect = true;
176
+ this.session.close();
177
+ this.session = null;
110
178
  this.setStatus('disconnected');
111
179
  }
112
180
  }
113
181
 
114
182
  get isConnected(): boolean {
115
- return this.ws?.readyState === WebSocket.OPEN && this.setupComplete;
183
+ return this.session !== null && this._status === 'connected';
116
184
  }
117
185
 
118
186
  get currentStatus(): VoiceStatus {
@@ -121,311 +189,240 @@ export class VoiceService {
121
189
 
122
190
  // ─── Send Audio ────────────────────────────────────────────
123
191
 
124
- /** Send PCM audio chunk (base64 encoded) to Gemini */
192
+ /** Send PCM audio chunk (base64 encoded) via SDK's sendRealtimeInput */
125
193
  private sendCount = 0;
126
194
  sendAudio(base64Audio: string): void {
127
195
  this.sendCount++;
128
- if (!this.isConnected) {
129
- logger.warn('VoiceService', `sendAudio #${this.sendCount} DROPPED not connected (ws=${this.ws?.readyState}, setup=${this.setupComplete})`);
196
+ if (!this.isConnected || !this.session) {
197
+ if (this.sendCount % 20 === 0) {
198
+ logger.warn('VoiceService', `sendAudio #${this.sendCount} DROPPED — not connected`);
199
+ }
130
200
  return;
131
201
  }
132
202
 
133
- const message = {
134
- realtimeInput: {
135
- audio: {
136
- mimeType: `audio/pcm;rate=${this.config.inputSampleRate || DEFAULT_INPUT_SAMPLE_RATE}`,
137
- data: base64Audio,
138
- },
139
- },
140
- };
203
+ const mimeType = `audio/pcm;rate=${this.config.inputSampleRate || DEFAULT_INPUT_SAMPLE_RATE}`;
204
+
205
+ // DEBUG: log every send call
206
+ if (this.sendCount <= 5 || this.sendCount % 10 === 0) {
207
+ logger.info('VoiceService', `📡 sendAudio #${this.sendCount}: len=${base64Audio.length}, mime=${mimeType}, preview=${base64Audio.substring(0, 30)}...`);
208
+ }
141
209
 
142
- logger.info('VoiceService', `📤 #${this.sendCount} sending ${base64Audio.length} chars (ws=${this.ws?.readyState})`);
143
- this.ws!.send(JSON.stringify(message));
210
+ try {
211
+ this.session.sendRealtimeInput({
212
+ audio: { data: base64Audio, mimeType },
213
+ });
214
+ // Log every 50th successful send to confirm data is reaching WebSocket
215
+ if (this.sendCount % 50 === 0) {
216
+ logger.info('VoiceService', `✅ sendAudio #${this.sendCount} OK — session.isOpen=${!!this.session}`);
217
+ }
218
+ } catch (error: any) {
219
+ logger.error('VoiceService', `❌ sendAudio EXCEPTION: ${error.message}\n${error.stack?.substring(0, 300)}`);
220
+ this.session = null;
221
+ this.setStatus('disconnected');
222
+ }
144
223
  }
145
224
 
146
225
  // ─── Send Text ─────────────────────────────────────────────
147
226
 
148
- /** Send text message via realtimeInput (same channel as audio) */
227
+ /** Send text message via SDK's sendClientContent */
149
228
  sendText(text: string): void {
150
- if (!this.isConnected) return;
151
-
152
- const message = {
153
- realtimeInput: { text },
154
- };
229
+ if (!this.isConnected || !this.session) return;
155
230
 
156
- this.ws!.send(JSON.stringify(message));
231
+ logger.info('VoiceService', `🗣️ USER (text): "${text}"`);
232
+ try {
233
+ this.session.sendClientContent({
234
+ turns: [{ role: 'user', parts: [{ text }] }],
235
+ turnComplete: true,
236
+ });
237
+ } catch (error: any) {
238
+ logger.error('VoiceService', `sendText failed: ${error.message}`);
239
+ }
157
240
  }
158
241
 
159
- /** Send DOM tree as passive context during live conversation.
160
- *
161
- * Uses `clientContent` with `turnComplete: false` to inject context
162
- * WITHOUT triggering a model response. This is the "incremental content
163
- * updates" pattern from the Gemini docs for establishing session context.
164
- *
165
- * Called once at connect + after each tool call (not on a timer).
166
- * Screenshots are handled separately via the capture_screenshot tool.
242
+ /**
243
+ * Send DOM tree as passive context during live conversation.
244
+ * Uses turnComplete: false the model receives context without responding.
167
245
  */
168
246
  sendScreenContext(domText: string): void {
169
- if (!this.isConnected) return;
247
+ if (!this.isConnected || !this.session) return;
170
248
 
171
- const message = {
172
- clientContent: {
249
+ try {
250
+ this.session.sendClientContent({
173
251
  turns: [{ role: 'user', parts: [{ text: domText }] }],
174
- turnComplete: false, // Passive context — don't trigger a response
175
- },
176
- };
177
-
178
- logger.debug('VoiceService', `📤 Screen context sent (${domText.length} chars)`);
179
- logger.debug('VoiceService', `📤 Raw Screen Context Payload: ${JSON.stringify(message).substring(0, 500)}...`);
180
- this.ws!.send(JSON.stringify(message));
252
+ turnComplete: true,
253
+ });
254
+ logger.info('VoiceService', `📤 Screen context sent (${domText.length} chars)`);
255
+ } catch (error: any) {
256
+ logger.error('VoiceService', `sendScreenContext failed: ${error.message}`);
257
+ }
181
258
  }
182
259
 
183
260
  // ─── Send Function Response ────────────────────────────────
184
261
 
185
- /** Send function call result back to Gemini */
262
+ /** Send function call result back via SDK's sendToolResponse */
186
263
  sendFunctionResponse(name: string, id: string, result: any): void {
187
- if (!this.isConnected) return;
188
-
189
- const message = {
190
- toolResponse: {
191
- functionResponses: [{
192
- name,
193
- id,
194
- response: result,
195
- }],
196
- },
197
- };
264
+ if (!this.isConnected || !this.session) return;
198
265
 
199
266
  logger.info('VoiceService', `📤 Sending tool response for ${name} (id=${id})`);
200
- this.ws!.send(JSON.stringify(message));
267
+
268
+ try {
269
+ this.session.sendToolResponse({
270
+ functionResponses: [{ name, id, response: result }],
271
+ });
272
+ } catch (error: any) {
273
+ logger.error('VoiceService', `sendFunctionResponse failed: ${error.message}`);
274
+ }
201
275
  }
202
276
 
203
- // ─── Internal: Setup ───────────────────────────────────────
277
+ // ─── Internal: Tool Declarations ───────────────────────────
204
278
 
205
279
  /**
206
- * Builds and sends the setup message, replicating text mode's agent_step
207
- * compound tool so the model uses structured reasoning + actions.
208
- *
209
- * The agent_step tool flattens reasoning fields (previous_goal_eval,
210
- * memory, plan) + action_name enum + all action parameters into a single
211
- * function — matching GeminiProvider.buildAgentStepDeclaration exactly.
280
+ * Builds function declarations from configured tools.
281
+ * Converts BOOLEAN params to STRING (native audio model limitation).
212
282
  */
213
- private sendSetup(): void {
214
- if (!this.ws || this.ws.readyState !== WebSocket.OPEN) return;
283
+ private buildToolDeclarations(): any[] {
284
+ if (!this.config.tools?.length) return [];
215
285
 
216
- const model = this.config.model || DEFAULT_MODEL;
286
+ const validTools = this.config.tools.filter(t => t.name !== 'capture_screenshot');
287
+ if (validTools.length === 0) return [];
217
288
 
218
- const setup: any = {
219
- model: `models/${model}`,
220
- generationConfig: {
221
- responseModalities: ['AUDIO'],
222
- // Note: Do NOT set thinkingBudget: 0 — it completely disables
223
- // the model's ability to reason about when to call tools.
224
- // The text thinking blocks are a trade-off for working function calling.
225
- },
226
- };
227
-
228
- // Add system instruction if provided
229
- if (this.config.systemPrompt) {
230
- setup.systemInstruction = {
231
- parts: [{ text: this.config.systemPrompt }],
289
+ return validTools.map(tool => {
290
+ const hasParams = Object.keys(tool.parameters || {}).length > 0;
291
+ const functionDecl: any = {
292
+ name: tool.name,
293
+ description: tool.description,
232
294
  };
233
- }
234
295
 
235
- // Add individual tool declarations for function calling
236
- // NOTE: We use individual tools (tap, type, navigate, done, ask_user)
237
- // instead of the compound agent_step used in text mode.
238
- // The native audio model in real-time can call simple tools but struggles
239
- // with the complex agent_step schema (it speaks about calling tools
240
- // instead of actually calling them).
241
- if (this.config.tools?.length) {
242
- const validTools = this.config.tools.filter(t => t.name !== 'capture_screenshot');
243
- if (validTools.length > 0) {
244
- setup.tools = [{
245
- functionDeclarations: validTools.map(tool => {
246
- const hasParams = Object.keys(tool.parameters || {}).length > 0;
247
- const functionDecl: any = {
248
- name: tool.name,
249
- description: tool.description,
250
- };
251
-
252
- if (hasParams) {
253
- functionDecl.parameters = {
254
- type: 'OBJECT',
255
- properties: Object.fromEntries(
256
- Object.entries(tool.parameters).map(([key, param]) => {
257
- // Native audio model crashes with BOOLEAN/ENUM types (error 1008)
258
- // Convert to STRING as a workaround
259
- let paramType = param.type.toUpperCase();
260
- let desc = param.description;
261
- if (paramType === 'BOOLEAN') {
262
- paramType = 'STRING';
263
- desc = `${desc} (use "true" or "false")`;
264
- }
265
- return [
266
- key,
267
- {
268
- type: paramType,
269
- description: desc,
270
- },
271
- ];
272
- })
273
- ),
274
- required: Object.entries(tool.parameters)
275
- .filter(([, param]) => param.required)
276
- .map(([key]) => key),
277
- };
278
- }
279
- return functionDecl;
280
- }),
281
- }];
296
+ if (hasParams) {
297
+ functionDecl.parameters = {
298
+ type: 'OBJECT',
299
+ properties: Object.fromEntries(
300
+ Object.entries(tool.parameters).map(([key, param]) => {
301
+ let paramType = param.type.toUpperCase();
302
+ let desc = param.description;
303
+ if (paramType === 'BOOLEAN') {
304
+ paramType = 'STRING';
305
+ desc = `${desc} (use "true" or "false")`;
306
+ }
307
+ return [key, { type: paramType, description: desc }];
308
+ })
309
+ ),
310
+ required: Object.entries(tool.parameters)
311
+ .filter(([, param]) => param.required)
312
+ .map(([key]) => key),
313
+ };
282
314
  }
283
- }
284
-
285
- const setupMessage = { setup };
286
- logger.info('VoiceService', `Sending setup (model: ${model}, ${this.config.tools?.length || 0} tools)`);
287
- try {
288
- const payload = JSON.stringify(setupMessage);
289
- logger.info('VoiceService', `📤 Raw Setup Payload: ${payload}`);
290
- this.ws.send(payload);
291
- } catch (err: any) {
292
- logger.error('VoiceService', `❌ Error stringifying setup message: ${err.message}`);
293
- }
315
+ return functionDecl;
316
+ });
294
317
  }
295
318
 
296
319
  // ─── Internal: Message Handling ────────────────────────────
297
320
 
298
- private handleMessage(event: WebSocketMessageEvent): void {
321
+ /**
322
+ * Handle messages from the SDK's onmessage callback.
323
+ * The SDK parses binary/JSON automatically — we get clean objects.
324
+ *
325
+ * Per official docs, tool calls come at the top level as
326
+ * `response.toolCall.functionCalls`.
327
+ */
328
+ private handleSDKMessage(message: any): void {
299
329
  try {
300
- const dataType = typeof event.data;
301
- const dataLen = typeof event.data === 'string' ? event.data.length : (event.data?.byteLength || 'unknown');
302
- logger.info('VoiceService', `📥 WS message received: type=${dataType}, length=${dataLen}`);
303
-
304
- // Handle binary data (could be JSON or raw PCM)
305
- if (typeof event.data !== 'string') {
306
- logger.info('VoiceService', '📥 Binary message — processing...');
307
- this.handleBinaryMessage(event.data);
330
+ // RAW MESSAGE DUMP — full session visibility
331
+ const msgKeys = Object.keys(message || {}).join(', ');
332
+ logger.info('VoiceService', `📨 SDK message keys: [${msgKeys}]`);
333
+
334
+ // Full raw dump for non-audio messages (audio is too large)
335
+ if (!message.serverContent?.modelTurn?.parts?.some((p: any) => p.inlineData)) {
336
+ const rawDump = JSON.stringify(message).substring(0, 1000);
337
+ logger.info('VoiceService', `📨 RAW: ${rawDump}`);
338
+ }
339
+
340
+ // Tool calls — top-level (per official docs)
341
+ if (message.toolCall?.functionCalls) {
342
+ this.handleToolCalls(message.toolCall.functionCalls);
308
343
  return;
309
344
  }
310
345
 
311
- // Handle JSON text messages
312
- const message = JSON.parse(event.data);
313
- logger.info('VoiceService', `📥 JSON message keys: ${Object.keys(message).join(', ')}`);
314
- logger.info('VoiceService', `📥 Raw JSON Message: ${event.data.substring(0, 1000)}`);
315
- this.processMessage(message);
316
- } catch (error: any) {
317
- logger.error('VoiceService', `Error handling message: ${error.message}`);
318
- }
319
- }
346
+ // Server content (audio, text, transcripts, turn events)
347
+ if (message.serverContent) {
348
+ this.handleServerContent(message.serverContent);
349
+ }
320
350
 
321
- private handleBinaryMessage(data: any): void {
322
- try {
323
- // Try to decode as JSON first
324
- let bytes: Uint8Array;
325
- if (data instanceof ArrayBuffer) {
326
- bytes = new Uint8Array(data);
327
- } else if (data instanceof Blob) {
328
- // Blob handling — read as ArrayBuffer
329
- const reader = new FileReader();
330
- reader.onload = () => {
331
- if (reader.result instanceof ArrayBuffer) {
332
- this.processBinaryBytes(new Uint8Array(reader.result));
333
- }
334
- };
335
- reader.readAsArrayBuffer(data);
336
- return;
337
- } else {
338
- return;
351
+ // Setup complete acknowledgment
352
+ if (message.setupComplete !== undefined) {
353
+ logger.info('VoiceService', '✅ Setup complete ready for audio');
354
+ this.callbacks.onSetupComplete?.();
339
355
  }
340
356
 
341
- this.processBinaryBytes(bytes);
357
+ // Error messages
358
+ if (message.error) {
359
+ logger.error('VoiceService', `Server error: ${JSON.stringify(message.error)}`);
360
+ this.callbacks.onError?.(message.error.message || 'Server error');
361
+ }
342
362
  } catch (error: any) {
343
- logger.error('VoiceService', `Error handling binary message: ${error.message}`);
363
+ logger.error('VoiceService', `Error handling SDK message: ${error.message}`);
344
364
  }
345
365
  }
346
366
 
347
- private processBinaryBytes(bytes: Uint8Array): void {
348
- // Check if it looks like JSON (starts with '{' or '[')
349
- const looksLikeJson = bytes.length > 0 && (bytes[0] === 123 || bytes[0] === 91);
350
-
351
- if (looksLikeJson) {
352
- try {
353
- const text = new TextDecoder('utf-8').decode(bytes);
354
- const message = JSON.parse(text);
355
- this.processMessage(message);
356
- } catch {
357
- // Not JSON — treat as raw PCM audio
358
- this.callbacks.onAudioResponse?.(this.arrayBufferToBase64(bytes.buffer as ArrayBuffer));
359
- }
360
- } else {
361
- // Raw PCM audio data
362
- this.callbacks.onAudioResponse?.(this.arrayBufferToBase64(bytes.buffer as ArrayBuffer));
367
+ /** Process tool calls from the model */
368
+ private handleToolCalls(functionCalls: any[]): void {
369
+ for (const fn of functionCalls) {
370
+ logger.info('VoiceService', `🎯 Tool call: ${fn.name}(${JSON.stringify(fn.args)}) [id=${fn.id}]`);
371
+ this.callbacks.onToolCall?.({
372
+ name: fn.name,
373
+ args: fn.args || {},
374
+ id: fn.id,
375
+ });
363
376
  }
364
377
  }
365
378
 
366
- private processMessage(message: any): void {
367
- // Setup complete acknowledgment
368
- if (message.setupComplete !== undefined) {
369
- logger.info('VoiceService', '✅ Setup complete — ready for audio exchange');
370
- this.setupComplete = true;
371
- this.setStatus('connected');
372
- return;
373
- }
379
+ private audioResponseCount = 0;
374
380
 
375
- // Server content (audio response + transcripts)
376
- if (message.serverContent) {
377
- const content = message.serverContent;
378
- logger.info('VoiceService', `📥 serverContent received turnComplete=${content.turnComplete}, hasParts=${!!content.modelTurn?.parts}, inputTranscription=${!!content.inputTranscription}, outputTranscription=${!!content.outputTranscription}`);
381
+ /** Process server content (audio responses, transcripts, turn events) */
382
+ private handleServerContent(content: any): void {
383
+ // Log all keys for full visibility
384
+ const contentKeys = Object.keys(content || {}).join(', ');
385
+ logger.debug('VoiceService', `📦 serverContent keys: [${contentKeys}]`);
379
386
 
380
- // Check for turn complete
381
- if (content.turnComplete) {
382
- this.callbacks.onTurnComplete?.();
383
- }
384
-
385
- // Process model output parts
386
- if (content.modelTurn?.parts) {
387
- for (const part of content.modelTurn.parts) {
388
- // Audio response
389
- if (part.inlineData?.data) {
390
- logger.info('VoiceService', `🔊 Audio response: ${part.inlineData.data.length} chars`);
391
- this.callbacks.onAudioResponse?.(part.inlineData.data);
392
- }
387
+ // Turn complete
388
+ if (content.turnComplete) {
389
+ logger.info('VoiceService', `🏁 Turn complete (audioChunks sent: ${this.audioResponseCount})`);
390
+ this.audioResponseCount = 0;
391
+ this.callbacks.onTurnComplete?.();
392
+ }
393
393
 
394
- // Text response (transcript)
395
- if (part.text) {
396
- logger.info('VoiceService', `💬 Text response: "${part.text}"`);
397
- this.callbacks.onTranscript?.(part.text, true, 'model');
394
+ // Model output parts (audio + optional thinking text)
395
+ if (content.modelTurn?.parts) {
396
+ for (const part of content.modelTurn.parts) {
397
+ if (part.inlineData?.data) {
398
+ this.audioResponseCount++;
399
+ if (this.audioResponseCount <= 3 || this.audioResponseCount % 20 === 0) {
400
+ logger.info('VoiceService', `🔊 Audio chunk #${this.audioResponseCount}: ${part.inlineData.data.length} b64 chars, mime=${part.inlineData.mimeType || 'unknown'}`);
398
401
  }
402
+ this.callbacks.onAudioResponse?.(part.inlineData.data);
403
+ }
404
+ if (part.text) {
405
+ logger.info('VoiceService', `🤖 MODEL: "${part.text}"`);
406
+ this.callbacks.onTranscript?.(part.text, true, 'model');
399
407
  }
400
408
  }
409
+ }
401
410
 
402
- // Input transcription (user's speech)
403
- if (content.inputTranscription?.text) {
404
- this.callbacks.onTranscript?.(content.inputTranscription.text, true, 'user');
405
- }
406
-
407
- // Output transcription (model's speech-to-text)
408
- if (content.outputTranscription?.text) {
409
- this.callbacks.onTranscript?.(content.outputTranscription.text, true, 'model');
410
- }
411
+ // Input transcription (user's speech-to-text)
412
+ if (content.inputTranscription?.text) {
413
+ logger.info('VoiceService', `🗣️ USER (voice): "${content.inputTranscription.text}"`);
414
+ this.callbacks.onTranscript?.(content.inputTranscription.text, true, 'user');
411
415
  }
412
416
 
413
- // Tool calls from the model
414
- if (message.toolCall?.functionCalls) {
415
- for (const fn of message.toolCall.functionCalls) {
416
- logger.info('VoiceService', `🎯 Tool call: ${fn.name}(${JSON.stringify(fn.args)})`);
417
- this.callbacks.onToolCall?.({
418
- name: fn.name,
419
- args: fn.args || {},
420
- id: fn.id,
421
- });
422
- }
417
+ // Output transcription (model's speech-to-text)
418
+ if (content.outputTranscription?.text) {
419
+ logger.info('VoiceService', `🤖 MODEL (voice): "${content.outputTranscription.text}"`);
420
+ this.callbacks.onTranscript?.(content.outputTranscription.text, true, 'model');
423
421
  }
424
422
 
425
- // Error messages
426
- if (message.error) {
427
- logger.error('VoiceService', `Server error: ${JSON.stringify(message.error)}`);
428
- this.callbacks.onError?.(message.error.message || 'Server error');
423
+ // Tool calls inside serverContent (some SDK versions deliver here)
424
+ if (content.toolCall?.functionCalls) {
425
+ this.handleToolCalls(content.toolCall.functionCalls);
429
426
  }
430
427
  }
431
428
 
@@ -435,13 +432,4 @@ export class VoiceService {
435
432
  this._status = newStatus;
436
433
  this.callbacks.onStatusChange?.(newStatus);
437
434
  }
438
-
439
- private arrayBufferToBase64(buffer: ArrayBuffer): string {
440
- const bytes = new Uint8Array(buffer);
441
- let binary = '';
442
- for (let i = 0; i < bytes.byteLength; i++) {
443
- binary += String.fromCharCode(bytes[i]!);
444
- }
445
- return btoa(binary);
446
- }
447
435
  }