@mobileai/react-native 0.4.5 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. package/README.md +80 -15
  2. package/lib/module/components/AIAgent.js +181 -38
  3. package/lib/module/components/AIAgent.js.map +1 -1
  4. package/lib/module/components/AgentChatBar.js +53 -29
  5. package/lib/module/components/AgentChatBar.js.map +1 -1
  6. package/lib/module/components/Icons.js +337 -0
  7. package/lib/module/components/Icons.js.map +1 -0
  8. package/lib/module/core/AgentRuntime.js +74 -3
  9. package/lib/module/core/AgentRuntime.js.map +1 -1
  10. package/lib/module/core/systemPrompt.js +87 -34
  11. package/lib/module/core/systemPrompt.js.map +1 -1
  12. package/lib/module/services/AudioInputService.js +73 -2
  13. package/lib/module/services/AudioInputService.js.map +1 -1
  14. package/lib/module/services/AudioOutputService.js +58 -5
  15. package/lib/module/services/AudioOutputService.js.map +1 -1
  16. package/lib/module/services/VoiceService.js +284 -239
  17. package/lib/module/services/VoiceService.js.map +1 -1
  18. package/lib/typescript/src/components/AIAgent.d.ts.map +1 -1
  19. package/lib/typescript/src/components/AgentChatBar.d.ts.map +1 -1
  20. package/lib/typescript/src/components/Icons.d.ts +43 -0
  21. package/lib/typescript/src/components/Icons.d.ts.map +1 -0
  22. package/lib/typescript/src/core/AgentRuntime.d.ts +12 -0
  23. package/lib/typescript/src/core/AgentRuntime.d.ts.map +1 -1
  24. package/lib/typescript/src/core/systemPrompt.d.ts +7 -4
  25. package/lib/typescript/src/core/systemPrompt.d.ts.map +1 -1
  26. package/lib/typescript/src/services/AudioInputService.d.ts +13 -0
  27. package/lib/typescript/src/services/AudioInputService.d.ts.map +1 -1
  28. package/lib/typescript/src/services/AudioOutputService.d.ts.map +1 -1
  29. package/lib/typescript/src/services/VoiceService.d.ts +41 -24
  30. package/lib/typescript/src/services/VoiceService.d.ts.map +1 -1
  31. package/package.json +1 -1
  32. package/src/components/AIAgent.tsx +194 -38
  33. package/src/components/AgentChatBar.tsx +44 -25
  34. package/src/components/Icons.tsx +253 -0
  35. package/src/core/AgentRuntime.ts +70 -3
  36. package/src/core/systemPrompt.ts +87 -34
  37. package/src/services/AudioInputService.ts +77 -2
  38. package/src/services/AudioOutputService.ts +59 -5
  39. package/src/services/VoiceService.ts +280 -252
@@ -1,15 +1,26 @@
1
1
  /**
2
- * VoiceService — WebSocket connection to Gemini Live API.
2
+ * VoiceService — @google/genai SDK Live API connection.
3
+ *
4
+ * Uses the official `ai.live.connect()` method instead of raw WebSocket.
5
+ * This fixes function calling reliability: the SDK handles protocol details
6
+ * (binary framing, message transforms, model name prefixes) that our
7
+ * previous raw WebSocket implementation missed.
3
8
  *
4
9
  * Handles bidirectional audio streaming between the app and Gemini:
5
10
  * - Sends PCM 16kHz 16-bit audio chunks (mic input)
6
11
  * - Receives PCM 24kHz 16-bit audio chunks (AI responses)
7
12
  * - Receives function calls (tap, navigate, etc.) for agentic actions
8
- * - Sends screen context (DOM text + optional screenshot) for live mode
9
- *
10
- * Protocol: wss://generativelanguage.googleapis.com/ws/google.ai.generativelanguage.v1beta.GenerativeService.BidiGenerateContent
13
+ * - Sends screen context (DOM text) for live mode
11
14
  */
12
15
 
16
+ // Platform-specific import: Metro can't resolve '@google/genai/web' sub-path
17
+ // export, so we use the full path to the web bundle. This is what the SDK
18
+ // recommends ('use a platform specific import') — RN's WebSocket API is
19
+ // browser-compatible so the web bundle works correctly.
20
+ // @ts-ignore — TS can't find declarations for the deep path
21
+ import { GoogleGenAI, Modality } from '@google/genai/dist/web/index.mjs';
22
+ // @ts-ignore
23
+ import type { Session } from '@google/genai/dist/web/index.mjs';
13
24
  import { logger } from '../utils/logger';
14
25
  import type { ToolDefinition } from '../core/types';
15
26
 
@@ -34,25 +45,26 @@ export interface VoiceServiceCallbacks {
34
45
  onError?: (error: string) => void;
35
46
  /** Called when AI turn is complete (all audio sent) */
36
47
  onTurnComplete?: () => void;
48
+ /** Called when SDK setup is complete — safe to send screen context */
49
+ onSetupComplete?: () => void;
37
50
  }
38
51
 
39
52
  export type VoiceStatus = 'disconnected' | 'connecting' | 'connected' | 'error';
40
53
 
41
54
  // ─── Constants ─────────────────────────────────────────────────
42
55
 
43
- const WS_HOST = 'generativelanguage.googleapis.com';
44
- const WS_PATH = '/ws/google.ai.generativelanguage.v1beta.GenerativeService.BidiGenerateContent';
45
56
  const DEFAULT_MODEL = 'gemini-2.5-flash-native-audio-preview-12-2025';
46
57
  const DEFAULT_INPUT_SAMPLE_RATE = 16000;
47
58
 
48
59
  // ─── Service ───────────────────────────────────────────────────
49
60
 
50
61
  export class VoiceService {
51
- private ws: WebSocket | null = null;
62
+ private session: Session | null = null;
52
63
  private config: VoiceServiceConfig;
53
64
  private callbacks: VoiceServiceCallbacks = {};
54
- private setupComplete = false;
65
+ public lastCallbacks: VoiceServiceCallbacks | null = null;
55
66
  private _status: VoiceStatus = 'disconnected';
67
+ public intentionalDisconnect = false;
56
68
 
57
69
  constructor(config: VoiceServiceConfig) {
58
70
  this.config = config;
@@ -60,55 +72,115 @@ export class VoiceService {
60
72
 
61
73
  // ─── Connection ────────────────────────────────────────────
62
74
 
63
- connect(callbacks: VoiceServiceCallbacks): void {
64
- if (this.ws?.readyState === WebSocket.OPEN) {
75
+ /**
76
+ * Connect to Gemini Live API via the official SDK.
77
+ * Now async because `ai.live.connect()` returns a Promise.
78
+ */
79
+ async connect(callbacks: VoiceServiceCallbacks): Promise<void> {
80
+ if (this.session) {
65
81
  logger.info('VoiceService', 'Already connected');
66
82
  return;
67
83
  }
68
84
 
69
85
  this.callbacks = callbacks;
86
+ this.lastCallbacks = callbacks;
70
87
  this.setStatus('connecting');
88
+ this.intentionalDisconnect = false;
71
89
 
72
90
  const model = this.config.model || DEFAULT_MODEL;
73
- const url = `wss://${WS_HOST}${WS_PATH}?key=${this.config.apiKey}`;
91
+ logger.info('VoiceService', `Connecting via SDK (model: ${model})`);
74
92
 
75
- logger.info('VoiceService', `Connecting to Gemini Live API (model: ${model})`);
76
- this.ws = new WebSocket(url);
93
+ try {
94
+ const ai = new GoogleGenAI({ apiKey: this.config.apiKey });
77
95
 
78
- this.ws.onopen = () => {
79
- logger.info('VoiceService', 'WebSocket connected, sending setup...');
80
- this.sendSetup();
81
- };
96
+ const toolDeclarations = this.buildToolDeclarations();
82
97
 
83
- this.ws.onclose = (event) => {
84
- logger.info('VoiceService', `WebSocket closed: ${event.code} ${event.reason}`);
85
- this.setStatus('disconnected');
86
- this.setupComplete = false;
87
- };
98
+ // Build SDK config matching the official docs pattern
99
+ const sdkConfig: Record<string, any> = {
100
+ responseModalities: [Modality.AUDIO],
101
+ };
88
102
 
89
- this.ws.onerror = (error: any) => {
90
- logger.error('VoiceService', `WebSocket error: ${error.message || 'Unknown'}`);
91
- this.setStatus('error');
92
- this.callbacks.onError?.(error.message || 'WebSocket connection error');
93
- };
103
+ // Enable transcription for debugging and UX
104
+ sdkConfig.inputAudioTranscription = {};
105
+ sdkConfig.outputAudioTranscription = {};
106
+ logger.info('VoiceService', 'Transcription enabled');
94
107
 
95
- this.ws.onmessage = (event) => {
96
- this.handleMessage(event);
97
- };
108
+ if (this.config.systemPrompt) {
109
+ sdkConfig.systemInstruction = {
110
+ parts: [{ text: this.config.systemPrompt }],
111
+ };
112
+ }
113
+
114
+ if (toolDeclarations.length > 0) {
115
+ sdkConfig.tools = [{ functionDeclarations: toolDeclarations }];
116
+ }
117
+
118
+ // FULL CONFIG DUMP — see exactly what we send to SDK
119
+ const configDump = JSON.stringify({
120
+ ...sdkConfig,
121
+ systemInstruction: sdkConfig.systemInstruction ? '(present)' : '(none)',
122
+ tools: sdkConfig.tools ? `${toolDeclarations.length} declarations` : '(none)',
123
+ });
124
+ logger.info('VoiceService', `📋 SDK config: ${configDump}`);
125
+ logger.info('VoiceService', `📋 Tool names: ${toolDeclarations.map((t: any) => t.name).join(', ')}`);
126
+
127
+ const session = await ai.live.connect({
128
+ model: model,
129
+ config: sdkConfig,
130
+ callbacks: {
131
+ onopen: () => {
132
+ logger.info('VoiceService', '✅ SDK session connected');
133
+ this.setStatus('connected');
134
+ },
135
+ onmessage: (message: any) => {
136
+ this.handleSDKMessage(message);
137
+ },
138
+ onerror: (error: any) => {
139
+ const errDetail = error
140
+ ? JSON.stringify(error, Object.getOwnPropertyNames(error)).substring(0, 500)
141
+ : 'null';
142
+ logger.error('VoiceService', `SDK error: ${errDetail}`);
143
+ this.setStatus('error');
144
+ this.callbacks.onError?.(error?.message || 'SDK connection error');
145
+ },
146
+ onclose: (event: any) => {
147
+ const closeDetail = event
148
+ ? JSON.stringify(event, Object.getOwnPropertyNames(event)).substring(0, 500)
149
+ : 'null';
150
+ if (this.intentionalDisconnect) {
151
+ logger.info('VoiceService', `SDK session closed (intentional)`);
152
+ } else {
153
+ logger.error('VoiceService', `SDK session closed UNEXPECTEDLY — code: ${event?.code}, reason: ${event?.reason}, detail: ${closeDetail}`);
154
+ this.callbacks.onError?.(`Connection lost (code: ${event?.code || 'unknown'})`);
155
+ }
156
+ this.session = null;
157
+ this.setStatus('disconnected');
158
+ },
159
+ },
160
+ });
161
+
162
+ this.session = session;
163
+ logger.info('VoiceService', 'SDK session established');
164
+
165
+ } catch (error: any) {
166
+ logger.error('VoiceService', `Connection failed: ${error.message}`);
167
+ this.setStatus('error');
168
+ this.callbacks.onError?.(error.message || 'Failed to connect');
169
+ }
98
170
  }
99
171
 
100
172
  disconnect(): void {
101
- if (this.ws) {
102
- logger.info('VoiceService', 'Disconnecting...');
103
- this.ws.close();
104
- this.ws = null;
105
- this.setupComplete = false;
173
+ if (this.session) {
174
+ logger.info('VoiceService', 'Disconnecting (intentional)...');
175
+ this.intentionalDisconnect = true;
176
+ this.session.close();
177
+ this.session = null;
106
178
  this.setStatus('disconnected');
107
179
  }
108
180
  }
109
181
 
110
182
  get isConnected(): boolean {
111
- return this.ws?.readyState === WebSocket.OPEN && this.setupComplete;
183
+ return this.session !== null && this._status === 'connected';
112
184
  }
113
185
 
114
186
  get currentStatus(): VoiceStatus {
@@ -117,275 +189,240 @@ export class VoiceService {
117
189
 
118
190
  // ─── Send Audio ────────────────────────────────────────────
119
191
 
120
- /** Send PCM audio chunk (base64 encoded) to Gemini */
192
+ /** Send PCM audio chunk (base64 encoded) via SDK's sendRealtimeInput */
121
193
  private sendCount = 0;
122
194
  sendAudio(base64Audio: string): void {
123
195
  this.sendCount++;
124
- if (!this.isConnected) {
125
- logger.warn('VoiceService', `sendAudio #${this.sendCount} DROPPED not connected (ws=${this.ws?.readyState}, setup=${this.setupComplete})`);
196
+ if (!this.isConnected || !this.session) {
197
+ if (this.sendCount % 20 === 0) {
198
+ logger.warn('VoiceService', `sendAudio #${this.sendCount} DROPPED — not connected`);
199
+ }
126
200
  return;
127
201
  }
128
202
 
129
- const message = {
130
- realtimeInput: {
131
- audio: {
132
- mimeType: `audio/pcm;rate=${this.config.inputSampleRate || DEFAULT_INPUT_SAMPLE_RATE}`,
133
- data: base64Audio,
134
- },
135
- },
136
- };
203
+ const mimeType = `audio/pcm;rate=${this.config.inputSampleRate || DEFAULT_INPUT_SAMPLE_RATE}`;
137
204
 
138
- logger.info('VoiceService', `📤 #${this.sendCount} sending ${base64Audio.length} chars (ws=${this.ws?.readyState})`);
139
- this.ws!.send(JSON.stringify(message));
205
+ // DEBUG: log every send call
206
+ if (this.sendCount <= 5 || this.sendCount % 10 === 0) {
207
+ logger.info('VoiceService', `📡 sendAudio #${this.sendCount}: len=${base64Audio.length}, mime=${mimeType}, preview=${base64Audio.substring(0, 30)}...`);
208
+ }
209
+
210
+ try {
211
+ this.session.sendRealtimeInput({
212
+ audio: { data: base64Audio, mimeType },
213
+ });
214
+ // Log every 50th successful send to confirm data is reaching WebSocket
215
+ if (this.sendCount % 50 === 0) {
216
+ logger.info('VoiceService', `✅ sendAudio #${this.sendCount} OK — session.isOpen=${!!this.session}`);
217
+ }
218
+ } catch (error: any) {
219
+ logger.error('VoiceService', `❌ sendAudio EXCEPTION: ${error.message}\n${error.stack?.substring(0, 300)}`);
220
+ this.session = null;
221
+ this.setStatus('disconnected');
222
+ }
140
223
  }
141
224
 
142
225
  // ─── Send Text ─────────────────────────────────────────────
143
226
 
144
- /** Send text message via realtimeInput (same channel as audio) */
227
+ /** Send text message via SDK's sendClientContent */
145
228
  sendText(text: string): void {
146
- if (!this.isConnected) return;
147
-
148
- const message = {
149
- realtimeInput: { text },
150
- };
229
+ if (!this.isConnected || !this.session) return;
151
230
 
152
- this.ws!.send(JSON.stringify(message));
231
+ logger.info('VoiceService', `🗣️ USER (text): "${text}"`);
232
+ try {
233
+ this.session.sendClientContent({
234
+ turns: [{ role: 'user', parts: [{ text }] }],
235
+ turnComplete: true,
236
+ });
237
+ } catch (error: any) {
238
+ logger.error('VoiceService', `sendText failed: ${error.message}`);
239
+ }
153
240
  }
154
241
 
155
- /** Send DOM tree as passive context during live conversation.
156
- *
157
- * Uses `clientContent` with `turnComplete: false` to inject context
158
- * WITHOUT triggering a model response. This is the "incremental content
159
- * updates" pattern from the Gemini docs for establishing session context.
160
- *
161
- * Called once at connect + after each tool call (not on a timer).
162
- * Screenshots are handled separately via the capture_screenshot tool.
242
+ /**
243
+ * Send DOM tree as passive context during live conversation.
244
+ * Uses turnComplete: false the model receives context without responding.
163
245
  */
164
246
  sendScreenContext(domText: string): void {
165
- if (!this.isConnected) return;
247
+ if (!this.isConnected || !this.session) return;
166
248
 
167
- const message = {
168
- clientContent: {
249
+ try {
250
+ this.session.sendClientContent({
169
251
  turns: [{ role: 'user', parts: [{ text: domText }] }],
170
- turnComplete: false, // Passive context — don't trigger a response
171
- },
172
- };
173
-
174
- this.ws!.send(JSON.stringify(message));
175
- logger.debug('VoiceService', `📤 Screen context sent (${domText.length} chars)`);
252
+ turnComplete: true,
253
+ });
254
+ logger.info('VoiceService', `📤 Screen context sent (${domText.length} chars)`);
255
+ } catch (error: any) {
256
+ logger.error('VoiceService', `sendScreenContext failed: ${error.message}`);
257
+ }
176
258
  }
177
259
 
178
260
  // ─── Send Function Response ────────────────────────────────
179
261
 
180
- /** Send function call result back to Gemini */
262
+ /** Send function call result back via SDK's sendToolResponse */
181
263
  sendFunctionResponse(name: string, id: string, result: any): void {
182
- if (!this.isConnected) return;
183
-
184
- const message = {
185
- toolResponse: {
186
- functionResponses: [{
187
- name,
188
- id,
189
- response: result,
190
- }],
191
- },
192
- };
264
+ if (!this.isConnected || !this.session) return;
193
265
 
194
266
  logger.info('VoiceService', `📤 Sending tool response for ${name} (id=${id})`);
195
- this.ws!.send(JSON.stringify(message));
267
+
268
+ try {
269
+ this.session.sendToolResponse({
270
+ functionResponses: [{ name, id, response: result }],
271
+ });
272
+ } catch (error: any) {
273
+ logger.error('VoiceService', `sendFunctionResponse failed: ${error.message}`);
274
+ }
196
275
  }
197
276
 
198
- // ─── Internal: Setup ───────────────────────────────────────
277
+ // ─── Internal: Tool Declarations ───────────────────────────
199
278
 
200
- private sendSetup(): void {
201
- if (!this.ws || this.ws.readyState !== WebSocket.OPEN) return;
279
+ /**
280
+ * Builds function declarations from configured tools.
281
+ * Converts BOOLEAN params to STRING (native audio model limitation).
282
+ */
283
+ private buildToolDeclarations(): any[] {
284
+ if (!this.config.tools?.length) return [];
202
285
 
203
- const model = this.config.model || DEFAULT_MODEL;
286
+ const validTools = this.config.tools.filter(t => t.name !== 'capture_screenshot');
287
+ if (validTools.length === 0) return [];
204
288
 
205
- const setup: any = {
206
- model: `models/${model}`,
207
- generationConfig: {
208
- responseModalities: ['AUDIO'],
209
- },
210
- };
211
-
212
- // For native audio models, language is enforced via system instructions.
213
- // Explicitly setting speechConfig.languageCode causes silent API failures
214
- // without a prebuiltVoiceConfig.
215
-
216
- // Add system instruction if provided
217
- if (this.config.systemPrompt) {
218
- setup.systemInstruction = {
219
- parts: [{ text: this.config.systemPrompt }],
289
+ return validTools.map(tool => {
290
+ const hasParams = Object.keys(tool.parameters || {}).length > 0;
291
+ const functionDecl: any = {
292
+ name: tool.name,
293
+ description: tool.description,
220
294
  };
221
- }
222
295
 
223
- // Add tool declarations for function calling
224
- if (this.config.tools?.length) {
225
- setup.tools = [{
226
- functionDeclarations: this.config.tools.map(tool => ({
227
- name: tool.name,
228
- description: tool.description,
229
- parameters: {
230
- type: 'OBJECT',
231
- properties: Object.fromEntries(
232
- Object.entries(tool.parameters).map(([key, param]) => [
233
- key,
234
- {
235
- type: param.type.toUpperCase(),
236
- description: param.description,
237
- },
238
- ])
239
- ),
240
- required: Object.entries(tool.parameters)
241
- .filter(([, param]) => param.required)
242
- .map(([key]) => key),
243
- },
244
- })),
245
- }];
246
- }
247
-
248
- // Enable transcription
249
- setup.inputAudioTranscription = {};
250
- setup.outputAudioTranscription = {};
251
-
252
- const setupMessage = { setup };
253
- logger.info('VoiceService', `Sending setup (model: ${model}, tools: ${this.config.tools?.length || 0})`);
254
- this.ws.send(JSON.stringify(setupMessage));
296
+ if (hasParams) {
297
+ functionDecl.parameters = {
298
+ type: 'OBJECT',
299
+ properties: Object.fromEntries(
300
+ Object.entries(tool.parameters).map(([key, param]) => {
301
+ let paramType = param.type.toUpperCase();
302
+ let desc = param.description;
303
+ if (paramType === 'BOOLEAN') {
304
+ paramType = 'STRING';
305
+ desc = `${desc} (use "true" or "false")`;
306
+ }
307
+ return [key, { type: paramType, description: desc }];
308
+ })
309
+ ),
310
+ required: Object.entries(tool.parameters)
311
+ .filter(([, param]) => param.required)
312
+ .map(([key]) => key),
313
+ };
314
+ }
315
+ return functionDecl;
316
+ });
255
317
  }
256
318
 
257
319
  // ─── Internal: Message Handling ────────────────────────────
258
320
 
259
- private handleMessage(event: WebSocketMessageEvent): void {
321
+ /**
322
+ * Handle messages from the SDK's onmessage callback.
323
+ * The SDK parses binary/JSON automatically — we get clean objects.
324
+ *
325
+ * Per official docs, tool calls come at the top level as
326
+ * `response.toolCall.functionCalls`.
327
+ */
328
+ private handleSDKMessage(message: any): void {
260
329
  try {
261
- const dataType = typeof event.data;
262
- const dataLen = typeof event.data === 'string' ? event.data.length : (event.data?.byteLength || 'unknown');
263
- logger.info('VoiceService', `📥 WS message received: type=${dataType}, length=${dataLen}`);
264
-
265
- // Handle binary data (could be JSON or raw PCM)
266
- if (typeof event.data !== 'string') {
267
- logger.info('VoiceService', '📥 Binary message — processing...');
268
- this.handleBinaryMessage(event.data);
330
+ // RAW MESSAGE DUMP — full session visibility
331
+ const msgKeys = Object.keys(message || {}).join(', ');
332
+ logger.info('VoiceService', `📨 SDK message keys: [${msgKeys}]`);
333
+
334
+ // Full raw dump for non-audio messages (audio is too large)
335
+ if (!message.serverContent?.modelTurn?.parts?.some((p: any) => p.inlineData)) {
336
+ const rawDump = JSON.stringify(message).substring(0, 1000);
337
+ logger.info('VoiceService', `📨 RAW: ${rawDump}`);
338
+ }
339
+
340
+ // Tool calls — top-level (per official docs)
341
+ if (message.toolCall?.functionCalls) {
342
+ this.handleToolCalls(message.toolCall.functionCalls);
269
343
  return;
270
344
  }
271
345
 
272
- // Handle JSON text messages
273
- const message = JSON.parse(event.data);
274
- logger.info('VoiceService', `📥 JSON message keys: ${Object.keys(message).join(', ')}`);
275
- this.processMessage(message);
276
- } catch (error: any) {
277
- logger.error('VoiceService', `Error handling message: ${error.message}`);
278
- }
279
- }
346
+ // Server content (audio, text, transcripts, turn events)
347
+ if (message.serverContent) {
348
+ this.handleServerContent(message.serverContent);
349
+ }
280
350
 
281
- private handleBinaryMessage(data: any): void {
282
- try {
283
- // Try to decode as JSON first
284
- let bytes: Uint8Array;
285
- if (data instanceof ArrayBuffer) {
286
- bytes = new Uint8Array(data);
287
- } else if (data instanceof Blob) {
288
- // Blob handling — read as ArrayBuffer
289
- const reader = new FileReader();
290
- reader.onload = () => {
291
- if (reader.result instanceof ArrayBuffer) {
292
- this.processBinaryBytes(new Uint8Array(reader.result));
293
- }
294
- };
295
- reader.readAsArrayBuffer(data);
296
- return;
297
- } else {
298
- return;
351
+ // Setup complete acknowledgment
352
+ if (message.setupComplete !== undefined) {
353
+ logger.info('VoiceService', '✅ Setup complete ready for audio');
354
+ this.callbacks.onSetupComplete?.();
299
355
  }
300
356
 
301
- this.processBinaryBytes(bytes);
357
+ // Error messages
358
+ if (message.error) {
359
+ logger.error('VoiceService', `Server error: ${JSON.stringify(message.error)}`);
360
+ this.callbacks.onError?.(message.error.message || 'Server error');
361
+ }
302
362
  } catch (error: any) {
303
- logger.error('VoiceService', `Error handling binary message: ${error.message}`);
363
+ logger.error('VoiceService', `Error handling SDK message: ${error.message}`);
304
364
  }
305
365
  }
306
366
 
307
- private processBinaryBytes(bytes: Uint8Array): void {
308
- // Check if it looks like JSON (starts with '{' or '[')
309
- const looksLikeJson = bytes.length > 0 && (bytes[0] === 123 || bytes[0] === 91);
310
-
311
- if (looksLikeJson) {
312
- try {
313
- const text = new TextDecoder('utf-8').decode(bytes);
314
- const message = JSON.parse(text);
315
- this.processMessage(message);
316
- } catch {
317
- // Not JSON — treat as raw PCM audio
318
- this.callbacks.onAudioResponse?.(this.arrayBufferToBase64(bytes.buffer as ArrayBuffer));
319
- }
320
- } else {
321
- // Raw PCM audio data
322
- this.callbacks.onAudioResponse?.(this.arrayBufferToBase64(bytes.buffer as ArrayBuffer));
367
+ /** Process tool calls from the model */
368
+ private handleToolCalls(functionCalls: any[]): void {
369
+ for (const fn of functionCalls) {
370
+ logger.info('VoiceService', `🎯 Tool call: ${fn.name}(${JSON.stringify(fn.args)}) [id=${fn.id}]`);
371
+ this.callbacks.onToolCall?.({
372
+ name: fn.name,
373
+ args: fn.args || {},
374
+ id: fn.id,
375
+ });
323
376
  }
324
377
  }
325
378
 
326
- private processMessage(message: any): void {
327
- // Setup complete acknowledgment
328
- if (message.setupComplete !== undefined) {
329
- logger.info('VoiceService', '✅ Setup complete — ready for audio exchange');
330
- this.setupComplete = true;
331
- this.setStatus('connected');
332
- return;
333
- }
379
+ private audioResponseCount = 0;
334
380
 
335
- // Server content (audio response + transcripts)
336
- if (message.serverContent) {
337
- const content = message.serverContent;
338
- logger.info('VoiceService', `📥 serverContent received turnComplete=${content.turnComplete}, hasParts=${!!content.modelTurn?.parts}, inputTranscription=${!!content.inputTranscription}, outputTranscription=${!!content.outputTranscription}`);
381
+ /** Process server content (audio responses, transcripts, turn events) */
382
+ private handleServerContent(content: any): void {
383
+ // Log all keys for full visibility
384
+ const contentKeys = Object.keys(content || {}).join(', ');
385
+ logger.debug('VoiceService', `📦 serverContent keys: [${contentKeys}]`);
339
386
 
340
- // Check for turn complete
341
- if (content.turnComplete) {
342
- this.callbacks.onTurnComplete?.();
343
- }
344
-
345
- // Process model output parts
346
- if (content.modelTurn?.parts) {
347
- for (const part of content.modelTurn.parts) {
348
- // Audio response
349
- if (part.inlineData?.data) {
350
- logger.info('VoiceService', `🔊 Audio response: ${part.inlineData.data.length} chars`);
351
- this.callbacks.onAudioResponse?.(part.inlineData.data);
352
- }
387
+ // Turn complete
388
+ if (content.turnComplete) {
389
+ logger.info('VoiceService', `🏁 Turn complete (audioChunks sent: ${this.audioResponseCount})`);
390
+ this.audioResponseCount = 0;
391
+ this.callbacks.onTurnComplete?.();
392
+ }
353
393
 
354
- // Text response (transcript)
355
- if (part.text) {
356
- logger.info('VoiceService', `💬 Text response: "${part.text}"`);
357
- this.callbacks.onTranscript?.(part.text, true, 'model');
394
+ // Model output parts (audio + optional thinking text)
395
+ if (content.modelTurn?.parts) {
396
+ for (const part of content.modelTurn.parts) {
397
+ if (part.inlineData?.data) {
398
+ this.audioResponseCount++;
399
+ if (this.audioResponseCount <= 3 || this.audioResponseCount % 20 === 0) {
400
+ logger.info('VoiceService', `🔊 Audio chunk #${this.audioResponseCount}: ${part.inlineData.data.length} b64 chars, mime=${part.inlineData.mimeType || 'unknown'}`);
358
401
  }
402
+ this.callbacks.onAudioResponse?.(part.inlineData.data);
403
+ }
404
+ if (part.text) {
405
+ logger.info('VoiceService', `🤖 MODEL: "${part.text}"`);
406
+ this.callbacks.onTranscript?.(part.text, true, 'model');
359
407
  }
360
408
  }
409
+ }
361
410
 
362
- // Input transcription (user's speech)
363
- if (content.inputTranscription?.text) {
364
- this.callbacks.onTranscript?.(content.inputTranscription.text, true, 'user');
365
- }
366
-
367
- // Output transcription (model's speech-to-text)
368
- if (content.outputTranscription?.text) {
369
- this.callbacks.onTranscript?.(content.outputTranscription.text, true, 'model');
370
- }
411
+ // Input transcription (user's speech-to-text)
412
+ if (content.inputTranscription?.text) {
413
+ logger.info('VoiceService', `🗣️ USER (voice): "${content.inputTranscription.text}"`);
414
+ this.callbacks.onTranscript?.(content.inputTranscription.text, true, 'user');
371
415
  }
372
416
 
373
- // Tool calls from the model
374
- if (message.toolCall?.functionCalls) {
375
- for (const fn of message.toolCall.functionCalls) {
376
- logger.info('VoiceService', `Tool call: ${fn.name}(${JSON.stringify(fn.args)})`);
377
- this.callbacks.onToolCall?.({
378
- name: fn.name,
379
- args: fn.args || {},
380
- id: fn.id,
381
- });
382
- }
417
+ // Output transcription (model's speech-to-text)
418
+ if (content.outputTranscription?.text) {
419
+ logger.info('VoiceService', `🤖 MODEL (voice): "${content.outputTranscription.text}"`);
420
+ this.callbacks.onTranscript?.(content.outputTranscription.text, true, 'model');
383
421
  }
384
422
 
385
- // Error messages
386
- if (message.error) {
387
- logger.error('VoiceService', `Server error: ${JSON.stringify(message.error)}`);
388
- this.callbacks.onError?.(message.error.message || 'Server error');
423
+ // Tool calls inside serverContent (some SDK versions deliver here)
424
+ if (content.toolCall?.functionCalls) {
425
+ this.handleToolCalls(content.toolCall.functionCalls);
389
426
  }
390
427
  }
391
428
 
@@ -395,13 +432,4 @@ export class VoiceService {
395
432
  this._status = newStatus;
396
433
  this.callbacks.onStatusChange?.(newStatus);
397
434
  }
398
-
399
- private arrayBufferToBase64(buffer: ArrayBuffer): string {
400
- const bytes = new Uint8Array(buffer);
401
- let binary = '';
402
- for (let i = 0; i < bytes.byteLength; i++) {
403
- binary += String.fromCharCode(bytes[i]!);
404
- }
405
- return btoa(binary);
406
- }
407
435
  }