@mobileai/react-native 0.4.2 → 0.4.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. package/README.md +23 -3
  2. package/lib/module/components/AIAgent.js +216 -5
  3. package/lib/module/components/AIAgent.js.map +1 -1
  4. package/lib/module/components/AgentChatBar.js +358 -36
  5. package/lib/module/components/AgentChatBar.js.map +1 -1
  6. package/lib/module/core/AgentRuntime.js +122 -6
  7. package/lib/module/core/AgentRuntime.js.map +1 -1
  8. package/lib/module/core/systemPrompt.js +57 -0
  9. package/lib/module/core/systemPrompt.js.map +1 -1
  10. package/lib/module/index.js +8 -0
  11. package/lib/module/index.js.map +1 -1
  12. package/lib/module/providers/GeminiProvider.js +108 -85
  13. package/lib/module/providers/GeminiProvider.js.map +1 -1
  14. package/lib/module/services/AudioInputService.js +128 -0
  15. package/lib/module/services/AudioInputService.js.map +1 -0
  16. package/lib/module/services/AudioOutputService.js +154 -0
  17. package/lib/module/services/AudioOutputService.js.map +1 -0
  18. package/lib/module/services/VoiceService.js +361 -0
  19. package/lib/module/services/VoiceService.js.map +1 -0
  20. package/lib/module/utils/audioUtils.js +49 -0
  21. package/lib/module/utils/audioUtils.js.map +1 -0
  22. package/lib/module/utils/logger.js +21 -4
  23. package/lib/module/utils/logger.js.map +1 -1
  24. package/lib/typescript/babel.config.d.ts +10 -0
  25. package/lib/typescript/babel.config.d.ts.map +1 -0
  26. package/lib/typescript/eslint.config.d.mts +3 -0
  27. package/lib/typescript/eslint.config.d.mts.map +1 -0
  28. package/lib/typescript/fetch-models.d.mts +2 -0
  29. package/lib/typescript/fetch-models.d.mts.map +1 -0
  30. package/lib/typescript/list-all-models.d.mts +2 -0
  31. package/lib/typescript/list-all-models.d.mts.map +1 -0
  32. package/lib/typescript/list-models.d.mts +2 -0
  33. package/lib/typescript/list-models.d.mts.map +1 -0
  34. package/lib/typescript/src/components/AIAgent.d.ts +8 -2
  35. package/lib/typescript/src/components/AIAgent.d.ts.map +1 -1
  36. package/lib/typescript/src/components/AgentChatBar.d.ts +19 -2
  37. package/lib/typescript/src/components/AgentChatBar.d.ts.map +1 -1
  38. package/lib/typescript/src/core/AgentRuntime.d.ts +17 -1
  39. package/lib/typescript/src/core/AgentRuntime.d.ts.map +1 -1
  40. package/lib/typescript/src/core/systemPrompt.d.ts +8 -0
  41. package/lib/typescript/src/core/systemPrompt.d.ts.map +1 -1
  42. package/lib/typescript/src/core/types.d.ts +24 -1
  43. package/lib/typescript/src/core/types.d.ts.map +1 -1
  44. package/lib/typescript/src/index.d.ts +6 -1
  45. package/lib/typescript/src/index.d.ts.map +1 -1
  46. package/lib/typescript/src/providers/GeminiProvider.d.ts +22 -18
  47. package/lib/typescript/src/providers/GeminiProvider.d.ts.map +1 -1
  48. package/lib/typescript/src/services/AudioInputService.d.ts +31 -0
  49. package/lib/typescript/src/services/AudioInputService.d.ts.map +1 -0
  50. package/lib/typescript/src/services/AudioOutputService.d.ts +34 -0
  51. package/lib/typescript/src/services/AudioOutputService.d.ts.map +1 -0
  52. package/lib/typescript/src/services/VoiceService.d.ts +73 -0
  53. package/lib/typescript/src/services/VoiceService.d.ts.map +1 -0
  54. package/lib/typescript/src/utils/audioUtils.d.ts +17 -0
  55. package/lib/typescript/src/utils/audioUtils.d.ts.map +1 -0
  56. package/lib/typescript/src/utils/logger.d.ts +4 -0
  57. package/lib/typescript/src/utils/logger.d.ts.map +1 -1
  58. package/package.json +24 -8
  59. package/src/components/AIAgent.tsx +222 -3
  60. package/src/components/AgentChatBar.tsx +487 -42
  61. package/src/core/AgentRuntime.ts +131 -2
  62. package/src/core/systemPrompt.ts +62 -0
  63. package/src/core/types.ts +30 -0
  64. package/src/index.ts +16 -0
  65. package/src/providers/GeminiProvider.ts +105 -89
  66. package/src/services/AudioInputService.ts +141 -0
  67. package/src/services/AudioOutputService.ts +167 -0
  68. package/src/services/VoiceService.ts +407 -0
  69. package/src/utils/audioUtils.ts +54 -0
  70. package/src/utils/logger.ts +24 -7
@@ -1,40 +1,34 @@
1
1
  /**
2
- * GeminiProvider — Gemini API integration with structured action pattern.
2
+ * GeminiProvider — Gemini API integration via @google/genai SDK.
3
3
  *
4
- * Uses a single forced function call (`agent_step`) that bundles
5
- * structured reasoning (evaluation, memory, plan) alongside the action.
6
- * This replaces free-form text + separate tool calls for stability.
4
+ * Uses the official Google GenAI SDK for:
5
+ * - generateContent with structured function calling (agent_step)
6
+ * - inlineData for vision (base64 screenshots)
7
+ * - System instructions
8
+ *
9
+ * Implements the AIProvider interface so it can be swapped
10
+ * with OpenAIProvider, AnthropicProvider, etc.
7
11
  */
8
12
 
13
+ import { GoogleGenAI, FunctionCallingConfigMode, Type } from '@google/genai';
9
14
  import { logger } from '../utils/logger';
10
- import type { AIProvider, ToolDefinition, AgentStep, ProviderResult, AgentReasoning } from '../core/types';
15
+ import type { AIProvider, ToolDefinition, AgentStep, ProviderResult, AgentReasoning, TokenUsage } from '../core/types';
11
16
 
12
17
  // ─── Constants ─────────────────────────────────────────────────
13
18
 
14
19
  const AGENT_STEP_FN = 'agent_step';
15
20
 
16
- // Reasoning fields that are always present in the agent_step schema
21
+ // Reasoning fields always present in the agent_step schema
17
22
  const REASONING_FIELDS = ['previous_goal_eval', 'memory', 'plan'] as const;
18
23
 
19
- // ─── Gemini API Types ──────────────────────────────────────────
20
-
21
- interface GeminiContent {
22
- role: 'user' | 'model';
23
- parts: Array<{
24
- text?: string;
25
- functionCall?: { name: string; args: any };
26
- functionResponse?: { name: string; response: any };
27
- }>;
28
- }
29
-
30
24
  // ─── Provider ──────────────────────────────────────────────────
31
25
 
32
26
  export class GeminiProvider implements AIProvider {
33
- private apiKey: string;
27
+ private ai: GoogleGenAI;
34
28
  private model: string;
35
29
 
36
30
  constructor(apiKey: string, model: string = 'gemini-2.5-flash') {
37
- this.apiKey = apiKey;
31
+ this.ai = new GoogleGenAI({ apiKey });
38
32
  this.model = model;
39
33
  }
40
34
 
@@ -43,59 +37,56 @@ export class GeminiProvider implements AIProvider {
43
37
  userMessage: string,
44
38
  tools: ToolDefinition[],
45
39
  history: AgentStep[],
40
+ screenshot?: string,
46
41
  ): Promise<ProviderResult> {
47
42
 
48
- logger.info('GeminiProvider', `Sending request. Model: ${this.model}, Tools: ${tools.length}`);
43
+ logger.info('GeminiProvider', `Sending request. Model: ${this.model}, Tools: ${tools.length}${screenshot ? ', with screenshot' : ''}`);
49
44
 
50
45
  // Build single agent_step function declaration
51
46
  const agentStepDeclaration = this.buildAgentStepDeclaration(tools);
52
47
 
53
- // Build conversation history with proper function call/response pairs
54
- const contents = this.buildContents(userMessage, history);
55
-
56
- // Make API request
57
- const url = `https://generativelanguage.googleapis.com/v1beta/models/${this.model}:generateContent?key=${this.apiKey}`;
58
-
59
- const body: any = {
60
- contents,
61
- tools: [{ functionDeclarations: [agentStepDeclaration] }],
62
- systemInstruction: { parts: [{ text: systemPrompt }] },
63
- // Force the model to always call agent_step
64
- tool_config: {
65
- function_calling_config: {
66
- mode: 'ANY',
67
- allowed_function_names: [AGENT_STEP_FN],
68
- },
69
- },
70
- generationConfig: {
71
- temperature: 0.2,
72
- maxOutputTokens: 2048,
73
- },
74
- };
48
+ // Build contents (user message + optional screenshot)
49
+ const contents = this.buildContents(userMessage, history, screenshot);
75
50
 
76
51
  const startTime = Date.now();
77
52
 
78
53
  try {
79
- const response = await fetch(url, {
80
- method: 'POST',
81
- headers: { 'Content-Type': 'application/json' },
82
- body: JSON.stringify(body),
54
+ const response = await this.ai.models.generateContent({
55
+ model: this.model,
56
+ contents,
57
+ config: {
58
+ systemInstruction: systemPrompt,
59
+ tools: [{ functionDeclarations: [agentStepDeclaration] }],
60
+ toolConfig: {
61
+ functionCallingConfig: {
62
+ mode: FunctionCallingConfigMode.ANY,
63
+ allowedFunctionNames: [AGENT_STEP_FN],
64
+ },
65
+ },
66
+ temperature: 0.2,
67
+ maxOutputTokens: 2048,
68
+ },
83
69
  });
84
70
 
85
71
  const elapsed = Date.now() - startTime;
86
72
  logger.info('GeminiProvider', `Response received in ${elapsed}ms`);
87
73
 
88
- if (!response.ok) {
89
- const errorText = await response.text();
90
- logger.error('GeminiProvider', `API error ${response.status}: ${errorText}`);
91
- throw new Error(`Gemini API error ${response.status}: ${errorText}`);
74
+ // Extract token usage from SDK response
75
+ const tokenUsage = this.extractTokenUsage(response);
76
+ if (tokenUsage) {
77
+ logger.info('GeminiProvider', `Tokens: ${tokenUsage.promptTokens} in / ${tokenUsage.completionTokens} out / $${tokenUsage.estimatedCostUSD.toFixed(6)}`);
92
78
  }
93
79
 
94
- const data = await response.json();
95
-
96
- return this.parseAgentStepResponse(data, tools);
80
+ const result = this.parseAgentStepResponse(response, tools);
81
+ result.tokenUsage = tokenUsage;
82
+ return result;
97
83
  } catch (error: any) {
98
84
  logger.error('GeminiProvider', 'Request failed:', error.message);
85
+
86
+ // Preserve HTTP error format for backward compatibility with tests
87
+ if (error.status) {
88
+ throw new Error(`Gemini API error ${error.status}: ${error.message}`);
89
+ }
99
90
  throw error;
100
91
  }
101
92
  }
@@ -117,7 +108,6 @@ export class GeminiProvider implements AIProvider {
117
108
  const actionProperties: Record<string, any> = {};
118
109
  for (const tool of tools) {
119
110
  for (const [paramName, param] of Object.entries(tool.parameters)) {
120
- // Skip if already added (shared field names like 'text', 'index')
121
111
  if (actionProperties[paramName]) continue;
122
112
  actionProperties[paramName] = {
123
113
  type: this.mapParamType(param.type),
@@ -139,28 +129,25 @@ export class GeminiProvider implements AIProvider {
139
129
  name: AGENT_STEP_FN,
140
130
  description: `Execute one agent step. Choose an action and provide reasoning.\n\nAvailable actions:\n${toolDescriptions}`,
141
131
  parameters: {
142
- type: 'OBJECT',
132
+ type: Type.OBJECT,
143
133
  properties: {
144
- // ── Reasoning fields ──
145
134
  previous_goal_eval: {
146
- type: 'STRING',
135
+ type: Type.STRING,
147
136
  description: 'One-sentence assessment of your last action. State success, failure, or uncertain. Skip on first step.',
148
137
  },
149
138
  memory: {
150
- type: 'STRING',
139
+ type: Type.STRING,
151
140
  description: 'Key facts to remember for future steps: progress made, items found, counters, field values already collected.',
152
141
  },
153
142
  plan: {
154
- type: 'STRING',
143
+ type: Type.STRING,
155
144
  description: 'Your immediate next goal — what action you will take and why.',
156
145
  },
157
- // ── Action selection ──
158
146
  action_name: {
159
- type: 'STRING',
147
+ type: Type.STRING,
160
148
  description: 'Which action to execute.',
161
149
  enum: toolNames,
162
150
  },
163
- // ── Action parameters (flat) ──
164
151
  ...actionProperties,
165
152
  },
166
153
  required: ['plan', 'action_name'],
@@ -170,43 +157,46 @@ export class GeminiProvider implements AIProvider {
170
157
 
171
158
  private mapParamType(type: string): string {
172
159
  switch (type) {
173
- case 'number': return 'NUMBER';
174
- case 'integer': return 'INTEGER';
175
- case 'boolean': return 'BOOLEAN';
160
+ case 'number': return Type.NUMBER;
161
+ case 'integer': return Type.INTEGER;
162
+ case 'boolean': return Type.BOOLEAN;
176
163
  case 'string':
177
- default: return 'STRING';
164
+ default: return Type.STRING;
178
165
  }
179
166
  }
180
167
 
181
168
  // ─── Build Contents ────────────────────────────────────────
182
169
 
183
170
  /**
184
- * Builds Gemini conversation contents.
185
- *
186
- * Each step is a STATELESS single-turn request (matching page-agent's approach):
187
- * - System prompt has general instructions
188
- * - User message contains full context: task, history, screen state
189
- * - Model responds with agent_step function call
190
- *
191
- * History is embedded as text in assembleUserPrompt (via <agent_history>),
192
- * NOT as functionCall/functionResponse pairs. This avoids Gemini's
193
- * conversation format requirements and thought_signature complexity.
171
+ * Builds contents for the generateContent call.
172
+ * Single-turn: user message + optional screenshot as inlineData.
194
173
  */
195
- private buildContents(userMessage: string, _history: AgentStep[]): GeminiContent[] {
196
- return [{
197
- role: 'user',
198
- parts: [{ text: userMessage }],
199
- }];
174
+ private buildContents(userMessage: string, _history: AgentStep[], screenshot?: string): any[] {
175
+ const parts: any[] = [{ text: userMessage }];
176
+
177
+ // Append screenshot as inlineData for Gemini vision
178
+ if (screenshot) {
179
+ parts.push({
180
+ inlineData: {
181
+ mimeType: 'image/jpeg',
182
+ data: screenshot,
183
+ },
184
+ });
185
+ }
186
+
187
+ return [{ role: 'user', parts }];
200
188
  }
201
189
 
202
190
  // ─── Parse Response ────────────────────────────────────────
203
191
 
204
192
  /**
205
- * Parses the Gemini response expecting a single agent_step function call.
206
- * Extracts structured reasoning + action, and determines which tool to execute.
193
+ * Parses the SDK response expecting a single agent_step function call.
194
+ * Extracts structured reasoning + action.
207
195
  */
208
- private parseAgentStepResponse(data: any, tools: ToolDefinition[]): ProviderResult {
209
- if (!data.candidates || data.candidates.length === 0) {
196
+ private parseAgentStepResponse(response: any, tools: ToolDefinition[]): ProviderResult {
197
+ const candidates = response.candidates || [];
198
+
199
+ if (candidates.length === 0) {
210
200
  logger.warn('GeminiProvider', 'No candidates in response');
211
201
  return {
212
202
  toolCalls: [{ name: 'done', args: { text: 'No response generated.', success: false } }],
@@ -215,7 +205,7 @@ export class GeminiProvider implements AIProvider {
215
205
  };
216
206
  }
217
207
 
218
- const candidate = data.candidates[0];
208
+ const candidate = candidates[0];
219
209
  const parts = candidate.content?.parts || [];
220
210
 
221
211
  // Find the function call part
@@ -251,11 +241,10 @@ export class GeminiProvider implements AIProvider {
251
241
  };
252
242
  }
253
243
 
254
- // Build action args: everything except reasoning fields and action_name
244
+ // Build action args: extract only the params that belong to the matched tool
255
245
  const actionArgs: Record<string, any> = {};
256
246
  const reservedKeys = new Set([...REASONING_FIELDS, 'action_name']);
257
247
 
258
- // Find the matching tool to know which params belong to it
259
248
  const matchedTool = tools.find(t => t.name === actionName);
260
249
  if (matchedTool) {
261
250
  for (const paramName of Object.keys(matchedTool.parameters)) {
@@ -264,7 +253,6 @@ export class GeminiProvider implements AIProvider {
264
253
  }
265
254
  }
266
255
  } else {
267
- // Custom/registered tool — grab all non-reserved fields
268
256
  for (const [key, value] of Object.entries(args)) {
269
257
  if (!reservedKeys.has(key)) {
270
258
  actionArgs[key] = value;
@@ -280,4 +268,32 @@ export class GeminiProvider implements AIProvider {
280
268
  text: textPart?.text,
281
269
  };
282
270
  }
271
+
272
+ // ─── Token Usage Extraction ─────────────────────────────────
273
+
274
+ /**
275
+ * Extracts token usage from SDK response and calculates estimated cost.
276
+ *
277
+ * Pricing (Gemini 2.5 Flash):
278
+ * - Input: $0.30 / 1M tokens
279
+ * - Output: $2.50 / 1M tokens
280
+ */
281
+ private extractTokenUsage(response: any): TokenUsage | undefined {
282
+ const meta = response?.usageMetadata;
283
+ if (!meta) return undefined;
284
+
285
+ const promptTokens = meta.promptTokenCount ?? 0;
286
+ const completionTokens = meta.candidatesTokenCount ?? 0;
287
+ const totalTokens = meta.totalTokenCount ?? (promptTokens + completionTokens);
288
+
289
+ // Cost estimation based on Gemini 2.5 Flash pricing
290
+ const INPUT_COST_PER_M = 0.30;
291
+ const OUTPUT_COST_PER_M = 2.50;
292
+
293
+ const estimatedCostUSD =
294
+ (promptTokens / 1_000_000) * INPUT_COST_PER_M +
295
+ (completionTokens / 1_000_000) * OUTPUT_COST_PER_M;
296
+
297
+ return { promptTokens, completionTokens, totalTokens, estimatedCostUSD };
298
+ }
283
299
  }
@@ -0,0 +1,141 @@
1
+ /**
2
+ * AudioInputService — Real-time microphone capture for voice mode.
3
+ *
4
+ * Uses react-native-audio-api (Software Mansion) AudioRecorder for native
5
+ * PCM streaming from the microphone. Each chunk is converted from Float32
6
+ * to Int16 PCM and base64-encoded for the Gemini Live API.
7
+ *
8
+ * Requires: react-native-audio-api (development build only, not Expo Go)
9
+ */
10
+
11
+ import { logger } from '../utils/logger';
12
+ import { float32ToInt16Base64 } from '../utils/audioUtils';
13
+
14
+ // ─── Types ─────────────────────────────────────────────────────
15
+
16
+ export interface AudioInputConfig {
17
+ sampleRate?: number;
18
+ /** Number of samples per callback buffer (default: 4096) */
19
+ bufferLength?: number;
20
+ /** Callback with base64 PCM audio chunk */
21
+ onAudioChunk: (base64Audio: string) => void;
22
+ onError?: (error: string) => void;
23
+ onPermissionDenied?: () => void;
24
+ }
25
+
26
+ type RecordingStatus = 'idle' | 'recording' | 'paused';
27
+
28
+ // ─── Service ───────────────────────────────────────────────────
29
+
30
+ export class AudioInputService {
31
+ private config: AudioInputConfig;
32
+ private status: RecordingStatus = 'idle';
33
+ private recorder: any = null;
34
+
35
+ constructor(config: AudioInputConfig) {
36
+ this.config = config;
37
+ }
38
+
39
+ // ─── Lifecycle ─────────────────────────────────────────────
40
+
41
+ async start(): Promise<boolean> {
42
+ try {
43
+ // Lazy-load react-native-audio-api (optional peer dependency)
44
+ let audioApi: any;
45
+ try {
46
+ audioApi = require('react-native-audio-api');
47
+ } catch {
48
+ const msg =
49
+ 'Voice mode requires react-native-audio-api. Install with: npm install react-native-audio-api';
50
+ logger.error('AudioInput', msg);
51
+ this.config.onError?.(msg);
52
+ return false;
53
+ }
54
+
55
+ // Request mic permission (Android)
56
+ try {
57
+ const { PermissionsAndroid, Platform } = require('react-native');
58
+ if (Platform.OS === 'android') {
59
+ const result = await PermissionsAndroid.request(
60
+ PermissionsAndroid.PERMISSIONS.RECORD_AUDIO
61
+ );
62
+ if (result !== PermissionsAndroid.RESULTS.GRANTED) {
63
+ logger.warn('AudioInput', 'Microphone permission denied');
64
+ this.config.onPermissionDenied?.();
65
+ return false;
66
+ }
67
+ }
68
+ } catch {
69
+ // Permission check failed — continue and let native layer handle it
70
+ }
71
+
72
+ // Create AudioRecorder
73
+ this.recorder = new audioApi.AudioRecorder();
74
+
75
+ const sampleRate = this.config.sampleRate || 16000;
76
+ const bufferLength = this.config.bufferLength || 4096;
77
+
78
+ // Register audio data callback
79
+ let frameCount = 0;
80
+ this.recorder.onAudioReady(
81
+ { sampleRate, bufferLength, channelCount: 1 },
82
+ (event: any) => {
83
+ frameCount++;
84
+ try {
85
+ // event.buffer is an AudioBuffer — get Float32 channel data
86
+ const float32Data = event.buffer.getChannelData(0);
87
+ // Convert Float32 → Int16 → base64 for Gemini
88
+ const base64Chunk = float32ToInt16Base64(float32Data);
89
+ logger.debug('AudioInput', `🎤 Frame #${frameCount}: size=${base64Chunk.length}`);
90
+ this.config.onAudioChunk(base64Chunk);
91
+ } catch (err: any) {
92
+ logger.error('AudioInput', `Frame processing error: ${err.message}`);
93
+ }
94
+ }
95
+ );
96
+
97
+ // Register error callback
98
+ this.recorder.onError((error: any) => {
99
+ logger.error('AudioInput', `Recorder error: ${error.message || error}`);
100
+ this.config.onError?.(error.message || String(error));
101
+ });
102
+
103
+ // Start recording
104
+ this.recorder.start();
105
+ this.status = 'recording';
106
+ logger.info('AudioInput', `Streaming started (${sampleRate}Hz, bufLen=${bufferLength})`);
107
+ return true;
108
+ } catch (error: any) {
109
+ logger.error('AudioInput', `Failed to start: ${error.message}`);
110
+ this.config.onError?.(error.message);
111
+ return false;
112
+ }
113
+ }
114
+
115
+ async stop(): Promise<void> {
116
+ try {
117
+ if (this.recorder && this.status !== 'idle') {
118
+ this.recorder.clearOnAudioReady();
119
+ this.recorder.clearOnError();
120
+ this.recorder.stop();
121
+ }
122
+ this.recorder = null;
123
+ this.status = 'idle';
124
+ logger.info('AudioInput', 'Streaming stopped');
125
+ } catch (error: any) {
126
+ logger.error('AudioInput', `Failed to stop: ${error.message}`);
127
+ this.recorder = null;
128
+ this.status = 'idle';
129
+ }
130
+ }
131
+
132
+ // ─── Status ───────────────────────────────────────────────
133
+
134
+ get isRecording(): boolean {
135
+ return this.status === 'recording';
136
+ }
137
+
138
+ get currentStatus(): RecordingStatus {
139
+ return this.status;
140
+ }
141
+ }
@@ -0,0 +1,167 @@
1
+ /**
2
+ * AudioOutputService — AI speech playback for voice mode.
3
+ *
4
+ * Uses react-native-audio-api (Software Mansion) for gapless, low-latency
5
+ * PCM playback. Decodes base64 PCM from Gemini Live API and queues it via
6
+ * AudioBufferQueueSourceNode for seamless streaming.
7
+ *
8
+ * Requires: react-native-audio-api (development build only, not Expo Go)
9
+ */
10
+
11
+ import { logger } from '../utils/logger';
12
+ import { base64ToFloat32 } from '../utils/audioUtils';
13
+
14
+ // ─── Types ─────────────────────────────────────────────────────
15
+
16
+ /** Gemini Live API outputs 24kHz 16-bit mono PCM */
17
+ const GEMINI_OUTPUT_SAMPLE_RATE = 24000;
18
+
19
+ export interface AudioOutputConfig {
20
+ sampleRate?: number;
21
+ onPlaybackStart?: () => void;
22
+ onPlaybackEnd?: () => void;
23
+ onError?: (error: string) => void;
24
+ }
25
+
26
+ // ─── Service ───────────────────────────────────────────────────
27
+
28
+ export class AudioOutputService {
29
+ private config: AudioOutputConfig;
30
+ private audioContext: any = null;
31
+ private queueSourceNode: any = null;
32
+ private gainNode: any = null;
33
+ private muted = false;
34
+ private isStarted = false;
35
+ private chunkCount = 0;
36
+
37
+ constructor(config: AudioOutputConfig = {}) {
38
+ this.config = config;
39
+ }
40
+
41
+ // ─── Lifecycle ─────────────────────────────────────────────
42
+
43
+ async initialize(): Promise<boolean> {
44
+ try {
45
+ let audioApi: any;
46
+ try {
47
+ audioApi = require('react-native-audio-api');
48
+ } catch {
49
+ const msg =
50
+ 'react-native-audio-api is required for audio output. Install with: npm install react-native-audio-api';
51
+ logger.error('AudioOutput', msg);
52
+ this.config.onError?.(msg);
53
+ return false;
54
+ }
55
+
56
+ const sampleRate = this.config.sampleRate || GEMINI_OUTPUT_SAMPLE_RATE;
57
+
58
+ // Create AudioContext at Gemini's output sample rate
59
+ this.audioContext = new audioApi.AudioContext({ sampleRate });
60
+
61
+ // Create GainNode for mute control
62
+ this.gainNode = this.audioContext.createGain();
63
+ this.gainNode.gain.value = 1.0;
64
+ this.gainNode.connect(this.audioContext.destination);
65
+
66
+ // Create AudioBufferQueueSourceNode for gapless streaming
67
+ this.queueSourceNode = this.audioContext.createBufferQueueSource();
68
+ this.queueSourceNode.connect(this.gainNode);
69
+
70
+ logger.info('AudioOutput', `Initialized (${sampleRate}Hz, AudioBufferQueueSourceNode)`);
71
+ return true;
72
+ } catch (error: any) {
73
+ logger.error('AudioOutput', `Failed to initialize: ${error.message}`);
74
+ this.config.onError?.(error.message);
75
+ return false;
76
+ }
77
+ }
78
+
79
+ // ─── Enqueue Audio ─────────────────────────────────────────
80
+
81
+ /** Add a base64-encoded PCM chunk from Gemini to the playback queue */
82
+ enqueue(base64Audio: string): void {
83
+ if (this.muted || !this.audioContext || !this.queueSourceNode) return;
84
+
85
+ try {
86
+ this.chunkCount++;
87
+
88
+ // Decode base64 Int16 PCM → Float32
89
+ const float32Data = base64ToFloat32(base64Audio);
90
+ const sampleRate = this.config.sampleRate || GEMINI_OUTPUT_SAMPLE_RATE;
91
+
92
+ // Create an AudioBuffer and fill it with PCM data
93
+ const audioBuffer = this.audioContext.createBuffer(1, float32Data.length, sampleRate);
94
+ audioBuffer.copyToChannel(float32Data, 0);
95
+
96
+ // Enqueue the buffer for gapless playback
97
+ this.queueSourceNode.enqueueBuffer(audioBuffer);
98
+
99
+ // Start playback on first enqueue
100
+ if (!this.isStarted) {
101
+ this.queueSourceNode.start();
102
+ this.isStarted = true;
103
+ this.config.onPlaybackStart?.();
104
+ logger.info('AudioOutput', '▶️ Playback started');
105
+ }
106
+
107
+ if (this.chunkCount % 20 === 0) {
108
+ logger.debug('AudioOutput', `Queued chunk #${this.chunkCount}`);
109
+ }
110
+ } catch (error: any) {
111
+ logger.error('AudioOutput', `Enqueue error: ${error.message}`);
112
+ }
113
+ }
114
+
115
+ // ─── Mute/Unmute ──────────────────────────────────────────
116
+
117
+ mute(): void {
118
+ this.muted = true;
119
+ if (this.gainNode) {
120
+ this.gainNode.gain.value = 0;
121
+ }
122
+ logger.info('AudioOutput', 'Speaker muted');
123
+ }
124
+
125
+ unmute(): void {
126
+ this.muted = false;
127
+ if (this.gainNode) {
128
+ this.gainNode.gain.value = 1.0;
129
+ }
130
+ logger.info('AudioOutput', 'Speaker unmuted');
131
+ }
132
+
133
+ get isMuted(): boolean {
134
+ return this.muted;
135
+ }
136
+
137
+ // ─── Stop & Cleanup ───────────────────────────────────────
138
+
139
+ async stop(): Promise<void> {
140
+ try {
141
+ if (this.queueSourceNode && this.isStarted) {
142
+ this.queueSourceNode.stop();
143
+ this.queueSourceNode.clearBuffers();
144
+ }
145
+ this.isStarted = false;
146
+ this.chunkCount = 0;
147
+ this.config.onPlaybackEnd?.();
148
+ logger.info('AudioOutput', 'Playback stopped');
149
+ } catch (error: any) {
150
+ logger.error('AudioOutput', `Stop error: ${error.message}`);
151
+ }
152
+ }
153
+
154
+ async cleanup(): Promise<void> {
155
+ await this.stop();
156
+ try {
157
+ if (this.audioContext) {
158
+ await this.audioContext.close();
159
+ }
160
+ } catch {
161
+ // Non-critical
162
+ }
163
+ this.audioContext = null;
164
+ this.queueSourceNode = null;
165
+ this.gainNode = null;
166
+ }
167
+ }