@mobileai/react-native 0.4.2 → 0.4.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. package/README.md +23 -3
  2. package/lib/module/components/AIAgent.js +216 -5
  3. package/lib/module/components/AIAgent.js.map +1 -1
  4. package/lib/module/components/AgentChatBar.js +358 -36
  5. package/lib/module/components/AgentChatBar.js.map +1 -1
  6. package/lib/module/core/AgentRuntime.js +122 -6
  7. package/lib/module/core/AgentRuntime.js.map +1 -1
  8. package/lib/module/core/systemPrompt.js +57 -0
  9. package/lib/module/core/systemPrompt.js.map +1 -1
  10. package/lib/module/index.js +8 -0
  11. package/lib/module/index.js.map +1 -1
  12. package/lib/module/providers/GeminiProvider.js +108 -85
  13. package/lib/module/providers/GeminiProvider.js.map +1 -1
  14. package/lib/module/services/AudioInputService.js +128 -0
  15. package/lib/module/services/AudioInputService.js.map +1 -0
  16. package/lib/module/services/AudioOutputService.js +154 -0
  17. package/lib/module/services/AudioOutputService.js.map +1 -0
  18. package/lib/module/services/VoiceService.js +361 -0
  19. package/lib/module/services/VoiceService.js.map +1 -0
  20. package/lib/module/utils/audioUtils.js +49 -0
  21. package/lib/module/utils/audioUtils.js.map +1 -0
  22. package/lib/module/utils/logger.js +21 -4
  23. package/lib/module/utils/logger.js.map +1 -1
  24. package/lib/typescript/babel.config.d.ts +10 -0
  25. package/lib/typescript/babel.config.d.ts.map +1 -0
  26. package/lib/typescript/eslint.config.d.mts +3 -0
  27. package/lib/typescript/eslint.config.d.mts.map +1 -0
  28. package/lib/typescript/fetch-models.d.mts +2 -0
  29. package/lib/typescript/fetch-models.d.mts.map +1 -0
  30. package/lib/typescript/list-all-models.d.mts +2 -0
  31. package/lib/typescript/list-all-models.d.mts.map +1 -0
  32. package/lib/typescript/list-models.d.mts +2 -0
  33. package/lib/typescript/list-models.d.mts.map +1 -0
  34. package/lib/typescript/src/components/AIAgent.d.ts +8 -2
  35. package/lib/typescript/src/components/AIAgent.d.ts.map +1 -1
  36. package/lib/typescript/src/components/AgentChatBar.d.ts +19 -2
  37. package/lib/typescript/src/components/AgentChatBar.d.ts.map +1 -1
  38. package/lib/typescript/src/core/AgentRuntime.d.ts +17 -1
  39. package/lib/typescript/src/core/AgentRuntime.d.ts.map +1 -1
  40. package/lib/typescript/src/core/systemPrompt.d.ts +8 -0
  41. package/lib/typescript/src/core/systemPrompt.d.ts.map +1 -1
  42. package/lib/typescript/src/core/types.d.ts +24 -1
  43. package/lib/typescript/src/core/types.d.ts.map +1 -1
  44. package/lib/typescript/src/index.d.ts +6 -1
  45. package/lib/typescript/src/index.d.ts.map +1 -1
  46. package/lib/typescript/src/providers/GeminiProvider.d.ts +22 -18
  47. package/lib/typescript/src/providers/GeminiProvider.d.ts.map +1 -1
  48. package/lib/typescript/src/services/AudioInputService.d.ts +31 -0
  49. package/lib/typescript/src/services/AudioInputService.d.ts.map +1 -0
  50. package/lib/typescript/src/services/AudioOutputService.d.ts +34 -0
  51. package/lib/typescript/src/services/AudioOutputService.d.ts.map +1 -0
  52. package/lib/typescript/src/services/VoiceService.d.ts +73 -0
  53. package/lib/typescript/src/services/VoiceService.d.ts.map +1 -0
  54. package/lib/typescript/src/utils/audioUtils.d.ts +17 -0
  55. package/lib/typescript/src/utils/audioUtils.d.ts.map +1 -0
  56. package/lib/typescript/src/utils/logger.d.ts +4 -0
  57. package/lib/typescript/src/utils/logger.d.ts.map +1 -1
  58. package/package.json +24 -8
  59. package/src/components/AIAgent.tsx +222 -3
  60. package/src/components/AgentChatBar.tsx +487 -42
  61. package/src/core/AgentRuntime.ts +131 -2
  62. package/src/core/systemPrompt.ts +62 -0
  63. package/src/core/types.ts +30 -0
  64. package/src/index.ts +16 -0
  65. package/src/providers/GeminiProvider.ts +105 -89
  66. package/src/services/AudioInputService.ts +141 -0
  67. package/src/services/AudioOutputService.ts +167 -0
  68. package/src/services/VoiceService.ts +407 -0
  69. package/src/utils/audioUtils.ts +54 -0
  70. package/src/utils/logger.ts +24 -7
@@ -22,11 +22,15 @@ import { AgentContext } from '../hooks/useAction';
22
22
  import { AgentChatBar } from './AgentChatBar';
23
23
  import { AgentOverlay } from './AgentOverlay';
24
24
  import { logger } from '../utils/logger';
25
+ import { buildVoiceSystemPrompt } from '../core/systemPrompt';
25
26
  import { MCPBridge } from '../core/MCPBridge';
26
- import type { AgentConfig, ExecutionResult, ToolDefinition, AgentStep } from '../core/types';
27
+ import { VoiceService } from '../services/VoiceService';
28
+ import { AudioInputService } from '../services/AudioInputService';
29
+ import { AudioOutputService } from '../services/AudioOutputService';
30
+ import type { AgentConfig, AgentMode, ExecutionResult, ToolDefinition, AgentStep, TokenUsage } from '../core/types';
27
31
 
28
32
  // ─── Context ───────────────────────────────────────────────────
29
-
33
+ console.log('🚀 AIAgent.tsx MODULE LOADED');
30
34
 
31
35
  // ─── Props ─────────────────────────────────────────────────────
32
36
 
@@ -83,6 +87,12 @@ interface AIAgentProps {
83
87
  };
84
88
  /** Expo Router pathname (from usePathname()) */
85
89
  pathname?: string;
90
+ /** Enable voice mode (requires expo-av) */
91
+ enableVoice?: boolean;
92
+ /** Called after each step with token usage data */
93
+ onTokenUsage?: (usage: TokenUsage) => void;
94
+ /** Enable SDK debug logging (disabled by default) */
95
+ debug?: boolean;
86
96
  }
87
97
 
88
98
  // ─── Component ─────────────────────────────────────────────────
@@ -110,12 +120,43 @@ export function AIAgent({
110
120
  mcpServerUrl,
111
121
  router,
112
122
  pathname,
123
+ enableVoice = false,
124
+ onTokenUsage,
125
+ debug = false,
113
126
  }: AIAgentProps) {
127
+ // Configure logger based on debug prop
128
+ React.useEffect(() => {
129
+ console.log('[AIAgent] DEBUG PROP =', debug, '— enabling logger');
130
+ logger.setEnabled(debug);
131
+ if (debug) {
132
+ logger.info('AIAgent', '🔧 Debug logging enabled');
133
+ }
134
+ }, [debug]);
135
+
114
136
  const rootViewRef = useRef<any>(null);
115
137
  const [isThinking, setIsThinking] = useState(false);
116
138
  const [statusText, setStatusText] = useState('');
117
139
  const [lastResult, setLastResult] = useState<ExecutionResult | null>(null);
118
140
 
141
+ // ─── Voice/Live Mode State ──────────────────────────────────
142
+ const [mode, setMode] = useState<AgentMode>('text');
143
+ const [isMicActive, setIsMicActive] = useState(false);
144
+ const [isSpeakerMuted, setIsSpeakerMuted] = useState(false);
145
+ const [isAISpeaking, setIsAISpeaking] = useState(false);
146
+ const [isVoiceConnected, setIsVoiceConnected] = useState(false);
147
+
148
+ const voiceServiceRef = useRef<VoiceService | null>(null);
149
+ const audioInputRef = useRef<AudioInputService | null>(null);
150
+ const audioOutputRef = useRef<AudioOutputService | null>(null);
151
+
152
+ // Compute available modes from props
153
+ const availableModes: AgentMode[] = useMemo(() => {
154
+ const modes: AgentMode[] = ['text'];
155
+ if (enableVoice) modes.push('voice');
156
+ logger.info('AIAgent', `Available modes: ${modes.join(', ')}`);
157
+ return modes;
158
+ }, [enableVoice]);
159
+
119
160
  // Ref-based resolver for ask_user — stays alive across renders
120
161
  const askUserResolverRef = useRef<((answer: string) => void) | null>(null);
121
162
 
@@ -140,6 +181,7 @@ export function AIAgent({
140
181
  router,
141
182
  pathname,
142
183
  onStatusUpdate: setStatusText,
184
+ onTokenUsage,
143
185
  // Page-agent pattern: block the agent loop until user responds
144
186
  onAskUser: (question: string) => {
145
187
  return new Promise<string>((resolve) => {
@@ -155,7 +197,7 @@ export function AIAgent({
155
197
  interactiveBlacklist, interactiveWhitelist,
156
198
  onBeforeStep, onAfterStep, onBeforeTask, onAfterTask,
157
199
  transformScreenContent, customTools, instructions, stepDelay,
158
- mcpServerUrl, router, pathname,
200
+ mcpServerUrl, router, pathname, onTokenUsage,
159
201
  ]);
160
202
 
161
203
  const provider = useMemo(() => new GeminiProvider(apiKey, model), [apiKey, model]);
@@ -184,6 +226,144 @@ export function AIAgent({
184
226
  };
185
227
  }, [mcpServerUrl, runtime]);
186
228
 
229
+ // ─── Voice/Live Service Initialization ──────────────────────
230
+
231
+ // Initialize voice services when mode changes to voice or live
232
+ useEffect(() => {
233
+ if (mode === 'text') {
234
+ logger.info('AIAgent', 'Text mode — skipping voice service init');
235
+ return;
236
+ }
237
+
238
+ logger.info('AIAgent', `Mode changed to "${mode}" — initializing voice services...`);
239
+
240
+ // Create VoiceService with runtime's built-in tools (navigate, tap, type, done, etc.)
241
+ if (!voiceServiceRef.current) {
242
+ logger.info('AIAgent', 'Creating VoiceService...');
243
+ const runtimeTools = runtime.getTools();
244
+ logger.info('AIAgent', `Registering ${runtimeTools.length} tools with VoiceService: ${runtimeTools.map(t => t.name).join(', ')}`);
245
+ // Build the full voice system prompt (screen format + tool descriptions + guardrails)
246
+ // This gives voice mode the same screen understanding as text mode
247
+ const voicePrompt = buildVoiceSystemPrompt(language, instructions?.system);
248
+ voiceServiceRef.current = new VoiceService({
249
+ apiKey,
250
+ systemPrompt: voicePrompt,
251
+ tools: runtimeTools,
252
+ language,
253
+ });
254
+ logger.info('AIAgent', 'VoiceService created with full voice system prompt and tools');
255
+ }
256
+
257
+ // Create AudioOutputService if not exists
258
+ if (!audioOutputRef.current) {
259
+ logger.info('AIAgent', 'Creating AudioOutputService...');
260
+ audioOutputRef.current = new AudioOutputService({
261
+ onError: (err) => logger.error('AIAgent', `AudioOutput error: ${err}`),
262
+ });
263
+ audioOutputRef.current.initialize().then((ok) => {
264
+ logger.info('AIAgent', `AudioOutputService initialized: ${ok}`);
265
+ });
266
+ }
267
+
268
+ // Create AudioInputService if not exists
269
+ if (!audioInputRef.current) {
270
+ logger.info('AIAgent', 'Creating AudioInputService...');
271
+ audioInputRef.current = new AudioInputService({
272
+ // Default 16kHz — Gemini Live API input standard
273
+ onAudioChunk: (chunk) => {
274
+ logger.debug('AIAgent', `Mic chunk: ${chunk.length} chars`);
275
+ voiceServiceRef.current?.sendAudio(chunk);
276
+ },
277
+ onError: (err) => logger.error('AIAgent', `AudioInput error: ${err}`),
278
+ onPermissionDenied: () => logger.warn('AIAgent', 'Mic permission denied by user'),
279
+ });
280
+ }
281
+
282
+ // Connect VoiceService
283
+ logger.info('AIAgent', 'Connecting VoiceService...');
284
+ voiceServiceRef.current.connect({
285
+ onAudioResponse: (audio) => {
286
+ logger.info('AIAgent', `Received audio response (${audio.length} chars)`);
287
+ setIsAISpeaking(true);
288
+ audioOutputRef.current?.enqueue(audio);
289
+ },
290
+ onStatusChange: (status) => {
291
+ logger.info('AIAgent', `Voice status: ${status}`);
292
+ const connected = status === 'connected';
293
+ setIsVoiceConnected(connected);
294
+ if (connected) {
295
+ logger.info('AIAgent', '✅ VoiceService connected — auto-starting mic...');
296
+ // Auto-start mic streaming once WebSocket is ready
297
+ audioInputRef.current?.start().then((ok) => {
298
+ if (ok) {
299
+ setIsMicActive(true);
300
+ logger.info('AIAgent', '🎙️ Mic auto-started after connection');
301
+ }
302
+ });
303
+ // Send initial screen context (tree) so the model knows what's on screen
304
+ const initialContext = runtime.getScreenContext();
305
+ voiceServiceRef.current?.sendScreenContext(initialContext);
306
+ logger.info('AIAgent', '📡 Initial screen context sent to voice model');
307
+ }
308
+ },
309
+ onTranscript: (text, isFinal, role) => {
310
+ logger.info('AIAgent', `Transcript [${role}] (final=${isFinal}): "${text}"`);
311
+ },
312
+ onToolCall: async (toolCall) => {
313
+ logger.info('AIAgent', `Voice tool call: ${toolCall.name}(${JSON.stringify(toolCall.args)})`);
314
+ // Execute the tool via AgentRuntime and send result back to Gemini
315
+ const result = await runtime.executeTool(toolCall.name, toolCall.args);
316
+ logger.info('AIAgent', `Voice tool result: ${result}`);
317
+ voiceServiceRef.current?.sendFunctionResponse(toolCall.name, toolCall.id, { result });
318
+
319
+ // After tool execution, push updated screen context
320
+ // (the screen may have changed from tap/type/navigate)
321
+ const updatedContext = runtime.getScreenContext();
322
+ voiceServiceRef.current?.sendScreenContext(updatedContext);
323
+ logger.info('AIAgent', '📡 Updated screen context sent after tool call');
324
+ },
325
+ onError: (err) => {
326
+ logger.error('AIAgent', `VoiceService error: ${err}`);
327
+ },
328
+ onTurnComplete: () => {
329
+ logger.info('AIAgent', 'AI turn complete');
330
+ setIsAISpeaking(false);
331
+ },
332
+ });
333
+
334
+ // Cleanup on mode change back to text
335
+ return () => {
336
+ logger.info('AIAgent', `Cleaning up voice services (leaving "${mode}" mode)`);
337
+ voiceServiceRef.current?.disconnect();
338
+ voiceServiceRef.current = null; // Ensure fresh instance on next connect
339
+ audioInputRef.current?.stop();
340
+ setIsMicActive(false);
341
+ setIsAISpeaking(false);
342
+ setIsVoiceConnected(false);
343
+ };
344
+ // eslint-disable-next-line react-hooks/exhaustive-deps
345
+ }, [mode, apiKey, runtime, language, instructions]);
346
+
347
+ // ─── Stop Voice Session (full cleanup) ─────────────────────
348
+
349
+ const stopVoiceSession = useCallback(() => {
350
+ logger.info('AIAgent', '🛑 Stopping voice session (full cleanup)...');
351
+ // 1. Stop mic input
352
+ audioInputRef.current?.stop();
353
+ // 2. Stop audio output (clear queued chunks)
354
+ audioOutputRef.current?.stop();
355
+ // 3. Disconnect WebSocket
356
+ voiceServiceRef.current?.disconnect();
357
+ voiceServiceRef.current = null;
358
+ // 4. Reset state
359
+ setIsMicActive(false);
360
+ setIsAISpeaking(false);
361
+ setIsVoiceConnected(false);
362
+ // 5. Switch back to text mode (triggers cleanup effect naturally)
363
+ setMode('text');
364
+ logger.info('AIAgent', '🛑 Voice session fully stopped');
365
+ }, [runtime]);
366
+
187
367
  // ─── Execute ──────────────────────────────────────────────────
188
368
 
189
369
  const handleSend = useCallback(async (message: string) => {
@@ -249,12 +429,51 @@ export function AIAgent({
249
429
  lastResult={lastResult}
250
430
  language={language}
251
431
  onDismiss={() => setLastResult(null)}
432
+ availableModes={availableModes}
433
+ mode={mode}
434
+ onModeChange={(newMode) => {
435
+ logger.info('AIAgent', `Mode change: ${mode} → ${newMode}`);
436
+ setMode(newMode);
437
+ }}
438
+ isMicActive={isMicActive}
439
+ isSpeakerMuted={isSpeakerMuted}
440
+ isAISpeaking={isAISpeaking}
441
+ onStopSession={stopVoiceSession}
442
+ isVoiceConnected={isVoiceConnected}
443
+ onMicToggle={(active) => {
444
+ if (active && !isVoiceConnected) {
445
+ logger.warn('AIAgent', 'Cannot toggle mic — VoiceService not connected yet');
446
+ return;
447
+ }
448
+ logger.info('AIAgent', `Mic toggle: ${active ? 'ON' : 'OFF'}`);
449
+ setIsMicActive(active);
450
+ if (active) {
451
+ logger.info('AIAgent', 'Starting AudioInput...');
452
+ audioInputRef.current?.start().then((ok) => {
453
+ logger.info('AIAgent', `AudioInput start result: ${ok}`);
454
+ });
455
+ } else {
456
+ logger.info('AIAgent', 'Stopping AudioInput...');
457
+ audioInputRef.current?.stop();
458
+ }
459
+ }}
460
+ onSpeakerToggle={(muted) => {
461
+ logger.info('AIAgent', `Speaker toggle: ${muted ? 'MUTED' : 'UNMUTED'}`);
462
+ setIsSpeakerMuted(muted);
463
+ if (muted) {
464
+ audioOutputRef.current?.mute();
465
+ } else {
466
+ audioOutputRef.current?.unmute();
467
+ }
468
+ }}
469
+
252
470
  />
253
471
  )}
254
472
  </AgentContext.Provider>
255
473
  );
256
474
  }
257
475
 
476
+
258
477
  const styles = StyleSheet.create({
259
478
  root: {
260
479
  flex: 1,