@mobileai/react-native 0.4.1 → 0.4.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +25 -34
- package/lib/module/components/AIAgent.js +216 -5
- package/lib/module/components/AIAgent.js.map +1 -1
- package/lib/module/components/AgentChatBar.js +358 -36
- package/lib/module/components/AgentChatBar.js.map +1 -1
- package/lib/module/core/AgentRuntime.js +122 -6
- package/lib/module/core/AgentRuntime.js.map +1 -1
- package/lib/module/core/systemPrompt.js +57 -0
- package/lib/module/core/systemPrompt.js.map +1 -1
- package/lib/module/index.js +8 -0
- package/lib/module/index.js.map +1 -1
- package/lib/module/providers/GeminiProvider.js +108 -85
- package/lib/module/providers/GeminiProvider.js.map +1 -1
- package/lib/module/services/AudioInputService.js +128 -0
- package/lib/module/services/AudioInputService.js.map +1 -0
- package/lib/module/services/AudioOutputService.js +154 -0
- package/lib/module/services/AudioOutputService.js.map +1 -0
- package/lib/module/services/VoiceService.js +362 -0
- package/lib/module/services/VoiceService.js.map +1 -0
- package/lib/module/utils/audioUtils.js +49 -0
- package/lib/module/utils/audioUtils.js.map +1 -0
- package/lib/module/utils/logger.js +21 -4
- package/lib/module/utils/logger.js.map +1 -1
- package/lib/typescript/babel.config.d.ts +10 -0
- package/lib/typescript/babel.config.d.ts.map +1 -0
- package/lib/typescript/eslint.config.d.mts +3 -0
- package/lib/typescript/eslint.config.d.mts.map +1 -0
- package/lib/typescript/fetch-models.d.mts +2 -0
- package/lib/typescript/fetch-models.d.mts.map +1 -0
- package/lib/typescript/list-all-models.d.mts +2 -0
- package/lib/typescript/list-all-models.d.mts.map +1 -0
- package/lib/typescript/list-models.d.mts +2 -0
- package/lib/typescript/list-models.d.mts.map +1 -0
- package/lib/typescript/src/components/AIAgent.d.ts +8 -2
- package/lib/typescript/src/components/AIAgent.d.ts.map +1 -1
- package/lib/typescript/src/components/AgentChatBar.d.ts +19 -2
- package/lib/typescript/src/components/AgentChatBar.d.ts.map +1 -1
- package/lib/typescript/src/core/AgentRuntime.d.ts +17 -1
- package/lib/typescript/src/core/AgentRuntime.d.ts.map +1 -1
- package/lib/typescript/src/core/systemPrompt.d.ts +8 -0
- package/lib/typescript/src/core/systemPrompt.d.ts.map +1 -1
- package/lib/typescript/src/core/types.d.ts +24 -1
- package/lib/typescript/src/core/types.d.ts.map +1 -1
- package/lib/typescript/src/index.d.ts +6 -1
- package/lib/typescript/src/index.d.ts.map +1 -1
- package/lib/typescript/src/providers/GeminiProvider.d.ts +22 -18
- package/lib/typescript/src/providers/GeminiProvider.d.ts.map +1 -1
- package/lib/typescript/src/services/AudioInputService.d.ts +31 -0
- package/lib/typescript/src/services/AudioInputService.d.ts.map +1 -0
- package/lib/typescript/src/services/AudioOutputService.d.ts +34 -0
- package/lib/typescript/src/services/AudioOutputService.d.ts.map +1 -0
- package/lib/typescript/src/services/VoiceService.d.ts +73 -0
- package/lib/typescript/src/services/VoiceService.d.ts.map +1 -0
- package/lib/typescript/src/utils/audioUtils.d.ts +17 -0
- package/lib/typescript/src/utils/audioUtils.d.ts.map +1 -0
- package/lib/typescript/src/utils/logger.d.ts +4 -0
- package/lib/typescript/src/utils/logger.d.ts.map +1 -1
- package/package.json +24 -8
- package/src/components/AIAgent.tsx +222 -3
- package/src/components/AgentChatBar.tsx +487 -42
- package/src/core/AgentRuntime.ts +131 -2
- package/src/core/systemPrompt.ts +62 -0
- package/src/core/types.ts +30 -0
- package/src/index.ts +16 -0
- package/src/providers/GeminiProvider.ts +105 -89
- package/src/services/AudioInputService.ts +141 -0
- package/src/services/AudioOutputService.ts +167 -0
- package/src/services/VoiceService.ts +409 -0
- package/src/utils/audioUtils.ts +54 -0
- package/src/utils/logger.ts +24 -7
|
@@ -22,11 +22,15 @@ import { AgentContext } from '../hooks/useAction';
|
|
|
22
22
|
import { AgentChatBar } from './AgentChatBar';
|
|
23
23
|
import { AgentOverlay } from './AgentOverlay';
|
|
24
24
|
import { logger } from '../utils/logger';
|
|
25
|
+
import { buildVoiceSystemPrompt } from '../core/systemPrompt';
|
|
25
26
|
import { MCPBridge } from '../core/MCPBridge';
|
|
26
|
-
import
|
|
27
|
+
import { VoiceService } from '../services/VoiceService';
|
|
28
|
+
import { AudioInputService } from '../services/AudioInputService';
|
|
29
|
+
import { AudioOutputService } from '../services/AudioOutputService';
|
|
30
|
+
import type { AgentConfig, AgentMode, ExecutionResult, ToolDefinition, AgentStep, TokenUsage } from '../core/types';
|
|
27
31
|
|
|
28
32
|
// ─── Context ───────────────────────────────────────────────────
|
|
29
|
-
|
|
33
|
+
console.log('🚀 AIAgent.tsx MODULE LOADED');
|
|
30
34
|
|
|
31
35
|
// ─── Props ─────────────────────────────────────────────────────
|
|
32
36
|
|
|
@@ -83,6 +87,12 @@ interface AIAgentProps {
|
|
|
83
87
|
};
|
|
84
88
|
/** Expo Router pathname (from usePathname()) */
|
|
85
89
|
pathname?: string;
|
|
90
|
+
/** Enable voice mode (requires expo-av) */
|
|
91
|
+
enableVoice?: boolean;
|
|
92
|
+
/** Called after each step with token usage data */
|
|
93
|
+
onTokenUsage?: (usage: TokenUsage) => void;
|
|
94
|
+
/** Enable SDK debug logging (disabled by default) */
|
|
95
|
+
debug?: boolean;
|
|
86
96
|
}
|
|
87
97
|
|
|
88
98
|
// ─── Component ─────────────────────────────────────────────────
|
|
@@ -110,12 +120,43 @@ export function AIAgent({
|
|
|
110
120
|
mcpServerUrl,
|
|
111
121
|
router,
|
|
112
122
|
pathname,
|
|
123
|
+
enableVoice = false,
|
|
124
|
+
onTokenUsage,
|
|
125
|
+
debug = false,
|
|
113
126
|
}: AIAgentProps) {
|
|
127
|
+
// Configure logger based on debug prop
|
|
128
|
+
React.useEffect(() => {
|
|
129
|
+
console.log('[AIAgent] DEBUG PROP =', debug, '— enabling logger');
|
|
130
|
+
logger.setEnabled(debug);
|
|
131
|
+
if (debug) {
|
|
132
|
+
logger.info('AIAgent', '🔧 Debug logging enabled');
|
|
133
|
+
}
|
|
134
|
+
}, [debug]);
|
|
135
|
+
|
|
114
136
|
const rootViewRef = useRef<any>(null);
|
|
115
137
|
const [isThinking, setIsThinking] = useState(false);
|
|
116
138
|
const [statusText, setStatusText] = useState('');
|
|
117
139
|
const [lastResult, setLastResult] = useState<ExecutionResult | null>(null);
|
|
118
140
|
|
|
141
|
+
// ─── Voice/Live Mode State ──────────────────────────────────
|
|
142
|
+
const [mode, setMode] = useState<AgentMode>('text');
|
|
143
|
+
const [isMicActive, setIsMicActive] = useState(false);
|
|
144
|
+
const [isSpeakerMuted, setIsSpeakerMuted] = useState(false);
|
|
145
|
+
const [isAISpeaking, setIsAISpeaking] = useState(false);
|
|
146
|
+
const [isVoiceConnected, setIsVoiceConnected] = useState(false);
|
|
147
|
+
|
|
148
|
+
const voiceServiceRef = useRef<VoiceService | null>(null);
|
|
149
|
+
const audioInputRef = useRef<AudioInputService | null>(null);
|
|
150
|
+
const audioOutputRef = useRef<AudioOutputService | null>(null);
|
|
151
|
+
|
|
152
|
+
// Compute available modes from props
|
|
153
|
+
const availableModes: AgentMode[] = useMemo(() => {
|
|
154
|
+
const modes: AgentMode[] = ['text'];
|
|
155
|
+
if (enableVoice) modes.push('voice');
|
|
156
|
+
logger.info('AIAgent', `Available modes: ${modes.join(', ')}`);
|
|
157
|
+
return modes;
|
|
158
|
+
}, [enableVoice]);
|
|
159
|
+
|
|
119
160
|
// Ref-based resolver for ask_user — stays alive across renders
|
|
120
161
|
const askUserResolverRef = useRef<((answer: string) => void) | null>(null);
|
|
121
162
|
|
|
@@ -140,6 +181,7 @@ export function AIAgent({
|
|
|
140
181
|
router,
|
|
141
182
|
pathname,
|
|
142
183
|
onStatusUpdate: setStatusText,
|
|
184
|
+
onTokenUsage,
|
|
143
185
|
// Page-agent pattern: block the agent loop until user responds
|
|
144
186
|
onAskUser: (question: string) => {
|
|
145
187
|
return new Promise<string>((resolve) => {
|
|
@@ -155,7 +197,7 @@ export function AIAgent({
|
|
|
155
197
|
interactiveBlacklist, interactiveWhitelist,
|
|
156
198
|
onBeforeStep, onAfterStep, onBeforeTask, onAfterTask,
|
|
157
199
|
transformScreenContent, customTools, instructions, stepDelay,
|
|
158
|
-
mcpServerUrl, router, pathname,
|
|
200
|
+
mcpServerUrl, router, pathname, onTokenUsage,
|
|
159
201
|
]);
|
|
160
202
|
|
|
161
203
|
const provider = useMemo(() => new GeminiProvider(apiKey, model), [apiKey, model]);
|
|
@@ -184,6 +226,144 @@ export function AIAgent({
|
|
|
184
226
|
};
|
|
185
227
|
}, [mcpServerUrl, runtime]);
|
|
186
228
|
|
|
229
|
+
// ─── Voice/Live Service Initialization ──────────────────────
|
|
230
|
+
|
|
231
|
+
// Initialize voice services when mode changes to voice or live
|
|
232
|
+
useEffect(() => {
|
|
233
|
+
if (mode === 'text') {
|
|
234
|
+
logger.info('AIAgent', 'Text mode — skipping voice service init');
|
|
235
|
+
return;
|
|
236
|
+
}
|
|
237
|
+
|
|
238
|
+
logger.info('AIAgent', `Mode changed to "${mode}" — initializing voice services...`);
|
|
239
|
+
|
|
240
|
+
// Create VoiceService with runtime's built-in tools (navigate, tap, type, done, etc.)
|
|
241
|
+
if (!voiceServiceRef.current) {
|
|
242
|
+
logger.info('AIAgent', 'Creating VoiceService...');
|
|
243
|
+
const runtimeTools = runtime.getTools();
|
|
244
|
+
logger.info('AIAgent', `Registering ${runtimeTools.length} tools with VoiceService: ${runtimeTools.map(t => t.name).join(', ')}`);
|
|
245
|
+
// Build the full voice system prompt (screen format + tool descriptions + guardrails)
|
|
246
|
+
// This gives voice mode the same screen understanding as text mode
|
|
247
|
+
const voicePrompt = buildVoiceSystemPrompt(language, instructions?.system);
|
|
248
|
+
voiceServiceRef.current = new VoiceService({
|
|
249
|
+
apiKey,
|
|
250
|
+
systemPrompt: voicePrompt,
|
|
251
|
+
tools: runtimeTools,
|
|
252
|
+
language,
|
|
253
|
+
});
|
|
254
|
+
logger.info('AIAgent', 'VoiceService created with full voice system prompt and tools');
|
|
255
|
+
}
|
|
256
|
+
|
|
257
|
+
// Create AudioOutputService if not exists
|
|
258
|
+
if (!audioOutputRef.current) {
|
|
259
|
+
logger.info('AIAgent', 'Creating AudioOutputService...');
|
|
260
|
+
audioOutputRef.current = new AudioOutputService({
|
|
261
|
+
onError: (err) => logger.error('AIAgent', `AudioOutput error: ${err}`),
|
|
262
|
+
});
|
|
263
|
+
audioOutputRef.current.initialize().then((ok) => {
|
|
264
|
+
logger.info('AIAgent', `AudioOutputService initialized: ${ok}`);
|
|
265
|
+
});
|
|
266
|
+
}
|
|
267
|
+
|
|
268
|
+
// Create AudioInputService if not exists
|
|
269
|
+
if (!audioInputRef.current) {
|
|
270
|
+
logger.info('AIAgent', 'Creating AudioInputService...');
|
|
271
|
+
audioInputRef.current = new AudioInputService({
|
|
272
|
+
// Default 16kHz — Gemini Live API input standard
|
|
273
|
+
onAudioChunk: (chunk) => {
|
|
274
|
+
logger.debug('AIAgent', `Mic chunk: ${chunk.length} chars`);
|
|
275
|
+
voiceServiceRef.current?.sendAudio(chunk);
|
|
276
|
+
},
|
|
277
|
+
onError: (err) => logger.error('AIAgent', `AudioInput error: ${err}`),
|
|
278
|
+
onPermissionDenied: () => logger.warn('AIAgent', 'Mic permission denied by user'),
|
|
279
|
+
});
|
|
280
|
+
}
|
|
281
|
+
|
|
282
|
+
// Connect VoiceService
|
|
283
|
+
logger.info('AIAgent', 'Connecting VoiceService...');
|
|
284
|
+
voiceServiceRef.current.connect({
|
|
285
|
+
onAudioResponse: (audio) => {
|
|
286
|
+
logger.info('AIAgent', `Received audio response (${audio.length} chars)`);
|
|
287
|
+
setIsAISpeaking(true);
|
|
288
|
+
audioOutputRef.current?.enqueue(audio);
|
|
289
|
+
},
|
|
290
|
+
onStatusChange: (status) => {
|
|
291
|
+
logger.info('AIAgent', `Voice status: ${status}`);
|
|
292
|
+
const connected = status === 'connected';
|
|
293
|
+
setIsVoiceConnected(connected);
|
|
294
|
+
if (connected) {
|
|
295
|
+
logger.info('AIAgent', '✅ VoiceService connected — auto-starting mic...');
|
|
296
|
+
// Auto-start mic streaming once WebSocket is ready
|
|
297
|
+
audioInputRef.current?.start().then((ok) => {
|
|
298
|
+
if (ok) {
|
|
299
|
+
setIsMicActive(true);
|
|
300
|
+
logger.info('AIAgent', '🎙️ Mic auto-started after connection');
|
|
301
|
+
}
|
|
302
|
+
});
|
|
303
|
+
// Send initial screen context (tree) so the model knows what's on screen
|
|
304
|
+
const initialContext = runtime.getScreenContext();
|
|
305
|
+
voiceServiceRef.current?.sendScreenContext(initialContext);
|
|
306
|
+
logger.info('AIAgent', '📡 Initial screen context sent to voice model');
|
|
307
|
+
}
|
|
308
|
+
},
|
|
309
|
+
onTranscript: (text, isFinal, role) => {
|
|
310
|
+
logger.info('AIAgent', `Transcript [${role}] (final=${isFinal}): "${text}"`);
|
|
311
|
+
},
|
|
312
|
+
onToolCall: async (toolCall) => {
|
|
313
|
+
logger.info('AIAgent', `Voice tool call: ${toolCall.name}(${JSON.stringify(toolCall.args)})`);
|
|
314
|
+
// Execute the tool via AgentRuntime and send result back to Gemini
|
|
315
|
+
const result = await runtime.executeTool(toolCall.name, toolCall.args);
|
|
316
|
+
logger.info('AIAgent', `Voice tool result: ${result}`);
|
|
317
|
+
voiceServiceRef.current?.sendFunctionResponse(toolCall.name, toolCall.id, { result });
|
|
318
|
+
|
|
319
|
+
// After tool execution, push updated screen context
|
|
320
|
+
// (the screen may have changed from tap/type/navigate)
|
|
321
|
+
const updatedContext = runtime.getScreenContext();
|
|
322
|
+
voiceServiceRef.current?.sendScreenContext(updatedContext);
|
|
323
|
+
logger.info('AIAgent', '📡 Updated screen context sent after tool call');
|
|
324
|
+
},
|
|
325
|
+
onError: (err) => {
|
|
326
|
+
logger.error('AIAgent', `VoiceService error: ${err}`);
|
|
327
|
+
},
|
|
328
|
+
onTurnComplete: () => {
|
|
329
|
+
logger.info('AIAgent', 'AI turn complete');
|
|
330
|
+
setIsAISpeaking(false);
|
|
331
|
+
},
|
|
332
|
+
});
|
|
333
|
+
|
|
334
|
+
// Cleanup on mode change back to text
|
|
335
|
+
return () => {
|
|
336
|
+
logger.info('AIAgent', `Cleaning up voice services (leaving "${mode}" mode)`);
|
|
337
|
+
voiceServiceRef.current?.disconnect();
|
|
338
|
+
voiceServiceRef.current = null; // Ensure fresh instance on next connect
|
|
339
|
+
audioInputRef.current?.stop();
|
|
340
|
+
setIsMicActive(false);
|
|
341
|
+
setIsAISpeaking(false);
|
|
342
|
+
setIsVoiceConnected(false);
|
|
343
|
+
};
|
|
344
|
+
// eslint-disable-next-line react-hooks/exhaustive-deps
|
|
345
|
+
}, [mode, apiKey, runtime, language, instructions]);
|
|
346
|
+
|
|
347
|
+
// ─── Stop Voice Session (full cleanup) ─────────────────────
|
|
348
|
+
|
|
349
|
+
const stopVoiceSession = useCallback(() => {
|
|
350
|
+
logger.info('AIAgent', '🛑 Stopping voice session (full cleanup)...');
|
|
351
|
+
// 1. Stop mic input
|
|
352
|
+
audioInputRef.current?.stop();
|
|
353
|
+
// 2. Stop audio output (clear queued chunks)
|
|
354
|
+
audioOutputRef.current?.stop();
|
|
355
|
+
// 3. Disconnect WebSocket
|
|
356
|
+
voiceServiceRef.current?.disconnect();
|
|
357
|
+
voiceServiceRef.current = null;
|
|
358
|
+
// 4. Reset state
|
|
359
|
+
setIsMicActive(false);
|
|
360
|
+
setIsAISpeaking(false);
|
|
361
|
+
setIsVoiceConnected(false);
|
|
362
|
+
// 5. Switch back to text mode (triggers cleanup effect naturally)
|
|
363
|
+
setMode('text');
|
|
364
|
+
logger.info('AIAgent', '🛑 Voice session fully stopped');
|
|
365
|
+
}, [runtime]);
|
|
366
|
+
|
|
187
367
|
// ─── Execute ──────────────────────────────────────────────────
|
|
188
368
|
|
|
189
369
|
const handleSend = useCallback(async (message: string) => {
|
|
@@ -249,12 +429,51 @@ export function AIAgent({
|
|
|
249
429
|
lastResult={lastResult}
|
|
250
430
|
language={language}
|
|
251
431
|
onDismiss={() => setLastResult(null)}
|
|
432
|
+
availableModes={availableModes}
|
|
433
|
+
mode={mode}
|
|
434
|
+
onModeChange={(newMode) => {
|
|
435
|
+
logger.info('AIAgent', `Mode change: ${mode} → ${newMode}`);
|
|
436
|
+
setMode(newMode);
|
|
437
|
+
}}
|
|
438
|
+
isMicActive={isMicActive}
|
|
439
|
+
isSpeakerMuted={isSpeakerMuted}
|
|
440
|
+
isAISpeaking={isAISpeaking}
|
|
441
|
+
onStopSession={stopVoiceSession}
|
|
442
|
+
isVoiceConnected={isVoiceConnected}
|
|
443
|
+
onMicToggle={(active) => {
|
|
444
|
+
if (active && !isVoiceConnected) {
|
|
445
|
+
logger.warn('AIAgent', 'Cannot toggle mic — VoiceService not connected yet');
|
|
446
|
+
return;
|
|
447
|
+
}
|
|
448
|
+
logger.info('AIAgent', `Mic toggle: ${active ? 'ON' : 'OFF'}`);
|
|
449
|
+
setIsMicActive(active);
|
|
450
|
+
if (active) {
|
|
451
|
+
logger.info('AIAgent', 'Starting AudioInput...');
|
|
452
|
+
audioInputRef.current?.start().then((ok) => {
|
|
453
|
+
logger.info('AIAgent', `AudioInput start result: ${ok}`);
|
|
454
|
+
});
|
|
455
|
+
} else {
|
|
456
|
+
logger.info('AIAgent', 'Stopping AudioInput...');
|
|
457
|
+
audioInputRef.current?.stop();
|
|
458
|
+
}
|
|
459
|
+
}}
|
|
460
|
+
onSpeakerToggle={(muted) => {
|
|
461
|
+
logger.info('AIAgent', `Speaker toggle: ${muted ? 'MUTED' : 'UNMUTED'}`);
|
|
462
|
+
setIsSpeakerMuted(muted);
|
|
463
|
+
if (muted) {
|
|
464
|
+
audioOutputRef.current?.mute();
|
|
465
|
+
} else {
|
|
466
|
+
audioOutputRef.current?.unmute();
|
|
467
|
+
}
|
|
468
|
+
}}
|
|
469
|
+
|
|
252
470
|
/>
|
|
253
471
|
)}
|
|
254
472
|
</AgentContext.Provider>
|
|
255
473
|
);
|
|
256
474
|
}
|
|
257
475
|
|
|
476
|
+
|
|
258
477
|
const styles = StyleSheet.create({
|
|
259
478
|
root: {
|
|
260
479
|
flex: 1,
|