@mobileai/react-native 0.4.2 → 0.4.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. package/README.md +21 -2
  2. package/lib/module/components/AIAgent.js +216 -5
  3. package/lib/module/components/AIAgent.js.map +1 -1
  4. package/lib/module/components/AgentChatBar.js +358 -36
  5. package/lib/module/components/AgentChatBar.js.map +1 -1
  6. package/lib/module/core/AgentRuntime.js +122 -6
  7. package/lib/module/core/AgentRuntime.js.map +1 -1
  8. package/lib/module/core/systemPrompt.js +57 -0
  9. package/lib/module/core/systemPrompt.js.map +1 -1
  10. package/lib/module/index.js +8 -0
  11. package/lib/module/index.js.map +1 -1
  12. package/lib/module/providers/GeminiProvider.js +108 -85
  13. package/lib/module/providers/GeminiProvider.js.map +1 -1
  14. package/lib/module/services/AudioInputService.js +128 -0
  15. package/lib/module/services/AudioInputService.js.map +1 -0
  16. package/lib/module/services/AudioOutputService.js +154 -0
  17. package/lib/module/services/AudioOutputService.js.map +1 -0
  18. package/lib/module/services/VoiceService.js +362 -0
  19. package/lib/module/services/VoiceService.js.map +1 -0
  20. package/lib/module/utils/audioUtils.js +49 -0
  21. package/lib/module/utils/audioUtils.js.map +1 -0
  22. package/lib/module/utils/logger.js +21 -4
  23. package/lib/module/utils/logger.js.map +1 -1
  24. package/lib/typescript/babel.config.d.ts +10 -0
  25. package/lib/typescript/babel.config.d.ts.map +1 -0
  26. package/lib/typescript/eslint.config.d.mts +3 -0
  27. package/lib/typescript/eslint.config.d.mts.map +1 -0
  28. package/lib/typescript/fetch-models.d.mts +2 -0
  29. package/lib/typescript/fetch-models.d.mts.map +1 -0
  30. package/lib/typescript/list-all-models.d.mts +2 -0
  31. package/lib/typescript/list-all-models.d.mts.map +1 -0
  32. package/lib/typescript/list-models.d.mts +2 -0
  33. package/lib/typescript/list-models.d.mts.map +1 -0
  34. package/lib/typescript/src/components/AIAgent.d.ts +8 -2
  35. package/lib/typescript/src/components/AIAgent.d.ts.map +1 -1
  36. package/lib/typescript/src/components/AgentChatBar.d.ts +19 -2
  37. package/lib/typescript/src/components/AgentChatBar.d.ts.map +1 -1
  38. package/lib/typescript/src/core/AgentRuntime.d.ts +17 -1
  39. package/lib/typescript/src/core/AgentRuntime.d.ts.map +1 -1
  40. package/lib/typescript/src/core/systemPrompt.d.ts +8 -0
  41. package/lib/typescript/src/core/systemPrompt.d.ts.map +1 -1
  42. package/lib/typescript/src/core/types.d.ts +24 -1
  43. package/lib/typescript/src/core/types.d.ts.map +1 -1
  44. package/lib/typescript/src/index.d.ts +6 -1
  45. package/lib/typescript/src/index.d.ts.map +1 -1
  46. package/lib/typescript/src/providers/GeminiProvider.d.ts +22 -18
  47. package/lib/typescript/src/providers/GeminiProvider.d.ts.map +1 -1
  48. package/lib/typescript/src/services/AudioInputService.d.ts +31 -0
  49. package/lib/typescript/src/services/AudioInputService.d.ts.map +1 -0
  50. package/lib/typescript/src/services/AudioOutputService.d.ts +34 -0
  51. package/lib/typescript/src/services/AudioOutputService.d.ts.map +1 -0
  52. package/lib/typescript/src/services/VoiceService.d.ts +73 -0
  53. package/lib/typescript/src/services/VoiceService.d.ts.map +1 -0
  54. package/lib/typescript/src/utils/audioUtils.d.ts +17 -0
  55. package/lib/typescript/src/utils/audioUtils.d.ts.map +1 -0
  56. package/lib/typescript/src/utils/logger.d.ts +4 -0
  57. package/lib/typescript/src/utils/logger.d.ts.map +1 -1
  58. package/package.json +24 -8
  59. package/src/components/AIAgent.tsx +222 -3
  60. package/src/components/AgentChatBar.tsx +487 -42
  61. package/src/core/AgentRuntime.ts +131 -2
  62. package/src/core/systemPrompt.ts +62 -0
  63. package/src/core/types.ts +30 -0
  64. package/src/index.ts +16 -0
  65. package/src/providers/GeminiProvider.ts +105 -89
  66. package/src/services/AudioInputService.ts +141 -0
  67. package/src/services/AudioOutputService.ts +167 -0
  68. package/src/services/VoiceService.ts +409 -0
  69. package/src/utils/audioUtils.ts +54 -0
  70. package/src/utils/logger.ts +24 -7
@@ -21,6 +21,7 @@ import type {
21
21
  ExecutionResult,
22
22
  ToolDefinition,
23
23
  ActionDefinition,
24
+ TokenUsage,
24
25
  } from './types';
25
26
 
26
27
  const DEFAULT_MAX_STEPS = 10;
@@ -185,9 +186,14 @@ export class AgentRuntime {
185
186
  }
186
187
  try {
187
188
  const params = args.params ? (typeof args.params === 'string' ? JSON.parse(args.params) : args.params) : undefined;
188
- this.navRef.navigate(args.screen, params);
189
+ // Case-insensitive screen name matching
190
+ const availableRoutes = this.getRouteNames();
191
+ const matchedScreen = availableRoutes.find(
192
+ r => r.toLowerCase() === args.screen.toLowerCase()
193
+ ) || args.screen;
194
+ this.navRef.navigate(matchedScreen, params);
189
195
  await new Promise(resolve => setTimeout(resolve, 500));
190
- return `✅ Navigated to "${args.screen}"${params ? ` with params: ${JSON.stringify(params)}` : ''}`;
196
+ return `✅ Navigated to "${matchedScreen}"${params ? ` with params: ${JSON.stringify(params)}` : ''}`;
191
197
  } catch (error: any) {
192
198
  return `❌ Navigation error: ${error.message}. Available screens: ${this.getRouteNames().join(', ')}`;
193
199
  }
@@ -225,6 +231,20 @@ export class AgentRuntime {
225
231
  return `❓ ${args.question}`;
226
232
  },
227
233
  });
234
+
235
+ // capture_screenshot — on-demand visual capture (for image/video content questions)
236
+ this.tools.set('capture_screenshot', {
237
+ name: 'capture_screenshot',
238
+ description: 'Capture a screenshot of the current screen. Use when the user asks about visual content (images, videos, colors, layout appearance) that cannot be determined from the element tree alone.',
239
+ parameters: {},
240
+ execute: async () => {
241
+ const screenshot = await this.captureScreenshot();
242
+ if (screenshot) {
243
+ return `✅ Screenshot captured (${Math.round(screenshot.length / 1024)}KB). Visual content is now available for analysis.`;
244
+ }
245
+ return '❌ Screenshot capture failed. react-native-view-shot may not be installed.';
246
+ },
247
+ });
228
248
  }
229
249
 
230
250
  // ─── Action Registration (useAction hook) ──────────────────
@@ -319,6 +339,69 @@ export class AgentRuntime {
319
339
  }
320
340
  }
321
341
 
342
+ // ─── Screenshot Capture (optional react-native-view-shot) ─────
343
+
344
+ /**
345
+ * Captures the current screen as a base64 JPEG for Gemini vision.
346
+ * Uses react-native-view-shot as an optional peer dependency.
347
+ * Returns null if the library is not installed (graceful fallback).
348
+ */
349
+ private async captureScreenshot(): Promise<string | undefined> {
350
+ try {
351
+ const viewShot = require('react-native-view-shot');
352
+ const captureRef = viewShot.captureRef || viewShot.default?.captureRef;
353
+ if (!captureRef || !this.rootRef) return undefined;
354
+
355
+ const uri = await captureRef(this.rootRef, {
356
+ format: 'jpg',
357
+ quality: 0.4,
358
+ width: 720,
359
+ result: 'base64',
360
+ });
361
+
362
+ logger.info('AgentRuntime', `Screenshot captured (${Math.round((uri?.length || 0) / 1024)}KB base64)`);
363
+ return uri || undefined;
364
+ } catch (error: any) {
365
+ // Detect missing dependency vs runtime failure
366
+ if (error.message?.includes('Cannot find module') || error.code === 'MODULE_NOT_FOUND') {
367
+ logger.warn('AgentRuntime', 'Screenshot requires react-native-view-shot. Install with: npx expo install react-native-view-shot');
368
+ } else {
369
+ logger.debug('AgentRuntime', `Screenshot skipped: ${error.message}`);
370
+ }
371
+ return undefined;
372
+ }
373
+ }
374
+
375
+ // ─── Screen Context for Voice Mode ──────────────────────
376
+
377
+ /**
378
+ * Get current screen context as formatted text.
379
+ * Used by voice mode: sent once at connect + after each tool call.
380
+ * Follows page-agent pattern: tree in user prompt, not system instructions.
381
+ */
382
+ public getScreenContext(): string {
383
+ try {
384
+ const walkResult = walkFiberTree(this.rootRef, this.getWalkConfig());
385
+ const screenName = this.getCurrentScreenName();
386
+ const screen = dehydrateScreen(
387
+ screenName,
388
+ this.getRouteNames(),
389
+ walkResult.elementsText,
390
+ walkResult.interactives,
391
+ );
392
+
393
+ return `<screen_update>
394
+ Current Screen: ${screenName}
395
+ Available Screens: ${this.getRouteNames().join(', ')}
396
+
397
+ ${screen.elementsText}
398
+ </screen_update>`;
399
+ } catch (error: any) {
400
+ logger.error('AgentRuntime', `getScreenContext failed: ${error.message}`);
401
+ return '<screen_update>Error reading screen</screen_update>';
402
+ }
403
+ }
404
+
322
405
  // ─── Build Tools Array for Provider ────────────────────────
323
406
 
324
407
  private buildToolsForProvider(): ToolDefinition[] {
@@ -349,6 +432,28 @@ export class AgentRuntime {
349
432
  return allTools;
350
433
  }
351
434
 
435
+ /** Public accessor for voice mode — returns all registered tool definitions. */
436
+ public getTools(): ToolDefinition[] {
437
+ return this.buildToolsForProvider();
438
+ }
439
+
440
+ /** Execute a tool by name (for voice mode tool calls from WebSocket). */
441
+ public async executeTool(name: string, args: Record<string, any>): Promise<string> {
442
+ const tool = this.tools.get(name) ||
443
+ this.buildToolsForProvider().find(t => t.name === name);
444
+ if (!tool) {
445
+ return `❌ Unknown tool: ${name}`;
446
+ }
447
+ try {
448
+ const result = await tool.execute(args);
449
+ logger.info('AgentRuntime', `Voice tool executed: ${name} → ${result}`);
450
+ return result;
451
+ } catch (error: any) {
452
+ logger.error('AgentRuntime', `Voice tool error: ${name} — ${error.message}`);
453
+ return `❌ Tool "${name}" failed: ${error.message}`;
454
+ }
455
+ }
456
+
352
457
  // ─── Walk Config (passes security settings to FiberTreeWalker) ─
353
458
 
354
459
  private getWalkConfig(): WalkConfig {
@@ -477,6 +582,14 @@ export class AgentRuntime {
477
582
  const maxSteps = this.config.maxSteps || DEFAULT_MAX_STEPS;
478
583
  const stepDelay = this.config.stepDelay ?? 300;
479
584
 
585
+ // Token usage accumulator for the entire task
586
+ const sessionUsage: TokenUsage = {
587
+ promptTokens: 0,
588
+ completionTokens: 0,
589
+ totalTokens: 0,
590
+ estimatedCostUSD: 0,
591
+ };
592
+
480
593
  // Inject conversational context if we are answering the AI's question
481
594
  let contextualMessage = userMessage;
482
595
  if (this.lastAskUserQuestion) {
@@ -523,6 +636,9 @@ export class AgentRuntime {
523
636
  step, maxSteps, contextualMessage, screenName, screenContent,
524
637
  );
525
638
 
639
+ // 4.5. Capture screenshot for Gemini vision (optional)
640
+ const screenshot = await this.captureScreenshot();
641
+
526
642
  // 5. Send to AI provider
527
643
  this.config.onStatusUpdate?.('Analyzing screen...');
528
644
  const systemPrompt = buildSystemPrompt(this.config.language || 'en');
@@ -535,8 +651,18 @@ export class AgentRuntime {
535
651
  contextMessage,
536
652
  tools,
537
653
  this.history,
654
+ screenshot,
538
655
  );
539
656
 
657
+ // Accumulate token usage
658
+ if (response.tokenUsage) {
659
+ sessionUsage.promptTokens += response.tokenUsage.promptTokens;
660
+ sessionUsage.completionTokens += response.tokenUsage.completionTokens;
661
+ sessionUsage.totalTokens += response.tokenUsage.totalTokens;
662
+ sessionUsage.estimatedCostUSD += response.tokenUsage.estimatedCostUSD;
663
+ this.config.onTokenUsage?.(response.tokenUsage);
664
+ }
665
+
540
666
  // 6. Process tool calls
541
667
  if (!response.toolCalls || response.toolCalls.length === 0) {
542
668
  logger.warn('AgentRuntime', 'No tool calls in response. Text:', response.text);
@@ -544,6 +670,7 @@ export class AgentRuntime {
544
670
  success: true,
545
671
  message: response.text || 'Task completed.',
546
672
  steps: this.history,
673
+ tokenUsage: sessionUsage,
547
674
  };
548
675
  await this.config.onAfterTask?.(result);
549
676
  return result;
@@ -631,6 +758,7 @@ export class AgentRuntime {
631
758
  success: false,
632
759
  message: `Reached maximum steps (${maxSteps}) without completing the task.`,
633
760
  steps: this.history,
761
+ tokenUsage: sessionUsage,
634
762
  };
635
763
  await this.config.onAfterTask?.(result);
636
764
  return result;
@@ -640,6 +768,7 @@ export class AgentRuntime {
640
768
  success: false,
641
769
  message: `Error: ${error.message}`,
642
770
  steps: this.history,
771
+ tokenUsage: sessionUsage,
643
772
  };
644
773
  await this.config.onAfterTask?.(result);
645
774
  return result;
@@ -160,3 +160,65 @@ memory: "Added 2x Margherita pizza. Cart total visible."
160
160
  plan: "Call done to report the cart contents to the user."
161
161
  </output>`;
162
162
  }
163
+
164
+ /**
165
+ * Voice-optimized system prompt for the Gemini Live API.
166
+ *
167
+ * Includes the same screen format and tool semantics as text mode,
168
+ * but condensed for voice context and with guardrails against
169
+ * unprompted actions.
170
+ */
171
+ export function buildVoiceSystemPrompt(
172
+ language: string,
173
+ userInstructions?: string,
174
+ ): string {
175
+ const isArabic = language === 'ar';
176
+
177
+ let prompt = `You are a voice-controlled AI agent operating a React Native mobile app. You can see the screen content and interact with UI elements using tools.
178
+
179
+ <language>
180
+ ${isArabic ? 'Respond in Arabic.' : 'Respond in English.'}
181
+ Use the same language as the user.
182
+ </language>
183
+
184
+ <screen_format>
185
+ You receive periodic screen updates showing the current UI. Interactive elements appear as:
186
+ [index]<type attrs>label</type>
187
+
188
+ - index: numeric ID for interaction (use with tap/type tools)
189
+ - type: element type (pressable, text-input, switch)
190
+ - attrs: state like value="true", checked="false", role="switch"
191
+ - label: visible text content
192
+
193
+ Only elements with [index] are interactive. Text without [] is display-only.
194
+ Example: [5]<switch value="true">Order Updates</switch> means element 5 is a switch currently ON.
195
+ </screen_format>
196
+
197
+ <tools>
198
+ Available tools:
199
+ - tap(index): Tap an element. For switches, this toggles their value.
200
+ - type(index, text): Type text into a text-input.
201
+ - navigate(screen): Navigate to a named screen.
202
+ - done(text, success): Complete the task with a spoken response.
203
+ - ask_user(question): Ask the user for clarification.
204
+ </tools>
205
+
206
+ <rules>
207
+ CRITICAL ACTION RULES:
208
+ - ONLY perform actions (tap, type, navigate) when the user explicitly asks you to do something.
209
+ - NEVER tap or navigate on your own initiative — wait for the user's voice command.
210
+ - When the user asks a question about what's on screen, answer verbally via done(). Do NOT tap anything.
211
+ - When the user asks to toggle/enable/disable something, find the matching element by its label and use tap(index).
212
+ - When a screen update arrives, do NOT interact with elements unless the user asked you to.
213
+ - Use element indexes from the most recent screen update — they refresh every few seconds.
214
+ - For switches: tap(index) toggles the value. You do NOT need to find a separate button.
215
+ - Keep spoken responses concise — the user is listening, not reading.
216
+ </rules>`;
217
+
218
+ // Append user-provided instructions if any
219
+ if (userInstructions?.trim()) {
220
+ prompt += `\n\n<app_instructions>\n${userInstructions.trim()}\n</app_instructions>`;
221
+ }
222
+
223
+ return prompt;
224
+ }
package/src/core/types.ts CHANGED
@@ -2,6 +2,10 @@
2
2
  * Core types for the page-agent-style React Native AI SDK.
3
3
  */
4
4
 
5
+ // ─── Agent Modes ──────────────────────────────────────────────
6
+
7
+ export type AgentMode = 'text' | 'voice';
8
+
5
9
  // ─── Interactive Element (discovered from Fiber tree) ─────────
6
10
 
7
11
  export type ElementType = 'pressable' | 'text-input' | 'switch' | 'scrollable';
@@ -134,6 +138,12 @@ export interface AgentConfig {
134
138
  */
135
139
  onStatusUpdate?: (status: string) => void;
136
140
 
141
+ /**
142
+ * Called after each step with token usage data.
143
+ * Use to track cost, enforce budgets, or display usage to the user.
144
+ */
145
+ onTokenUsage?: (usage: TokenUsage) => void;
146
+
137
147
  /**
138
148
  * Callback for when agent needs user input (ask_user tool).
139
149
  * Mirrors page-agent: the agent loop blocks until the user responds.
@@ -174,6 +184,8 @@ export interface ExecutionResult {
174
184
  success: boolean;
175
185
  message: string;
176
186
  steps: AgentStep[];
187
+ /** Accumulated token usage for the entire task */
188
+ tokenUsage?: TokenUsage;
177
189
  }
178
190
 
179
191
  // ─── Tool Types ───────────────────────────────────────────────
@@ -213,6 +225,20 @@ export interface AgentReasoning {
213
225
  plan: string;
214
226
  }
215
227
 
228
+ // ─── Token Usage ──────────────────────────────────────────────
229
+
230
+ /** Token usage metrics for cost tracking. */
231
+ export interface TokenUsage {
232
+ /** Tokens in the input prompt */
233
+ promptTokens: number;
234
+ /** Tokens generated by the model */
235
+ completionTokens: number;
236
+ /** Total tokens (prompt + completion) */
237
+ totalTokens: number;
238
+ /** Estimated cost in USD (based on model pricing) */
239
+ estimatedCostUSD: number;
240
+ }
241
+
216
242
  /** Result from the AI provider's generateContent call. */
217
243
  export interface ProviderResult {
218
244
  /** Extracted action tool call (action_name + params). */
@@ -221,6 +247,8 @@ export interface ProviderResult {
221
247
  reasoning: AgentReasoning;
222
248
  /** Raw text response (if any). */
223
249
  text?: string;
250
+ /** Token usage for this specific call */
251
+ tokenUsage?: TokenUsage;
224
252
  }
225
253
 
226
254
  export interface AIProvider {
@@ -229,5 +257,7 @@ export interface AIProvider {
229
257
  userMessage: string,
230
258
  tools: ToolDefinition[],
231
259
  history: AgentStep[],
260
+ /** Optional base64-encoded JPEG screenshot for vision */
261
+ screenshot?: string,
232
262
  ): Promise<ProviderResult>;
233
263
  }
package/src/index.ts CHANGED
@@ -11,12 +11,28 @@ export { AIAgent } from './components/AIAgent';
11
11
  // ─── Hooks ───────────────────────────────────────────────────
12
12
  export { useAction } from './hooks/useAction';
13
13
 
14
+ // ─── Services ────────────────────────────────────────────────
15
+ export { VoiceService } from './services/VoiceService';
16
+ export { AudioInputService } from './services/AudioInputService';
17
+ export { AudioOutputService } from './services/AudioOutputService';
18
+
19
+ // ─── Utilities ───────────────────────────────────────────────
20
+ export { logger } from './utils/logger';
21
+
14
22
  // ─── Types ───────────────────────────────────────────────────
15
23
  export type {
16
24
  AgentConfig,
25
+ AgentMode,
17
26
  ExecutionResult,
18
27
  InteractiveElement,
19
28
  DehydratedScreen,
20
29
  ToolDefinition,
21
30
  ActionDefinition,
31
+ TokenUsage,
22
32
  } from './core/types';
33
+
34
+ export type {
35
+ VoiceServiceConfig,
36
+ VoiceServiceCallbacks,
37
+ VoiceStatus,
38
+ } from './services/VoiceService';