@mobileai/react-native 0.1.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. package/README.md +78 -7
  2. package/lib/module/components/AIAgent.js +40 -4
  3. package/lib/module/components/AIAgent.js.map +1 -1
  4. package/lib/module/components/AgentChatBar.js +177 -29
  5. package/lib/module/components/AgentChatBar.js.map +1 -1
  6. package/lib/module/core/AgentRuntime.js +268 -126
  7. package/lib/module/core/AgentRuntime.js.map +1 -1
  8. package/lib/module/core/FiberTreeWalker.js +74 -20
  9. package/lib/module/core/FiberTreeWalker.js.map +1 -1
  10. package/lib/module/core/systemPrompt.js +164 -0
  11. package/lib/module/core/systemPrompt.js.map +1 -0
  12. package/lib/module/providers/GeminiProvider.js +189 -73
  13. package/lib/module/providers/GeminiProvider.js.map +1 -1
  14. package/lib/typescript/src/components/AIAgent.d.ts +9 -1
  15. package/lib/typescript/src/components/AIAgent.d.ts.map +1 -1
  16. package/lib/typescript/src/components/AgentChatBar.d.ts +4 -3
  17. package/lib/typescript/src/components/AgentChatBar.d.ts.map +1 -1
  18. package/lib/typescript/src/core/AgentRuntime.d.ts +16 -0
  19. package/lib/typescript/src/core/AgentRuntime.d.ts.map +1 -1
  20. package/lib/typescript/src/core/FiberTreeWalker.d.ts +5 -0
  21. package/lib/typescript/src/core/FiberTreeWalker.d.ts.map +1 -1
  22. package/lib/typescript/src/core/systemPrompt.d.ts +9 -0
  23. package/lib/typescript/src/core/systemPrompt.d.ts.map +1 -0
  24. package/lib/typescript/src/core/types.d.ts +51 -13
  25. package/lib/typescript/src/core/types.d.ts.map +1 -1
  26. package/lib/typescript/src/providers/GeminiProvider.d.ts +33 -13
  27. package/lib/typescript/src/providers/GeminiProvider.d.ts.map +1 -1
  28. package/package.json +16 -14
  29. package/src/components/AIAgent.tsx +41 -1
  30. package/src/components/AgentChatBar.tsx +150 -28
  31. package/src/core/AgentRuntime.ts +287 -131
  32. package/src/core/FiberTreeWalker.ts +74 -19
  33. package/src/core/systemPrompt.ts +162 -0
  34. package/src/core/types.ts +58 -10
  35. package/src/providers/GeminiProvider.ts +174 -101
@@ -1,30 +1,30 @@
1
1
  /**
2
- * GeminiProvider — Simplified Gemini API integration.
3
- * Sends dehydrated screen state + tools to Gemini and parses tool call responses.
2
+ * GeminiProvider — Gemini API integration with structured action pattern.
3
+ *
4
+ * Uses a single forced function call (`agent_step`) that bundles
5
+ * structured reasoning (evaluation, memory, plan) alongside the action.
6
+ * This replaces free-form text + separate tool calls for stability.
4
7
  */
5
8
 
6
9
  import { logger } from '../utils/logger';
7
- import type { AIProvider, ToolDefinition, AgentStep } from '../core/types';
10
+ import type { AIProvider, ToolDefinition, AgentStep, ProviderResult, AgentReasoning } from '../core/types';
8
11
 
9
- // ─── Gemini API Types ──────────────────────────────────────────
12
+ // ─── Constants ─────────────────────────────────────────────────
10
13
 
11
- interface GeminiTool {
12
- functionDeclarations: GeminiFunctionDeclaration[];
13
- }
14
+ const AGENT_STEP_FN = 'agent_step';
14
15
 
15
- interface GeminiFunctionDeclaration {
16
- name: string;
17
- description: string;
18
- parameters: {
19
- type: string;
20
- properties: Record<string, { type: string; description: string; enum?: string[] }>;
21
- required: string[];
22
- };
23
- }
16
+ // Reasoning fields that are always present in the agent_step schema
17
+ const REASONING_FIELDS = ['previous_goal_eval', 'memory', 'plan'] as const;
18
+
19
+ // ─── Gemini API Types ──────────────────────────────────────────
24
20
 
25
21
  interface GeminiContent {
26
22
  role: 'user' | 'model';
27
- parts: Array<{ text?: string; functionCall?: { name: string; args: any }; functionResponse?: { name: string; response: any } }>;
23
+ parts: Array<{
24
+ text?: string;
25
+ functionCall?: { name: string; args: any };
26
+ functionResponse?: { name: string; response: any };
27
+ }>;
28
28
  }
29
29
 
30
30
  // ─── Provider ──────────────────────────────────────────────────
@@ -33,7 +33,6 @@ export class GeminiProvider implements AIProvider {
33
33
  private apiKey: string;
34
34
  private model: string;
35
35
 
36
-
37
36
  constructor(apiKey: string, model: string = 'gemini-2.5-flash') {
38
37
  this.apiKey = apiKey;
39
38
  this.model = model;
@@ -44,14 +43,14 @@ export class GeminiProvider implements AIProvider {
44
43
  userMessage: string,
45
44
  tools: ToolDefinition[],
46
45
  history: AgentStep[],
47
- ): Promise<{ toolCalls: Array<{ name: string; args: Record<string, any> }>; text?: string }> {
46
+ ): Promise<ProviderResult> {
48
47
 
49
48
  logger.info('GeminiProvider', `Sending request. Model: ${this.model}, Tools: ${tools.length}`);
50
49
 
51
- // Build Gemini tools
52
- const geminiTools = this.buildGeminiTools(tools);
50
+ // Build single agent_step function declaration
51
+ const agentStepDeclaration = this.buildAgentStepDeclaration(tools);
53
52
 
54
- // Build conversation history
53
+ // Build conversation history with proper function call/response pairs
55
54
  const contents = this.buildContents(userMessage, history);
56
55
 
57
56
  // Make API request
@@ -59,8 +58,15 @@ export class GeminiProvider implements AIProvider {
59
58
 
60
59
  const body: any = {
61
60
  contents,
62
- tools: geminiTools.length > 0 ? geminiTools : undefined,
61
+ tools: [{ functionDeclarations: [agentStepDeclaration] }],
63
62
  systemInstruction: { parts: [{ text: systemPrompt }] },
63
+ // Force the model to always call agent_step
64
+ tool_config: {
65
+ function_calling_config: {
66
+ mode: 'ANY',
67
+ allowed_function_names: [AGENT_STEP_FN],
68
+ },
69
+ },
64
70
  generationConfig: {
65
71
  temperature: 0.2,
66
72
  maxOutputTokens: 2048,
@@ -87,44 +93,85 @@ export class GeminiProvider implements AIProvider {
87
93
 
88
94
  const data = await response.json();
89
95
 
90
- // Parse response
91
- return this.parseResponse(data);
96
+ return this.parseAgentStepResponse(data, tools);
92
97
  } catch (error: any) {
93
98
  logger.error('GeminiProvider', 'Request failed:', error.message);
94
99
  throw error;
95
100
  }
96
101
  }
97
102
 
98
- // ─── Build Gemini Tools ────────────────────────────────────
103
+ // ─── Build agent_step Declaration ──────────────────────────
104
+
105
+ /**
106
+ * Builds a single `agent_step` function declaration that combines:
107
+ * - Structured reasoning fields (previous_goal_eval, memory, plan)
108
+ * - action_name (enum of all available tool names)
109
+ * - All tool parameter fields as flat top-level properties
110
+ *
111
+ * Flat schema avoids Gemini's "deeply nested schema" rejection in ANY mode.
112
+ */
113
+ private buildAgentStepDeclaration(tools: ToolDefinition[]): any {
114
+ const toolNames = tools.map(t => t.name);
115
+
116
+ // Collect all unique parameter fields across all tools
117
+ const actionProperties: Record<string, any> = {};
118
+ for (const tool of tools) {
119
+ for (const [paramName, param] of Object.entries(tool.parameters)) {
120
+ // Skip if already added (shared field names like 'text', 'index')
121
+ if (actionProperties[paramName]) continue;
122
+ actionProperties[paramName] = {
123
+ type: this.mapParamType(param.type),
124
+ description: param.description,
125
+ ...(param.enum ? { enum: param.enum } : {}),
126
+ };
127
+ }
128
+ }
99
129
 
100
- private buildGeminiTools(tools: ToolDefinition[]): GeminiTool[] {
101
- const declarations: GeminiFunctionDeclaration[] = tools.map(tool => ({
102
- name: tool.name,
103
- description: tool.description,
130
+ // Build tool descriptions for the action_name enum
131
+ const toolDescriptions = tools
132
+ .map(t => {
133
+ const params = Object.keys(t.parameters).join(', ');
134
+ return `- ${t.name}(${params}): ${t.description}`;
135
+ })
136
+ .join('\n');
137
+
138
+ return {
139
+ name: AGENT_STEP_FN,
140
+ description: `Execute one agent step. Choose an action and provide reasoning.\n\nAvailable actions:\n${toolDescriptions}`,
104
141
  parameters: {
105
142
  type: 'OBJECT',
106
- properties: Object.fromEntries(
107
- Object.entries(tool.parameters).map(([key, param]) => [
108
- key,
109
- {
110
- type: this.mapParamType(param.type),
111
- description: param.description,
112
- ...(param.enum ? { enum: param.enum } : {}),
113
- },
114
- ]),
115
- ),
116
- required: Object.entries(tool.parameters)
117
- .filter(([, param]) => param.required !== false)
118
- .map(([key]) => key),
143
+ properties: {
144
+ // ── Reasoning fields ──
145
+ previous_goal_eval: {
146
+ type: 'STRING',
147
+ description: 'One-sentence assessment of your last action. State success, failure, or uncertain. Skip on first step.',
148
+ },
149
+ memory: {
150
+ type: 'STRING',
151
+ description: 'Key facts to remember for future steps: progress made, items found, counters, field values already collected.',
152
+ },
153
+ plan: {
154
+ type: 'STRING',
155
+ description: 'Your immediate next goal — what action you will take and why.',
156
+ },
157
+ // ── Action selection ──
158
+ action_name: {
159
+ type: 'STRING',
160
+ description: 'Which action to execute.',
161
+ enum: toolNames,
162
+ },
163
+ // ── Action parameters (flat) ──
164
+ ...actionProperties,
165
+ },
166
+ required: ['plan', 'action_name'],
119
167
  },
120
- }));
121
-
122
- return [{ functionDeclarations: declarations }];
168
+ };
123
169
  }
124
170
 
125
171
  private mapParamType(type: string): string {
126
172
  switch (type) {
127
173
  case 'number': return 'NUMBER';
174
+ case 'integer': return 'INTEGER';
128
175
  case 'boolean': return 'BOOLEAN';
129
176
  case 'string':
130
177
  default: return 'STRING';
@@ -133,78 +180,104 @@ export class GeminiProvider implements AIProvider {
133
180
 
134
181
  // ─── Build Contents ────────────────────────────────────────
135
182
 
136
- private buildContents(userMessage: string, history: AgentStep[]): GeminiContent[] {
137
- const contents: GeminiContent[] = [];
138
-
139
- // Add history as conversation turns
140
- for (const step of history) {
141
- // User turn (screen state was sent)
142
- contents.push({
143
- role: 'user',
144
- parts: [{ text: `Step ${step.stepIndex + 1} result: ${step.action.output}` }],
145
- });
146
- }
147
-
148
- // Current user message
149
- contents.push({
183
+ /**
184
+ * Builds Gemini conversation contents.
185
+ *
186
+ * Each step is a STATELESS single-turn request (matching page-agent's approach):
187
+ * - System prompt has general instructions
188
+ * - User message contains full context: task, history, screen state
189
+ * - Model responds with agent_step function call
190
+ *
191
+ * History is embedded as text in assembleUserPrompt (via <agent_history>),
192
+ * NOT as functionCall/functionResponse pairs. This avoids Gemini's
193
+ * conversation format requirements and thought_signature complexity.
194
+ */
195
+ private buildContents(userMessage: string, _history: AgentStep[]): GeminiContent[] {
196
+ return [{
150
197
  role: 'user',
151
198
  parts: [{ text: userMessage }],
152
- });
153
-
154
- // Ensure alternating roles (Gemini requirement)
155
- return this.ensureAlternatingRoles(contents);
156
- }
157
-
158
- private ensureAlternatingRoles(contents: GeminiContent[]): GeminiContent[] {
159
- if (contents.length <= 1) return contents;
160
-
161
- const merged: GeminiContent[] = [contents[0]!];
162
-
163
- for (let i = 1; i < contents.length; i++) {
164
- const prev = merged[merged.length - 1]!;
165
- const curr = contents[i]!;
166
-
167
- if (prev.role === curr.role) {
168
- // Merge same-role messages
169
- prev.parts.push(...curr.parts);
170
- } else {
171
- merged.push(curr);
172
- }
173
- }
174
-
175
- return merged;
199
+ }];
176
200
  }
177
201
 
178
202
  // ─── Parse Response ────────────────────────────────────────
179
203
 
180
- private parseResponse(data: any): { toolCalls: Array<{ name: string; args: Record<string, any> }>; text?: string } {
181
- const toolCalls: Array<{ name: string; args: Record<string, any> }> = [];
182
- let text: string | undefined;
183
-
204
+ /**
205
+ * Parses the Gemini response expecting a single agent_step function call.
206
+ * Extracts structured reasoning + action, and determines which tool to execute.
207
+ */
208
+ private parseAgentStepResponse(data: any, tools: ToolDefinition[]): ProviderResult {
184
209
  if (!data.candidates || data.candidates.length === 0) {
185
210
  logger.warn('GeminiProvider', 'No candidates in response');
186
- return { toolCalls, text: 'No response generated.' };
211
+ return {
212
+ toolCalls: [{ name: 'done', args: { text: 'No response generated.', success: false } }],
213
+ reasoning: { previousGoalEval: '', memory: '', plan: '' },
214
+ text: 'No response generated.',
215
+ };
187
216
  }
188
217
 
189
218
  const candidate = data.candidates[0];
190
219
  const parts = candidate.content?.parts || [];
191
220
 
192
- for (const part of parts) {
193
- if (part.functionCall) {
194
- toolCalls.push({
195
- name: part.functionCall.name,
196
- args: part.functionCall.args || {},
197
- });
221
+ // Find the function call part
222
+ const fnCallPart = parts.find((p: any) => p.functionCall);
223
+ const textPart = parts.find((p: any) => p.text);
224
+
225
+ if (!fnCallPart?.functionCall) {
226
+ logger.warn('GeminiProvider', 'No function call in response. Text:', textPart?.text);
227
+ return {
228
+ toolCalls: [{ name: 'done', args: { text: textPart?.text || 'No action taken.', success: false } }],
229
+ reasoning: { previousGoalEval: '', memory: '', plan: '' },
230
+ text: textPart?.text,
231
+ };
232
+ }
233
+
234
+ const args = fnCallPart.functionCall.args || {};
235
+
236
+ // Extract reasoning fields
237
+ const reasoning: AgentReasoning = {
238
+ previousGoalEval: args.previous_goal_eval || '',
239
+ memory: args.memory || '',
240
+ plan: args.plan || '',
241
+ };
242
+
243
+ // Extract action
244
+ const actionName = args.action_name;
245
+ if (!actionName) {
246
+ logger.warn('GeminiProvider', 'No action_name in agent_step. Falling back to done.');
247
+ return {
248
+ toolCalls: [{ name: 'done', args: { text: 'Agent did not choose an action.', success: false } }],
249
+ reasoning,
250
+ text: textPart?.text,
251
+ };
252
+ }
253
+
254
+ // Build action args: everything except reasoning fields and action_name
255
+ const actionArgs: Record<string, any> = {};
256
+ const reservedKeys = new Set([...REASONING_FIELDS, 'action_name']);
257
+
258
+ // Find the matching tool to know which params belong to it
259
+ const matchedTool = tools.find(t => t.name === actionName);
260
+ if (matchedTool) {
261
+ for (const paramName of Object.keys(matchedTool.parameters)) {
262
+ if (args[paramName] !== undefined) {
263
+ actionArgs[paramName] = args[paramName];
264
+ }
198
265
  }
199
- if (part.text) {
200
- text = (text || '') + part.text;
266
+ } else {
267
+ // Custom/registered tool grab all non-reserved fields
268
+ for (const [key, value] of Object.entries(args)) {
269
+ if (!reservedKeys.has(key)) {
270
+ actionArgs[key] = value;
271
+ }
201
272
  }
202
273
  }
203
274
 
204
- logger.info('GeminiProvider', `Parsed: ${toolCalls.length} tool calls, text: ${text ? 'yes' : 'no'}`);
275
+ logger.info('GeminiProvider', `Parsed: action=${actionName}, plan="${reasoning.plan}"`);
205
276
 
206
- return { toolCalls, text };
277
+ return {
278
+ toolCalls: [{ name: actionName, args: actionArgs }],
279
+ reasoning,
280
+ text: textPart?.text,
281
+ };
207
282
  }
208
-
209
-
210
283
  }