minitap-mobile-use 2.5.3__py3-none-any.whl → 2.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of minitap-mobile-use might be problematic. Click here for more details.

Files changed (43) hide show
  1. minitap/mobile_use/agents/contextor/contextor.py +0 -8
  2. minitap/mobile_use/agents/cortex/cortex.md +122 -36
  3. minitap/mobile_use/agents/cortex/cortex.py +32 -17
  4. minitap/mobile_use/agents/cortex/types.py +18 -4
  5. minitap/mobile_use/agents/executor/executor.md +3 -3
  6. minitap/mobile_use/agents/executor/executor.py +10 -3
  7. minitap/mobile_use/agents/hopper/hopper.md +30 -2
  8. minitap/mobile_use/agents/hopper/hopper.py +19 -15
  9. minitap/mobile_use/agents/orchestrator/orchestrator.py +14 -5
  10. minitap/mobile_use/agents/outputter/outputter.py +13 -3
  11. minitap/mobile_use/agents/planner/planner.md +20 -9
  12. minitap/mobile_use/agents/planner/planner.py +12 -5
  13. minitap/mobile_use/agents/screen_analyzer/human.md +16 -0
  14. minitap/mobile_use/agents/screen_analyzer/screen_analyzer.py +111 -0
  15. minitap/mobile_use/clients/ios_client.py +7 -3
  16. minitap/mobile_use/config.py +87 -24
  17. minitap/mobile_use/controllers/mobile_command_controller.py +354 -88
  18. minitap/mobile_use/controllers/platform_specific_commands_controller.py +41 -27
  19. minitap/mobile_use/controllers/types.py +95 -0
  20. minitap/mobile_use/graph/graph.py +55 -11
  21. minitap/mobile_use/graph/state.py +10 -3
  22. minitap/mobile_use/main.py +12 -4
  23. minitap/mobile_use/sdk/agent.py +113 -72
  24. minitap/mobile_use/sdk/examples/smart_notification_assistant.py +59 -10
  25. minitap/mobile_use/sdk/services/platform.py +15 -1
  26. minitap/mobile_use/sdk/types/platform.py +1 -0
  27. minitap/mobile_use/sdk/types/task.py +10 -1
  28. minitap/mobile_use/servers/device_hardware_bridge.py +13 -6
  29. minitap/mobile_use/services/llm.py +5 -2
  30. minitap/mobile_use/tools/index.py +7 -9
  31. minitap/mobile_use/tools/mobile/{clear_text.py → focus_and_clear_text.py} +7 -7
  32. minitap/mobile_use/tools/mobile/{input_text.py → focus_and_input_text.py} +8 -8
  33. minitap/mobile_use/tools/mobile/long_press_on.py +130 -15
  34. minitap/mobile_use/tools/mobile/swipe.py +3 -26
  35. minitap/mobile_use/tools/mobile/tap.py +41 -28
  36. minitap/mobile_use/tools/mobile/wait_for_delay.py +84 -0
  37. minitap/mobile_use/utils/cli_helpers.py +10 -6
  38. {minitap_mobile_use-2.5.3.dist-info → minitap_mobile_use-2.7.0.dist-info}/METADATA +1 -1
  39. {minitap_mobile_use-2.5.3.dist-info → minitap_mobile_use-2.7.0.dist-info}/RECORD +41 -39
  40. minitap/mobile_use/tools/mobile/glimpse_screen.py +0 -74
  41. minitap/mobile_use/tools/mobile/wait_for_animation_to_end.py +0 -64
  42. {minitap_mobile_use-2.5.3.dist-info → minitap_mobile_use-2.7.0.dist-info}/WHEEL +0 -0
  43. {minitap_mobile_use-2.5.3.dist-info → minitap_mobile_use-2.7.0.dist-info}/entry_points.txt +0 -0
@@ -1,4 +1,3 @@
1
- from minitap.mobile_use.agents.executor.utils import is_last_tool_message_take_screenshot
2
1
  from minitap.mobile_use.context import MobileUseContext
3
2
  from minitap.mobile_use.controllers.mobile_command_controller import get_screen_data
4
3
  from minitap.mobile_use.controllers.platform_specific_commands_controller import (
@@ -26,16 +25,9 @@ class ContextorNode:
26
25
  focused_app_info = get_focused_app_info(self.ctx)
27
26
  device_date = get_device_date(self.ctx)
28
27
 
29
- should_add_screenshot_context = is_last_tool_message_take_screenshot(
30
- list(state.executor_messages)
31
- )
32
-
33
28
  return await state.asanitize_update(
34
29
  ctx=self.ctx,
35
30
  update={
36
- "latest_screenshot_base64": device_data.base64
37
- if should_add_screenshot_context
38
- else None,
39
31
  "latest_ui_hierarchy": device_data.elements,
40
32
  "focused_app_info": focused_app_info,
41
33
  "screen_size": (device_data.width, device_data.height),
@@ -21,16 +21,22 @@ If you detect a cycle, you are **FORBIDDEN** from repeating it. You must pivot y
21
21
  To understand the device state, you have two senses, each with its purpose:
22
22
 
23
23
  1. **UI Hierarchy (Your sense of "Touch"):**
24
- * **What it is:** A structured list of all elements on the screen.
25
- * **Use it for:** Finding elements by `resource-id`, checking for specific text, and understanding the layout structure.
26
- * **Limitation:** It does NOT tell you what the screen *looks* like. It can be incomplete, and it contains no information about images, colors, or whether an element is visually obscured.
27
24
 
28
- 2. **`glimpse_screen` (Your sense of "Sight"):**
29
- * **What it is:** A tool that provides a real, up-to-date image of the screen.
30
- * **Use it for:** Confirming what is actually visible. This is your source of TRUTH for all visual information (icons, images, element positions, colors).
31
- * **Golden Rule:** When the UI hierarchy is ambiguous, seems incomplete, or when you need to verify a visual detail before acting, **`glimpse_screen` is always the most effective and reliable action.** Never guess what the screen looks like; use your sight to be sure.
25
+ - **What it is:** A structured list of all elements on the screen.
26
+ - **Use it for:** Finding elements by `resource-id`, checking for specific text, and understanding the layout structure.
27
+ - **Limitation:** It does NOT tell you what the screen _looks_ like. It can be incomplete, and it contains no information about images, colors, or whether an element is visually obscured.
32
28
 
33
- **CRITICAL NOTE ON SIGHT:** The visual information from `glimpse_screen` is **ephemeral**. It is available for **THIS decision turn ONLY**. You MUST extract all necessary information from it IMMEDIATELY, as it will be cleared before the next step.
29
+ 2. **`screen_analyzer` (Your sense of "Sight"):**
30
+ - **What it is:** A specialized agent that captures the screen and uses a vision model to answer specific questions about what is visible.
31
+ - **When to use it:** ONLY when the UI hierarchy is insufficient to make a decision. Use it sparingly for:
32
+ - Verifying visual elements that are not in the UI hierarchy (images, icons, colors)
33
+ - Confirming element visibility when hierarchy seems incomplete or ambiguous
34
+ - Identifying visual content that cannot be determined from text alone
35
+ - **When NOT to use it:** If the UI hierarchy contains the information you need (resource-ids, text, bounds), use that instead. Screen analysis is slower and should be a last resort.
36
+ - **How to use it:** Set the `screen_analysis_prompt` field in your output with a specific, focused question (e.g., "Is there a red notification badge on the Messages icon?", "What color is the submit button?").
37
+ - **Golden Rule:** Prefer the UI hierarchy first. Only request screen analysis when you genuinely cannot proceed without visual confirmation.
38
+
39
+ **CRITICAL NOTE ON SIGHT:** Screen analysis adds latency and is mutually exclusive with execution decisions. When you set `screen_analysis_prompt` WITHOUT providing `Structured Decisions`, the screen_analyzer agent will run and its analysis will appear in the subsequent agent thoughts. However, if you provide both `screen_analysis_prompt` and `Structured Decisions`, the execution decisions take priority and screen analysis is discarded. Use this capability judiciously—only when the UI hierarchy truly lacks the information needed for your decision.
34
40
 
35
41
  ### CRITICAL ACTION DIRECTIVES
36
42
 
@@ -40,8 +46,9 @@ To understand the device state, you have two senses, each with its purpose:
40
46
  ### Context You Receive:
41
47
 
42
48
  - 📱 **Device state**:
43
- - Latest **UI hierarchy** and (if available) a **screenshot**.
44
- - **CRITICAL NOTE ON SIGHT:** The visual information from `glimpse_screen` is **ephemeral**. It is available for **THIS decision turn ONLY**. You MUST extract all necessary information from it IMMEDIATELY, as it will be cleared before the next step.
49
+
50
+ - Latest **UI hierarchy**
51
+ - Results from the **screen_analyzer** agent (if you previously requested analysis via `screen_analysis_prompt`, you'll see the result in agent thoughts)
45
52
 
46
53
  - 🧭 **Task context**:
47
54
  - The user's **initial goal**
@@ -55,6 +62,7 @@ To understand the device state, you have two senses, each with its purpose:
55
62
  Focus on the **current PENDING subgoal and the next subgoals not yet started**.
56
63
 
57
64
  **CRITICAL: Before making any decision, you MUST thoroughly analyze the agent thoughts history to:**
65
+
58
66
  - **Detect patterns of failure or repeated attempts** that suggest the current approach isn't working
59
67
  - **Identify contradictions** between what was planned and what actually happened
60
68
  - **Spot errors in previous reasoning** that need to be corrected
@@ -62,6 +70,7 @@ Focus on the **current PENDING subgoal and the next subgoals not yet started**.
62
70
  - **Avoid repeating failed approaches** by recognizing when to change strategy
63
71
 
64
72
  1. **Analyze the agent thoughts first** - Review all previous agent thoughts to understand:
73
+
65
74
  - What strategies have been tried and their outcomes
66
75
  - Any errors or misconceptions in previous reasoning
67
76
  - Patterns that indicate success or failure
@@ -76,18 +85,17 @@ Focus on the **current PENDING subgoal and the next subgoals not yet started**.
76
85
  - Recent tool effects and whether they matched expectations from agent thoughts
77
86
  - **Any corrections needed to previous reasoning or strategy**
78
87
 
79
-
80
88
  ### The Rule of Element Interaction
81
89
 
82
90
  **You MUST follow it for every element interaction.**
83
91
 
84
- When you target a UI element (for a `tap`, `input_text`, `clear_text`, etc.), you **MUST** provide a comprehensive `target` object containing every piece of information you can find about **that single element**.
92
+ When you target a UI element (for a `tap`, `focus_and_input_text`, `focus_and_clear_text`, etc.), you **MUST** provide a comprehensive `target` object containing every piece of information you can find about **that single element**.
85
93
 
86
- * **1. `resource_id`**: Include this if it is present in the UI hierarchy.
87
- * **2. `resource_id_index`**: If there are multiple elements with the same `resource_id`, provide the zero-based index of the specific one you are targeting.
88
- * **3. `coordinates`**: Include the full bounds (`x`, `y`, `width`, `height`) if they are available.
89
- * **4. `text`**: Include the *current text* content of the element (e.g., placeholder text for an input).
90
- * **5. `text_index`**: If there are multiple elements with the same `text`, provide the zero-based index of the specific one you are targeting.
94
+ - **1. `resource_id`**: Include this if it is present in the UI hierarchy.
95
+ - **2. `resource_id_index`**: If there are multiple elements with the same `resource_id`, provide the zero-based index of the specific one you are targeting.
96
+ - **3. `coordinates`**: Include the full bounds (`x`, `y`, `width`, `height`) if they are available.
97
+ - **4. `text`**: Include the _current text_ content of the element (e.g., placeholder text for an input).
98
+ - **5. `text_index`**: If there are multiple elements with the same `text`, provide the zero-based index of the specific one you are targeting.
91
99
 
92
100
  **CRITICAL: The index must correspond to its identifier.** `resource_id_index` is only used when targeting by `resource_id`. `text_index` is only used when targeting by `text`. This ensures the fallback logic targets the correct element.
93
101
 
@@ -96,18 +104,17 @@ When you target a UI element (for a `tap`, `input_text`, `clear_text`, etc.), yo
96
104
  ### The Rule of Unpredictable Actions
97
105
 
98
106
  Certain actions have outcomes that can significantly and sometimes unpredictably change the UI. These include:
107
+
99
108
  - `back`
100
109
  - `launch_app`
101
110
  - `stop_app`
102
111
  - `open_link`
103
112
  - `tap` on an element that is clearly for navigation (e.g., a "Back" button, a menu item, a link to another screen).
104
113
 
105
- **CRITICAL RULE: If your decision includes one of these unpredictable actions, it MUST be the only action in your `Structured Decisions` for this turn. Else, use flows to group actions together.**
114
+ **CRITICAL RULE: If your decision includes one of these unpredictable actions, it MUST be the only action in your `Structured Decisions` for this turn. Else, provide multiple decisions in your `Structured Decisions`, in the right order, to group actions together.**
106
115
 
107
116
  This is not optional. Failing to isolate these actions will cause the system to act on an outdated understanding of the screen, leading to catastrophic errors. For example, after a `back` command, you MUST wait to see the new screen before deciding what to tap next.
108
117
 
109
- You may only group simple, predictable actions together, such as tapping a text field and then immediately typing into it (`tap` followed by `input_text`).
110
-
111
118
  ### Outputting Your Decisions
112
119
 
113
120
  If you decide to act, output a **valid JSON stringified structured set of instructions** for the Executor.
@@ -117,9 +124,9 @@ If you decide to act, output a **valid JSON stringified structured set of instru
117
124
  - Your goal is to achieve subgoals **fast** - so you must put as much actions as possible in your instructions to complete all achievable subgoals (based on your observations) in one go.
118
125
  - If you refer to a UI element or coordinates, specify it clearly (e.g., `resource-id: com.whatsapp:id/search`, `resource-id-index: 0`, `text: "Alice"`, `resource-id-index: 0`, `x: 100, y: 200, width: 100, height: 100`).
119
126
  - **The structure is up to you**, but it must be valid **JSON stringified output**. You will accompany this output with a **natural-language summary** of your reasoning and approach in your agent thought.
120
- - **Always use a single `input_text` action** to type in a field. This tool handles focusing the element and placing the cursor correctly. If the tool feedback indicates verification is needed or shows None/empty content, perform verification before proceeding.
127
+ - **Always use a single `focus_and_input_text` action** to type in a field. This tool handles focusing the element, placing the cursor correctly and typing the text. If the tool feedback indicates verification is needed or shows None/empty content, perform verification before proceeding.
121
128
  - **Only reference UI element IDs or visible texts that are explicitly present in the provided UI hierarchy or screenshot. Do not invent, infer, or guess any IDs or texts that are not directly observed**.
122
- - **For text clearing**: When you need to completely clear text from an input field, always call the `clear_text` tool with the correct resource_id. This tool automatically focuses the element, and ensures the field is emptied. If you notice this tool fails to clear the text, try to long press the input, select all, and call `erase_one_char`.
129
+ - **For text clearing**: When you need to completely clear text from an input field, always call the `focus_and_clear_text` tool with the correct resource_id. This tool automatically focuses the element, and ensures the field is emptied. If you notice this tool fails to clear the text, try to long press the input, select all, and call `erase_one_char`.
123
130
 
124
131
  ### Output
125
132
 
@@ -129,15 +136,27 @@ If you decide to act, output a **valid JSON stringified structured set of instru
129
136
  - **Structured Decisions** _(optional)_:
130
137
  A **valid stringified JSON** describing what should be executed **right now** to advance through the subgoals as much as possible.
131
138
 
132
- - **Agent Thought** _(2-4 sentences)_:
133
- **MANDATORY: Start by analyzing previous agent thoughts** - Did previous reasoning contain errors? Are we repeating failed approaches? What worked before in similar situations?
134
-
135
- Then explain your current decision based on this analysis. If there is any information you need to remember for later steps, you must include it here, because only the agent thoughts will be used to produce the final structured output.
139
+ - **Decisions Reason** _(2-4 sentences)_:
140
+ Start by analyzing previous agent thoughts. Then explain your current decision. Explicitly mention if correcting errors or changing strategy. Include checkpoints for indefinite actions (e.g., "Swiping up - last seen recipe was X").
136
141
 
137
- This also helps other agents understand your decision and learn from future failures. **Explicitly mention if you're correcting a previous error or changing strategy based on agent thoughts analysis.**
138
- You must also use this field to mention checkpoints when you perform actions without definite ending: for instance "Swiping up to reveal more recipes - last seen recipe was <ID or NAME>, stop when no more".
142
+ - **Goals Completion Reason**: Explain why marking subgoals complete based on observed evidence, or state "None".
139
143
 
140
- **Important:** `complete_subgoals_by_ids` and the structured decisions are mutually exclusive: if you provide both, the structured decisions will be ignored. Therefore, you must always prioritize completing subgoals over providing structured decisions.
144
+ - **Screen Analysis Prompt** _(optional)_: A specific question for visual analysis (e.g., "Is there a search icon visible?"). Leave empty if not needed.
145
+
146
+ **Important Decision Rules:**
147
+
148
+ 1. **Goal Completion + Execution Decisions**: You CAN provide both `complete_subgoals_by_ids` AND `Structured Decisions` in the same turn. This is the PREFERRED approach when:
149
+
150
+ - Agent thoughts show a previous action has ALREADY succeeded → Complete that subgoal
151
+ - The current screen requires new actions → Provide structured decisions
152
+ - **CRITICAL**: Only complete goals based on OBSERVED evidence from agent thoughts. NEVER complete goals "in advance" assuming an action will succeed.
153
+
154
+ 2. **Screen Analysis + Execution Decisions ARE MUTUALLY EXCLUSIVE**: If you provide both `screen_analysis_prompt` AND `Structured Decisions`, the execution decisions will take priority and screen analysis will be ignored. This should NEVER happen. Use screen analysis only when you need visual insights for the NEXT turn, not the current one.
155
+
156
+ 3. **Maximum Decisions Per Turn**: You can make up to 2 types of decisions simultaneously (never all 3):
157
+ - Complete examined subgoals (based on agent thoughts showing completion) + Execute actions on the current screen
158
+ - OR Complete examined subgoals + Request screen analysis (only when no execution decisions are needed)
159
+ - **Note:** Screen analysis and execution decisions cannot coexist—execution always takes priority if both are provided.
141
160
 
142
161
  ---
143
162
 
@@ -153,25 +172,92 @@ If you decide to act, output a **valid JSON stringified structured set of instru
153
172
  "{\"action\": \"launch_app\", \"app_name\": \"WhatsApp\"}"
154
173
  ```
155
174
 
156
- #### Agent Thought:
175
+ #### Decisions Reason:
176
+
177
+ > I need to launch the WhatsApp app to achieve the current subgoal. The `launch_app` tool is the most reliable method for opening applications.
157
178
 
158
- > I need to launch the WhatsApp app. I will use the `launch_app` tool to open it.
179
+ #### Goals Completion Reason:
159
180
 
160
- ### Exemple 2
181
+ > None
182
+
183
+ ### Example 2: Execution Decisions + Goal Completion
161
184
 
162
185
  #### Current Subgoal:
163
186
 
164
- > "Search for Alice in WhatsApp"
187
+ > "Send 'Hello!' to Alice on WhatsApp"
188
+
189
+ #### Context:
190
+
191
+ - **Agent thoughts history shows**: Previous turn executed `input_text` to type "Hello!" in the message field. Executor feedback confirms the text was successfully entered.
192
+ - **Current UI state**: The UI hierarchy shows the message "Hello!" is in the input field, and a send button with resource_id `com.whatsapp:id/send` is present.
193
+
194
+ #### Complete Subgoals By IDs:
195
+
196
+ ```text
197
+ ["subgoal-4-type-message"]
198
+ ```
165
199
 
166
200
  #### Structured Decisions:
167
201
 
168
202
  ```text
169
- "[{\"action\": \"tap\", \"target\": {\"resource_id\": \"com.whatsapp:id/menuitem_search\", \"resource_id_index\": 1, \"text\": \"Search\", \"text_index\": 0, \"coordinates\": {\"x\": 880, \"y\": 150, \"width\": 120, \"height\": 120}}}]"
203
+ "[{\"action\": \"tap\", \"target\": {\"resource_id\": \"com.whatsapp:id/send\", \"resource_id_index\": 0, \"coordinates\": {\"x\": 950, \"y\": 1800, \"width\": 100, \"height\": 100}}}]"
170
204
  ```
171
205
 
172
- #### Agent Thought:
206
+ #### Decisions Reason:
207
+
208
+ > Analysis: Agent thoughts confirm the text "Hello!" was successfully entered in the previous turn (executor feedback showed successful input). The current UI shows the message in the field and the send button is visible. I am completing the typing subgoal based on OBSERVED evidence, and tapping send to proceed. Providing full target information following the element rule.
209
+
210
+ #### Goals Completion Reason:
211
+
212
+ > Completing "type-message" subgoal because agent thoughts show the Executor successfully entered "Hello!" in the previous turn, and the current UI hierarchy confirms the text is present in the message field.
213
+
214
+ #### Screen Analysis Prompt:
215
+
216
+ ```text
217
+ None
218
+ ```
219
+
220
+ **Why this makes sense:** We're completing a goal that ALREADY happened (typing) based on observed evidence from agent thoughts, while simultaneously executing the next action (sending). We're not anticipating the send will succeed—we're only completing what has been confirmed.
221
+
222
+ ### Example 3: Screen Analysis + Goal Completion
223
+
224
+ #### Current Subgoal:
225
+
226
+ > "Verify the message was delivered to Alice"
227
+
228
+ #### Context:
229
+
230
+ - **Agent thoughts history shows**: Previous turn executed `tap` on the send button. Executor feedback confirms the tap was successful.
231
+ - **Current UI state**: The UI hierarchy shows we're still in the WhatsApp chat with Alice. The hierarchy contains text elements but doesn't clearly indicate delivery status.
232
+ - **Next step consideration**: We need visual confirmation of delivery checkmarks, which are not reliably exposed in the UI hierarchy.
233
+
234
+ #### Complete Subgoals By IDs:
235
+
236
+ ```text
237
+ ["subgoal-5-send-message"]
238
+ ```
239
+
240
+ #### Structured Decisions:
241
+
242
+ ```text
243
+ None
244
+ ```
245
+
246
+ #### Decisions Reason:
247
+
248
+ > None
249
+
250
+ #### Goals Completion Reason:
251
+
252
+ > Completing "send-message" subgoal because agent thoughts show the send button tap was executed successfully in the previous turn, and we remain in the chat screen (not an error state).
253
+
254
+ #### Screen Analysis Prompt:
255
+
256
+ ```text
257
+ Are there delivery checkmarks (single or double) visible next to the message "Hello!" in the chat? Describe their appearance.
258
+ ```
173
259
 
174
- > Analysis: No previous attempts, this is a fresh approach. I will tap the search icon to begin searching. I am providing its resource_id, coordinates, and text content to ensure the Executor can find it reliably, following the element rule.
260
+ **Why this makes sense:** We're completing the goal that ALREADY happened (sending the message) based on observed evidence from agent thoughts. We need screen analysis to verify delivery status for the next subgoal, but we have no execution decisions to make on the current screen. This respects the mutual exclusivity between execution decisions and screen analysis.
175
261
 
176
262
  ### Input
177
263
 
@@ -18,7 +18,6 @@ from minitap.mobile_use.context import MobileUseContext
18
18
  from minitap.mobile_use.graph.state import State
19
19
  from minitap.mobile_use.services.llm import get_llm, invoke_llm_with_timeout_message, with_fallback
20
20
  from minitap.mobile_use.tools.index import EXECUTOR_WRAPPERS_TOOLS, format_tools_list
21
- from minitap.mobile_use.utils.conversations import get_screenshot_message_for_llm
22
21
  from minitap.mobile_use.utils.decorators import wrap_with_callbacks
23
22
  from minitap.mobile_use.utils.logger import get_logger
24
23
 
@@ -62,10 +61,6 @@ class CortexNode:
62
61
  for thought in state.agents_thoughts:
63
62
  messages.append(AIMessage(content=thought))
64
63
 
65
- if state.latest_screenshot_base64:
66
- messages.append(get_screenshot_message_for_llm(state.latest_screenshot_base64))
67
- logger.info("Added screenshot to context")
68
-
69
64
  if state.latest_ui_hierarchy:
70
65
  ui_hierarchy_dict: list[dict] = state.latest_ui_hierarchy
71
66
  ui_hierarchy_str = json.dumps(ui_hierarchy_dict, indent=2, ensure_ascii=False)
@@ -86,27 +81,47 @@ class CortexNode:
86
81
  ),
87
82
  ) # type: ignore
88
83
 
89
- is_subgoal_completed = (
90
- response.complete_subgoals_by_ids is not None
91
- and len(response.complete_subgoals_by_ids) > 0
92
- and (len(response.decisions) == 0 or response.decisions in ["{}", "[]", "null", ""])
93
- )
94
- if not is_subgoal_completed:
95
- response.complete_subgoals_by_ids = []
84
+ EMPTY_STRING_TOKENS = ["{}", "[]", "null", "", "None"]
85
+
86
+ if response.decisions in EMPTY_STRING_TOKENS:
87
+ response.decisions = None
88
+ if response.goals_completion_reason in EMPTY_STRING_TOKENS:
89
+ response.goals_completion_reason = None
90
+ if response.screen_analysis_prompt in EMPTY_STRING_TOKENS:
91
+ response.screen_analysis_prompt = None
92
+
93
+ # Enforce mutual exclusivity: screen_analysis_prompt and decisions cannot coexist
94
+ # If both are provided, prioritize decisions and discard screen_analysis_prompt
95
+ if response.decisions is not None and response.screen_analysis_prompt is not None:
96
+ logger.warning(
97
+ "Both 'decisions' and 'screen_analysis_prompt' were provided. "
98
+ "Prioritizing execution decisions and discarding screen analysis request."
99
+ )
100
+ response.screen_analysis_prompt = None
101
+
102
+ thought_parts = []
103
+ if response.decisions_reason:
104
+ thought_parts.append(f"Decisions reason: {response.decisions_reason}")
105
+ if response.goals_completion_reason:
106
+ thought_parts.append(f"Goals completion reason: {response.goals_completion_reason}")
107
+ if response.screen_analysis_prompt:
108
+ thought_parts.append(f"Screen analysis query: {response.screen_analysis_prompt}")
109
+
110
+ agent_thought = "\n\n".join(thought_parts)
96
111
 
97
112
  return await state.asanitize_update(
98
113
  ctx=self.ctx,
99
114
  update={
100
- "agents_thoughts": [response.agent_thought],
101
- "structured_decisions": response.decisions if not is_subgoal_completed else None,
102
- "complete_subgoals_by_ids": response.complete_subgoals_by_ids or [],
103
- "latest_screenshot_base64": None,
115
+ "agents_thoughts": [agent_thought],
116
+ "structured_decisions": response.decisions,
117
+ "complete_subgoals_by_ids": response.complete_subgoals_by_ids,
118
+ "screen_analysis_prompt": response.screen_analysis_prompt,
104
119
  "latest_ui_hierarchy": None,
105
120
  "focused_app_info": None,
106
121
  "device_date": None,
107
122
  # Executor related fields
108
123
  EXECUTOR_MESSAGES_KEY: [RemoveMessage(id=REMOVE_ALL_MESSAGES)],
109
- "cortex_last_thought": response.agent_thought,
124
+ "cortex_last_thought": agent_thought,
110
125
  },
111
126
  agent="cortex",
112
127
  )
@@ -2,8 +2,22 @@ from pydantic import BaseModel, Field
2
2
 
3
3
 
4
4
  class CortexOutput(BaseModel):
5
- decisions: str = Field(..., description="The decisions to be made. A stringified JSON object")
6
- agent_thought: str = Field(..., description="The agent's thought")
7
- complete_subgoals_by_ids: list[str] | None = Field(
8
- [], description="List of subgoal IDs to complete"
5
+ decisions: str | None = Field(
6
+ default=None, description="The decisions to be made. A stringified JSON object"
7
+ )
8
+ decisions_reason: str | None = Field(default=None, description="The reason for the decisions")
9
+ goals_completion_reason: str | None = Field(
10
+ default=None,
11
+ description="The reason for the goals completion, if there are any goals to be completed.",
12
+ )
13
+ complete_subgoals_by_ids: list[str] = Field(
14
+ default_factory=list, description="List of subgoal IDs to complete"
15
+ )
16
+ screen_analysis_prompt: str | None = Field(
17
+ default=None,
18
+ description=(
19
+ "Optional prompt for the screen_analyzer agent. "
20
+ "Set this if you need visual analysis of the current screen. "
21
+ "The screen_analyzer will take a screenshot and answer your specific question."
22
+ ),
9
23
  )
@@ -50,7 +50,7 @@ Call the `tap_on_element` tool with:
50
50
 
51
51
  #### 📝 Text Input Best Practice
52
52
 
53
- When using the `input_text` tool:
53
+ When using the `focus_and_input_text` tool:
54
54
 
55
55
  - **Provide all available information** in the target object to identify text input element
56
56
  - `resource_id`: The resource ID of the text input element (when available)
@@ -69,11 +69,11 @@ When using the `input_text` tool:
69
69
 
70
70
  #### 🔄 Text Clearing Best Practice
71
71
 
72
- When you need to completely clear text from an input field, always use the clear_text tool with the correct resource_id.
72
+ When you need to completely clear text from an input field, always use the focus_and_clear_text tool with the correct resource_id.
73
73
 
74
74
  This tool automatically takes care of focusing the element (if needed), and ensuring the field is fully emptied.
75
75
 
76
- Only and if only the clear_text tool fails to clear the text, try to long press the input, select all, and call erase_one_char.
76
+ Only and if only the focus_and_clear_text tool fails to clear the text, try to long press the input, select all, and call erase_one_char.
77
77
 
78
78
  #### 🔁 Final Notes
79
79
 
@@ -8,7 +8,7 @@ from langchain_google_vertexai.chat_models import ChatVertexAI
8
8
  from minitap.mobile_use.constants import EXECUTOR_MESSAGES_KEY
9
9
  from minitap.mobile_use.context import MobileUseContext
10
10
  from minitap.mobile_use.graph.state import State
11
- from minitap.mobile_use.services.llm import get_llm, invoke_llm_with_timeout_message
11
+ from minitap.mobile_use.services.llm import get_llm, invoke_llm_with_timeout_message, with_fallback
12
12
  from minitap.mobile_use.tools.index import EXECUTOR_WRAPPERS_TOOLS, get_tools_from_wrappers
13
13
  from minitap.mobile_use.utils.decorators import wrap_with_callbacks
14
14
  from minitap.mobile_use.utils.logger import get_logger
@@ -53,6 +53,7 @@ class ExecutorNode:
53
53
  ]
54
54
 
55
55
  llm = get_llm(ctx=self.ctx, name="executor")
56
+ llm_fallback = get_llm(ctx=self.ctx, name="executor", use_fallback=True)
56
57
  llm_bind_tools_kwargs: dict = {
57
58
  "tools": get_tools_from_wrappers(self.ctx, EXECUTOR_WRAPPERS_TOOLS),
58
59
  }
@@ -62,8 +63,14 @@ class ExecutorNode:
62
63
  llm_bind_tools_kwargs["parallel_tool_calls"] = True
63
64
 
64
65
  llm = llm.bind_tools(**llm_bind_tools_kwargs)
65
- response = await invoke_llm_with_timeout_message(
66
- llm.ainvoke(messages), agent_name="Executor"
66
+ llm_fallback = llm_fallback.bind_tools(**llm_bind_tools_kwargs)
67
+ response = await with_fallback(
68
+ main_call=lambda: invoke_llm_with_timeout_message(
69
+ llm.ainvoke(messages), agent_name="Executor"
70
+ ),
71
+ fallback_call=lambda: invoke_llm_with_timeout_message(
72
+ llm_fallback.ainvoke(messages), agent_name="Executor (Fallback)"
73
+ ),
67
74
  )
68
75
  return await state.asanitize_update(
69
76
  ctx=self.ctx,
@@ -1,5 +1,33 @@
1
1
  ## Hopper
2
2
 
3
- The user will send you a batch of data you must dig in order to extract the most relevant information to reach the user's goal. Keep the information as is, do not modify it since the user will trigger actions based on it.
3
+ The user will send you a **batch of data**. Your role is to **dig through it** and extract the most relevant information needed to reach the user's goal.
4
4
 
5
- You'll need to output the extracted information in the `output` field, and you will describe what you did in the `step` field.
5
+ - **Keep the extracted information exactly as it appears** in the input. Do not reformat, paraphrase, or alter it.
6
+ - The user may rely on this raw data for triggering actions, so fidelity matters.
7
+
8
+ ---
9
+
10
+ ### Output Fields
11
+
12
+ - **output**: the extracted information.
13
+ - **reason**: a short explanation of what you looked for and how you decided what to extract.
14
+
15
+ ---
16
+
17
+ ### Rules
18
+
19
+ 1. **Search thoroughly**: The data may contain hundreds of entries. Scan the entire input carefully before concluding.
20
+
21
+ 2. **Match app names to package names**: When looking for an app package, look for package names where the app name (or a close variation) appears in the package identifier. Common patterns:
22
+ - App name in lowercase as part of the package
23
+ - Company/developer name followed by app name
24
+ - Brand name or abbreviated form of the app name
25
+ - Sometimes a codename or internal name related to the app
26
+
27
+ 3. **Prefer the most direct match**: If multiple packages contain similar terms, prefer the one where the app name appears most directly in the package identifier.
28
+
29
+ 4. **Consider variations**: App names may appear in different forms (abbreviated, translated, or with slight modifications) in package names.
30
+
31
+ 5. If the relevant information is **not found**, return `None`.
32
+
33
+ 6. If multiple plausible matches exist and you cannot determine which is correct, return `None` instead of guessing.
@@ -5,17 +5,15 @@ from langchain_core.messages import HumanMessage, SystemMessage
5
5
  from pydantic import BaseModel, Field
6
6
 
7
7
  from minitap.mobile_use.context import MobileUseContext
8
- from minitap.mobile_use.services.llm import get_llm, invoke_llm_with_timeout_message
8
+ from minitap.mobile_use.services.llm import get_llm, invoke_llm_with_timeout_message, with_fallback
9
9
 
10
10
 
11
11
  class HopperOutput(BaseModel):
12
- step: str = Field(
13
- description=(
14
- "The step that has been done, must be a valid one following the "
15
- "current steps and the current goal to achieve."
16
- )
17
- )
18
12
  output: str = Field(description="The interesting data extracted from the input data.")
13
+ reason: str = Field(
14
+ description="A short explanation of what you looked for"
15
+ + " and how you decided what to extract."
16
+ )
19
17
 
20
18
 
21
19
  async def hopper(
@@ -32,12 +30,18 @@ async def hopper(
32
30
  HumanMessage(content=f"{request}\nHere is the data you must dig:\n{data}"),
33
31
  ]
34
32
 
35
- llm = get_llm(ctx=ctx, name="hopper", is_utils=True, temperature=0)
36
- structured_llm = llm.with_structured_output(HopperOutput)
37
- response: HopperOutput = await invoke_llm_with_timeout_message(
38
- structured_llm.ainvoke(messages), agent_name="Hopper"
39
- ) # type: ignore
40
- return HopperOutput(
41
- step=response.step,
42
- output=response.output,
33
+ llm = get_llm(ctx=ctx, name="hopper", is_utils=True, temperature=0).with_structured_output(
34
+ HopperOutput
43
35
  )
36
+ llm_fallback = get_llm(
37
+ ctx=ctx, name="hopper", is_utils=True, use_fallback=True, temperature=0
38
+ ).with_structured_output(HopperOutput)
39
+ response: HopperOutput = await with_fallback(
40
+ main_call=lambda: invoke_llm_with_timeout_message(
41
+ llm.ainvoke(messages), agent_name="Hopper"
42
+ ),
43
+ fallback_call=lambda: invoke_llm_with_timeout_message(
44
+ llm_fallback.ainvoke(messages), agent_name="Hopper (Fallback)"
45
+ ),
46
+ ) # type: ignore
47
+ return response
@@ -15,7 +15,7 @@ from minitap.mobile_use.agents.planner.utils import (
15
15
  )
16
16
  from minitap.mobile_use.context import MobileUseContext
17
17
  from minitap.mobile_use.graph.state import State
18
- from minitap.mobile_use.services.llm import get_llm, invoke_llm_with_timeout_message
18
+ from minitap.mobile_use.services.llm import get_llm, invoke_llm_with_timeout_message, with_fallback
19
19
  from minitap.mobile_use.utils.decorators import wrap_with_callbacks
20
20
  from minitap.mobile_use.utils.logger import get_logger
21
21
 
@@ -74,10 +74,19 @@ class OrchestratorNode:
74
74
  HumanMessage(content=human_message),
75
75
  ]
76
76
 
77
- llm = get_llm(ctx=self.ctx, name="orchestrator", temperature=1)
78
- llm = llm.with_structured_output(OrchestratorOutput)
79
- response: OrchestratorOutput = await invoke_llm_with_timeout_message(
80
- llm.ainvoke(messages), agent_name="Orchestrator"
77
+ llm = get_llm(ctx=self.ctx, name="orchestrator", temperature=1).with_structured_output(
78
+ OrchestratorOutput
79
+ )
80
+ llm_fallback = get_llm(
81
+ ctx=self.ctx, name="orchestrator", use_fallback=True, temperature=1
82
+ ).with_structured_output(OrchestratorOutput)
83
+ response: OrchestratorOutput = await with_fallback(
84
+ main_call=lambda: invoke_llm_with_timeout_message(
85
+ llm.ainvoke(messages), agent_name="Orchestrator"
86
+ ),
87
+ fallback_call=lambda: invoke_llm_with_timeout_message(
88
+ llm_fallback.ainvoke(messages), agent_name="Orchestrator (Fallback)"
89
+ ),
81
90
  ) # type: ignore
82
91
  if response.needs_replaning:
83
92
  thoughts = [response.reason]
@@ -8,7 +8,7 @@ from pydantic import BaseModel
8
8
  from minitap.mobile_use.config import OutputConfig
9
9
  from minitap.mobile_use.context import MobileUseContext
10
10
  from minitap.mobile_use.graph.state import State
11
- from minitap.mobile_use.services.llm import get_llm, invoke_llm_with_timeout_message
11
+ from minitap.mobile_use.services.llm import get_llm, invoke_llm_with_timeout_message, with_fallback
12
12
  from minitap.mobile_use.utils.conversations import is_ai_message
13
13
  from minitap.mobile_use.utils.logger import get_logger
14
14
 
@@ -46,7 +46,11 @@ async def outputter(
46
46
  messages.append(HumanMessage(content=output_config.output_description))
47
47
 
48
48
  llm = get_llm(ctx=ctx, name="outputter", is_utils=True, temperature=1)
49
+ llm_fallback = get_llm(
50
+ ctx=ctx, name="outputter", is_utils=True, use_fallback=True, temperature=1
51
+ )
49
52
  structured_llm = llm
53
+ structured_llm_fallback = llm_fallback
50
54
 
51
55
  if output_config.structured_output:
52
56
  schema: dict | type[BaseModel] | None = None
@@ -61,9 +65,15 @@ async def outputter(
61
65
 
62
66
  if schema is not None:
63
67
  structured_llm = llm.with_structured_output(schema)
68
+ structured_llm_fallback = llm_fallback.with_structured_output(schema)
64
69
 
65
- response = await invoke_llm_with_timeout_message(
66
- structured_llm.ainvoke(messages), agent_name="Outputter"
70
+ response = await with_fallback(
71
+ main_call=lambda: invoke_llm_with_timeout_message(
72
+ structured_llm.ainvoke(messages), agent_name="Outputter"
73
+ ),
74
+ fallback_call=lambda: invoke_llm_with_timeout_message(
75
+ structured_llm_fallback.ainvoke(messages), agent_name="Outputter (Fallback)"
76
+ ),
67
77
  ) # type: ignore
68
78
  if isinstance(response, BaseModel):
69
79
  if output_config.output_description and hasattr(response, "content"):