minitap-mobile-use 2.2.0__py3-none-any.whl → 2.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of minitap-mobile-use might be problematic. Click here for more details.

Files changed (59) hide show
  1. minitap/mobile_use/agents/contextor/contextor.py +6 -4
  2. minitap/mobile_use/agents/cortex/cortex.md +114 -27
  3. minitap/mobile_use/agents/cortex/cortex.py +8 -5
  4. minitap/mobile_use/agents/executor/executor.md +15 -10
  5. minitap/mobile_use/agents/executor/executor.py +6 -5
  6. minitap/mobile_use/agents/executor/utils.py +2 -1
  7. minitap/mobile_use/agents/hopper/hopper.py +6 -3
  8. minitap/mobile_use/agents/orchestrator/orchestrator.py +26 -11
  9. minitap/mobile_use/agents/outputter/outputter.py +6 -3
  10. minitap/mobile_use/agents/outputter/test_outputter.py +104 -42
  11. minitap/mobile_use/agents/planner/planner.md +20 -22
  12. minitap/mobile_use/agents/planner/planner.py +10 -7
  13. minitap/mobile_use/agents/planner/types.py +4 -2
  14. minitap/mobile_use/agents/planner/utils.py +14 -0
  15. minitap/mobile_use/agents/summarizer/summarizer.py +2 -2
  16. minitap/mobile_use/config.py +6 -1
  17. minitap/mobile_use/context.py +13 -3
  18. minitap/mobile_use/controllers/mobile_command_controller.py +1 -14
  19. minitap/mobile_use/graph/state.py +7 -3
  20. minitap/mobile_use/sdk/agent.py +204 -29
  21. minitap/mobile_use/sdk/examples/README.md +19 -1
  22. minitap/mobile_use/sdk/examples/platform_minimal_example.py +46 -0
  23. minitap/mobile_use/sdk/services/platform.py +244 -0
  24. minitap/mobile_use/sdk/types/__init__.py +14 -14
  25. minitap/mobile_use/sdk/types/exceptions.py +57 -0
  26. minitap/mobile_use/sdk/types/platform.py +125 -0
  27. minitap/mobile_use/sdk/types/task.py +60 -17
  28. minitap/mobile_use/servers/device_hardware_bridge.py +3 -2
  29. minitap/mobile_use/servers/stop_servers.py +11 -12
  30. minitap/mobile_use/servers/utils.py +6 -9
  31. minitap/mobile_use/services/llm.py +89 -5
  32. minitap/mobile_use/tools/index.py +2 -8
  33. minitap/mobile_use/tools/mobile/back.py +3 -3
  34. minitap/mobile_use/tools/mobile/clear_text.py +67 -38
  35. minitap/mobile_use/tools/mobile/erase_one_char.py +5 -4
  36. minitap/mobile_use/tools/mobile/{take_screenshot.py → glimpse_screen.py} +23 -15
  37. minitap/mobile_use/tools/mobile/input_text.py +67 -16
  38. minitap/mobile_use/tools/mobile/launch_app.py +54 -22
  39. minitap/mobile_use/tools/mobile/long_press_on.py +15 -8
  40. minitap/mobile_use/tools/mobile/open_link.py +15 -8
  41. minitap/mobile_use/tools/mobile/press_key.py +15 -8
  42. minitap/mobile_use/tools/mobile/stop_app.py +14 -8
  43. minitap/mobile_use/tools/mobile/swipe.py +11 -5
  44. minitap/mobile_use/tools/mobile/tap.py +103 -21
  45. minitap/mobile_use/tools/mobile/wait_for_animation_to_end.py +3 -3
  46. minitap/mobile_use/tools/test_utils.py +377 -0
  47. minitap/mobile_use/tools/types.py +35 -0
  48. minitap/mobile_use/tools/utils.py +149 -39
  49. minitap/mobile_use/utils/recorder.py +1 -1
  50. minitap/mobile_use/utils/test_ui_hierarchy.py +178 -0
  51. minitap/mobile_use/utils/ui_hierarchy.py +11 -4
  52. {minitap_mobile_use-2.2.0.dist-info → minitap_mobile_use-2.4.0.dist-info}/METADATA +6 -4
  53. minitap_mobile_use-2.4.0.dist-info/RECORD +99 -0
  54. minitap/mobile_use/tools/mobile/copy_text_from.py +0 -73
  55. minitap/mobile_use/tools/mobile/find_packages.py +0 -69
  56. minitap/mobile_use/tools/mobile/paste_text.py +0 -62
  57. minitap_mobile_use-2.2.0.dist-info/RECORD +0 -96
  58. {minitap_mobile_use-2.2.0.dist-info → minitap_mobile_use-2.4.0.dist-info}/WHEEL +0 -0
  59. {minitap_mobile_use-2.2.0.dist-info → minitap_mobile_use-2.4.0.dist-info}/entry_points.txt +0 -0
@@ -1,4 +1,5 @@
1
1
  from minitap.mobile_use.agents.executor.utils import is_last_tool_message_take_screenshot
2
+ from minitap.mobile_use.context import MobileUseContext
2
3
  from minitap.mobile_use.controllers.mobile_command_controller import get_screen_data
3
4
  from minitap.mobile_use.controllers.platform_specific_commands_controller import (
4
5
  get_device_date,
@@ -7,7 +8,6 @@ from minitap.mobile_use.controllers.platform_specific_commands_controller import
7
8
  from minitap.mobile_use.graph.state import State
8
9
  from minitap.mobile_use.utils.decorators import wrap_with_callbacks
9
10
  from minitap.mobile_use.utils.logger import get_logger
10
- from minitap.mobile_use.context import MobileUseContext
11
11
 
12
12
  logger = get_logger(__name__)
13
13
 
@@ -21,14 +21,16 @@ class ContextorNode:
21
21
  on_success=lambda _: logger.success("Contextor Agent"),
22
22
  on_failure=lambda _: logger.error("Contextor Agent"),
23
23
  )
24
- def __call__(self, state: State):
24
+ async def __call__(self, state: State):
25
25
  device_data = get_screen_data(self.ctx.screen_api_client)
26
26
  focused_app_info = get_focused_app_info(self.ctx)
27
27
  device_date = get_device_date(self.ctx)
28
28
 
29
- should_add_screenshot_context = is_last_tool_message_take_screenshot(list(state.messages))
29
+ should_add_screenshot_context = is_last_tool_message_take_screenshot(
30
+ list(state.executor_messages)
31
+ )
30
32
 
31
- return state.sanitize_update(
33
+ return await state.asanitize_update(
32
34
  ctx=self.ctx,
33
35
  update={
34
36
  "latest_screenshot_base64": device_data.base64
@@ -4,19 +4,46 @@ Your job is to **analyze the current {{ platform }} mobile device state** and pr
4
4
 
5
5
  You must act like a human brain, responsible for giving instructions to your hands (the **Executor** agent). Therefore, you must act with the same imprecision and uncertainty as a human when performing swipe actions: humans don't know where exactly they are swiping (always prefer percentages of width and height instead of absolute coordinates), they just know they are swiping up or down, left or right, and with how much force (usually amplified compared to what's truly needed - go overboard of sliders for instance).
6
6
 
7
- ### Context You Receive:
7
+ ### Core Principle: Break Unproductive Cycles
8
8
 
9
- You are provided with:
9
+ Your highest priority is to recognize when you are not making progress. You are in an unproductive cycle if a **sequence of actions brings you back to a previous state without achieving the subgoal.**
10
10
 
11
- - 📱 **Device state**:
11
+ If you detect a cycle, you are **FORBIDDEN** from repeating it. You must pivot your strategy.
12
12
 
13
- - Latest **UI hierarchy**
14
- - (Optional) Latest **screenshot (base64)**. You can query one if you need it by calling the take_screenshot tool. Often, the UI hierarchy is enough to understand what is happening on the screen.
15
- - Current **focused app info**
16
- - **Screen size** and **device date**
13
+ 1. **Announce the Pivot:** In your `agent_thought`, you must briefly state which workflow is failing and what your new approach is.
17
14
 
18
- - 🧭 **Task context**:
15
+ 2. **Find a Simpler Path:** Abandon the current workflow. Ask yourself: **"How would a human do this if this feature didn't exist?"** This usually means relying on fundamental actions like scrolling, swiping, or navigating through menus manually.
16
+
17
+ 3. **Retreat as a Last Resort:** If no simpler path exists, declare the subgoal a failure to trigger a replan.
18
+
19
+ ### How to Perceive the Screen: A Two-Sense Approach
20
+
21
+ To understand the device state, you have two senses, each with its purpose:
22
+
23
+ 1. **UI Hierarchy (Your sense of "Touch"):**
24
+ * **What it is:** A structured list of all elements on the screen.
25
+ * **Use it for:** Finding elements by `resource-id`, checking for specific text, and understanding the layout structure.
26
+ * **Limitation:** It does NOT tell you what the screen *looks* like. It can be incomplete, and it contains no information about images, colors, or whether an element is visually obscured.
27
+
28
+ 2. **`glimpse_screen` (Your sense of "Sight"):**
29
+ * **What it is:** A tool that provides a real, up-to-date image of the screen.
30
+ * **Use it for:** Confirming what is actually visible. This is your source of TRUTH for all visual information (icons, images, element positions, colors).
31
+ * **Golden Rule:** When the UI hierarchy is ambiguous, seems incomplete, or when you need to verify a visual detail before acting, **`glimpse_screen` is always the most effective and reliable action.** Never guess what the screen looks like; use your sight to be sure.
32
+
33
+ **CRITICAL NOTE ON SIGHT:** The visual information from `glimpse_screen` is **ephemeral**. It is available for **THIS decision turn ONLY**. You MUST extract all necessary information from it IMMEDIATELY, as it will be cleared before the next step.
34
+
35
+ ### CRITICAL ACTION DIRECTIVES
36
+
37
+ - **To open an application, you MUST use the `launch_app` tool.** Provide the natural language name of the app (e.g., "Uber Eats"). Do NOT attempt to open apps manually by swiping to the app drawer and searching. The `launch_app` tool is the fastest and most reliable method.
38
+ - **To open URLs/links, you MUST use the `open_link` tool.** This handles all links, including deep links, correctly.
19
39
 
40
+ ### Context You Receive:
41
+
42
+ - 📱 **Device state**:
43
+ - Latest **UI hierarchy** and (if available) a **screenshot**.
44
+ - **CRITICAL NOTE ON SIGHT:** The visual information from `glimpse_screen` is **ephemeral**. It is available for **THIS decision turn ONLY**. You MUST extract all necessary information from it IMMEDIATELY, as it will be cleared before the next step.
45
+
46
+ - 🧭 **Task context**:
20
47
  - The user's **initial goal**
21
48
  - The **subgoal plan** with their statuses
22
49
  - The **current subgoal** (the one in `PENDING` in the plan)
@@ -27,25 +54,70 @@ You are provided with:
27
54
 
28
55
  Focus on the **current PENDING subgoal and the next subgoals not yet started**.
29
56
 
30
- 1. **Analyze the UI** and environment to understand what action is required.
57
+ **CRITICAL: Before making any decision, you MUST thoroughly analyze the agent thoughts history to:**
58
+ - **Detect patterns of failure or repeated attempts** that suggest the current approach isn't working
59
+ - **Identify contradictions** between what was planned and what actually happened
60
+ - **Spot errors in previous reasoning** that need to be corrected
61
+ - **Learn from successful strategies** used in similar situations
62
+ - **Avoid repeating failed approaches** by recognizing when to change strategy
63
+
64
+ 1. **Analyze the agent thoughts first** - Review all previous agent thoughts to understand:
65
+ - What strategies have been tried and their outcomes
66
+ - Any errors or misconceptions in previous reasoning
67
+ - Patterns that indicate success or failure
68
+ - Whether the current approach should be continued or modified
31
69
 
32
- 2.1. If some of the subgoals must be **completed** based on your observations, add them to `complete_subgoals_by_ids`. To justify your conclusion, you will fill in the `agent_thought` field based on:
70
+ 2. **Then analyze the UI** and environment to understand what action is required, but always in the context of what the agent thoughts reveal about the situation.
71
+
72
+ 3. If some of the subgoals must be **completed** based on your observations, add them to `complete_subgoals_by_ids`. To justify your conclusion, you will fill in the `agent_thought` field based on:
33
73
 
34
74
  - The current UI state
35
- - Past agent thoughts
36
- - Recent tool effects
75
+ - **Critical analysis of past agent thoughts and their accuracy**
76
+ - Recent tool effects and whether they matched expectations from agent thoughts
77
+ - **Any corrections needed to previous reasoning or strategy**
78
+
79
+
80
+ ### The Rule of Element Interaction
81
+
82
+ **You MUST follow it for every element interaction.**
83
+
84
+ When you target a UI element (for a `tap`, `input_text`, `clear_text`, etc.), you **MUST** provide a comprehensive `target` object containing every piece of information you can find about **that single element**.
37
85
 
38
- 2.2. Otherwise, output a **stringified structured set of instructions** that an **Executor agent** can perform on a real mobile device:
86
+ * **1. `resource_id`**: Include this if it is present in the UI hierarchy.
87
+ * **2. `resource_id_index`**: If there are multiple elements with the same `resource_id`, provide the zero-based index of the specific one you are targeting.
88
+ * **3. `coordinates`**: Include the full bounds (`x`, `y`, `width`, `height`) if they are available.
89
+ * **4. `text`**: Include the *current text* content of the element (e.g., placeholder text for an input).
90
+ * **5. `text_index`**: If there are multiple elements with the same `text`, provide the zero-based index of the specific one you are targeting.
91
+
92
+ **CRITICAL: The index must correspond to its identifier.** `resource_id_index` is only used when targeting by `resource_id`. `text_index` is only used when targeting by `text`. This ensures the fallback logic targets the correct element.
93
+
94
+ **This is NOT optional.** Providing all locators if we have, it is the foundation of the system's reliability. It allows next steps to use a fallback mechanism: if the ID fails, it tries the coordinates, etc. Failing to provide this complete context will lead to action failures.
95
+
96
+ ### The Rule of Unpredictable Actions
97
+
98
+ Certain actions have outcomes that can significantly and sometimes unpredictably change the UI. These include:
99
+ - `back`
100
+ - `launch_app`
101
+ - `stop_app`
102
+ - `open_link`
103
+ - `tap` on an element that is clearly for navigation (e.g., a "Back" button, a menu item, a link to another screen).
104
+
105
+ **CRITICAL RULE: If your decision includes one of these unpredictable actions, it MUST be the only action in your `Structured Decisions` for this turn. Else, use flows to group actions together.**
106
+
107
+ This is not optional. Failing to isolate these actions will cause the system to act on an outdated understanding of the screen, leading to catastrophic errors. For example, after a `back` command, you MUST wait to see the new screen before deciding what to tap next.
108
+
109
+ You may only group simple, predictable actions together, such as tapping a text field and then immediately typing into it (`tap` followed by `input_text`).
110
+
111
+ ### Outputting Your Decisions
112
+
113
+ If you decide to act, output a **valid JSON stringified structured set of instructions** for the Executor.
39
114
 
40
115
  - These must be **concrete low-level actions**.
41
116
  - The executor has the following available tools: {{ executor_tools_list }}.
42
117
  - Your goal is to achieve subgoals **fast** - so you must put as much actions as possible in your instructions to complete all achievable subgoals (based on your observations) in one go.
43
- - To open URLs/links directly, use the `open_link` tool - it will automatically handle opening in the appropriate browser. It also handles deep links.
44
- - When you need to open an app, use the `find_packages` low-level action to try and get its name. Then, simply use the `launch_app` low-level action to launch it.
45
- - If you refer to a UI element or coordinates, specify it clearly (e.g., `resource-id: com.whatsapp:id/search`, `text: "Alice"`, `x: 100, y: 200`).
118
+ - If you refer to a UI element or coordinates, specify it clearly (e.g., `resource-id: com.whatsapp:id/search`, `resource-id-index: 0`, `text: "Alice"`, `resource-id-index: 0`, `x: 100, y: 200, width: 100, height: 100`).
46
119
  - **The structure is up to you**, but it must be valid **JSON stringified output**. You will accompany this output with a **natural-language summary** of your reasoning and approach in your agent thought.
47
- - **Never use a sequence of `tap` + `input_text` to type into a field. Always use a single `input_text` action** with the correct `resource_id` (this already ensures the element is focused and the cursor is moved to the end).
48
- - When you want to launch/stop an app, prefer using its package name.
120
+ - **Always use a single `input_text` action** to type in a field. This tool handles focusing the element and placing the cursor correctly. If the tool feedback indicates verification is needed or shows None/empty content, perform verification before proceeding.
49
121
  - **Only reference UI element IDs or visible texts that are explicitly present in the provided UI hierarchy or screenshot. Do not invent, infer, or guess any IDs or texts that are not directly observed**.
50
122
  - **For text clearing**: When you need to completely clear text from an input field, always call the `clear_text` tool with the correct resource_id. This tool automatically focuses the element, and ensures the field is emptied. If you notice this tool fails to clear the text, try to long press the input, select all, and call `erase_one_char`.
51
123
 
@@ -57,17 +129,35 @@ Focus on the **current PENDING subgoal and the next subgoals not yet started**.
57
129
  - **Structured Decisions** _(optional)_:
58
130
  A **valid stringified JSON** describing what should be executed **right now** to advance through the subgoals as much as possible.
59
131
 
60
- - **Agent Thought** _(1-2 sentences)_:
61
- If there is any information you need to remember for later steps, you must include it here, because only the agent thoughts will be used to produce the final structured output.
132
+ - **Agent Thought** _(2-4 sentences)_:
133
+ **MANDATORY: Start by analyzing previous agent thoughts** - Did previous reasoning contain errors? Are we repeating failed approaches? What worked before in similar situations?
134
+
135
+ Then explain your current decision based on this analysis. If there is any information you need to remember for later steps, you must include it here, because only the agent thoughts will be used to produce the final structured output.
62
136
 
63
- This also helps other agents understand your decision and learn from future failures.
137
+ This also helps other agents understand your decision and learn from future failures. **Explicitly mention if you're correcting a previous error or changing strategy based on agent thoughts analysis.**
64
138
  You must also use this field to mention checkpoints when you perform actions without definite ending: for instance "Swiping up to reveal more recipes - last seen recipe was <ID or NAME>, stop when no more".
65
139
 
66
140
  **Important:** `complete_subgoals_by_ids` and the structured decisions are mutually exclusive: if you provide both, the structured decisions will be ignored. Therefore, you must always prioritize completing subgoals over providing structured decisions.
67
141
 
68
142
  ---
69
143
 
70
- ### Example
144
+ ### Example 1
145
+
146
+ #### Current Subgoal:
147
+
148
+ > "Open WhatsApp"
149
+
150
+ #### Structured Decisions:
151
+
152
+ ```text
153
+ "{\"action\": \"launch_app\", \"app_name\": \"WhatsApp\"}"
154
+ ```
155
+
156
+ #### Agent Thought:
157
+
158
+ > I need to launch the WhatsApp app. I will use the `launch_app` tool to open it.
159
+
160
+ ### Exemple 2
71
161
 
72
162
  #### Current Subgoal:
73
163
 
@@ -76,12 +166,12 @@ Focus on the **current PENDING subgoal and the next subgoals not yet started**.
76
166
  #### Structured Decisions:
77
167
 
78
168
  ```text
79
- "{\"action\": \"tap\", \"target\": {\"resource_id\": \"com.whatsapp:id/menuitem_search\", \"text\": \"Search\"}}"
169
+ "[{\"action\": \"tap\", \"target\": {\"resource_id\": \"com.whatsapp:id/menuitem_search\", \"resource_id_index\": 1, \"text\": \"Search\", \"text_index\": 0, \"coordinates\": {\"x\": 880, \"y\": 150, \"width\": 120, \"height\": 120}}}]"
80
170
  ```
81
171
 
82
172
  #### Agent Thought:
83
173
 
84
- > I will tap the search icon at the top of the WhatsApp interface to begin searching for Alice.
174
+ > Analysis: No previous attempts, this is a fresh approach. I will tap the search icon to begin searching. I am providing its resource_id, coordinates, and text content to ensure the Executor can find it reliably, following the element rule.
85
175
 
86
176
  ### Input
87
177
 
@@ -94,9 +184,6 @@ Focus on the **current PENDING subgoal and the next subgoals not yet started**.
94
184
  **Current Subgoal (what needs to be done right now):**
95
185
  {{ current_subgoal }}
96
186
 
97
- **Agent thoughts (previous reasoning, observations about the environment):**
98
- {{ agents_thoughts }}
99
-
100
187
  **Executor agent feedback on latest UI decisions:**
101
188
 
102
189
  {{ executor_feedback }}
@@ -16,7 +16,7 @@ from minitap.mobile_use.agents.planner.utils import get_current_subgoal
16
16
  from minitap.mobile_use.constants import EXECUTOR_MESSAGES_KEY
17
17
  from minitap.mobile_use.context import MobileUseContext
18
18
  from minitap.mobile_use.graph.state import State
19
- from minitap.mobile_use.services.llm import get_llm, with_fallback
19
+ from minitap.mobile_use.services.llm import get_llm, invoke_llm_with_timeout_message, with_fallback
20
20
  from minitap.mobile_use.tools.index import EXECUTOR_WRAPPERS_TOOLS, format_tools_list
21
21
  from minitap.mobile_use.utils.conversations import get_screenshot_message_for_llm
22
22
  from minitap.mobile_use.utils.decorators import wrap_with_callbacks
@@ -44,7 +44,6 @@ class CortexNode:
44
44
  initial_goal=state.initial_goal,
45
45
  subgoal_plan=state.subgoal_plan,
46
46
  current_subgoal=get_current_subgoal(state.subgoal_plan),
47
- agents_thoughts=state.agents_thoughts,
48
47
  executor_feedback=executor_feedback,
49
48
  executor_tools_list=format_tools_list(ctx=self.ctx, wrappers=EXECUTOR_WRAPPERS_TOOLS),
50
49
  )
@@ -79,8 +78,12 @@ class CortexNode:
79
78
  ctx=self.ctx, name="cortex", use_fallback=True, temperature=1
80
79
  ).with_structured_output(CortexOutput)
81
80
  response: CortexOutput = await with_fallback(
82
- main_call=lambda: llm.ainvoke(messages),
83
- fallback_call=lambda: llm_fallback.ainvoke(messages),
81
+ main_call=lambda: invoke_llm_with_timeout_message(
82
+ llm.ainvoke(messages), agent_name="Cortex"
83
+ ),
84
+ fallback_call=lambda: invoke_llm_with_timeout_message(
85
+ llm_fallback.ainvoke(messages), agent_name="Cortex (Fallback)"
86
+ ),
84
87
  ) # type: ignore
85
88
 
86
89
  is_subgoal_completed = (
@@ -91,7 +94,7 @@ class CortexNode:
91
94
  if not is_subgoal_completed:
92
95
  response.complete_subgoals_by_ids = []
93
96
 
94
- return state.sanitize_update(
97
+ return await state.asanitize_update(
95
98
  ctx=self.ctx,
96
99
  update={
97
100
  "agents_thoughts": [response.agent_thought],
@@ -25,13 +25,7 @@ and your previous actions, you must:
25
25
  "I'm tapping on the chat item labeled 'Alice' to open the conversation."
26
26
 
27
27
  ```json
28
- {
29
- "action": "tap",
30
- "target": {
31
- "text": "Alice",
32
- "resource_id": "com.whatsapp:id/conversation_item"
33
- }
34
- }
28
+ "[{\"tool_name\": \"tap\", \"arguments\": {\"target\": {\"resource_id\": \"com.whatsapp:id/conversation_item\", \"resource_id_index\": 0, \"text\": \"Alice\", \"text_index\": 0, \"coordinates\": {\"x\": 0, \"y\": 350, \"width\": 1080, \"height\": 80}}}}]"
35
29
  ```
36
30
 
37
31
  **→ Executor Action**:
@@ -39,14 +33,17 @@ and your previous actions, you must:
39
33
  Call the `tap_on_element` tool with:
40
34
 
41
35
  - `resource_id = "com.whatsapp:id/conversation_item"`
36
+ - `resource_id_index = 0`
42
37
  - `text = "Alice"`
38
+ - `text_index = 0`
39
+ - `coordinates = {"x": 0, "y": 350, "width": 1080, "height": 80}`
43
40
  - `agent_thought = "I'm tapping on the chat item labeled 'Alice' to open the conversation."`
44
41
 
45
42
  ---
46
43
 
47
44
  ### ⚙️ Tools
48
45
 
49
- - Tools may include actions like: `tap`, `swipe`, `start_app`, `stop_app`, `find_packages`, `get_current_focus`, etc.
46
+ - Tools may include actions like: `tap`, `swipe`, `launch_app`, `stop_app`, etc.
50
47
  - You **must not hardcode tool definitions** here.
51
48
  - Just use the right tool based on what the `structured_decisions` requires.
52
49
  - The tools are provided dynamically via LangGraph's tool binding mechanism.
@@ -55,13 +52,21 @@ Call the `tap_on_element` tool with:
55
52
 
56
53
  When using the `input_text` tool:
57
54
 
58
- - **Always provide the `resource_id` of the element** you want to type into.
55
+ - **Provide all available information** in the target object to identify text input element
56
+ - `resource_id`: The resource ID of the text input element (when available)
57
+ - `resource_id_index`: The zero-based index of the specific resource ID you are targeting (when available)
58
+ - `text`: The current text content of the text input element (when available)
59
+ - `text_index`: The zero-based index of the specific text you are targeting (when available)
60
+ - `coordinates`: The bounds (ElementBounds) of the text input element (when available)
61
+
59
62
  - The tool will automatically:
60
63
 
61
- 1. **Focus the element first**
64
+ 1. **Focus the element** using the provided identification parameters
62
65
  2. **Move the cursor to the end** of the existing text
63
66
  3. **Then type the new text**
64
67
 
68
+ - **Important**: Special characters and markdown-like escape sequences (e.g., \n, \t, *, _) are not interpreted. For example, typing \n will insert the literal characters \ and n, not a line break.
69
+
65
70
  #### 🔄 Text Clearing Best Practice
66
71
 
67
72
  When you need to completely clear text from an input field, always use the clear_text tool with the correct resource_id.
@@ -8,7 +8,7 @@ from langchain_google_vertexai.chat_models import ChatVertexAI
8
8
  from minitap.mobile_use.constants import EXECUTOR_MESSAGES_KEY
9
9
  from minitap.mobile_use.context import MobileUseContext
10
10
  from minitap.mobile_use.graph.state import State
11
- from minitap.mobile_use.services.llm import get_llm
11
+ from minitap.mobile_use.services.llm import get_llm, invoke_llm_with_timeout_message
12
12
  from minitap.mobile_use.tools.index import EXECUTOR_WRAPPERS_TOOLS, get_tools_from_wrappers
13
13
  from minitap.mobile_use.utils.decorators import wrap_with_callbacks
14
14
  from minitap.mobile_use.utils.logger import get_logger
@@ -29,7 +29,7 @@ class ExecutorNode:
29
29
  structured_decisions = state.structured_decisions
30
30
  if not structured_decisions:
31
31
  logger.warning("No structured decisions found.")
32
- return state.sanitize_update(
32
+ return await state.asanitize_update(
33
33
  ctx=self.ctx,
34
34
  update={
35
35
  "agents_thoughts": [
@@ -62,9 +62,10 @@ class ExecutorNode:
62
62
  llm_bind_tools_kwargs["parallel_tool_calls"] = True
63
63
 
64
64
  llm = llm.bind_tools(**llm_bind_tools_kwargs)
65
- response = await llm.ainvoke(messages)
66
-
67
- return state.sanitize_update(
65
+ response = await invoke_llm_with_timeout_message(
66
+ llm.ainvoke(messages), agent_name="Executor"
67
+ )
68
+ return await state.asanitize_update(
68
69
  ctx=self.ctx,
69
70
  update={
70
71
  "cortex_last_thought": cortex_last_thought,
@@ -1,4 +1,5 @@
1
1
  from langchain_core.messages import BaseMessage
2
+
2
3
  from minitap.mobile_use.utils.conversations import is_tool_message
3
4
 
4
5
 
@@ -7,5 +8,5 @@ def is_last_tool_message_take_screenshot(messages: list[BaseMessage]) -> bool:
7
8
  return False
8
9
  for msg in messages[::-1]:
9
10
  if is_tool_message(msg):
10
- return msg.name == "take_screenshot"
11
+ return msg.name == "glimpse_screen"
11
12
  return False
@@ -2,10 +2,11 @@ from pathlib import Path
2
2
 
3
3
  from jinja2 import Template
4
4
  from langchain_core.messages import HumanMessage, SystemMessage
5
- from minitap.mobile_use.context import MobileUseContext
6
- from minitap.mobile_use.services.llm import get_llm
7
5
  from pydantic import BaseModel, Field
8
6
 
7
+ from minitap.mobile_use.context import MobileUseContext
8
+ from minitap.mobile_use.services.llm import get_llm, invoke_llm_with_timeout_message
9
+
9
10
 
10
11
  class HopperOutput(BaseModel):
11
12
  step: str = Field(
@@ -33,7 +34,9 @@ async def hopper(
33
34
 
34
35
  llm = get_llm(ctx=ctx, name="hopper", is_utils=True, temperature=0)
35
36
  structured_llm = llm.with_structured_output(HopperOutput)
36
- response: HopperOutput = await structured_llm.ainvoke(messages) # type: ignore
37
+ response: HopperOutput = await invoke_llm_with_timeout_message(
38
+ structured_llm.ainvoke(messages), agent_name="Hopper"
39
+ ) # type: ignore
37
40
  return HopperOutput(
38
41
  step=response.step,
39
42
  output=response.output,
@@ -15,7 +15,7 @@ from minitap.mobile_use.agents.planner.utils import (
15
15
  )
16
16
  from minitap.mobile_use.context import MobileUseContext
17
17
  from minitap.mobile_use.graph.state import State
18
- from minitap.mobile_use.services.llm import get_llm
18
+ from minitap.mobile_use.services.llm import get_llm, invoke_llm_with_timeout_message
19
19
  from minitap.mobile_use.utils.decorators import wrap_with_callbacks
20
20
  from minitap.mobile_use.utils.logger import get_logger
21
21
 
@@ -45,14 +45,18 @@ class OrchestratorNode:
45
45
  else f"Starting the next subgoal: {new_subgoal}"
46
46
  )
47
47
  ]
48
- return _get_state_update(ctx=self.ctx, state=state, thoughts=thoughts, update_plan=True)
48
+ return await _get_state_update(
49
+ ctx=self.ctx, state=state, thoughts=thoughts, update_plan=True
50
+ )
49
51
 
50
52
  subgoals_to_examine = get_subgoals_by_ids(
51
53
  subgoals=state.subgoal_plan,
52
54
  ids=state.complete_subgoals_by_ids,
53
55
  )
54
56
  if len(subgoals_to_examine) <= 0:
55
- return _get_state_update(ctx=self.ctx, state=state, thoughts=["No subgoal to examine."])
57
+ return await _get_state_update(
58
+ ctx=self.ctx, state=state, thoughts=["No subgoal to examine."]
59
+ )
56
60
 
57
61
  system_message = Template(
58
62
  Path(__file__).parent.joinpath("orchestrator.md").read_text(encoding="utf-8")
@@ -72,13 +76,16 @@ class OrchestratorNode:
72
76
 
73
77
  llm = get_llm(ctx=self.ctx, name="orchestrator", temperature=1)
74
78
  llm = llm.with_structured_output(OrchestratorOutput)
75
- response: OrchestratorOutput = await llm.ainvoke(messages) # type: ignore
76
-
79
+ response: OrchestratorOutput = await invoke_llm_with_timeout_message(
80
+ llm.ainvoke(messages), agent_name="Orchestrator"
81
+ ) # type: ignore
77
82
  if response.needs_replaning:
78
83
  thoughts = [response.reason]
79
84
  state.subgoal_plan = fail_current_subgoal(state.subgoal_plan)
80
85
  thoughts.append("==== END OF PLAN, REPLANNING ====")
81
- return _get_state_update(ctx=self.ctx, state=state, thoughts=thoughts, update_plan=True)
86
+ return await _get_state_update(
87
+ ctx=self.ctx, state=state, thoughts=thoughts, update_plan=True
88
+ )
82
89
 
83
90
  state.subgoal_plan = complete_subgoals_by_ids(
84
91
  subgoals=state.subgoal_plan,
@@ -87,19 +94,25 @@ class OrchestratorNode:
87
94
  thoughts = [response.reason]
88
95
  if all_completed(state.subgoal_plan):
89
96
  logger.success("All the subgoals have been completed successfully.")
90
- return _get_state_update(ctx=self.ctx, state=state, thoughts=thoughts, update_plan=True)
97
+ return await _get_state_update(
98
+ ctx=self.ctx, state=state, thoughts=thoughts, update_plan=True
99
+ )
91
100
 
92
101
  if current_subgoal.id not in response.completed_subgoal_ids:
93
102
  # The current subgoal is not yet complete.
94
- return _get_state_update(ctx=self.ctx, state=state, thoughts=thoughts, update_plan=True)
103
+ return await _get_state_update(
104
+ ctx=self.ctx, state=state, thoughts=thoughts, update_plan=True
105
+ )
95
106
 
96
107
  state.subgoal_plan = start_next_subgoal(state.subgoal_plan)
97
108
  new_subgoal = get_current_subgoal(state.subgoal_plan)
98
109
  thoughts.append(f"==== NEXT SUBGOAL: {new_subgoal} ====")
99
- return _get_state_update(ctx=self.ctx, state=state, thoughts=thoughts, update_plan=True)
110
+ return await _get_state_update(
111
+ ctx=self.ctx, state=state, thoughts=thoughts, update_plan=True
112
+ )
100
113
 
101
114
 
102
- def _get_state_update(
115
+ async def _get_state_update(
103
116
  ctx: MobileUseContext,
104
117
  state: State,
105
118
  thoughts: list[str],
@@ -111,4 +124,6 @@ def _get_state_update(
111
124
  }
112
125
  if update_plan:
113
126
  update["subgoal_plan"] = state.subgoal_plan
114
- return state.sanitize_update(ctx=ctx, update=update, agent="orchestrator")
127
+ if ctx.on_plan_changes:
128
+ await ctx.on_plan_changes(state.subgoal_plan, False)
129
+ return await state.asanitize_update(ctx=ctx, update=update, agent="orchestrator")
@@ -3,13 +3,14 @@ from pathlib import Path
3
3
 
4
4
  from jinja2 import Template
5
5
  from langchain_core.messages import BaseMessage, HumanMessage, SystemMessage
6
+ from pydantic import BaseModel
7
+
6
8
  from minitap.mobile_use.config import OutputConfig
7
9
  from minitap.mobile_use.context import MobileUseContext
8
10
  from minitap.mobile_use.graph.state import State
9
- from minitap.mobile_use.services.llm import get_llm
11
+ from minitap.mobile_use.services.llm import get_llm, invoke_llm_with_timeout_message
10
12
  from minitap.mobile_use.utils.conversations import is_ai_message
11
13
  from minitap.mobile_use.utils.logger import get_logger
12
- from pydantic import BaseModel
13
14
 
14
15
  logger = get_logger(__name__)
15
16
 
@@ -61,7 +62,9 @@ async def outputter(
61
62
  if schema is not None:
62
63
  structured_llm = llm.with_structured_output(schema)
63
64
 
64
- response = await structured_llm.ainvoke(messages) # type: ignore
65
+ response = await invoke_llm_with_timeout_message(
66
+ structured_llm.ainvoke(messages), agent_name="Outputter"
67
+ ) # type: ignore
65
68
  if isinstance(response, BaseModel):
66
69
  if output_config.output_description and hasattr(response, "content"):
67
70
  response = json.loads(response.content) # type: ignore