minitap-mobile-use 2.0.0__py3-none-any.whl → 2.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of minitap-mobile-use might be problematic. Click here for more details.

Files changed (74) hide show
  1. minitap/mobile_use/agents/cortex/cortex.md +19 -10
  2. minitap/mobile_use/agents/cortex/cortex.py +15 -2
  3. minitap/mobile_use/agents/cortex/types.py +2 -4
  4. minitap/mobile_use/agents/executor/executor.md +20 -15
  5. minitap/mobile_use/agents/executor/executor.py +6 -18
  6. minitap/mobile_use/agents/executor/tool_node.py +105 -0
  7. minitap/mobile_use/agents/hopper/hopper.md +2 -10
  8. minitap/mobile_use/agents/hopper/hopper.py +4 -9
  9. minitap/mobile_use/agents/orchestrator/human.md +3 -4
  10. minitap/mobile_use/agents/orchestrator/orchestrator.md +25 -7
  11. minitap/mobile_use/agents/orchestrator/orchestrator.py +56 -56
  12. minitap/mobile_use/agents/orchestrator/types.py +5 -8
  13. minitap/mobile_use/agents/outputter/outputter.py +1 -2
  14. minitap/mobile_use/agents/planner/planner.md +25 -15
  15. minitap/mobile_use/agents/planner/planner.py +7 -1
  16. minitap/mobile_use/agents/planner/types.py +10 -5
  17. minitap/mobile_use/agents/planner/utils.py +11 -0
  18. minitap/mobile_use/agents/summarizer/summarizer.py +2 -1
  19. minitap/mobile_use/clients/device_hardware_client.py +3 -0
  20. minitap/mobile_use/config.py +16 -14
  21. minitap/mobile_use/constants.py +1 -0
  22. minitap/mobile_use/context.py +3 -4
  23. minitap/mobile_use/controllers/mobile_command_controller.py +37 -26
  24. minitap/mobile_use/controllers/platform_specific_commands_controller.py +3 -4
  25. minitap/mobile_use/graph/graph.py +10 -31
  26. minitap/mobile_use/graph/state.py +34 -14
  27. minitap/mobile_use/main.py +11 -8
  28. minitap/mobile_use/sdk/agent.py +78 -63
  29. minitap/mobile_use/sdk/builders/agent_config_builder.py +23 -11
  30. minitap/mobile_use/sdk/builders/task_request_builder.py +9 -9
  31. minitap/mobile_use/sdk/examples/smart_notification_assistant.py +1 -2
  32. minitap/mobile_use/sdk/types/agent.py +10 -5
  33. minitap/mobile_use/sdk/types/task.py +19 -18
  34. minitap/mobile_use/sdk/utils.py +1 -1
  35. minitap/mobile_use/servers/config.py +1 -2
  36. minitap/mobile_use/servers/device_hardware_bridge.py +3 -4
  37. minitap/mobile_use/servers/start_servers.py +4 -4
  38. minitap/mobile_use/servers/stop_servers.py +12 -18
  39. minitap/mobile_use/services/llm.py +4 -2
  40. minitap/mobile_use/tools/index.py +11 -7
  41. minitap/mobile_use/tools/mobile/back.py +8 -12
  42. minitap/mobile_use/tools/mobile/clear_text.py +277 -0
  43. minitap/mobile_use/tools/mobile/copy_text_from.py +8 -12
  44. minitap/mobile_use/tools/mobile/erase_one_char.py +56 -0
  45. minitap/mobile_use/tools/mobile/find_packages.py +69 -0
  46. minitap/mobile_use/tools/mobile/input_text.py +55 -32
  47. minitap/mobile_use/tools/mobile/launch_app.py +8 -12
  48. minitap/mobile_use/tools/mobile/long_press_on.py +9 -13
  49. minitap/mobile_use/tools/mobile/open_link.py +8 -12
  50. minitap/mobile_use/tools/mobile/paste_text.py +8 -12
  51. minitap/mobile_use/tools/mobile/press_key.py +8 -12
  52. minitap/mobile_use/tools/mobile/stop_app.py +9 -13
  53. minitap/mobile_use/tools/mobile/swipe.py +8 -12
  54. minitap/mobile_use/tools/mobile/take_screenshot.py +8 -12
  55. minitap/mobile_use/tools/mobile/tap.py +9 -13
  56. minitap/mobile_use/tools/mobile/wait_for_animation_to_end.py +9 -13
  57. minitap/mobile_use/tools/tool_wrapper.py +1 -23
  58. minitap/mobile_use/tools/utils.py +86 -0
  59. minitap/mobile_use/utils/cli_helpers.py +1 -2
  60. minitap/mobile_use/utils/cli_selection.py +5 -6
  61. minitap/mobile_use/utils/decorators.py +21 -20
  62. minitap/mobile_use/utils/logger.py +3 -4
  63. minitap/mobile_use/utils/media.py +1 -1
  64. minitap/mobile_use/utils/recorder.py +11 -10
  65. minitap/mobile_use/utils/ui_hierarchy.py +98 -3
  66. {minitap_mobile_use-2.0.0.dist-info → minitap_mobile_use-2.1.0.dist-info}/METADATA +12 -2
  67. minitap_mobile_use-2.1.0.dist-info/RECORD +96 -0
  68. minitap/mobile_use/agents/executor/executor_context_cleaner.py +0 -27
  69. minitap/mobile_use/tools/mobile/erase_text.py +0 -124
  70. minitap/mobile_use/tools/mobile/list_packages.py +0 -78
  71. minitap/mobile_use/tools/mobile/run_flow.py +0 -57
  72. minitap_mobile_use-2.0.0.dist-info/RECORD +0 -95
  73. {minitap_mobile_use-2.0.0.dist-info → minitap_mobile_use-2.1.0.dist-info}/WHEEL +0 -0
  74. {minitap_mobile_use-2.0.0.dist-info → minitap_mobile_use-2.1.0.dist-info}/entry_points.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  ## You are the **Cortex**
2
2
 
3
- Your job is to **analyze the current {{ platform }} mobile device state** and produce **structured decisions** to achieve the current subgoal.
3
+ Your job is to **analyze the current {{ platform }} mobile device state** and produce **structured decisions** to achieve the current subgoal and more consecutive subgoals if possible.
4
4
 
5
5
  You must act like a human brain, responsible for giving instructions to your hands (the **Executor** agent). Therefore, you must act with the same imprecision and uncertainty as a human when performing swipe actions: humans don't know where exactly they are swiping (always prefer percentages of width and height instead of absolute coordinates), they just know they are swiping up or down, left or right, and with how much force (usually amplified compared to what's truly needed - go overboard of sliders for instance).
6
6
 
@@ -19,33 +19,43 @@ You are provided with:
19
19
 
20
20
  - The user's **initial goal**
21
21
  - The **subgoal plan** with their statuses
22
- - The **current subgoal** to act on (the one in `PENDING` in the plan)
22
+ - The **current subgoal** (the one in `PENDING` in the plan)
23
23
  - A list of **agent thoughts** (previous reasoning, observations about the environment)
24
24
  - **Executor agent feedback** on the latest UI decisions
25
25
 
26
26
  ### Your Mission:
27
27
 
28
- Focus on the **current subgoal**.
28
+ Focus on the **current PENDING subgoal and the next subgoals not yet started**.
29
29
 
30
30
  1. **Analyze the UI** and environment to understand what action is required.
31
- 2.1. If the **subgoal is completed**, set the `complete_subgoal` field to `True`. To justify your conclusion, you will fill in the `agent_thought` field based on:
31
+
32
+ 2.1. If some of the subgoals must be **completed** based on your observations, add them to `complete_subgoals_by_ids`. To justify your conclusion, you will fill in the `agent_thought` field based on:
32
33
 
33
34
  - The current UI state
34
35
  - Past agent thoughts
35
36
  - Recent tool effects
37
+
36
38
  2.2. Otherwise, output a **stringified structured set of instructions** that an **Executor agent** can perform on a real mobile device:
37
39
 
38
- - These must be **concrete low-level actions**: back,tap, swipe, launch app, list packages, close app, input text, paste, erase, text, copy, etc.
40
+ - These must be **concrete low-level actions**.
41
+ - The executor has the following available tools: **{{ executor_tools_list }}**.
42
+ - Your goal is to achieve subgoals **fast** - so you must put as much actions as possible in your instructions to complete all achievable subgoals (based on your observations) in one go.
43
+ - To open URLs/links directly, use the `open_link` tool - it will automatically handle opening in the appropriate browser. It also handles deep links.
44
+ - When you need to open an app, use the `find_packages` low-level action to try and get its name. Then, simply use the `launch_app` low-level action to launch it.
39
45
  - If you refer to a UI element or coordinates, specify it clearly (e.g., `resource-id: com.whatsapp:id/search`, `text: "Alice"`, `x: 100, y: 200`).
40
46
  - **The structure is up to you**, but it must be valid **JSON stringified output**. You will accompany this output with a **natural-language summary** of your reasoning and approach in your agent thought.
47
+ - **Never use a sequence of `tap` + `input_text` to type into a field. Always use a single `input_text` action** with the correct `resource_id` (this already ensures the element is focused and the cursor is moved to the end).
41
48
  - When you want to launch/stop an app, prefer using its package name.
42
49
  - **Only reference UI element IDs or visible texts that are explicitly present in the provided UI hierarchy or screenshot. Do not invent, infer, or guess any IDs or texts that are not directly observed**.
43
- - **For text clearing**: When you need to completely clear text from an input field, always use **LONG PRESS** first to select the text field, then erase. Do NOT use tap + erase as this only clears from cursor position.
50
+ - **For text clearing**: When you need to completely clear text from an input field, always call the `clear_text` tool with the correct resource_id. This tool automatically focuses the element, and ensures the field is emptied. If you notice this tool fails to clear the text, try to long press the input, select all, and call `erase_one_char`.
44
51
 
45
52
  ### Output
46
53
 
47
- - **Structured Decisions**:
48
- A **valid stringified JSON** describing what should be executed **right now** to advance the current subgoal **IF THE SUBGOAL IS NOT COMPLETED**.
54
+ - **complete_subgoals_by_ids** _(optional)_:
55
+ A list of subgoal IDs that should be marked as completed.
56
+
57
+ - **Structured Decisions** _(optional)_:
58
+ A **valid stringified JSON** describing what should be executed **right now** to advance through the subgoals as much as possible.
49
59
 
50
60
  - **Agent Thought** _(1-2 sentences)_:
51
61
  If there is any information you need to remember for later steps, you must include it here, because only the agent thoughts will be used to produce the final structured output.
@@ -53,8 +63,7 @@ Focus on the **current subgoal**.
53
63
  This also helps other agents understand your decision and learn from future failures.
54
64
  You must also use this field to mention checkpoints when you perform actions without definite ending: for instance "Swiping up to reveal more recipes - last seen recipe was <ID or NAME>, stop when no more".
55
65
 
56
- - **Subgoal Completion** _(boolean)_:
57
- Set to true if the current subgoal has been successfully completed - you **cannot set it to true and provide structured decisions at the same time**. You must base your decision ONLY on what you have as input (device state, agent thoughts, executor feedback, etc) - NEVER based on the decisions you have produced.
66
+ **Important:** `complete_subgoals_by_ids` and the structured decisions are mutually exclusive: if you provide both, the structured decisions will be ignored. Therefore, you must always prioritize completing subgoals over providing structured decisions.
58
67
 
59
68
  ---
60
69
 
@@ -10,11 +10,14 @@ from langchain_core.messages import (
10
10
  ToolMessage,
11
11
  )
12
12
  from langgraph.graph.message import REMOVE_ALL_MESSAGES
13
+
13
14
  from minitap.mobile_use.agents.cortex.types import CortexOutput
14
15
  from minitap.mobile_use.agents.planner.utils import get_current_subgoal
16
+ from minitap.mobile_use.constants import EXECUTOR_MESSAGES_KEY
15
17
  from minitap.mobile_use.context import MobileUseContext
16
18
  from minitap.mobile_use.graph.state import State
17
19
  from minitap.mobile_use.services.llm import get_llm, with_fallback
20
+ from minitap.mobile_use.tools.index import EXECUTOR_WRAPPERS_TOOLS, format_tools_list
18
21
  from minitap.mobile_use.utils.conversations import get_screenshot_message_for_llm
19
22
  from minitap.mobile_use.utils.decorators import wrap_with_callbacks
20
23
  from minitap.mobile_use.utils.logger import get_logger
@@ -43,6 +46,7 @@ class CortexNode:
43
46
  current_subgoal=get_current_subgoal(state.subgoal_plan),
44
47
  agents_thoughts=state.agents_thoughts,
45
48
  executor_feedback=executor_feedback,
49
+ executor_tools_list=format_tools_list(self.ctx, EXECUTOR_WRAPPERS_TOOLS),
46
50
  )
47
51
  messages = [
48
52
  SystemMessage(content=system_message),
@@ -79,20 +83,29 @@ class CortexNode:
79
83
  fallback_call=lambda: llm_fallback.ainvoke(messages),
80
84
  ) # type: ignore
81
85
 
82
- is_subgoal_completed = response.complete_current_subgoal
86
+ is_subgoal_completed = (
87
+ response.complete_subgoals_by_ids is not None
88
+ and len(response.complete_subgoals_by_ids) > 0
89
+ and (len(response.decisions) == 0 or response.decisions in ["{}", "[]", "null", ""])
90
+ )
91
+ if not is_subgoal_completed:
92
+ response.complete_subgoals_by_ids = []
93
+
83
94
  return state.sanitize_update(
84
95
  ctx=self.ctx,
85
96
  update={
86
97
  "agents_thoughts": [response.agent_thought],
87
98
  "structured_decisions": response.decisions if not is_subgoal_completed else None,
99
+ "complete_subgoals_by_ids": response.complete_subgoals_by_ids or [],
88
100
  "latest_screenshot_base64": None,
89
101
  "latest_ui_hierarchy": None,
90
102
  "focused_app_info": None,
91
103
  "device_date": None,
92
104
  # Executor related fields
93
- "executor_messages": [RemoveMessage(id=REMOVE_ALL_MESSAGES)],
105
+ EXECUTOR_MESSAGES_KEY: [RemoveMessage(id=REMOVE_ALL_MESSAGES)],
94
106
  "cortex_last_thought": response.agent_thought,
95
107
  },
108
+ agent="cortex",
96
109
  )
97
110
 
98
111
 
@@ -1,11 +1,9 @@
1
- from typing import Optional
2
-
3
1
  from pydantic import BaseModel, Field
4
2
 
5
3
 
6
4
  class CortexOutput(BaseModel):
7
5
  decisions: str = Field(..., description="The decisions to be made. A stringified JSON object")
8
6
  agent_thought: str = Field(..., description="The agent's thought")
9
- complete_current_subgoal: Optional[bool] = Field(
10
- False, description="Whether the current subgoal is complete"
7
+ complete_subgoals_by_ids: list[str] | None = Field(
8
+ [], description="List of subgoal IDs to complete"
11
9
  )
@@ -5,21 +5,17 @@ Your job is to **interpret the structured decisions** provided by the **Cortex**
5
5
  ### 🎯 Your Objective:
6
6
 
7
7
  Given the `structured_decisions` (a stringified object) from the **Cortex** agent
8
- and the previous tool calls, you must:
8
+ and your previous actions, you must:
9
9
 
10
10
  1. **Parse the structured decisions** into usable Python objects.
11
- 2. **Determine the most appropriate tool** to execute the intended action - **you can ONLY USE ONE**
12
- 3. **Invoke tool accurately**, passing the required parameters.
13
- 4. For **the tool you invoke**, always provide a clear `agent_thought` argument:
11
+ 2. **Determine the appropriate tools** to execute the intended action - **the order of the tools you return is the order in which they will be executed**
12
+ 3. **Invoke tools accurately**, passing the required parameters.
13
+ 4. For **each tool you invoke**, always provide a clear `agent_thought` argument:
14
14
 
15
15
  - This is a natural-language sentence (or two) **explaining why** this tool is being invoked.
16
16
  - Keep it short but informative.
17
17
  - This is essential for debugging, traceability, and adaptation by other agents.
18
18
 
19
- 5. For **the tool you invoke**, always provide the `executor_metadata` argument:
20
-
21
- - If you know you won't be able to achieve all Cortex decisions using the tool call you've chosen, set `retrigger` to `true` - otherwise set it to `false`
22
-
23
19
  ---
24
20
 
25
21
  ### 🧠 Example
@@ -45,27 +41,36 @@ Call the `tap_on_element` tool with:
45
41
  - `resource_id = "com.whatsapp:id/conversation_item"`
46
42
  - `text = "Alice"`
47
43
  - `agent_thought = "I'm tapping on the chat item labeled 'Alice' to open the conversation."`
48
- - `executor_metadata = {"retrigger": false}`
49
44
 
50
45
  ---
51
46
 
52
47
  ### ⚙️ Tools
53
48
 
54
- - Tools may include actions like: `tap`, `swipe`, `start_app`, `stop_app`, `list_packages`, `get_current_focus`, etc.
49
+ - Tools may include actions like: `tap`, `swipe`, `start_app`, `stop_app`, `find_packages`, `get_current_focus`, etc.
55
50
  - You **must not hardcode tool definitions** here.
56
51
  - Just use the right tool based on what the `structured_decisions` requires.
57
52
  - The tools are provided dynamically via LangGraph's tool binding mechanism.
58
53
 
54
+ #### 📝 Text Input Best Practice
55
+
56
+ When using the `input_text` tool:
57
+
58
+ - **Always provide the `resource_id` of the element** you want to type into.
59
+ - The tool will automatically:
60
+
61
+ 1. **Focus the element first**
62
+ 2. **Move the cursor to the end** of the existing text
63
+ 3. **Then type the new text**
64
+
59
65
  #### 🔄 Text Clearing Best Practice
60
66
 
61
- When you need to completely clear text from an input field, **DO NOT** simply use `erase_text` alone, as it only erases from the cursor position, backward. Instead:
67
+ When you need to completely clear text from an input field, always use the clear_text tool with the correct resource_id.
62
68
 
63
- 1. **Use `long_press_on` first** to select the text field and bring up selection options
64
- 2. **Then use `erase_text`** to clear the selected content
69
+ This tool automatically takes care of focusing the element (if needed), and ensuring the field is fully emptied.
65
70
 
66
- This approach ensures the **entire text content** is removed, not just the portion before the cursor position. The long press will typically select all text in the field, making the subsequent erase operation more effective.
71
+ Only and if only the clear_text tool fails to clear the text, try to long press the input, select all, and call erase_one_char.
67
72
 
68
- ### 🔁 Final Notes
73
+ #### 🔁 Final Notes
69
74
 
70
75
  - **You do not need to reason or decide strategy** — that's the Cortex's job.
71
76
  - You simply interpret and execute — like hands following the brain.
@@ -2,8 +2,8 @@ from pathlib import Path
2
2
 
3
3
  from jinja2 import Template
4
4
  from langchain_core.messages import HumanMessage, SystemMessage
5
- from langchain_core.messages.ai import AIMessage
6
5
  from langchain_google_genai import ChatGoogleGenerativeAI
6
+ from minitap.mobile_use.constants import EXECUTOR_MESSAGES_KEY
7
7
  from minitap.mobile_use.context import MobileUseContext
8
8
  from minitap.mobile_use.graph.state import State
9
9
  from minitap.mobile_use.services.llm import get_llm
@@ -34,20 +34,9 @@ class ExecutorNode:
34
34
  "No structured decisions found, I cannot execute anything."
35
35
  ],
36
36
  },
37
+ agent="executor",
37
38
  )
38
39
 
39
- if len(state.executor_messages) > 0 and isinstance(state.executor_messages[-1], AIMessage):
40
- if len(state.executor_messages[-1].tool_calls) > 0: # type: ignore
41
- # A previous tool call raised an uncaught exception while retrigerring the executor
42
- return state.sanitize_update(
43
- ctx=self.ctx,
44
- update={
45
- "executor_retrigger": False,
46
- "executor_failed": True,
47
- "executor_messages": [state.messages[-1]],
48
- },
49
- )
50
-
51
40
  system_message = Template(
52
41
  Path(__file__).parent.joinpath("executor.md").read_text(encoding="utf-8")
53
42
  ).render(platform=self.ctx.device.mobile_platform.value)
@@ -62,14 +51,13 @@ class ExecutorNode:
62
51
  ]
63
52
 
64
53
  llm = get_llm(ctx=self.ctx, name="executor")
65
- llm_bind_tools_kwargs = {
54
+ llm_bind_tools_kwargs: dict = {
66
55
  "tools": get_tools_from_wrappers(self.ctx, EXECUTOR_WRAPPERS_TOOLS),
67
- "tool_choice": "auto", # automatically select a tool call or none
68
56
  }
69
57
 
70
58
  # ChatGoogleGenerativeAI does not support the "parallel_tool_calls" keyword
71
59
  if not isinstance(llm, ChatGoogleGenerativeAI):
72
- llm_bind_tools_kwargs["parallel_tool_calls"] = False
60
+ llm_bind_tools_kwargs["parallel_tool_calls"] = True
73
61
 
74
62
  llm = llm.bind_tools(**llm_bind_tools_kwargs)
75
63
  response = await llm.ainvoke(messages)
@@ -78,7 +66,7 @@ class ExecutorNode:
78
66
  ctx=self.ctx,
79
67
  update={
80
68
  "cortex_last_thought": cortex_last_thought,
81
- "executor_messages": [response],
82
- "messages": [response],
69
+ EXECUTOR_MESSAGES_KEY: [response],
83
70
  },
71
+ agent="executor",
84
72
  )
@@ -0,0 +1,105 @@
1
+ import asyncio
2
+ from typing import Any
3
+ from langgraph.types import Command
4
+ from pydantic import BaseModel
5
+ from typing import override
6
+ from langchain_core.runnables import RunnableConfig
7
+ from langgraph.store.base import BaseStore
8
+ from langchain_core.messages import AnyMessage, ToolCall, ToolMessage
9
+ from langgraph.prebuilt import ToolNode
10
+
11
+
12
+ class ExecutorToolNode(ToolNode):
13
+ """
14
+ ToolNode that runs tool calls one after the other - not simultaneously.
15
+ If one error occurs, the remaining tool calls are aborted!
16
+ """
17
+
18
+ @override
19
+ async def _afunc(
20
+ self,
21
+ input: list[AnyMessage] | dict[str, Any] | BaseModel,
22
+ config: RunnableConfig,
23
+ *,
24
+ store: BaseStore | None,
25
+ ):
26
+ return await self.__func(is_async=True, input=input, config=config, store=store)
27
+
28
+ @override
29
+ def _func(
30
+ self,
31
+ input: list[AnyMessage] | dict[str, Any] | BaseModel,
32
+ config: RunnableConfig,
33
+ *,
34
+ store: BaseStore | None,
35
+ ) -> Any:
36
+ loop = asyncio.get_event_loop()
37
+ return loop.run_until_complete(
38
+ self.__func(is_async=False, input=input, config=config, store=store)
39
+ )
40
+
41
+ async def __func(
42
+ self,
43
+ is_async: bool,
44
+ input: list[AnyMessage] | dict[str, Any] | BaseModel,
45
+ config: RunnableConfig,
46
+ *,
47
+ store: BaseStore | None,
48
+ ) -> Any:
49
+ tool_calls, input_type = self._parse_input(input, store)
50
+ outputs: list[Command | ToolMessage] = []
51
+ failed = False
52
+ for call in tool_calls:
53
+ if failed:
54
+ output = self._get_erroneous_command(
55
+ call=call,
56
+ message="Aborted: a previous tool call failed!",
57
+ )
58
+ else:
59
+ if is_async:
60
+ output = await self._arun_one(call, input_type, config)
61
+ else:
62
+ output = self._run_one(call, input_type, config)
63
+ failed = self._has_tool_call_failed(call, output)
64
+ if failed is None:
65
+ output = self._get_erroneous_command(
66
+ call=call,
67
+ message=f"Unexpected tool output type: {type(output)}",
68
+ )
69
+ failed = True
70
+ outputs.append(output)
71
+ return self._combine_tool_outputs(outputs, input_type) # type: ignore
72
+
73
+ def _has_tool_call_failed(
74
+ self,
75
+ call: ToolCall,
76
+ output: ToolMessage | Command,
77
+ ) -> bool | None:
78
+ if isinstance(output, ToolMessage):
79
+ return output.status == "error"
80
+ if isinstance(output, Command):
81
+ output_msg = self._get_tool_message(output)
82
+ return output_msg.status == "error"
83
+ return None
84
+
85
+ def _get_erroneous_command(self, call: ToolCall, message: str) -> Command:
86
+ tool_message = ToolMessage(
87
+ name=call["name"], tool_call_id=call["id"], content=message, status="error"
88
+ )
89
+ return Command(update={self.messages_key: [tool_message]})
90
+
91
+ def _get_tool_message(self, cmd: Command) -> ToolMessage:
92
+ if isinstance(cmd.update, dict):
93
+ msg = cmd.update.get(self.messages_key)
94
+ if isinstance(msg, list):
95
+ if len(msg) == 0:
96
+ raise ValueError("No messages found in command update")
97
+ if not isinstance(msg[-1], ToolMessage):
98
+ raise ValueError("Last message in command update is not a tool message")
99
+ return msg[-1]
100
+ elif isinstance(msg, ToolMessage):
101
+ return msg
102
+ elif msg is None:
103
+ raise ValueError(f"Missing '{self.messages_key}' in command update")
104
+ raise ValueError(f"Unexpected message type in command update: {type(msg)}")
105
+ raise ValueError("Command update is not a dict")
@@ -1,13 +1,5 @@
1
1
  ## Hopper
2
2
 
3
- Your goal is to analyze the input data and to pick only the most relevant information based on the current steps. We aim to reach the goal defined by the user as : {{ initial_goal }}
3
+ The user will send you a batch of data you must dig in order to extract the most relevant information to reach the user's goal. Keep the information as is, do not modify it since the user will trigger actions based on it.
4
4
 
5
- ### Input
6
-
7
- You have the list of steps we've done so far. We use those steps to track our progress to reach our goal. Here they are : {{ messages }}
8
-
9
- Finally, here is the data that we receive form executing the last task. We will dig this data to pick only the most relevant information to reach our goal. Keep the information as is, do not modify it since we will trigger actions based on it. Output this information in the output field, and you will describe what you did in the step field.
10
-
11
- Here is the data you must dig :
12
-
13
- {{ data }}
5
+ You'll need to output the extracted information in the `output` field, and you will describe what you did in the `step` field.
@@ -1,8 +1,7 @@
1
1
  from pathlib import Path
2
- from typing import Sequence
3
2
 
4
3
  from jinja2 import Template
5
- from langchain_core.messages import BaseMessage, HumanMessage, SystemMessage
4
+ from langchain_core.messages import HumanMessage, SystemMessage
6
5
  from minitap.mobile_use.context import MobileUseContext
7
6
  from minitap.mobile_use.services.llm import get_llm
8
7
  from pydantic import BaseModel, Field
@@ -20,20 +19,16 @@ class HopperOutput(BaseModel):
20
19
 
21
20
  async def hopper(
22
21
  ctx: MobileUseContext,
23
- initial_goal: str,
24
- messages: Sequence[BaseMessage],
22
+ request: str,
25
23
  data: str,
26
24
  ) -> HopperOutput:
27
25
  print("Starting Hopper Agent", flush=True)
28
26
  system_message = Template(
29
27
  Path(__file__).parent.joinpath("hopper.md").read_text(encoding="utf-8")
30
- ).render(
31
- initial_goal=initial_goal,
32
- messages=messages,
33
- )
28
+ ).render()
34
29
  messages = [
35
30
  SystemMessage(content=system_message),
36
- HumanMessage(content=data),
31
+ HumanMessage(content=f"{request}\nHere is the data you must dig:\n{data}"),
37
32
  ]
38
33
 
39
34
  llm = get_llm(ctx=ctx, name="hopper", is_utils=True, temperature=0)
@@ -1,13 +1,12 @@
1
- Here is your input.
2
-
3
- ---
1
+ Here is the input for your analysis:
4
2
 
5
3
  **Initial goal** : {{ initial_goal }}
6
4
 
7
5
  **Subgoal plan**
8
6
  {{ subgoal_plan }}
9
7
 
10
- **Current subgoal** : {{ current_subgoal }}
8
+ **Subgoals to examine (provided by the Cortex)**
9
+ {{ subgoals_to_examine }}
11
10
 
12
11
  **Agent thoughts**
13
12
  {{ agent_thoughts }}
@@ -1,18 +1,36 @@
1
1
  You are the **Orchestrator**.
2
2
 
3
- Your role is to **decide what to do next**, based on the current execution state of a plan running on an **{{ platform }} mobile device**. You must assess the situation and choose between resuming, continuing, or replanning.
3
+ Your role is to **decide what to do next**, based on the current execution state of a plan running on an **{{ platform }} mobile device**. You must assess the situation and determine whether the provided subgoals have been completed, or if they need to remain pending.
4
+ Based on the input data, you must also determine if the subgoal plan must be replanned.
4
5
 
5
6
  ### Responsibilities
6
7
 
7
- You're given:
8
+ You will be given:
8
9
 
9
10
  - The current **subgoal plan**
10
- - The current **subgoal** (which is marked as **PENDING** in the plan, but repeated here for your convenience)
11
+ - The **subgoal to examine** (which are marked as **PENDING** and **NOT STARTED** in the plan)
11
12
  - A list of **agent thoughts** (insights, obstacles, or reasoning gathered during execution)
12
13
  - The original **initial goal**
13
14
 
14
- You must then **choose what to do next**:
15
+ You must then:
15
16
 
16
- - `"resume"`: The current subgoal is clearly not finished, let's resume it. The status of the current subgoal will stay as `PENDING`.
17
- - `"continue"`: Move to the next subgoal in the list. The current subgoal will be marked as `SUCCESS`. If the current subgoal is the final step of the plan: The "reason" field must contain the final answer to the user’s initial goal. If the current subgoal is not the final step: The "reason" field must explain why this subgoal is now considered complete before moving on.
18
- - `"replan"`: The current plan no longer fits : the current subgoal will be marked as `FAILURE`. we need to define a new plan.
17
+ 1. For **each subgoal to examine provided by the user** (not all subgoals):
18
+ - if it's clearly finished and can be marked as complete, regardless of whether it was started or not -> add its ID to `completed_subgoal_ids`
19
+ Then fill the `reason` field with:
20
+ - the final answer to the initial goal if all subgoals are expected to be completed, OR
21
+ - an explanation of your decisions for the report.
22
+
23
+ 2. Set `needs_replaning` to `TRUE` if the current plan no longer fits because of repeated failed attempts. In that case, the current subgoal will be marked as `FAILURE`, and a new plan will be defined. Explain in the `reason` field why the plan no longer fits.
24
+
25
+ ### Agent Roles & Thought Ownership
26
+
27
+ All thoughts belong to the specific agent that generated them. There are four collaborating agents:
28
+
29
+ - **Orchestrator (You):** Coordinates the entire process. Decides what to do next based on the execution state and whether the plan needs replanning.
30
+ - **Planner:** Designs the subgoal plan and updates it when necessary (replanning). Does not execute actions.
31
+ - **Cortex (Brain & Eyes):** It does not directly interact with the device, but it has full awareness of the screen state. Its role is to reason about this state and determine the next actions (e.g., tap, swipe, scroll) required to advance through the plan.
32
+ - **Executor (Hands):** it executes the Cortex’s chosen actions on the device.
33
+
34
+ The cortex has the ability to complete multiple subgoals (the PENDING one and NOT STARTED ones), which are the ones you'll need to examine. Although the plan should normally be completed in order - this is not a strict requirement based on the context.
35
+
36
+ In its agent thoughts, the cortex may talk as if it were the one taking the action (e.g. "Tapping the button", ...) - but remember than only the executor can interact with the device.
@@ -3,12 +3,13 @@ from pathlib import Path
3
3
  from jinja2 import Template
4
4
  from langchain_core.messages import HumanMessage, SystemMessage
5
5
 
6
- from minitap.mobile_use.agents.orchestrator.types import OrchestratorOutput, OrchestratorStatus
6
+ from minitap.mobile_use.agents.orchestrator.types import OrchestratorOutput
7
7
  from minitap.mobile_use.agents.planner.utils import (
8
8
  all_completed,
9
- complete_current_subgoal,
9
+ complete_subgoals_by_ids,
10
10
  fail_current_subgoal,
11
11
  get_current_subgoal,
12
+ get_subgoals_by_ids,
12
13
  nothing_started,
13
14
  start_next_subgoal,
14
15
  )
@@ -31,24 +32,27 @@ class OrchestratorNode:
31
32
  on_failure=lambda _: logger.error("Orchestrator Agent"),
32
33
  )
33
34
  async def __call__(self, state: State):
34
- if nothing_started(state.subgoal_plan):
35
+ no_subgoal_started = nothing_started(state.subgoal_plan)
36
+ current_subgoal = get_current_subgoal(state.subgoal_plan)
37
+
38
+ if no_subgoal_started or not current_subgoal:
35
39
  state.subgoal_plan = start_next_subgoal(state.subgoal_plan)
36
40
  new_subgoal = get_current_subgoal(state.subgoal_plan)
37
- return state.sanitize_update(
38
- ctx=self.ctx,
39
- update={
40
- "agents_thoughts": [f"Starting the first subgoal: {new_subgoal}"],
41
- "subgoal_plan": state.subgoal_plan,
42
- },
43
- )
44
-
45
- current_subgoal = get_current_subgoal(state.subgoal_plan)
41
+ thoughts = [
42
+ (
43
+ f"Starting the first subgoal: {new_subgoal}"
44
+ if no_subgoal_started
45
+ else f"Starting the next subgoal: {new_subgoal}"
46
+ )
47
+ ]
48
+ return _get_state_update(ctx=self.ctx, state=state, thoughts=thoughts, update_plan=True)
46
49
 
47
- if not current_subgoal:
48
- return state.sanitize_update(
49
- ctx=self.ctx,
50
- update={"agents_thoughts": ["No subgoal to go for."]},
51
- )
50
+ subgoals_to_examine = get_subgoals_by_ids(
51
+ subgoals=state.subgoal_plan,
52
+ ids=state.complete_subgoals_by_ids,
53
+ )
54
+ if len(subgoals_to_examine) <= 0:
55
+ return _get_state_update(ctx=self.ctx, state=state, thoughts=["No subgoal to examine."])
52
56
 
53
57
  system_message = Template(
54
58
  Path(__file__).parent.joinpath("orchestrator.md").read_text(encoding="utf-8")
@@ -58,7 +62,7 @@ class OrchestratorNode:
58
62
  ).render(
59
63
  initial_goal=state.initial_goal,
60
64
  subgoal_plan="\n".join(str(s) for s in state.subgoal_plan),
61
- current_subgoal=str(current_subgoal),
65
+ subgoals_to_examine="\n".join(str(s) for s in subgoals_to_examine),
62
66
  agent_thoughts="\n".join(state.agents_thoughts),
63
67
  )
64
68
  messages = [
@@ -70,45 +74,41 @@ class OrchestratorNode:
70
74
  llm = llm.with_structured_output(OrchestratorOutput)
71
75
  response: OrchestratorOutput = await llm.ainvoke(messages) # type: ignore
72
76
 
73
- if response.status == OrchestratorStatus.CONTINUE:
74
- state.subgoal_plan = complete_current_subgoal(state.subgoal_plan)
75
- thoughts = [response.reason]
76
-
77
- if all_completed(state.subgoal_plan):
78
- logger.success("All the subgoals have been completed successfully.")
79
- return state.sanitize_update(
80
- ctx=self.ctx,
81
- update={
82
- "subgoal_plan": state.subgoal_plan,
83
- "agents_thoughts": thoughts,
84
- },
85
- )
86
- state.subgoal_plan = start_next_subgoal(state.subgoal_plan)
87
- new_subgoal = get_current_subgoal(state.subgoal_plan)
88
- thoughts.append(f"==== NEXT SUBGOAL: {new_subgoal} ====")
89
- return state.sanitize_update(
90
- ctx=self.ctx,
91
- update={
92
- "agents_thoughts": thoughts,
93
- "subgoal_plan": state.subgoal_plan,
94
- },
95
- )
96
-
97
- elif response.status == OrchestratorStatus.REPLAN:
77
+ if response.needs_replaning:
98
78
  thoughts = [response.reason]
99
79
  state.subgoal_plan = fail_current_subgoal(state.subgoal_plan)
100
80
  thoughts.append("==== END OF PLAN, REPLANNING ====")
101
- return state.sanitize_update(
102
- ctx=self.ctx,
103
- update={
104
- "agents_thoughts": thoughts,
105
- "subgoal_plan": state.subgoal_plan,
106
- },
107
- )
108
-
109
- return state.sanitize_update(
110
- ctx=self.ctx,
111
- update={
112
- "agents_thoughts": [response.reason],
113
- },
81
+ return _get_state_update(ctx=self.ctx, state=state, thoughts=thoughts, update_plan=True)
82
+
83
+ state.subgoal_plan = complete_subgoals_by_ids(
84
+ subgoals=state.subgoal_plan,
85
+ ids=response.completed_subgoal_ids,
114
86
  )
87
+ thoughts = [response.reason]
88
+ if all_completed(state.subgoal_plan):
89
+ logger.success("All the subgoals have been completed successfully.")
90
+ return _get_state_update(ctx=self.ctx, state=state, thoughts=thoughts, update_plan=True)
91
+
92
+ if current_subgoal.id not in response.completed_subgoal_ids:
93
+ # The current subgoal is not yet complete.
94
+ return _get_state_update(ctx=self.ctx, state=state, thoughts=thoughts, update_plan=True)
95
+
96
+ state.subgoal_plan = start_next_subgoal(state.subgoal_plan)
97
+ new_subgoal = get_current_subgoal(state.subgoal_plan)
98
+ thoughts.append(f"==== NEXT SUBGOAL: {new_subgoal} ====")
99
+ return _get_state_update(ctx=self.ctx, state=state, thoughts=thoughts, update_plan=True)
100
+
101
+
102
+ def _get_state_update(
103
+ ctx: MobileUseContext,
104
+ state: State,
105
+ thoughts: list[str],
106
+ update_plan: bool = False,
107
+ ):
108
+ update = {
109
+ "agents_thoughts": thoughts,
110
+ "complete_subgoals_by_ids": [],
111
+ }
112
+ if update_plan:
113
+ update["subgoal_plan"] = state.subgoal_plan
114
+ return state.sanitize_update(ctx=ctx, update=update, agent="orchestrator")