minitap-mobile-use 0.0.1.dev0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of minitap-mobile-use might be problematic. Click here for more details.
- minitap/mobile_use/__init__.py +0 -0
- minitap/mobile_use/agents/contextor/contextor.py +42 -0
- minitap/mobile_use/agents/cortex/cortex.md +93 -0
- minitap/mobile_use/agents/cortex/cortex.py +107 -0
- minitap/mobile_use/agents/cortex/types.py +11 -0
- minitap/mobile_use/agents/executor/executor.md +73 -0
- minitap/mobile_use/agents/executor/executor.py +84 -0
- minitap/mobile_use/agents/executor/executor_context_cleaner.py +27 -0
- minitap/mobile_use/agents/executor/utils.py +11 -0
- minitap/mobile_use/agents/hopper/hopper.md +13 -0
- minitap/mobile_use/agents/hopper/hopper.py +45 -0
- minitap/mobile_use/agents/orchestrator/human.md +13 -0
- minitap/mobile_use/agents/orchestrator/orchestrator.md +18 -0
- minitap/mobile_use/agents/orchestrator/orchestrator.py +114 -0
- minitap/mobile_use/agents/orchestrator/types.py +14 -0
- minitap/mobile_use/agents/outputter/human.md +25 -0
- minitap/mobile_use/agents/outputter/outputter.py +75 -0
- minitap/mobile_use/agents/outputter/test_outputter.py +107 -0
- minitap/mobile_use/agents/planner/human.md +12 -0
- minitap/mobile_use/agents/planner/planner.md +64 -0
- minitap/mobile_use/agents/planner/planner.py +64 -0
- minitap/mobile_use/agents/planner/types.py +44 -0
- minitap/mobile_use/agents/planner/utils.py +45 -0
- minitap/mobile_use/agents/summarizer/summarizer.py +34 -0
- minitap/mobile_use/clients/device_hardware_client.py +23 -0
- minitap/mobile_use/clients/ios_client.py +44 -0
- minitap/mobile_use/clients/screen_api_client.py +53 -0
- minitap/mobile_use/config.py +285 -0
- minitap/mobile_use/constants.py +2 -0
- minitap/mobile_use/context.py +65 -0
- minitap/mobile_use/controllers/__init__.py +0 -0
- minitap/mobile_use/controllers/mobile_command_controller.py +379 -0
- minitap/mobile_use/controllers/platform_specific_commands_controller.py +74 -0
- minitap/mobile_use/graph/graph.py +149 -0
- minitap/mobile_use/graph/state.py +73 -0
- minitap/mobile_use/main.py +122 -0
- minitap/mobile_use/sdk/__init__.py +12 -0
- minitap/mobile_use/sdk/agent.py +524 -0
- minitap/mobile_use/sdk/builders/__init__.py +10 -0
- minitap/mobile_use/sdk/builders/agent_config_builder.py +213 -0
- minitap/mobile_use/sdk/builders/index.py +15 -0
- minitap/mobile_use/sdk/builders/task_request_builder.py +218 -0
- minitap/mobile_use/sdk/constants.py +14 -0
- minitap/mobile_use/sdk/examples/README.md +45 -0
- minitap/mobile_use/sdk/examples/__init__.py +1 -0
- minitap/mobile_use/sdk/examples/simple_photo_organizer.py +76 -0
- minitap/mobile_use/sdk/examples/smart_notification_assistant.py +177 -0
- minitap/mobile_use/sdk/types/__init__.py +49 -0
- minitap/mobile_use/sdk/types/agent.py +73 -0
- minitap/mobile_use/sdk/types/exceptions.py +74 -0
- minitap/mobile_use/sdk/types/task.py +191 -0
- minitap/mobile_use/sdk/utils.py +28 -0
- minitap/mobile_use/servers/config.py +19 -0
- minitap/mobile_use/servers/device_hardware_bridge.py +212 -0
- minitap/mobile_use/servers/device_screen_api.py +143 -0
- minitap/mobile_use/servers/start_servers.py +151 -0
- minitap/mobile_use/servers/stop_servers.py +215 -0
- minitap/mobile_use/servers/utils.py +11 -0
- minitap/mobile_use/services/accessibility.py +100 -0
- minitap/mobile_use/services/llm.py +143 -0
- minitap/mobile_use/tools/index.py +54 -0
- minitap/mobile_use/tools/mobile/back.py +52 -0
- minitap/mobile_use/tools/mobile/copy_text_from.py +77 -0
- minitap/mobile_use/tools/mobile/erase_text.py +124 -0
- minitap/mobile_use/tools/mobile/input_text.py +74 -0
- minitap/mobile_use/tools/mobile/launch_app.py +59 -0
- minitap/mobile_use/tools/mobile/list_packages.py +78 -0
- minitap/mobile_use/tools/mobile/long_press_on.py +62 -0
- minitap/mobile_use/tools/mobile/open_link.py +59 -0
- minitap/mobile_use/tools/mobile/paste_text.py +66 -0
- minitap/mobile_use/tools/mobile/press_key.py +58 -0
- minitap/mobile_use/tools/mobile/run_flow.py +57 -0
- minitap/mobile_use/tools/mobile/stop_app.py +58 -0
- minitap/mobile_use/tools/mobile/swipe.py +56 -0
- minitap/mobile_use/tools/mobile/take_screenshot.py +70 -0
- minitap/mobile_use/tools/mobile/tap.py +66 -0
- minitap/mobile_use/tools/mobile/wait_for_animation_to_end.py +68 -0
- minitap/mobile_use/tools/tool_wrapper.py +33 -0
- minitap/mobile_use/utils/cli_helpers.py +40 -0
- minitap/mobile_use/utils/cli_selection.py +144 -0
- minitap/mobile_use/utils/conversations.py +31 -0
- minitap/mobile_use/utils/decorators.py +123 -0
- minitap/mobile_use/utils/errors.py +6 -0
- minitap/mobile_use/utils/file.py +13 -0
- minitap/mobile_use/utils/logger.py +184 -0
- minitap/mobile_use/utils/media.py +73 -0
- minitap/mobile_use/utils/recorder.py +55 -0
- minitap/mobile_use/utils/requests_utils.py +37 -0
- minitap/mobile_use/utils/shell_utils.py +20 -0
- minitap/mobile_use/utils/time.py +6 -0
- minitap/mobile_use/utils/ui_hierarchy.py +30 -0
- minitap_mobile_use-0.0.1.dev0.dist-info/METADATA +274 -0
- minitap_mobile_use-0.0.1.dev0.dist-info/RECORD +95 -0
- minitap_mobile_use-0.0.1.dev0.dist-info/WHEEL +4 -0
- minitap_mobile_use-0.0.1.dev0.dist-info/entry_points.txt +3 -0
|
File without changes
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
from minitap.mobile_use.agents.executor.utils import is_last_tool_message_take_screenshot
|
|
2
|
+
from minitap.mobile_use.controllers.mobile_command_controller import get_screen_data
|
|
3
|
+
from minitap.mobile_use.controllers.platform_specific_commands_controller import (
|
|
4
|
+
get_device_date,
|
|
5
|
+
get_focused_app_info,
|
|
6
|
+
)
|
|
7
|
+
from minitap.mobile_use.graph.state import State
|
|
8
|
+
from minitap.mobile_use.utils.decorators import wrap_with_callbacks
|
|
9
|
+
from minitap.mobile_use.utils.logger import get_logger
|
|
10
|
+
from minitap.mobile_use.context import MobileUseContext
|
|
11
|
+
|
|
12
|
+
logger = get_logger(__name__)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class ContextorNode:
|
|
16
|
+
def __init__(self, ctx: MobileUseContext):
|
|
17
|
+
self.ctx = ctx
|
|
18
|
+
|
|
19
|
+
@wrap_with_callbacks(
|
|
20
|
+
before=lambda: logger.info("Starting Contextor Agent"),
|
|
21
|
+
on_success=lambda _: logger.success("Contextor Agent"),
|
|
22
|
+
on_failure=lambda _: logger.error("Contextor Agent"),
|
|
23
|
+
)
|
|
24
|
+
def __call__(self, state: State):
|
|
25
|
+
device_data = get_screen_data(self.ctx.screen_api_client)
|
|
26
|
+
focused_app_info = get_focused_app_info(self.ctx)
|
|
27
|
+
device_date = get_device_date(self.ctx)
|
|
28
|
+
|
|
29
|
+
should_add_screenshot_context = is_last_tool_message_take_screenshot(list(state.messages))
|
|
30
|
+
|
|
31
|
+
return state.sanitize_update(
|
|
32
|
+
ctx=self.ctx,
|
|
33
|
+
update={
|
|
34
|
+
"latest_screenshot_base64": device_data.base64
|
|
35
|
+
if should_add_screenshot_context
|
|
36
|
+
else None,
|
|
37
|
+
"latest_ui_hierarchy": device_data.elements,
|
|
38
|
+
"focused_app_info": focused_app_info,
|
|
39
|
+
"screen_size": (device_data.width, device_data.height),
|
|
40
|
+
"device_date": device_date,
|
|
41
|
+
},
|
|
42
|
+
)
|
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
## You are the **Cortex**
|
|
2
|
+
|
|
3
|
+
Your job is to **analyze the current {{ platform }} mobile device state** and produce **structured decisions** to achieve the current subgoal.
|
|
4
|
+
|
|
5
|
+
You must act like a human brain, responsible for giving instructions to your hands (the **Executor** agent). Therefore, you must act with the same imprecision and uncertainty as a human when performing swipe actions: humans don't know where exactly they are swiping (always prefer percentages of width and height instead of absolute coordinates), they just know they are swiping up or down, left or right, and with how much force (usually amplified compared to what's truly needed - go overboard of sliders for instance).
|
|
6
|
+
|
|
7
|
+
### Context You Receive:
|
|
8
|
+
|
|
9
|
+
You are provided with:
|
|
10
|
+
|
|
11
|
+
- 📱 **Device state**:
|
|
12
|
+
|
|
13
|
+
- Latest **UI hierarchy**
|
|
14
|
+
- (Optional) Latest **screenshot (base64)**. You can query one if you need it by calling the take_screenshot tool. Often, the UI hierarchy is enough to understand what is happening on the screen.
|
|
15
|
+
- Current **focused app info**
|
|
16
|
+
- **Screen size** and **device date**
|
|
17
|
+
|
|
18
|
+
- 🧭 **Task context**:
|
|
19
|
+
|
|
20
|
+
- The user's **initial goal**
|
|
21
|
+
- The **subgoal plan** with their statuses
|
|
22
|
+
- The **current subgoal** to act on (the one in `PENDING` in the plan)
|
|
23
|
+
- A list of **agent thoughts** (previous reasoning, observations about the environment)
|
|
24
|
+
- **Executor agent feedback** on the latest UI decisions
|
|
25
|
+
|
|
26
|
+
### Your Mission:
|
|
27
|
+
|
|
28
|
+
Focus on the **current subgoal**.
|
|
29
|
+
|
|
30
|
+
1. **Analyze the UI** and environment to understand what action is required.
|
|
31
|
+
2.1. If the **subgoal is completed**, set the `complete_subgoal` field to `True`. To justify your conclusion, you will fill in the `agent_thought` field based on:
|
|
32
|
+
|
|
33
|
+
- The current UI state
|
|
34
|
+
- Past agent thoughts
|
|
35
|
+
- Recent tool effects
|
|
36
|
+
2.2. Otherwise, output a **stringified structured set of instructions** that an **Executor agent** can perform on a real mobile device:
|
|
37
|
+
|
|
38
|
+
- These must be **concrete low-level actions**: back,tap, swipe, launch app, list packages, close app, input text, paste, erase, text, copy, etc.
|
|
39
|
+
- If you refer to a UI element or coordinates, specify it clearly (e.g., `resource-id: com.whatsapp:id/search`, `text: "Alice"`, `x: 100, y: 200`).
|
|
40
|
+
- **The structure is up to you**, but it must be valid **JSON stringified output**. You will accompany this output with a **natural-language summary** of your reasoning and approach in your agent thought.
|
|
41
|
+
- When you want to launch/stop an app, prefer using its package name.
|
|
42
|
+
- **Only reference UI element IDs or visible texts that are explicitly present in the provided UI hierarchy or screenshot. Do not invent, infer, or guess any IDs or texts that are not directly observed**.
|
|
43
|
+
- **For text clearing**: When you need to completely clear text from an input field, always use **LONG PRESS** first to select the text field, then erase. Do NOT use tap + erase as this only clears from cursor position.
|
|
44
|
+
|
|
45
|
+
### Output
|
|
46
|
+
|
|
47
|
+
- **Structured Decisions**:
|
|
48
|
+
A **valid stringified JSON** describing what should be executed **right now** to advance the current subgoal **IF THE SUBGOAL IS NOT COMPLETED**.
|
|
49
|
+
|
|
50
|
+
- **Agent Thought** _(1-2 sentences)_:
|
|
51
|
+
If there is any information you need to remember for later steps, you must include it here, because only the agent thoughts will be used to produce the final structured output.
|
|
52
|
+
|
|
53
|
+
This also helps other agents understand your decision and learn from future failures.
|
|
54
|
+
You must also use this field to mention checkpoints when you perform actions without definite ending: for instance "Swiping up to reveal more recipes - last seen recipe was <ID or NAME>, stop when no more".
|
|
55
|
+
|
|
56
|
+
- **Subgoal Completion** _(boolean)_:
|
|
57
|
+
Set to true if the current subgoal has been successfully completed - you **cannot set it to true and provide structured decisions at the same time**. You must base your decision ONLY on what you have as input (device state, agent thoughts, executor feedback, etc) - NEVER based on the decisions you have produced.
|
|
58
|
+
|
|
59
|
+
---
|
|
60
|
+
|
|
61
|
+
### Example
|
|
62
|
+
|
|
63
|
+
#### Current Subgoal:
|
|
64
|
+
|
|
65
|
+
> "Search for Alice in WhatsApp"
|
|
66
|
+
|
|
67
|
+
#### Structured Decisions:
|
|
68
|
+
|
|
69
|
+
```text
|
|
70
|
+
"{\"action\": \"tap\", \"target\": {\"resource_id\": \"com.whatsapp:id/menuitem_search\", \"text\": \"Search\"}}"
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
#### Agent Thought:
|
|
74
|
+
|
|
75
|
+
> I will tap the search icon at the top of the WhatsApp interface to begin searching for Alice.
|
|
76
|
+
|
|
77
|
+
### Input
|
|
78
|
+
|
|
79
|
+
**Initial Goal:**
|
|
80
|
+
{{ initial_goal }}
|
|
81
|
+
|
|
82
|
+
**Subgoal Plan:**
|
|
83
|
+
{{ subgoal_plan }}
|
|
84
|
+
|
|
85
|
+
**Current Subgoal (what needs to be done right now):**
|
|
86
|
+
{{ current_subgoal }}
|
|
87
|
+
|
|
88
|
+
**Agent thoughts (previous reasoning, observations about the environment):**
|
|
89
|
+
{{ agents_thoughts }}
|
|
90
|
+
|
|
91
|
+
**Executor agent feedback on latest UI decisions:**
|
|
92
|
+
|
|
93
|
+
{{ executor_feedback }}
|
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
|
|
4
|
+
from jinja2 import Template
|
|
5
|
+
from langchain_core.messages import (
|
|
6
|
+
AIMessage,
|
|
7
|
+
HumanMessage,
|
|
8
|
+
RemoveMessage,
|
|
9
|
+
SystemMessage,
|
|
10
|
+
ToolMessage,
|
|
11
|
+
)
|
|
12
|
+
from langgraph.graph.message import REMOVE_ALL_MESSAGES
|
|
13
|
+
from minitap.mobile_use.agents.cortex.types import CortexOutput
|
|
14
|
+
from minitap.mobile_use.agents.planner.utils import get_current_subgoal
|
|
15
|
+
from minitap.mobile_use.context import MobileUseContext
|
|
16
|
+
from minitap.mobile_use.graph.state import State
|
|
17
|
+
from minitap.mobile_use.services.llm import get_llm, with_fallback
|
|
18
|
+
from minitap.mobile_use.utils.conversations import get_screenshot_message_for_llm
|
|
19
|
+
from minitap.mobile_use.utils.decorators import wrap_with_callbacks
|
|
20
|
+
from minitap.mobile_use.utils.logger import get_logger
|
|
21
|
+
|
|
22
|
+
logger = get_logger(__name__)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class CortexNode:
|
|
26
|
+
def __init__(self, ctx: MobileUseContext):
|
|
27
|
+
self.ctx = ctx
|
|
28
|
+
|
|
29
|
+
@wrap_with_callbacks(
|
|
30
|
+
before=lambda: logger.info("Starting Cortex Agent..."),
|
|
31
|
+
on_success=lambda _: logger.success("Cortex Agent"),
|
|
32
|
+
on_failure=lambda _: logger.error("Cortex Agent"),
|
|
33
|
+
)
|
|
34
|
+
async def __call__(self, state: State):
|
|
35
|
+
executor_feedback = get_executor_agent_feedback(state)
|
|
36
|
+
|
|
37
|
+
system_message = Template(
|
|
38
|
+
Path(__file__).parent.joinpath("cortex.md").read_text(encoding="utf-8")
|
|
39
|
+
).render(
|
|
40
|
+
platform=self.ctx.device.mobile_platform.value,
|
|
41
|
+
initial_goal=state.initial_goal,
|
|
42
|
+
subgoal_plan=state.subgoal_plan,
|
|
43
|
+
current_subgoal=get_current_subgoal(state.subgoal_plan),
|
|
44
|
+
agents_thoughts=state.agents_thoughts,
|
|
45
|
+
executor_feedback=executor_feedback,
|
|
46
|
+
)
|
|
47
|
+
messages = [
|
|
48
|
+
SystemMessage(content=system_message),
|
|
49
|
+
HumanMessage(
|
|
50
|
+
content="Here are my device info:\n"
|
|
51
|
+
+ self.ctx.device.to_str()
|
|
52
|
+
+ f"Device date: {state.device_date}\n"
|
|
53
|
+
if state.device_date
|
|
54
|
+
else "" + f"Focused app info: {state.focused_app_info}\n"
|
|
55
|
+
if state.focused_app_info
|
|
56
|
+
else ""
|
|
57
|
+
),
|
|
58
|
+
]
|
|
59
|
+
for thought in state.agents_thoughts:
|
|
60
|
+
messages.append(AIMessage(content=thought))
|
|
61
|
+
|
|
62
|
+
if state.latest_screenshot_base64:
|
|
63
|
+
messages.append(get_screenshot_message_for_llm(state.latest_screenshot_base64))
|
|
64
|
+
logger.info("Added screenshot to context")
|
|
65
|
+
|
|
66
|
+
if state.latest_ui_hierarchy:
|
|
67
|
+
ui_hierarchy_dict: list[dict] = state.latest_ui_hierarchy
|
|
68
|
+
ui_hierarchy_str = json.dumps(ui_hierarchy_dict, indent=2, ensure_ascii=False)
|
|
69
|
+
messages.append(HumanMessage(content="Here is the UI hierarchy:\n" + ui_hierarchy_str))
|
|
70
|
+
|
|
71
|
+
llm = get_llm(ctx=self.ctx, name="cortex", temperature=1).with_structured_output(
|
|
72
|
+
CortexOutput
|
|
73
|
+
)
|
|
74
|
+
llm_fallback = get_llm(
|
|
75
|
+
ctx=self.ctx, name="cortex", use_fallback=True, temperature=1
|
|
76
|
+
).with_structured_output(CortexOutput)
|
|
77
|
+
response: CortexOutput = await with_fallback(
|
|
78
|
+
main_call=lambda: llm.ainvoke(messages),
|
|
79
|
+
fallback_call=lambda: llm_fallback.ainvoke(messages),
|
|
80
|
+
) # type: ignore
|
|
81
|
+
|
|
82
|
+
is_subgoal_completed = response.complete_current_subgoal
|
|
83
|
+
return state.sanitize_update(
|
|
84
|
+
ctx=self.ctx,
|
|
85
|
+
update={
|
|
86
|
+
"agents_thoughts": [response.agent_thought],
|
|
87
|
+
"structured_decisions": response.decisions if not is_subgoal_completed else None,
|
|
88
|
+
"latest_screenshot_base64": None,
|
|
89
|
+
"latest_ui_hierarchy": None,
|
|
90
|
+
"focused_app_info": None,
|
|
91
|
+
"device_date": None,
|
|
92
|
+
# Executor related fields
|
|
93
|
+
"executor_messages": [RemoveMessage(id=REMOVE_ALL_MESSAGES)],
|
|
94
|
+
"cortex_last_thought": response.agent_thought,
|
|
95
|
+
},
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def get_executor_agent_feedback(state: State) -> str:
|
|
100
|
+
if state.structured_decisions is None:
|
|
101
|
+
return "None."
|
|
102
|
+
executor_tool_messages = [m for m in state.executor_messages if isinstance(m, ToolMessage)]
|
|
103
|
+
return (
|
|
104
|
+
f"Latest UI decisions:\n{state.structured_decisions}"
|
|
105
|
+
+ "\n\n"
|
|
106
|
+
+ f"Executor feedback:\n{executor_tool_messages}"
|
|
107
|
+
)
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
from typing import Optional
|
|
2
|
+
|
|
3
|
+
from pydantic import BaseModel, Field
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class CortexOutput(BaseModel):
|
|
7
|
+
decisions: str = Field(..., description="The decisions to be made. A stringified JSON object")
|
|
8
|
+
agent_thought: str = Field(..., description="The agent's thought")
|
|
9
|
+
complete_current_subgoal: Optional[bool] = Field(
|
|
10
|
+
False, description="Whether the current subgoal is complete"
|
|
11
|
+
)
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
## You are the **Executor**
|
|
2
|
+
|
|
3
|
+
Your job is to **interpret the structured decisions** provided by the **Cortex** agent and use the appropriate tools to act on a **{{ platform }} mobile device**.
|
|
4
|
+
|
|
5
|
+
### 🎯 Your Objective:
|
|
6
|
+
|
|
7
|
+
Given the `structured_decisions` (a stringified object) from the **Cortex** agent
|
|
8
|
+
and the previous tool calls, you must:
|
|
9
|
+
|
|
10
|
+
1. **Parse the structured decisions** into usable Python objects.
|
|
11
|
+
2. **Determine the most appropriate tool** to execute the intended action - **you can ONLY USE ONE**
|
|
12
|
+
3. **Invoke tool accurately**, passing the required parameters.
|
|
13
|
+
4. For **the tool you invoke**, always provide a clear `agent_thought` argument:
|
|
14
|
+
|
|
15
|
+
- This is a natural-language sentence (or two) **explaining why** this tool is being invoked.
|
|
16
|
+
- Keep it short but informative.
|
|
17
|
+
- This is essential for debugging, traceability, and adaptation by other agents.
|
|
18
|
+
|
|
19
|
+
5. For **the tool you invoke**, always provide the `executor_metadata` argument:
|
|
20
|
+
|
|
21
|
+
- If you know you won't be able to achieve all Cortex decisions using the tool call you've chosen, set `retrigger` to `true` - otherwise set it to `false`
|
|
22
|
+
|
|
23
|
+
---
|
|
24
|
+
|
|
25
|
+
### 🧠 Example
|
|
26
|
+
|
|
27
|
+
**Structured Decisions from the **Cortex** agent**:
|
|
28
|
+
|
|
29
|
+
"I'm tapping on the chat item labeled 'Alice' to open the conversation."
|
|
30
|
+
|
|
31
|
+
```json
|
|
32
|
+
{
|
|
33
|
+
"action": "tap",
|
|
34
|
+
"target": {
|
|
35
|
+
"text": "Alice",
|
|
36
|
+
"resource_id": "com.whatsapp:id/conversation_item"
|
|
37
|
+
}
|
|
38
|
+
}
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
**→ Executor Action**:
|
|
42
|
+
|
|
43
|
+
Call the `tap_on_element` tool with:
|
|
44
|
+
|
|
45
|
+
- `resource_id = "com.whatsapp:id/conversation_item"`
|
|
46
|
+
- `text = "Alice"`
|
|
47
|
+
- `agent_thought = "I'm tapping on the chat item labeled 'Alice' to open the conversation."`
|
|
48
|
+
- `executor_metadata = {"retrigger": false}`
|
|
49
|
+
|
|
50
|
+
---
|
|
51
|
+
|
|
52
|
+
### ⚙️ Tools
|
|
53
|
+
|
|
54
|
+
- Tools may include actions like: `tap`, `swipe`, `start_app`, `stop_app`, `list_packages`, `get_current_focus`, etc.
|
|
55
|
+
- You **must not hardcode tool definitions** here.
|
|
56
|
+
- Just use the right tool based on what the `structured_decisions` requires.
|
|
57
|
+
- The tools are provided dynamically via LangGraph's tool binding mechanism.
|
|
58
|
+
|
|
59
|
+
#### 🔄 Text Clearing Best Practice
|
|
60
|
+
|
|
61
|
+
When you need to completely clear text from an input field, **DO NOT** simply use `erase_text` alone, as it only erases from the cursor position, backward. Instead:
|
|
62
|
+
|
|
63
|
+
1. **Use `long_press_on` first** to select the text field and bring up selection options
|
|
64
|
+
2. **Then use `erase_text`** to clear the selected content
|
|
65
|
+
|
|
66
|
+
This approach ensures the **entire text content** is removed, not just the portion before the cursor position. The long press will typically select all text in the field, making the subsequent erase operation more effective.
|
|
67
|
+
|
|
68
|
+
### 🔁 Final Notes
|
|
69
|
+
|
|
70
|
+
- **You do not need to reason or decide strategy** — that's the Cortex's job.
|
|
71
|
+
- You simply interpret and execute — like hands following the brain.
|
|
72
|
+
- The `agent_thought` must always clearly reflect _why_ the action is being performed.
|
|
73
|
+
- Be precise. Avoid vague or generic `agent_thought`s.
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
|
|
3
|
+
from jinja2 import Template
|
|
4
|
+
from langchain_core.messages import HumanMessage, SystemMessage
|
|
5
|
+
from langchain_core.messages.ai import AIMessage
|
|
6
|
+
from langchain_google_genai import ChatGoogleGenerativeAI
|
|
7
|
+
from minitap.mobile_use.context import MobileUseContext
|
|
8
|
+
from minitap.mobile_use.graph.state import State
|
|
9
|
+
from minitap.mobile_use.services.llm import get_llm
|
|
10
|
+
from minitap.mobile_use.tools.index import EXECUTOR_WRAPPERS_TOOLS, get_tools_from_wrappers
|
|
11
|
+
from minitap.mobile_use.utils.decorators import wrap_with_callbacks
|
|
12
|
+
from minitap.mobile_use.utils.logger import get_logger
|
|
13
|
+
|
|
14
|
+
logger = get_logger(__name__)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class ExecutorNode:
|
|
18
|
+
def __init__(self, ctx: MobileUseContext):
|
|
19
|
+
self.ctx = ctx
|
|
20
|
+
|
|
21
|
+
@wrap_with_callbacks(
|
|
22
|
+
before=lambda: logger.info("Starting Executor Agent..."),
|
|
23
|
+
on_success=lambda _: logger.success("Executor Agent"),
|
|
24
|
+
on_failure=lambda _: logger.error("Executor Agent"),
|
|
25
|
+
)
|
|
26
|
+
async def __call__(self, state: State):
|
|
27
|
+
structured_decisions = state.structured_decisions
|
|
28
|
+
if not structured_decisions:
|
|
29
|
+
logger.warning("No structured decisions found.")
|
|
30
|
+
return state.sanitize_update(
|
|
31
|
+
ctx=self.ctx,
|
|
32
|
+
update={
|
|
33
|
+
"agents_thoughts": [
|
|
34
|
+
"No structured decisions found, I cannot execute anything."
|
|
35
|
+
],
|
|
36
|
+
},
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
if len(state.executor_messages) > 0 and isinstance(state.executor_messages[-1], AIMessage):
|
|
40
|
+
if len(state.executor_messages[-1].tool_calls) > 0: # type: ignore
|
|
41
|
+
# A previous tool call raised an uncaught exception while retrigerring the executor
|
|
42
|
+
return state.sanitize_update(
|
|
43
|
+
ctx=self.ctx,
|
|
44
|
+
update={
|
|
45
|
+
"executor_retrigger": False,
|
|
46
|
+
"executor_failed": True,
|
|
47
|
+
"executor_messages": [state.messages[-1]],
|
|
48
|
+
},
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
system_message = Template(
|
|
52
|
+
Path(__file__).parent.joinpath("executor.md").read_text(encoding="utf-8")
|
|
53
|
+
).render(platform=self.ctx.device.mobile_platform.value)
|
|
54
|
+
cortex_last_thought = (
|
|
55
|
+
state.cortex_last_thought if state.cortex_last_thought else state.agents_thoughts[-1]
|
|
56
|
+
)
|
|
57
|
+
messages = [
|
|
58
|
+
SystemMessage(content=system_message),
|
|
59
|
+
HumanMessage(content=cortex_last_thought),
|
|
60
|
+
HumanMessage(content=structured_decisions),
|
|
61
|
+
*state.executor_messages,
|
|
62
|
+
]
|
|
63
|
+
|
|
64
|
+
llm = get_llm(ctx=self.ctx, name="executor")
|
|
65
|
+
llm_bind_tools_kwargs = {
|
|
66
|
+
"tools": get_tools_from_wrappers(self.ctx, EXECUTOR_WRAPPERS_TOOLS),
|
|
67
|
+
"tool_choice": "auto", # automatically select a tool call or none
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
# ChatGoogleGenerativeAI does not support the "parallel_tool_calls" keyword
|
|
71
|
+
if not isinstance(llm, ChatGoogleGenerativeAI):
|
|
72
|
+
llm_bind_tools_kwargs["parallel_tool_calls"] = False
|
|
73
|
+
|
|
74
|
+
llm = llm.bind_tools(**llm_bind_tools_kwargs)
|
|
75
|
+
response = await llm.ainvoke(messages)
|
|
76
|
+
|
|
77
|
+
return state.sanitize_update(
|
|
78
|
+
ctx=self.ctx,
|
|
79
|
+
update={
|
|
80
|
+
"cortex_last_thought": cortex_last_thought,
|
|
81
|
+
"executor_messages": [response],
|
|
82
|
+
"messages": [response],
|
|
83
|
+
},
|
|
84
|
+
)
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
from langchain_core.messages.ai import AIMessage
|
|
2
|
+
from minitap.mobile_use.graph.state import State
|
|
3
|
+
from minitap.mobile_use.utils.decorators import wrap_with_callbacks
|
|
4
|
+
from minitap.mobile_use.utils.logger import get_logger
|
|
5
|
+
|
|
6
|
+
logger = get_logger(__name__)
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@wrap_with_callbacks(
|
|
10
|
+
before=lambda: logger.info("Starting Executor Context Cleaner..."),
|
|
11
|
+
on_success=lambda _: logger.success("Executor Context Cleaner"),
|
|
12
|
+
on_failure=lambda _: logger.error("Executor Context Cleaner"),
|
|
13
|
+
)
|
|
14
|
+
async def executor_context_cleaner_node(state: State):
|
|
15
|
+
"""Clears the executor context."""
|
|
16
|
+
update: dict = {
|
|
17
|
+
"executor_failed": False,
|
|
18
|
+
"executor_retrigger": False,
|
|
19
|
+
}
|
|
20
|
+
if len(state.executor_messages) > 0 and isinstance(state.executor_messages[-1], AIMessage):
|
|
21
|
+
last_executor_message = state.executor_messages[-1]
|
|
22
|
+
if len(last_executor_message.tool_calls) > 0:
|
|
23
|
+
# A previous tool call raised an uncaught exception -> sanitize the executor messages
|
|
24
|
+
tool_error_message = state.messages[-1]
|
|
25
|
+
logger.error("Tool call failed with error: " + str(tool_error_message.content))
|
|
26
|
+
update["executor_messages"] = [tool_error_message]
|
|
27
|
+
return update
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
from langchain_core.messages import BaseMessage
|
|
2
|
+
from minitap.mobile_use.utils.conversations import is_tool_message
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def is_last_tool_message_take_screenshot(messages: list[BaseMessage]) -> bool:
|
|
6
|
+
if not messages:
|
|
7
|
+
return False
|
|
8
|
+
for msg in messages[::-1]:
|
|
9
|
+
if is_tool_message(msg):
|
|
10
|
+
return msg.name == "take_screenshot"
|
|
11
|
+
return False
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
## Hopper
|
|
2
|
+
|
|
3
|
+
Your goal is to analyze the input data and to pick only the most relevant information based on the current steps. We aim to reach the goal defined by the user as : {{ initial_goal }}
|
|
4
|
+
|
|
5
|
+
### Input
|
|
6
|
+
|
|
7
|
+
You have the list of steps we've done so far. We use those steps to track our progress to reach our goal. Here they are : {{ messages }}
|
|
8
|
+
|
|
9
|
+
Finally, here is the data that we receive form executing the last task. We will dig this data to pick only the most relevant information to reach our goal. Keep the information as is, do not modify it since we will trigger actions based on it. Output this information in the output field, and you will describe what you did in the step field.
|
|
10
|
+
|
|
11
|
+
Here is the data you must dig :
|
|
12
|
+
|
|
13
|
+
{{ data }}
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
from typing import Sequence
|
|
3
|
+
|
|
4
|
+
from jinja2 import Template
|
|
5
|
+
from langchain_core.messages import BaseMessage, HumanMessage, SystemMessage
|
|
6
|
+
from minitap.mobile_use.context import MobileUseContext
|
|
7
|
+
from minitap.mobile_use.services.llm import get_llm
|
|
8
|
+
from pydantic import BaseModel, Field
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class HopperOutput(BaseModel):
|
|
12
|
+
step: str = Field(
|
|
13
|
+
description=(
|
|
14
|
+
"The step that has been done, must be a valid one following the "
|
|
15
|
+
"current steps and the current goal to achieve."
|
|
16
|
+
)
|
|
17
|
+
)
|
|
18
|
+
output: str = Field(description="The interesting data extracted from the input data.")
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
async def hopper(
|
|
22
|
+
ctx: MobileUseContext,
|
|
23
|
+
initial_goal: str,
|
|
24
|
+
messages: Sequence[BaseMessage],
|
|
25
|
+
data: str,
|
|
26
|
+
) -> HopperOutput:
|
|
27
|
+
print("Starting Hopper Agent", flush=True)
|
|
28
|
+
system_message = Template(
|
|
29
|
+
Path(__file__).parent.joinpath("hopper.md").read_text(encoding="utf-8")
|
|
30
|
+
).render(
|
|
31
|
+
initial_goal=initial_goal,
|
|
32
|
+
messages=messages,
|
|
33
|
+
)
|
|
34
|
+
messages = [
|
|
35
|
+
SystemMessage(content=system_message),
|
|
36
|
+
HumanMessage(content=data),
|
|
37
|
+
]
|
|
38
|
+
|
|
39
|
+
llm = get_llm(ctx=ctx, name="hopper", is_utils=True, temperature=0)
|
|
40
|
+
structured_llm = llm.with_structured_output(HopperOutput)
|
|
41
|
+
response: HopperOutput = await structured_llm.ainvoke(messages) # type: ignore
|
|
42
|
+
return HopperOutput(
|
|
43
|
+
step=response.step,
|
|
44
|
+
output=response.output,
|
|
45
|
+
)
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
You are the **Orchestrator**.
|
|
2
|
+
|
|
3
|
+
Your role is to **decide what to do next**, based on the current execution state of a plan running on an **{{ platform }} mobile device**. You must assess the situation and choose between resuming, continuing, or replanning.
|
|
4
|
+
|
|
5
|
+
### Responsibilities
|
|
6
|
+
|
|
7
|
+
You're given:
|
|
8
|
+
|
|
9
|
+
- The current **subgoal plan**
|
|
10
|
+
- The current **subgoal** (which is marked as **PENDING** in the plan, but repeated here for your convenience)
|
|
11
|
+
- A list of **agent thoughts** (insights, obstacles, or reasoning gathered during execution)
|
|
12
|
+
- The original **initial goal**
|
|
13
|
+
|
|
14
|
+
You must then **choose what to do next**:
|
|
15
|
+
|
|
16
|
+
- `"resume"`: The current subgoal is clearly not finished, let's resume it. The status of the current subgoal will stay as `PENDING`.
|
|
17
|
+
- `"continue"`: Move to the next subgoal in the list. The current subgoal will be marked as `SUCCESS`. If the current subgoal is the final step of the plan: The "reason" field must contain the final answer to the user’s initial goal. If the current subgoal is not the final step: The "reason" field must explain why this subgoal is now considered complete before moving on.
|
|
18
|
+
- `"replan"`: The current plan no longer fits : the current subgoal will be marked as `FAILURE`. we need to define a new plan.
|
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
|
|
3
|
+
from jinja2 import Template
|
|
4
|
+
from langchain_core.messages import HumanMessage, SystemMessage
|
|
5
|
+
|
|
6
|
+
from minitap.mobile_use.agents.orchestrator.types import OrchestratorOutput, OrchestratorStatus
|
|
7
|
+
from minitap.mobile_use.agents.planner.utils import (
|
|
8
|
+
all_completed,
|
|
9
|
+
complete_current_subgoal,
|
|
10
|
+
fail_current_subgoal,
|
|
11
|
+
get_current_subgoal,
|
|
12
|
+
nothing_started,
|
|
13
|
+
start_next_subgoal,
|
|
14
|
+
)
|
|
15
|
+
from minitap.mobile_use.context import MobileUseContext
|
|
16
|
+
from minitap.mobile_use.graph.state import State
|
|
17
|
+
from minitap.mobile_use.services.llm import get_llm
|
|
18
|
+
from minitap.mobile_use.utils.decorators import wrap_with_callbacks
|
|
19
|
+
from minitap.mobile_use.utils.logger import get_logger
|
|
20
|
+
|
|
21
|
+
logger = get_logger(__name__)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class OrchestratorNode:
|
|
25
|
+
def __init__(self, ctx: MobileUseContext):
|
|
26
|
+
self.ctx = ctx
|
|
27
|
+
|
|
28
|
+
@wrap_with_callbacks(
|
|
29
|
+
before=lambda: logger.info("Starting Orchestrator Agent..."),
|
|
30
|
+
on_success=lambda _: logger.success("Orchestrator Agent"),
|
|
31
|
+
on_failure=lambda _: logger.error("Orchestrator Agent"),
|
|
32
|
+
)
|
|
33
|
+
async def __call__(self, state: State):
|
|
34
|
+
if nothing_started(state.subgoal_plan):
|
|
35
|
+
state.subgoal_plan = start_next_subgoal(state.subgoal_plan)
|
|
36
|
+
new_subgoal = get_current_subgoal(state.subgoal_plan)
|
|
37
|
+
return state.sanitize_update(
|
|
38
|
+
ctx=self.ctx,
|
|
39
|
+
update={
|
|
40
|
+
"agents_thoughts": [f"Starting the first subgoal: {new_subgoal}"],
|
|
41
|
+
"subgoal_plan": state.subgoal_plan,
|
|
42
|
+
},
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
current_subgoal = get_current_subgoal(state.subgoal_plan)
|
|
46
|
+
|
|
47
|
+
if not current_subgoal:
|
|
48
|
+
return state.sanitize_update(
|
|
49
|
+
ctx=self.ctx,
|
|
50
|
+
update={"agents_thoughts": ["No subgoal to go for."]},
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
system_message = Template(
|
|
54
|
+
Path(__file__).parent.joinpath("orchestrator.md").read_text(encoding="utf-8")
|
|
55
|
+
).render(platform=self.ctx.device.mobile_platform.value)
|
|
56
|
+
human_message = Template(
|
|
57
|
+
Path(__file__).parent.joinpath("human.md").read_text(encoding="utf-8")
|
|
58
|
+
).render(
|
|
59
|
+
initial_goal=state.initial_goal,
|
|
60
|
+
subgoal_plan="\n".join(str(s) for s in state.subgoal_plan),
|
|
61
|
+
current_subgoal=str(current_subgoal),
|
|
62
|
+
agent_thoughts="\n".join(state.agents_thoughts),
|
|
63
|
+
)
|
|
64
|
+
messages = [
|
|
65
|
+
SystemMessage(content=system_message),
|
|
66
|
+
HumanMessage(content=human_message),
|
|
67
|
+
]
|
|
68
|
+
|
|
69
|
+
llm = get_llm(ctx=self.ctx, name="orchestrator", temperature=1)
|
|
70
|
+
llm = llm.with_structured_output(OrchestratorOutput)
|
|
71
|
+
response: OrchestratorOutput = await llm.ainvoke(messages) # type: ignore
|
|
72
|
+
|
|
73
|
+
if response.status == OrchestratorStatus.CONTINUE:
|
|
74
|
+
state.subgoal_plan = complete_current_subgoal(state.subgoal_plan)
|
|
75
|
+
thoughts = [response.reason]
|
|
76
|
+
|
|
77
|
+
if all_completed(state.subgoal_plan):
|
|
78
|
+
logger.success("All the subgoals have been completed successfully.")
|
|
79
|
+
return state.sanitize_update(
|
|
80
|
+
ctx=self.ctx,
|
|
81
|
+
update={
|
|
82
|
+
"subgoal_plan": state.subgoal_plan,
|
|
83
|
+
"agents_thoughts": thoughts,
|
|
84
|
+
},
|
|
85
|
+
)
|
|
86
|
+
state.subgoal_plan = start_next_subgoal(state.subgoal_plan)
|
|
87
|
+
new_subgoal = get_current_subgoal(state.subgoal_plan)
|
|
88
|
+
thoughts.append(f"==== NEXT SUBGOAL: {new_subgoal} ====")
|
|
89
|
+
return state.sanitize_update(
|
|
90
|
+
ctx=self.ctx,
|
|
91
|
+
update={
|
|
92
|
+
"agents_thoughts": thoughts,
|
|
93
|
+
"subgoal_plan": state.subgoal_plan,
|
|
94
|
+
},
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
elif response.status == OrchestratorStatus.REPLAN:
|
|
98
|
+
thoughts = [response.reason]
|
|
99
|
+
state.subgoal_plan = fail_current_subgoal(state.subgoal_plan)
|
|
100
|
+
thoughts.append("==== END OF PLAN, REPLANNING ====")
|
|
101
|
+
return state.sanitize_update(
|
|
102
|
+
ctx=self.ctx,
|
|
103
|
+
update={
|
|
104
|
+
"agents_thoughts": thoughts,
|
|
105
|
+
"subgoal_plan": state.subgoal_plan,
|
|
106
|
+
},
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
return state.sanitize_update(
|
|
110
|
+
ctx=self.ctx,
|
|
111
|
+
update={
|
|
112
|
+
"agents_thoughts": [response.reason],
|
|
113
|
+
},
|
|
114
|
+
)
|