minitap-mobile-use 0.0.1.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of minitap-mobile-use might be problematic. Click here for more details.

Files changed (95) hide show
  1. minitap/mobile_use/__init__.py +0 -0
  2. minitap/mobile_use/agents/contextor/contextor.py +42 -0
  3. minitap/mobile_use/agents/cortex/cortex.md +93 -0
  4. minitap/mobile_use/agents/cortex/cortex.py +107 -0
  5. minitap/mobile_use/agents/cortex/types.py +11 -0
  6. minitap/mobile_use/agents/executor/executor.md +73 -0
  7. minitap/mobile_use/agents/executor/executor.py +84 -0
  8. minitap/mobile_use/agents/executor/executor_context_cleaner.py +27 -0
  9. minitap/mobile_use/agents/executor/utils.py +11 -0
  10. minitap/mobile_use/agents/hopper/hopper.md +13 -0
  11. minitap/mobile_use/agents/hopper/hopper.py +45 -0
  12. minitap/mobile_use/agents/orchestrator/human.md +13 -0
  13. minitap/mobile_use/agents/orchestrator/orchestrator.md +18 -0
  14. minitap/mobile_use/agents/orchestrator/orchestrator.py +114 -0
  15. minitap/mobile_use/agents/orchestrator/types.py +14 -0
  16. minitap/mobile_use/agents/outputter/human.md +25 -0
  17. minitap/mobile_use/agents/outputter/outputter.py +75 -0
  18. minitap/mobile_use/agents/outputter/test_outputter.py +107 -0
  19. minitap/mobile_use/agents/planner/human.md +12 -0
  20. minitap/mobile_use/agents/planner/planner.md +64 -0
  21. minitap/mobile_use/agents/planner/planner.py +64 -0
  22. minitap/mobile_use/agents/planner/types.py +44 -0
  23. minitap/mobile_use/agents/planner/utils.py +45 -0
  24. minitap/mobile_use/agents/summarizer/summarizer.py +34 -0
  25. minitap/mobile_use/clients/device_hardware_client.py +23 -0
  26. minitap/mobile_use/clients/ios_client.py +44 -0
  27. minitap/mobile_use/clients/screen_api_client.py +53 -0
  28. minitap/mobile_use/config.py +285 -0
  29. minitap/mobile_use/constants.py +2 -0
  30. minitap/mobile_use/context.py +65 -0
  31. minitap/mobile_use/controllers/__init__.py +0 -0
  32. minitap/mobile_use/controllers/mobile_command_controller.py +379 -0
  33. minitap/mobile_use/controllers/platform_specific_commands_controller.py +74 -0
  34. minitap/mobile_use/graph/graph.py +149 -0
  35. minitap/mobile_use/graph/state.py +73 -0
  36. minitap/mobile_use/main.py +122 -0
  37. minitap/mobile_use/sdk/__init__.py +12 -0
  38. minitap/mobile_use/sdk/agent.py +524 -0
  39. minitap/mobile_use/sdk/builders/__init__.py +10 -0
  40. minitap/mobile_use/sdk/builders/agent_config_builder.py +213 -0
  41. minitap/mobile_use/sdk/builders/index.py +15 -0
  42. minitap/mobile_use/sdk/builders/task_request_builder.py +218 -0
  43. minitap/mobile_use/sdk/constants.py +14 -0
  44. minitap/mobile_use/sdk/examples/README.md +45 -0
  45. minitap/mobile_use/sdk/examples/__init__.py +1 -0
  46. minitap/mobile_use/sdk/examples/simple_photo_organizer.py +76 -0
  47. minitap/mobile_use/sdk/examples/smart_notification_assistant.py +177 -0
  48. minitap/mobile_use/sdk/types/__init__.py +49 -0
  49. minitap/mobile_use/sdk/types/agent.py +73 -0
  50. minitap/mobile_use/sdk/types/exceptions.py +74 -0
  51. minitap/mobile_use/sdk/types/task.py +191 -0
  52. minitap/mobile_use/sdk/utils.py +28 -0
  53. minitap/mobile_use/servers/config.py +19 -0
  54. minitap/mobile_use/servers/device_hardware_bridge.py +212 -0
  55. minitap/mobile_use/servers/device_screen_api.py +143 -0
  56. minitap/mobile_use/servers/start_servers.py +151 -0
  57. minitap/mobile_use/servers/stop_servers.py +215 -0
  58. minitap/mobile_use/servers/utils.py +11 -0
  59. minitap/mobile_use/services/accessibility.py +100 -0
  60. minitap/mobile_use/services/llm.py +143 -0
  61. minitap/mobile_use/tools/index.py +54 -0
  62. minitap/mobile_use/tools/mobile/back.py +52 -0
  63. minitap/mobile_use/tools/mobile/copy_text_from.py +77 -0
  64. minitap/mobile_use/tools/mobile/erase_text.py +124 -0
  65. minitap/mobile_use/tools/mobile/input_text.py +74 -0
  66. minitap/mobile_use/tools/mobile/launch_app.py +59 -0
  67. minitap/mobile_use/tools/mobile/list_packages.py +78 -0
  68. minitap/mobile_use/tools/mobile/long_press_on.py +62 -0
  69. minitap/mobile_use/tools/mobile/open_link.py +59 -0
  70. minitap/mobile_use/tools/mobile/paste_text.py +66 -0
  71. minitap/mobile_use/tools/mobile/press_key.py +58 -0
  72. minitap/mobile_use/tools/mobile/run_flow.py +57 -0
  73. minitap/mobile_use/tools/mobile/stop_app.py +58 -0
  74. minitap/mobile_use/tools/mobile/swipe.py +56 -0
  75. minitap/mobile_use/tools/mobile/take_screenshot.py +70 -0
  76. minitap/mobile_use/tools/mobile/tap.py +66 -0
  77. minitap/mobile_use/tools/mobile/wait_for_animation_to_end.py +68 -0
  78. minitap/mobile_use/tools/tool_wrapper.py +33 -0
  79. minitap/mobile_use/utils/cli_helpers.py +40 -0
  80. minitap/mobile_use/utils/cli_selection.py +144 -0
  81. minitap/mobile_use/utils/conversations.py +31 -0
  82. minitap/mobile_use/utils/decorators.py +123 -0
  83. minitap/mobile_use/utils/errors.py +6 -0
  84. minitap/mobile_use/utils/file.py +13 -0
  85. minitap/mobile_use/utils/logger.py +184 -0
  86. minitap/mobile_use/utils/media.py +73 -0
  87. minitap/mobile_use/utils/recorder.py +55 -0
  88. minitap/mobile_use/utils/requests_utils.py +37 -0
  89. minitap/mobile_use/utils/shell_utils.py +20 -0
  90. minitap/mobile_use/utils/time.py +6 -0
  91. minitap/mobile_use/utils/ui_hierarchy.py +30 -0
  92. minitap_mobile_use-0.0.1.dev0.dist-info/METADATA +274 -0
  93. minitap_mobile_use-0.0.1.dev0.dist-info/RECORD +95 -0
  94. minitap_mobile_use-0.0.1.dev0.dist-info/WHEEL +4 -0
  95. minitap_mobile_use-0.0.1.dev0.dist-info/entry_points.txt +3 -0
File without changes
@@ -0,0 +1,42 @@
1
+ from minitap.mobile_use.agents.executor.utils import is_last_tool_message_take_screenshot
2
+ from minitap.mobile_use.controllers.mobile_command_controller import get_screen_data
3
+ from minitap.mobile_use.controllers.platform_specific_commands_controller import (
4
+ get_device_date,
5
+ get_focused_app_info,
6
+ )
7
+ from minitap.mobile_use.graph.state import State
8
+ from minitap.mobile_use.utils.decorators import wrap_with_callbacks
9
+ from minitap.mobile_use.utils.logger import get_logger
10
+ from minitap.mobile_use.context import MobileUseContext
11
+
12
+ logger = get_logger(__name__)
13
+
14
+
15
+ class ContextorNode:
16
+ def __init__(self, ctx: MobileUseContext):
17
+ self.ctx = ctx
18
+
19
+ @wrap_with_callbacks(
20
+ before=lambda: logger.info("Starting Contextor Agent"),
21
+ on_success=lambda _: logger.success("Contextor Agent"),
22
+ on_failure=lambda _: logger.error("Contextor Agent"),
23
+ )
24
+ def __call__(self, state: State):
25
+ device_data = get_screen_data(self.ctx.screen_api_client)
26
+ focused_app_info = get_focused_app_info(self.ctx)
27
+ device_date = get_device_date(self.ctx)
28
+
29
+ should_add_screenshot_context = is_last_tool_message_take_screenshot(list(state.messages))
30
+
31
+ return state.sanitize_update(
32
+ ctx=self.ctx,
33
+ update={
34
+ "latest_screenshot_base64": device_data.base64
35
+ if should_add_screenshot_context
36
+ else None,
37
+ "latest_ui_hierarchy": device_data.elements,
38
+ "focused_app_info": focused_app_info,
39
+ "screen_size": (device_data.width, device_data.height),
40
+ "device_date": device_date,
41
+ },
42
+ )
@@ -0,0 +1,93 @@
1
+ ## You are the **Cortex**
2
+
3
+ Your job is to **analyze the current {{ platform }} mobile device state** and produce **structured decisions** to achieve the current subgoal.
4
+
5
+ You must act like a human brain, responsible for giving instructions to your hands (the **Executor** agent). Therefore, you must act with the same imprecision and uncertainty as a human when performing swipe actions: humans don't know where exactly they are swiping (always prefer percentages of width and height instead of absolute coordinates), they just know they are swiping up or down, left or right, and with how much force (usually amplified compared to what's truly needed - go overboard of sliders for instance).
6
+
7
+ ### Context You Receive:
8
+
9
+ You are provided with:
10
+
11
+ - 📱 **Device state**:
12
+
13
+ - Latest **UI hierarchy**
14
+ - (Optional) Latest **screenshot (base64)**. You can query one if you need it by calling the take_screenshot tool. Often, the UI hierarchy is enough to understand what is happening on the screen.
15
+ - Current **focused app info**
16
+ - **Screen size** and **device date**
17
+
18
+ - 🧭 **Task context**:
19
+
20
+ - The user's **initial goal**
21
+ - The **subgoal plan** with their statuses
22
+ - The **current subgoal** to act on (the one in `PENDING` in the plan)
23
+ - A list of **agent thoughts** (previous reasoning, observations about the environment)
24
+ - **Executor agent feedback** on the latest UI decisions
25
+
26
+ ### Your Mission:
27
+
28
+ Focus on the **current subgoal**.
29
+
30
+ 1. **Analyze the UI** and environment to understand what action is required.
31
+ 2.1. If the **subgoal is completed**, set the `complete_subgoal` field to `True`. To justify your conclusion, you will fill in the `agent_thought` field based on:
32
+
33
+ - The current UI state
34
+ - Past agent thoughts
35
+ - Recent tool effects
36
+ 2.2. Otherwise, output a **stringified structured set of instructions** that an **Executor agent** can perform on a real mobile device:
37
+
38
+ - These must be **concrete low-level actions**: back,tap, swipe, launch app, list packages, close app, input text, paste, erase, text, copy, etc.
39
+ - If you refer to a UI element or coordinates, specify it clearly (e.g., `resource-id: com.whatsapp:id/search`, `text: "Alice"`, `x: 100, y: 200`).
40
+ - **The structure is up to you**, but it must be valid **JSON stringified output**. You will accompany this output with a **natural-language summary** of your reasoning and approach in your agent thought.
41
+ - When you want to launch/stop an app, prefer using its package name.
42
+ - **Only reference UI element IDs or visible texts that are explicitly present in the provided UI hierarchy or screenshot. Do not invent, infer, or guess any IDs or texts that are not directly observed**.
43
+ - **For text clearing**: When you need to completely clear text from an input field, always use **LONG PRESS** first to select the text field, then erase. Do NOT use tap + erase as this only clears from cursor position.
44
+
45
+ ### Output
46
+
47
+ - **Structured Decisions**:
48
+ A **valid stringified JSON** describing what should be executed **right now** to advance the current subgoal **IF THE SUBGOAL IS NOT COMPLETED**.
49
+
50
+ - **Agent Thought** _(1-2 sentences)_:
51
+ If there is any information you need to remember for later steps, you must include it here, because only the agent thoughts will be used to produce the final structured output.
52
+
53
+ This also helps other agents understand your decision and learn from future failures.
54
+ You must also use this field to mention checkpoints when you perform actions without definite ending: for instance "Swiping up to reveal more recipes - last seen recipe was <ID or NAME>, stop when no more".
55
+
56
+ - **Subgoal Completion** _(boolean)_:
57
+ Set to true if the current subgoal has been successfully completed - you **cannot set it to true and provide structured decisions at the same time**. You must base your decision ONLY on what you have as input (device state, agent thoughts, executor feedback, etc) - NEVER based on the decisions you have produced.
58
+
59
+ ---
60
+
61
+ ### Example
62
+
63
+ #### Current Subgoal:
64
+
65
+ > "Search for Alice in WhatsApp"
66
+
67
+ #### Structured Decisions:
68
+
69
+ ```text
70
+ "{\"action\": \"tap\", \"target\": {\"resource_id\": \"com.whatsapp:id/menuitem_search\", \"text\": \"Search\"}}"
71
+ ```
72
+
73
+ #### Agent Thought:
74
+
75
+ > I will tap the search icon at the top of the WhatsApp interface to begin searching for Alice.
76
+
77
+ ### Input
78
+
79
+ **Initial Goal:**
80
+ {{ initial_goal }}
81
+
82
+ **Subgoal Plan:**
83
+ {{ subgoal_plan }}
84
+
85
+ **Current Subgoal (what needs to be done right now):**
86
+ {{ current_subgoal }}
87
+
88
+ **Agent thoughts (previous reasoning, observations about the environment):**
89
+ {{ agents_thoughts }}
90
+
91
+ **Executor agent feedback on latest UI decisions:**
92
+
93
+ {{ executor_feedback }}
@@ -0,0 +1,107 @@
1
+ import json
2
+ from pathlib import Path
3
+
4
+ from jinja2 import Template
5
+ from langchain_core.messages import (
6
+ AIMessage,
7
+ HumanMessage,
8
+ RemoveMessage,
9
+ SystemMessage,
10
+ ToolMessage,
11
+ )
12
+ from langgraph.graph.message import REMOVE_ALL_MESSAGES
13
+ from minitap.mobile_use.agents.cortex.types import CortexOutput
14
+ from minitap.mobile_use.agents.planner.utils import get_current_subgoal
15
+ from minitap.mobile_use.context import MobileUseContext
16
+ from minitap.mobile_use.graph.state import State
17
+ from minitap.mobile_use.services.llm import get_llm, with_fallback
18
+ from minitap.mobile_use.utils.conversations import get_screenshot_message_for_llm
19
+ from minitap.mobile_use.utils.decorators import wrap_with_callbacks
20
+ from minitap.mobile_use.utils.logger import get_logger
21
+
22
+ logger = get_logger(__name__)
23
+
24
+
25
+ class CortexNode:
26
+ def __init__(self, ctx: MobileUseContext):
27
+ self.ctx = ctx
28
+
29
+ @wrap_with_callbacks(
30
+ before=lambda: logger.info("Starting Cortex Agent..."),
31
+ on_success=lambda _: logger.success("Cortex Agent"),
32
+ on_failure=lambda _: logger.error("Cortex Agent"),
33
+ )
34
+ async def __call__(self, state: State):
35
+ executor_feedback = get_executor_agent_feedback(state)
36
+
37
+ system_message = Template(
38
+ Path(__file__).parent.joinpath("cortex.md").read_text(encoding="utf-8")
39
+ ).render(
40
+ platform=self.ctx.device.mobile_platform.value,
41
+ initial_goal=state.initial_goal,
42
+ subgoal_plan=state.subgoal_plan,
43
+ current_subgoal=get_current_subgoal(state.subgoal_plan),
44
+ agents_thoughts=state.agents_thoughts,
45
+ executor_feedback=executor_feedback,
46
+ )
47
+ messages = [
48
+ SystemMessage(content=system_message),
49
+ HumanMessage(
50
+ content="Here are my device info:\n"
51
+ + self.ctx.device.to_str()
52
+ + f"Device date: {state.device_date}\n"
53
+ if state.device_date
54
+ else "" + f"Focused app info: {state.focused_app_info}\n"
55
+ if state.focused_app_info
56
+ else ""
57
+ ),
58
+ ]
59
+ for thought in state.agents_thoughts:
60
+ messages.append(AIMessage(content=thought))
61
+
62
+ if state.latest_screenshot_base64:
63
+ messages.append(get_screenshot_message_for_llm(state.latest_screenshot_base64))
64
+ logger.info("Added screenshot to context")
65
+
66
+ if state.latest_ui_hierarchy:
67
+ ui_hierarchy_dict: list[dict] = state.latest_ui_hierarchy
68
+ ui_hierarchy_str = json.dumps(ui_hierarchy_dict, indent=2, ensure_ascii=False)
69
+ messages.append(HumanMessage(content="Here is the UI hierarchy:\n" + ui_hierarchy_str))
70
+
71
+ llm = get_llm(ctx=self.ctx, name="cortex", temperature=1).with_structured_output(
72
+ CortexOutput
73
+ )
74
+ llm_fallback = get_llm(
75
+ ctx=self.ctx, name="cortex", use_fallback=True, temperature=1
76
+ ).with_structured_output(CortexOutput)
77
+ response: CortexOutput = await with_fallback(
78
+ main_call=lambda: llm.ainvoke(messages),
79
+ fallback_call=lambda: llm_fallback.ainvoke(messages),
80
+ ) # type: ignore
81
+
82
+ is_subgoal_completed = response.complete_current_subgoal
83
+ return state.sanitize_update(
84
+ ctx=self.ctx,
85
+ update={
86
+ "agents_thoughts": [response.agent_thought],
87
+ "structured_decisions": response.decisions if not is_subgoal_completed else None,
88
+ "latest_screenshot_base64": None,
89
+ "latest_ui_hierarchy": None,
90
+ "focused_app_info": None,
91
+ "device_date": None,
92
+ # Executor related fields
93
+ "executor_messages": [RemoveMessage(id=REMOVE_ALL_MESSAGES)],
94
+ "cortex_last_thought": response.agent_thought,
95
+ },
96
+ )
97
+
98
+
99
+ def get_executor_agent_feedback(state: State) -> str:
100
+ if state.structured_decisions is None:
101
+ return "None."
102
+ executor_tool_messages = [m for m in state.executor_messages if isinstance(m, ToolMessage)]
103
+ return (
104
+ f"Latest UI decisions:\n{state.structured_decisions}"
105
+ + "\n\n"
106
+ + f"Executor feedback:\n{executor_tool_messages}"
107
+ )
@@ -0,0 +1,11 @@
1
+ from typing import Optional
2
+
3
+ from pydantic import BaseModel, Field
4
+
5
+
6
+ class CortexOutput(BaseModel):
7
+ decisions: str = Field(..., description="The decisions to be made. A stringified JSON object")
8
+ agent_thought: str = Field(..., description="The agent's thought")
9
+ complete_current_subgoal: Optional[bool] = Field(
10
+ False, description="Whether the current subgoal is complete"
11
+ )
@@ -0,0 +1,73 @@
1
+ ## You are the **Executor**
2
+
3
+ Your job is to **interpret the structured decisions** provided by the **Cortex** agent and use the appropriate tools to act on a **{{ platform }} mobile device**.
4
+
5
+ ### 🎯 Your Objective:
6
+
7
+ Given the `structured_decisions` (a stringified object) from the **Cortex** agent
8
+ and the previous tool calls, you must:
9
+
10
+ 1. **Parse the structured decisions** into usable Python objects.
11
+ 2. **Determine the most appropriate tool** to execute the intended action - **you can ONLY USE ONE**
12
+ 3. **Invoke tool accurately**, passing the required parameters.
13
+ 4. For **the tool you invoke**, always provide a clear `agent_thought` argument:
14
+
15
+ - This is a natural-language sentence (or two) **explaining why** this tool is being invoked.
16
+ - Keep it short but informative.
17
+ - This is essential for debugging, traceability, and adaptation by other agents.
18
+
19
+ 5. For **the tool you invoke**, always provide the `executor_metadata` argument:
20
+
21
+ - If you know you won't be able to achieve all Cortex decisions using the tool call you've chosen, set `retrigger` to `true` - otherwise set it to `false`
22
+
23
+ ---
24
+
25
+ ### 🧠 Example
26
+
27
+ **Structured Decisions from the **Cortex** agent**:
28
+
29
+ "I'm tapping on the chat item labeled 'Alice' to open the conversation."
30
+
31
+ ```json
32
+ {
33
+ "action": "tap",
34
+ "target": {
35
+ "text": "Alice",
36
+ "resource_id": "com.whatsapp:id/conversation_item"
37
+ }
38
+ }
39
+ ```
40
+
41
+ **→ Executor Action**:
42
+
43
+ Call the `tap_on_element` tool with:
44
+
45
+ - `resource_id = "com.whatsapp:id/conversation_item"`
46
+ - `text = "Alice"`
47
+ - `agent_thought = "I'm tapping on the chat item labeled 'Alice' to open the conversation."`
48
+ - `executor_metadata = {"retrigger": false}`
49
+
50
+ ---
51
+
52
+ ### ⚙️ Tools
53
+
54
+ - Tools may include actions like: `tap`, `swipe`, `start_app`, `stop_app`, `list_packages`, `get_current_focus`, etc.
55
+ - You **must not hardcode tool definitions** here.
56
+ - Just use the right tool based on what the `structured_decisions` requires.
57
+ - The tools are provided dynamically via LangGraph's tool binding mechanism.
58
+
59
+ #### 🔄 Text Clearing Best Practice
60
+
61
+ When you need to completely clear text from an input field, **DO NOT** simply use `erase_text` alone, as it only erases from the cursor position, backward. Instead:
62
+
63
+ 1. **Use `long_press_on` first** to select the text field and bring up selection options
64
+ 2. **Then use `erase_text`** to clear the selected content
65
+
66
+ This approach ensures the **entire text content** is removed, not just the portion before the cursor position. The long press will typically select all text in the field, making the subsequent erase operation more effective.
67
+
68
+ ### 🔁 Final Notes
69
+
70
+ - **You do not need to reason or decide strategy** — that's the Cortex's job.
71
+ - You simply interpret and execute — like hands following the brain.
72
+ - The `agent_thought` must always clearly reflect _why_ the action is being performed.
73
+ - Be precise. Avoid vague or generic `agent_thought`s.
@@ -0,0 +1,84 @@
1
+ from pathlib import Path
2
+
3
+ from jinja2 import Template
4
+ from langchain_core.messages import HumanMessage, SystemMessage
5
+ from langchain_core.messages.ai import AIMessage
6
+ from langchain_google_genai import ChatGoogleGenerativeAI
7
+ from minitap.mobile_use.context import MobileUseContext
8
+ from minitap.mobile_use.graph.state import State
9
+ from minitap.mobile_use.services.llm import get_llm
10
+ from minitap.mobile_use.tools.index import EXECUTOR_WRAPPERS_TOOLS, get_tools_from_wrappers
11
+ from minitap.mobile_use.utils.decorators import wrap_with_callbacks
12
+ from minitap.mobile_use.utils.logger import get_logger
13
+
14
+ logger = get_logger(__name__)
15
+
16
+
17
+ class ExecutorNode:
18
+ def __init__(self, ctx: MobileUseContext):
19
+ self.ctx = ctx
20
+
21
+ @wrap_with_callbacks(
22
+ before=lambda: logger.info("Starting Executor Agent..."),
23
+ on_success=lambda _: logger.success("Executor Agent"),
24
+ on_failure=lambda _: logger.error("Executor Agent"),
25
+ )
26
+ async def __call__(self, state: State):
27
+ structured_decisions = state.structured_decisions
28
+ if not structured_decisions:
29
+ logger.warning("No structured decisions found.")
30
+ return state.sanitize_update(
31
+ ctx=self.ctx,
32
+ update={
33
+ "agents_thoughts": [
34
+ "No structured decisions found, I cannot execute anything."
35
+ ],
36
+ },
37
+ )
38
+
39
+ if len(state.executor_messages) > 0 and isinstance(state.executor_messages[-1], AIMessage):
40
+ if len(state.executor_messages[-1].tool_calls) > 0: # type: ignore
41
+ # A previous tool call raised an uncaught exception while retrigerring the executor
42
+ return state.sanitize_update(
43
+ ctx=self.ctx,
44
+ update={
45
+ "executor_retrigger": False,
46
+ "executor_failed": True,
47
+ "executor_messages": [state.messages[-1]],
48
+ },
49
+ )
50
+
51
+ system_message = Template(
52
+ Path(__file__).parent.joinpath("executor.md").read_text(encoding="utf-8")
53
+ ).render(platform=self.ctx.device.mobile_platform.value)
54
+ cortex_last_thought = (
55
+ state.cortex_last_thought if state.cortex_last_thought else state.agents_thoughts[-1]
56
+ )
57
+ messages = [
58
+ SystemMessage(content=system_message),
59
+ HumanMessage(content=cortex_last_thought),
60
+ HumanMessage(content=structured_decisions),
61
+ *state.executor_messages,
62
+ ]
63
+
64
+ llm = get_llm(ctx=self.ctx, name="executor")
65
+ llm_bind_tools_kwargs = {
66
+ "tools": get_tools_from_wrappers(self.ctx, EXECUTOR_WRAPPERS_TOOLS),
67
+ "tool_choice": "auto", # automatically select a tool call or none
68
+ }
69
+
70
+ # ChatGoogleGenerativeAI does not support the "parallel_tool_calls" keyword
71
+ if not isinstance(llm, ChatGoogleGenerativeAI):
72
+ llm_bind_tools_kwargs["parallel_tool_calls"] = False
73
+
74
+ llm = llm.bind_tools(**llm_bind_tools_kwargs)
75
+ response = await llm.ainvoke(messages)
76
+
77
+ return state.sanitize_update(
78
+ ctx=self.ctx,
79
+ update={
80
+ "cortex_last_thought": cortex_last_thought,
81
+ "executor_messages": [response],
82
+ "messages": [response],
83
+ },
84
+ )
@@ -0,0 +1,27 @@
1
+ from langchain_core.messages.ai import AIMessage
2
+ from minitap.mobile_use.graph.state import State
3
+ from minitap.mobile_use.utils.decorators import wrap_with_callbacks
4
+ from minitap.mobile_use.utils.logger import get_logger
5
+
6
+ logger = get_logger(__name__)
7
+
8
+
9
+ @wrap_with_callbacks(
10
+ before=lambda: logger.info("Starting Executor Context Cleaner..."),
11
+ on_success=lambda _: logger.success("Executor Context Cleaner"),
12
+ on_failure=lambda _: logger.error("Executor Context Cleaner"),
13
+ )
14
+ async def executor_context_cleaner_node(state: State):
15
+ """Clears the executor context."""
16
+ update: dict = {
17
+ "executor_failed": False,
18
+ "executor_retrigger": False,
19
+ }
20
+ if len(state.executor_messages) > 0 and isinstance(state.executor_messages[-1], AIMessage):
21
+ last_executor_message = state.executor_messages[-1]
22
+ if len(last_executor_message.tool_calls) > 0:
23
+ # A previous tool call raised an uncaught exception -> sanitize the executor messages
24
+ tool_error_message = state.messages[-1]
25
+ logger.error("Tool call failed with error: " + str(tool_error_message.content))
26
+ update["executor_messages"] = [tool_error_message]
27
+ return update
@@ -0,0 +1,11 @@
1
+ from langchain_core.messages import BaseMessage
2
+ from minitap.mobile_use.utils.conversations import is_tool_message
3
+
4
+
5
+ def is_last_tool_message_take_screenshot(messages: list[BaseMessage]) -> bool:
6
+ if not messages:
7
+ return False
8
+ for msg in messages[::-1]:
9
+ if is_tool_message(msg):
10
+ return msg.name == "take_screenshot"
11
+ return False
@@ -0,0 +1,13 @@
1
+ ## Hopper
2
+
3
+ Your goal is to analyze the input data and to pick only the most relevant information based on the current steps. We aim to reach the goal defined by the user as : {{ initial_goal }}
4
+
5
+ ### Input
6
+
7
+ You have the list of steps we've done so far. We use those steps to track our progress to reach our goal. Here they are : {{ messages }}
8
+
9
+ Finally, here is the data that we receive form executing the last task. We will dig this data to pick only the most relevant information to reach our goal. Keep the information as is, do not modify it since we will trigger actions based on it. Output this information in the output field, and you will describe what you did in the step field.
10
+
11
+ Here is the data you must dig :
12
+
13
+ {{ data }}
@@ -0,0 +1,45 @@
1
+ from pathlib import Path
2
+ from typing import Sequence
3
+
4
+ from jinja2 import Template
5
+ from langchain_core.messages import BaseMessage, HumanMessage, SystemMessage
6
+ from minitap.mobile_use.context import MobileUseContext
7
+ from minitap.mobile_use.services.llm import get_llm
8
+ from pydantic import BaseModel, Field
9
+
10
+
11
+ class HopperOutput(BaseModel):
12
+ step: str = Field(
13
+ description=(
14
+ "The step that has been done, must be a valid one following the "
15
+ "current steps and the current goal to achieve."
16
+ )
17
+ )
18
+ output: str = Field(description="The interesting data extracted from the input data.")
19
+
20
+
21
+ async def hopper(
22
+ ctx: MobileUseContext,
23
+ initial_goal: str,
24
+ messages: Sequence[BaseMessage],
25
+ data: str,
26
+ ) -> HopperOutput:
27
+ print("Starting Hopper Agent", flush=True)
28
+ system_message = Template(
29
+ Path(__file__).parent.joinpath("hopper.md").read_text(encoding="utf-8")
30
+ ).render(
31
+ initial_goal=initial_goal,
32
+ messages=messages,
33
+ )
34
+ messages = [
35
+ SystemMessage(content=system_message),
36
+ HumanMessage(content=data),
37
+ ]
38
+
39
+ llm = get_llm(ctx=ctx, name="hopper", is_utils=True, temperature=0)
40
+ structured_llm = llm.with_structured_output(HopperOutput)
41
+ response: HopperOutput = await structured_llm.ainvoke(messages) # type: ignore
42
+ return HopperOutput(
43
+ step=response.step,
44
+ output=response.output,
45
+ )
@@ -0,0 +1,13 @@
1
+ Here is your input.
2
+
3
+ ---
4
+
5
+ **Initial goal** : {{ initial_goal }}
6
+
7
+ **Subgoal plan**
8
+ {{ subgoal_plan }}
9
+
10
+ **Current subgoal** : {{ current_subgoal }}
11
+
12
+ **Agent thoughts**
13
+ {{ agent_thoughts }}
@@ -0,0 +1,18 @@
1
+ You are the **Orchestrator**.
2
+
3
+ Your role is to **decide what to do next**, based on the current execution state of a plan running on an **{{ platform }} mobile device**. You must assess the situation and choose between resuming, continuing, or replanning.
4
+
5
+ ### Responsibilities
6
+
7
+ You're given:
8
+
9
+ - The current **subgoal plan**
10
+ - The current **subgoal** (which is marked as **PENDING** in the plan, but repeated here for your convenience)
11
+ - A list of **agent thoughts** (insights, obstacles, or reasoning gathered during execution)
12
+ - The original **initial goal**
13
+
14
+ You must then **choose what to do next**:
15
+
16
+ - `"resume"`: The current subgoal is clearly not finished, let's resume it. The status of the current subgoal will stay as `PENDING`.
17
+ - `"continue"`: Move to the next subgoal in the list. The current subgoal will be marked as `SUCCESS`. If the current subgoal is the final step of the plan: The "reason" field must contain the final answer to the user’s initial goal. If the current subgoal is not the final step: The "reason" field must explain why this subgoal is now considered complete before moving on.
18
+ - `"replan"`: The current plan no longer fits : the current subgoal will be marked as `FAILURE`. we need to define a new plan.
@@ -0,0 +1,114 @@
1
+ from pathlib import Path
2
+
3
+ from jinja2 import Template
4
+ from langchain_core.messages import HumanMessage, SystemMessage
5
+
6
+ from minitap.mobile_use.agents.orchestrator.types import OrchestratorOutput, OrchestratorStatus
7
+ from minitap.mobile_use.agents.planner.utils import (
8
+ all_completed,
9
+ complete_current_subgoal,
10
+ fail_current_subgoal,
11
+ get_current_subgoal,
12
+ nothing_started,
13
+ start_next_subgoal,
14
+ )
15
+ from minitap.mobile_use.context import MobileUseContext
16
+ from minitap.mobile_use.graph.state import State
17
+ from minitap.mobile_use.services.llm import get_llm
18
+ from minitap.mobile_use.utils.decorators import wrap_with_callbacks
19
+ from minitap.mobile_use.utils.logger import get_logger
20
+
21
+ logger = get_logger(__name__)
22
+
23
+
24
+ class OrchestratorNode:
25
+ def __init__(self, ctx: MobileUseContext):
26
+ self.ctx = ctx
27
+
28
+ @wrap_with_callbacks(
29
+ before=lambda: logger.info("Starting Orchestrator Agent..."),
30
+ on_success=lambda _: logger.success("Orchestrator Agent"),
31
+ on_failure=lambda _: logger.error("Orchestrator Agent"),
32
+ )
33
+ async def __call__(self, state: State):
34
+ if nothing_started(state.subgoal_plan):
35
+ state.subgoal_plan = start_next_subgoal(state.subgoal_plan)
36
+ new_subgoal = get_current_subgoal(state.subgoal_plan)
37
+ return state.sanitize_update(
38
+ ctx=self.ctx,
39
+ update={
40
+ "agents_thoughts": [f"Starting the first subgoal: {new_subgoal}"],
41
+ "subgoal_plan": state.subgoal_plan,
42
+ },
43
+ )
44
+
45
+ current_subgoal = get_current_subgoal(state.subgoal_plan)
46
+
47
+ if not current_subgoal:
48
+ return state.sanitize_update(
49
+ ctx=self.ctx,
50
+ update={"agents_thoughts": ["No subgoal to go for."]},
51
+ )
52
+
53
+ system_message = Template(
54
+ Path(__file__).parent.joinpath("orchestrator.md").read_text(encoding="utf-8")
55
+ ).render(platform=self.ctx.device.mobile_platform.value)
56
+ human_message = Template(
57
+ Path(__file__).parent.joinpath("human.md").read_text(encoding="utf-8")
58
+ ).render(
59
+ initial_goal=state.initial_goal,
60
+ subgoal_plan="\n".join(str(s) for s in state.subgoal_plan),
61
+ current_subgoal=str(current_subgoal),
62
+ agent_thoughts="\n".join(state.agents_thoughts),
63
+ )
64
+ messages = [
65
+ SystemMessage(content=system_message),
66
+ HumanMessage(content=human_message),
67
+ ]
68
+
69
+ llm = get_llm(ctx=self.ctx, name="orchestrator", temperature=1)
70
+ llm = llm.with_structured_output(OrchestratorOutput)
71
+ response: OrchestratorOutput = await llm.ainvoke(messages) # type: ignore
72
+
73
+ if response.status == OrchestratorStatus.CONTINUE:
74
+ state.subgoal_plan = complete_current_subgoal(state.subgoal_plan)
75
+ thoughts = [response.reason]
76
+
77
+ if all_completed(state.subgoal_plan):
78
+ logger.success("All the subgoals have been completed successfully.")
79
+ return state.sanitize_update(
80
+ ctx=self.ctx,
81
+ update={
82
+ "subgoal_plan": state.subgoal_plan,
83
+ "agents_thoughts": thoughts,
84
+ },
85
+ )
86
+ state.subgoal_plan = start_next_subgoal(state.subgoal_plan)
87
+ new_subgoal = get_current_subgoal(state.subgoal_plan)
88
+ thoughts.append(f"==== NEXT SUBGOAL: {new_subgoal} ====")
89
+ return state.sanitize_update(
90
+ ctx=self.ctx,
91
+ update={
92
+ "agents_thoughts": thoughts,
93
+ "subgoal_plan": state.subgoal_plan,
94
+ },
95
+ )
96
+
97
+ elif response.status == OrchestratorStatus.REPLAN:
98
+ thoughts = [response.reason]
99
+ state.subgoal_plan = fail_current_subgoal(state.subgoal_plan)
100
+ thoughts.append("==== END OF PLAN, REPLANNING ====")
101
+ return state.sanitize_update(
102
+ ctx=self.ctx,
103
+ update={
104
+ "agents_thoughts": thoughts,
105
+ "subgoal_plan": state.subgoal_plan,
106
+ },
107
+ )
108
+
109
+ return state.sanitize_update(
110
+ ctx=self.ctx,
111
+ update={
112
+ "agents_thoughts": [response.reason],
113
+ },
114
+ )