minitap-mobile-use 3.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- minitap/mobile_use/__init__.py +0 -0
- minitap/mobile_use/agents/contextor/contextor.md +55 -0
- minitap/mobile_use/agents/contextor/contextor.py +175 -0
- minitap/mobile_use/agents/contextor/types.py +36 -0
- minitap/mobile_use/agents/cortex/cortex.md +135 -0
- minitap/mobile_use/agents/cortex/cortex.py +152 -0
- minitap/mobile_use/agents/cortex/types.py +15 -0
- minitap/mobile_use/agents/executor/executor.md +42 -0
- minitap/mobile_use/agents/executor/executor.py +87 -0
- minitap/mobile_use/agents/executor/tool_node.py +152 -0
- minitap/mobile_use/agents/hopper/hopper.md +15 -0
- minitap/mobile_use/agents/hopper/hopper.py +44 -0
- minitap/mobile_use/agents/orchestrator/human.md +12 -0
- minitap/mobile_use/agents/orchestrator/orchestrator.md +21 -0
- minitap/mobile_use/agents/orchestrator/orchestrator.py +134 -0
- minitap/mobile_use/agents/orchestrator/types.py +11 -0
- minitap/mobile_use/agents/outputter/human.md +25 -0
- minitap/mobile_use/agents/outputter/outputter.py +85 -0
- minitap/mobile_use/agents/outputter/test_outputter.py +167 -0
- minitap/mobile_use/agents/planner/human.md +14 -0
- minitap/mobile_use/agents/planner/planner.md +126 -0
- minitap/mobile_use/agents/planner/planner.py +101 -0
- minitap/mobile_use/agents/planner/types.py +51 -0
- minitap/mobile_use/agents/planner/utils.py +70 -0
- minitap/mobile_use/agents/summarizer/summarizer.py +35 -0
- minitap/mobile_use/agents/video_analyzer/__init__.py +5 -0
- minitap/mobile_use/agents/video_analyzer/human.md +5 -0
- minitap/mobile_use/agents/video_analyzer/video_analyzer.md +37 -0
- minitap/mobile_use/agents/video_analyzer/video_analyzer.py +111 -0
- minitap/mobile_use/clients/browserstack_client.py +477 -0
- minitap/mobile_use/clients/idb_client.py +429 -0
- minitap/mobile_use/clients/ios_client.py +332 -0
- minitap/mobile_use/clients/ios_client_config.py +141 -0
- minitap/mobile_use/clients/ui_automator_client.py +330 -0
- minitap/mobile_use/clients/wda_client.py +526 -0
- minitap/mobile_use/clients/wda_lifecycle.py +367 -0
- minitap/mobile_use/config.py +413 -0
- minitap/mobile_use/constants.py +3 -0
- minitap/mobile_use/context.py +106 -0
- minitap/mobile_use/controllers/__init__.py +0 -0
- minitap/mobile_use/controllers/android_controller.py +524 -0
- minitap/mobile_use/controllers/controller_factory.py +46 -0
- minitap/mobile_use/controllers/device_controller.py +182 -0
- minitap/mobile_use/controllers/ios_controller.py +436 -0
- minitap/mobile_use/controllers/platform_specific_commands_controller.py +199 -0
- minitap/mobile_use/controllers/types.py +106 -0
- minitap/mobile_use/controllers/unified_controller.py +193 -0
- minitap/mobile_use/graph/graph.py +160 -0
- minitap/mobile_use/graph/state.py +115 -0
- minitap/mobile_use/main.py +309 -0
- minitap/mobile_use/sdk/__init__.py +12 -0
- minitap/mobile_use/sdk/agent.py +1294 -0
- minitap/mobile_use/sdk/builders/__init__.py +10 -0
- minitap/mobile_use/sdk/builders/agent_config_builder.py +307 -0
- minitap/mobile_use/sdk/builders/index.py +15 -0
- minitap/mobile_use/sdk/builders/task_request_builder.py +236 -0
- minitap/mobile_use/sdk/constants.py +1 -0
- minitap/mobile_use/sdk/examples/README.md +83 -0
- minitap/mobile_use/sdk/examples/__init__.py +1 -0
- minitap/mobile_use/sdk/examples/app_lock_messaging.py +54 -0
- minitap/mobile_use/sdk/examples/platform_manual_task_example.py +67 -0
- minitap/mobile_use/sdk/examples/platform_minimal_example.py +48 -0
- minitap/mobile_use/sdk/examples/simple_photo_organizer.py +76 -0
- minitap/mobile_use/sdk/examples/smart_notification_assistant.py +225 -0
- minitap/mobile_use/sdk/examples/video_transcription_example.py +117 -0
- minitap/mobile_use/sdk/services/cloud_mobile.py +656 -0
- minitap/mobile_use/sdk/services/platform.py +434 -0
- minitap/mobile_use/sdk/types/__init__.py +51 -0
- minitap/mobile_use/sdk/types/agent.py +84 -0
- minitap/mobile_use/sdk/types/exceptions.py +138 -0
- minitap/mobile_use/sdk/types/platform.py +183 -0
- minitap/mobile_use/sdk/types/task.py +269 -0
- minitap/mobile_use/sdk/utils.py +29 -0
- minitap/mobile_use/services/accessibility.py +100 -0
- minitap/mobile_use/services/llm.py +247 -0
- minitap/mobile_use/services/telemetry.py +421 -0
- minitap/mobile_use/tools/index.py +67 -0
- minitap/mobile_use/tools/mobile/back.py +52 -0
- minitap/mobile_use/tools/mobile/erase_one_char.py +56 -0
- minitap/mobile_use/tools/mobile/focus_and_clear_text.py +317 -0
- minitap/mobile_use/tools/mobile/focus_and_input_text.py +153 -0
- minitap/mobile_use/tools/mobile/launch_app.py +86 -0
- minitap/mobile_use/tools/mobile/long_press_on.py +169 -0
- minitap/mobile_use/tools/mobile/open_link.py +62 -0
- minitap/mobile_use/tools/mobile/press_key.py +83 -0
- minitap/mobile_use/tools/mobile/stop_app.py +62 -0
- minitap/mobile_use/tools/mobile/swipe.py +156 -0
- minitap/mobile_use/tools/mobile/tap.py +154 -0
- minitap/mobile_use/tools/mobile/video_recording.py +177 -0
- minitap/mobile_use/tools/mobile/wait_for_delay.py +81 -0
- minitap/mobile_use/tools/scratchpad.py +147 -0
- minitap/mobile_use/tools/test_utils.py +413 -0
- minitap/mobile_use/tools/tool_wrapper.py +16 -0
- minitap/mobile_use/tools/types.py +35 -0
- minitap/mobile_use/tools/utils.py +336 -0
- minitap/mobile_use/utils/app_launch_utils.py +173 -0
- minitap/mobile_use/utils/cli_helpers.py +37 -0
- minitap/mobile_use/utils/cli_selection.py +143 -0
- minitap/mobile_use/utils/conversations.py +31 -0
- minitap/mobile_use/utils/decorators.py +124 -0
- minitap/mobile_use/utils/errors.py +6 -0
- minitap/mobile_use/utils/file.py +13 -0
- minitap/mobile_use/utils/logger.py +183 -0
- minitap/mobile_use/utils/media.py +186 -0
- minitap/mobile_use/utils/recorder.py +52 -0
- minitap/mobile_use/utils/requests_utils.py +37 -0
- minitap/mobile_use/utils/shell_utils.py +20 -0
- minitap/mobile_use/utils/test_ui_hierarchy.py +178 -0
- minitap/mobile_use/utils/time.py +6 -0
- minitap/mobile_use/utils/ui_hierarchy.py +132 -0
- minitap/mobile_use/utils/video.py +281 -0
- minitap_mobile_use-3.3.0.dist-info/METADATA +329 -0
- minitap_mobile_use-3.3.0.dist-info/RECORD +115 -0
- minitap_mobile_use-3.3.0.dist-info/WHEEL +4 -0
- minitap_mobile_use-3.3.0.dist-info/entry_points.txt +3 -0
|
File without changes
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
## You are the **Contextor Agent**
|
|
2
|
+
|
|
3
|
+
Verify app lock compliance. Decide: **relaunch locked app** or **allow deviation**.
|
|
4
|
+
|
|
5
|
+
---
|
|
6
|
+
|
|
7
|
+
## Context
|
|
8
|
+
|
|
9
|
+
- **Locked app:** `{{ locked_app_package }}`
|
|
10
|
+
- **Current app:** `{{ current_app_package }}` ← Different ?
|
|
11
|
+
|
|
12
|
+
**Default: RELAUNCH.** Only allow deviation with clear justification.
|
|
13
|
+
|
|
14
|
+
---
|
|
15
|
+
|
|
16
|
+
## Allow Deviation ONLY If
|
|
17
|
+
|
|
18
|
+
All conditions met:
|
|
19
|
+
1. **Intentional** - Agent thoughts show explicit plan to use current app
|
|
20
|
+
2. **Necessary** - Current app required for task (not just convenient)
|
|
21
|
+
3. **Valid pattern**: OAuth/login flow, payment, system permissions, SMS/email verification, deep link
|
|
22
|
+
|
|
23
|
+
## Relaunch If ANY True
|
|
24
|
+
|
|
25
|
+
- Current app unrelated to task
|
|
26
|
+
- Deviation looks accidental (no intent in agent thoughts)
|
|
27
|
+
- Current app cannot help complete task
|
|
28
|
+
- When in doubt → **RELAUNCH**
|
|
29
|
+
|
|
30
|
+
---
|
|
31
|
+
|
|
32
|
+
## Output
|
|
33
|
+
|
|
34
|
+
```json
|
|
35
|
+
{
|
|
36
|
+
"should_relaunch_app": true/false,
|
|
37
|
+
"reasoning": "2-4 sentences explaining decision"
|
|
38
|
+
}
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
---
|
|
42
|
+
|
|
43
|
+
## Input
|
|
44
|
+
|
|
45
|
+
**Task Goal:** {{ task_goal }}
|
|
46
|
+
|
|
47
|
+
**Subgoal Plan:** {{ subgoal_plan }}
|
|
48
|
+
|
|
49
|
+
**Locked App:** {{ locked_app_package }}
|
|
50
|
+
|
|
51
|
+
**Current App:** {{ current_app_package }}
|
|
52
|
+
|
|
53
|
+
**Agent Thoughts:**
|
|
54
|
+
{% for thought in agents_thoughts %}- {{ thought }}
|
|
55
|
+
{% endfor %}
|
|
@@ -0,0 +1,175 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
|
|
3
|
+
from jinja2 import Template
|
|
4
|
+
from langchain_core.messages import HumanMessage, SystemMessage
|
|
5
|
+
|
|
6
|
+
from minitap.mobile_use.agents.contextor.types import AppLockVerificationOutput, ContextorOutput
|
|
7
|
+
from minitap.mobile_use.agents.planner.types import Subgoal
|
|
8
|
+
from minitap.mobile_use.context import MobileUseContext
|
|
9
|
+
from minitap.mobile_use.controllers.controller_factory import create_device_controller
|
|
10
|
+
from minitap.mobile_use.controllers.platform_specific_commands_controller import (
|
|
11
|
+
get_current_foreground_package,
|
|
12
|
+
get_device_date,
|
|
13
|
+
)
|
|
14
|
+
from minitap.mobile_use.graph.state import State
|
|
15
|
+
from minitap.mobile_use.services.llm import get_llm, invoke_llm_with_timeout_message, with_fallback
|
|
16
|
+
from minitap.mobile_use.utils.app_launch_utils import launch_app_with_retries
|
|
17
|
+
from minitap.mobile_use.utils.decorators import wrap_with_callbacks
|
|
18
|
+
from minitap.mobile_use.utils.logger import get_logger
|
|
19
|
+
|
|
20
|
+
logger = get_logger(__name__)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class ContextorNode:
|
|
24
|
+
def __init__(self, ctx: MobileUseContext):
|
|
25
|
+
self.ctx = ctx
|
|
26
|
+
|
|
27
|
+
@wrap_with_callbacks(
|
|
28
|
+
before=lambda: logger.info("Starting Contextor Agent"),
|
|
29
|
+
on_success=lambda _: logger.success("Contextor Agent"),
|
|
30
|
+
on_failure=lambda _: logger.error("Contextor Agent"),
|
|
31
|
+
)
|
|
32
|
+
async def __call__(self, state: State):
|
|
33
|
+
device_controller = create_device_controller(self.ctx)
|
|
34
|
+
device_data = await device_controller.get_screen_data()
|
|
35
|
+
current_app_package = get_current_foreground_package(self.ctx)
|
|
36
|
+
device_date = get_device_date(self.ctx)
|
|
37
|
+
agent_outcome: str | None = None
|
|
38
|
+
|
|
39
|
+
if self.ctx.execution_setup and self.ctx.execution_setup.app_lock_status:
|
|
40
|
+
locked_app_package = self.ctx.execution_setup.app_lock_status.locked_app_package
|
|
41
|
+
should_verify_app_lock = (
|
|
42
|
+
self.ctx.execution_setup.app_lock_status.locked_app_initial_launch_success
|
|
43
|
+
)
|
|
44
|
+
if should_verify_app_lock:
|
|
45
|
+
if current_app_package:
|
|
46
|
+
try:
|
|
47
|
+
verification: AppLockVerificationOutput = (
|
|
48
|
+
await self._handle_app_lock_verification(
|
|
49
|
+
state=state,
|
|
50
|
+
current_app_package=current_app_package,
|
|
51
|
+
locked_app_package=locked_app_package,
|
|
52
|
+
)
|
|
53
|
+
)
|
|
54
|
+
agent_outcome = verification.to_optional_message()
|
|
55
|
+
except Exception as e:
|
|
56
|
+
logger.error(f"Failed to verify app lock: {e}")
|
|
57
|
+
else:
|
|
58
|
+
logger.warning(
|
|
59
|
+
f"App lock feature is setup for {locked_app_package}, "
|
|
60
|
+
"but could not determine current app, skipping"
|
|
61
|
+
)
|
|
62
|
+
else:
|
|
63
|
+
logger.warning(
|
|
64
|
+
f"App lock feature is setup for {locked_app_package}, "
|
|
65
|
+
"but initial launch was not successful, skipping"
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
return await state.asanitize_update(
|
|
69
|
+
ctx=self.ctx,
|
|
70
|
+
update={
|
|
71
|
+
"latest_ui_hierarchy": device_data.elements,
|
|
72
|
+
"latest_screenshot": device_data.base64,
|
|
73
|
+
"focused_app_info": current_app_package,
|
|
74
|
+
"screen_size": (device_data.width, device_data.height),
|
|
75
|
+
"device_date": device_date,
|
|
76
|
+
"agents_thoughts": [agent_outcome],
|
|
77
|
+
},
|
|
78
|
+
agent="contextor",
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
async def _handle_app_lock_verification(
|
|
82
|
+
self, state: State, current_app_package: str, locked_app_package: str
|
|
83
|
+
) -> AppLockVerificationOutput:
|
|
84
|
+
"""Verify app lock compliance and decide whether to relaunch the locked app."""
|
|
85
|
+
if not self.ctx.execution_setup or not self.ctx.execution_setup.app_lock_status:
|
|
86
|
+
return AppLockVerificationOutput(
|
|
87
|
+
package_name=locked_app_package,
|
|
88
|
+
reasoning="App lock feature is not setup",
|
|
89
|
+
status="error",
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
app_lock_status = self.ctx.execution_setup.app_lock_status
|
|
93
|
+
locked_app_package = app_lock_status.locked_app_package
|
|
94
|
+
|
|
95
|
+
if current_app_package == locked_app_package:
|
|
96
|
+
logger.info(f"App lock verified: current app matches locked app ({locked_app_package})")
|
|
97
|
+
return AppLockVerificationOutput(
|
|
98
|
+
package_name=locked_app_package,
|
|
99
|
+
status="already_in_foreground",
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
logger.warning(
|
|
103
|
+
f"App lock violation detected: expected '{locked_app_package}', "
|
|
104
|
+
f"but current app is '{current_app_package}'"
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
decision: ContextorOutput = await self._invoke_contextor_llm(
|
|
108
|
+
initial_goal=state.initial_goal,
|
|
109
|
+
subgoal_plan=state.subgoal_plan,
|
|
110
|
+
agents_thoughts=state.agents_thoughts,
|
|
111
|
+
locked_app_package=locked_app_package,
|
|
112
|
+
current_app_package=current_app_package,
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
if decision.should_relaunch_app:
|
|
116
|
+
logger.info(f"Relaunching locked app: {locked_app_package}")
|
|
117
|
+
success, error = await launch_app_with_retries(self.ctx, app_package=locked_app_package)
|
|
118
|
+
if not success:
|
|
119
|
+
logger.error(f"Failed to relaunch {locked_app_package}: {error}")
|
|
120
|
+
return AppLockVerificationOutput(
|
|
121
|
+
package_name=locked_app_package,
|
|
122
|
+
reasoning=f"Failed to relaunch app: {error}",
|
|
123
|
+
status="error",
|
|
124
|
+
)
|
|
125
|
+
return AppLockVerificationOutput(
|
|
126
|
+
package_name=locked_app_package,
|
|
127
|
+
reasoning=decision.reasoning,
|
|
128
|
+
status="relaunched",
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
logger.info(f"Allowing app deviation to: {current_app_package}")
|
|
132
|
+
return AppLockVerificationOutput(
|
|
133
|
+
package_name=locked_app_package,
|
|
134
|
+
reasoning=decision.reasoning,
|
|
135
|
+
status="allowed_deviation",
|
|
136
|
+
)
|
|
137
|
+
|
|
138
|
+
async def _invoke_contextor_llm(
|
|
139
|
+
self,
|
|
140
|
+
initial_goal: str,
|
|
141
|
+
subgoal_plan: list[Subgoal],
|
|
142
|
+
agents_thoughts: list[str],
|
|
143
|
+
locked_app_package: str,
|
|
144
|
+
current_app_package: str,
|
|
145
|
+
) -> ContextorOutput:
|
|
146
|
+
"""Invoke the LLM to decide whether to relaunch the locked app."""
|
|
147
|
+
|
|
148
|
+
MAX_AGENTS_THOUGHTS = 25
|
|
149
|
+
|
|
150
|
+
system_message = Template(
|
|
151
|
+
Path(__file__).parent.joinpath("contextor.md").read_text(encoding="utf-8")
|
|
152
|
+
).render(
|
|
153
|
+
task_goal=initial_goal,
|
|
154
|
+
subgoal_plan="\n".join([str(subgoal) for subgoal in subgoal_plan]),
|
|
155
|
+
locked_app_package=locked_app_package,
|
|
156
|
+
current_app_package=current_app_package,
|
|
157
|
+
agents_thoughts=agents_thoughts[:MAX_AGENTS_THOUGHTS],
|
|
158
|
+
)
|
|
159
|
+
|
|
160
|
+
messages = [
|
|
161
|
+
SystemMessage(content=system_message),
|
|
162
|
+
HumanMessage(content="Please make your decision."),
|
|
163
|
+
]
|
|
164
|
+
|
|
165
|
+
llm = get_llm(ctx=self.ctx, name="contextor").with_structured_output(ContextorOutput)
|
|
166
|
+
llm_fallback = get_llm(
|
|
167
|
+
ctx=self.ctx, name="contextor", use_fallback=True
|
|
168
|
+
).with_structured_output(ContextorOutput)
|
|
169
|
+
|
|
170
|
+
response: ContextorOutput = await with_fallback(
|
|
171
|
+
main_call=lambda: invoke_llm_with_timeout_message(llm.ainvoke(messages)),
|
|
172
|
+
fallback_call=lambda: invoke_llm_with_timeout_message(llm_fallback.ainvoke(messages)),
|
|
173
|
+
) # type: ignore
|
|
174
|
+
|
|
175
|
+
return response
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
from typing import Literal
|
|
2
|
+
|
|
3
|
+
from pydantic import BaseModel, Field
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class ContextorOutput(BaseModel):
|
|
7
|
+
"""Output schema for the Contextor agent decision."""
|
|
8
|
+
|
|
9
|
+
should_relaunch_app: bool = Field(..., description="Whether to relaunch the locked app")
|
|
10
|
+
reasoning: str = Field(
|
|
11
|
+
..., description="Explanation of why we should or should not relaunch the app"
|
|
12
|
+
)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class AppLockVerificationOutput(BaseModel):
|
|
16
|
+
package_name: str = Field(..., description="Package name of the app that was verified")
|
|
17
|
+
reasoning: str | None = Field(default=None, description="Reasoning for the decision")
|
|
18
|
+
status: Literal["already_in_foreground", "relaunched", "allowed_deviation", "error"] = Field(
|
|
19
|
+
..., description="Status of the decision"
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
def to_optional_message(self) -> str | None:
|
|
23
|
+
msg = f"App {self.package_name}"
|
|
24
|
+
if self.reasoning:
|
|
25
|
+
msg = f"{self.reasoning} {msg}"
|
|
26
|
+
match self.status:
|
|
27
|
+
case "already_in_foreground":
|
|
28
|
+
return None
|
|
29
|
+
case "relaunched":
|
|
30
|
+
msg += " was successfully relaunched ✅"
|
|
31
|
+
case "allowed_deviation":
|
|
32
|
+
msg += " was allowed deviation ⚠️"
|
|
33
|
+
case "error":
|
|
34
|
+
msg = f"Could not verify app lock for {self.package_name}."
|
|
35
|
+
|
|
36
|
+
return msg
|
|
@@ -0,0 +1,135 @@
|
|
|
1
|
+
## You are the **Cortex**
|
|
2
|
+
|
|
3
|
+
You analyze the {{ platform }} mobile device state and produce structured decisions to achieve subgoals. You are the brain giving instructions to the Executor (your hands).
|
|
4
|
+
|
|
5
|
+
---
|
|
6
|
+
|
|
7
|
+
## 🚨 CRITICAL RULES (Read First)
|
|
8
|
+
|
|
9
|
+
### 1. Analyze Agent Thoughts Before Acting
|
|
10
|
+
Before ANY decision, review agent thoughts history to:
|
|
11
|
+
- Detect **repeated failures** → change strategy, don't retry blindly
|
|
12
|
+
- Spot **contradictions** between plan and reality
|
|
13
|
+
- Learn from what worked/failed
|
|
14
|
+
|
|
15
|
+
### 2. Never Repeat Failed Actions
|
|
16
|
+
If something failed, understand WHY before trying again. Ask: "How would a human solve this differently?"
|
|
17
|
+
|
|
18
|
+
### 3. Unpredictable Actions = Isolate Them
|
|
19
|
+
These actions change the screen unpredictably: `back`, `launch_app`, `stop_app`, `open_link`, navigation taps.
|
|
20
|
+
**Rule:** If your decision includes one of these, it MUST be the ONLY action in that turn. Wait to see the new screen before deciding next steps.
|
|
21
|
+
|
|
22
|
+
### 4. Complete Goals Only on OBSERVED Evidence
|
|
23
|
+
Never mark a goal complete "in advance". Only complete based on executor feedback confirming success.
|
|
24
|
+
|
|
25
|
+
### 5. Data Fidelity Over "Helpfulness"
|
|
26
|
+
For any data-related task: transcribe content **exactly as-is** unless explicitly told otherwise.
|
|
27
|
+
|
|
28
|
+
---
|
|
29
|
+
|
|
30
|
+
## 📱 Perception
|
|
31
|
+
|
|
32
|
+
You have 2 senses:
|
|
33
|
+
|
|
34
|
+
| Sense | Use For | Limitation |
|
|
35
|
+
|-------|---------|------------|
|
|
36
|
+
| **UI Hierarchy** | Find elements by resource-id, text, bounds | No visual info (colors, images, obscured elements) |
|
|
37
|
+
| **Screenshot** | Visual context, verify elements are visible, visual cues (badges, colors, icons) | Can't reliably extract precise element coordinates from pixels |
|
|
38
|
+
|
|
39
|
+
You must combine your 2 senses to cancel out the limitations of each.
|
|
40
|
+
|
|
41
|
+
---
|
|
42
|
+
|
|
43
|
+
## 🎯 Element Targeting (MANDATORY)
|
|
44
|
+
|
|
45
|
+
When targeting ANY element (tap, input, clear...), provide ALL available info:
|
|
46
|
+
|
|
47
|
+
```json
|
|
48
|
+
{
|
|
49
|
+
"target": {
|
|
50
|
+
"resource_id": "com.app:id/button",
|
|
51
|
+
"resource_id_index": 0,
|
|
52
|
+
"bounds": {"x": 100, "y": 200, "width": 50, "height": 50},
|
|
53
|
+
"text": "Submit",
|
|
54
|
+
"text_index": 0
|
|
55
|
+
}
|
|
56
|
+
}
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
- `resource_id_index` = index among elements with same resource_id
|
|
60
|
+
- `text_index` = index among elements with same text
|
|
61
|
+
- This enables **fallback**: if ID fails → tries bounds → tries text
|
|
62
|
+
|
|
63
|
+
**On tap failure:** "Out of bounds" = stale bounds. "No element found" = screen changed. Adapt, don't retry blindly.
|
|
64
|
+
|
|
65
|
+
---
|
|
66
|
+
|
|
67
|
+
## 🔧 Tools & Actions
|
|
68
|
+
|
|
69
|
+
Available tools: {{ executor_tools_list }}
|
|
70
|
+
|
|
71
|
+
| Action | Tool | Notes |
|
|
72
|
+
|--------|------|-------|
|
|
73
|
+
| **Open app** | `launch_app` | **ALWAYS use first** with app name (e.g., "WhatsApp"). Only try app drawer manually if launch_app fails. |
|
|
74
|
+
| Open URL | `open_link` | Handles deep links correctly |
|
|
75
|
+
| Type text | `focus_and_input_text` | Focuses + types. Verify if feedback shows empty. To create a blank line between paragraphs, use \n\n. |
|
|
76
|
+
| Clear text | `focus_and_clear_text` | If fails, try: long press → select all → `erase_one_char` |
|
|
77
|
+
|
|
78
|
+
### Swipe Physics
|
|
79
|
+
Swipe direction "pushes" the screen: **swipe RIGHT → reveals LEFT page** (and vice versa).
|
|
80
|
+
Default to **percentage-based** swipes. Use coordinates only for precise controls (sliders).
|
|
81
|
+
Memory aid: Swipe RIGHT (low→high x) to see LEFT page. Swipe LEFT (high→low x) to see RIGHT page.
|
|
82
|
+
|
|
83
|
+
### Form Filling
|
|
84
|
+
Before concluding a field is missing, **scroll through the entire form** to verify all fields. If you observed a field earlier but can't find it now, scroll back - don't assume it's gone.
|
|
85
|
+
**Rule:** Never input data into the wrong field if the correct field was previously observed.
|
|
86
|
+
|
|
87
|
+
{% if locked_app_package %}
|
|
88
|
+
---
|
|
89
|
+
|
|
90
|
+
## 🔒 App Lock Mode
|
|
91
|
+
|
|
92
|
+
Session locked to: **{{ locked_app_package }}**
|
|
93
|
+
- Stay within this app
|
|
94
|
+
- Avoid navigating away unless necessary (e.g., OAuth)
|
|
95
|
+
- Contextor agent will relaunch if you leave accidentally
|
|
96
|
+
{% endif %}
|
|
97
|
+
|
|
98
|
+
---
|
|
99
|
+
|
|
100
|
+
## 📤 Output Format
|
|
101
|
+
|
|
102
|
+
| Field | Required | Description |
|
|
103
|
+
|-------|----------|-------------|
|
|
104
|
+
| **complete_subgoals_by_ids** | Optional | IDs of subgoals to mark complete (based on OBSERVED evidence) |
|
|
105
|
+
| **Structured Decisions** | Optional | Valid JSON string of actions to execute |
|
|
106
|
+
| **Decisions Reason** | Required | 2-4 sentences: analyze agent thoughts → explain decision → note strategy changes |
|
|
107
|
+
| **Goals Completion Reason** | Required | Why completing these goals, or "None" |
|
|
108
|
+
|
|
109
|
+
---
|
|
110
|
+
|
|
111
|
+
## 📝 Example
|
|
112
|
+
|
|
113
|
+
**Subgoal:** "Send 'Hello!' to Alice on WhatsApp"
|
|
114
|
+
|
|
115
|
+
**Context:** Agent thoughts show previous turn typed "Hello!" successfully. UI shows message in field + send button visible.
|
|
116
|
+
|
|
117
|
+
**Output:**
|
|
118
|
+
```
|
|
119
|
+
complete_subgoals_by_ids: ["subgoal-4-type-message"]
|
|
120
|
+
Structured Decisions: "[{\"action\": \"tap\", \"target\": {\"resource_id\": \"com.whatsapp:id/send\", \"resource_id_index\": 0, \"bounds\": {\"x\": 950, \"y\": 1800, \"width\": 100, \"height\": 100}}}]"
|
|
121
|
+
Decisions Reason: Agent thoughts confirm typing succeeded. Completing typing subgoal based on observed evidence. Now tapping send with full target info.
|
|
122
|
+
Goals Completion Reason: Executor feedback confirmed "Hello!" was entered successfully.
|
|
123
|
+
```
|
|
124
|
+
|
|
125
|
+
---
|
|
126
|
+
|
|
127
|
+
## Input
|
|
128
|
+
|
|
129
|
+
**Initial Goal:** {{ initial_goal }}
|
|
130
|
+
|
|
131
|
+
**Subgoal Plan:** {{ subgoal_plan }}
|
|
132
|
+
|
|
133
|
+
**Current Subgoal:** {{ current_subgoal }}
|
|
134
|
+
|
|
135
|
+
**Executor Feedback:** {{ executor_feedback }}
|
|
@@ -0,0 +1,152 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
|
|
4
|
+
from jinja2 import Template
|
|
5
|
+
from langchain_core.messages import (
|
|
6
|
+
AIMessage,
|
|
7
|
+
HumanMessage,
|
|
8
|
+
RemoveMessage,
|
|
9
|
+
SystemMessage,
|
|
10
|
+
ToolMessage,
|
|
11
|
+
)
|
|
12
|
+
from langgraph.graph.message import REMOVE_ALL_MESSAGES
|
|
13
|
+
|
|
14
|
+
from minitap.mobile_use.agents.cortex.types import CortexOutput
|
|
15
|
+
from minitap.mobile_use.agents.planner.utils import get_current_subgoal
|
|
16
|
+
from minitap.mobile_use.constants import EXECUTOR_MESSAGES_KEY
|
|
17
|
+
from minitap.mobile_use.context import MobileUseContext
|
|
18
|
+
from minitap.mobile_use.controllers.controller_factory import create_device_controller
|
|
19
|
+
from minitap.mobile_use.graph.state import State
|
|
20
|
+
from minitap.mobile_use.services.llm import get_llm, invoke_llm_with_timeout_message, with_fallback
|
|
21
|
+
from minitap.mobile_use.services.telemetry import telemetry
|
|
22
|
+
from minitap.mobile_use.tools.index import (
|
|
23
|
+
EXECUTOR_WRAPPERS_TOOLS,
|
|
24
|
+
VIDEO_RECORDING_WRAPPERS,
|
|
25
|
+
format_tools_list,
|
|
26
|
+
)
|
|
27
|
+
from minitap.mobile_use.utils.conversations import get_screenshot_message_for_llm
|
|
28
|
+
from minitap.mobile_use.utils.decorators import wrap_with_callbacks
|
|
29
|
+
from minitap.mobile_use.utils.logger import get_logger
|
|
30
|
+
|
|
31
|
+
logger = get_logger(__name__)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class CortexNode:
|
|
35
|
+
def __init__(self, ctx: MobileUseContext):
|
|
36
|
+
self.ctx = ctx
|
|
37
|
+
|
|
38
|
+
@wrap_with_callbacks(
|
|
39
|
+
before=lambda: logger.info("Starting Cortex Agent..."),
|
|
40
|
+
on_success=lambda _: logger.success("Cortex Agent"),
|
|
41
|
+
on_failure=lambda _: logger.error("Cortex Agent"),
|
|
42
|
+
)
|
|
43
|
+
async def __call__(self, state: State):
|
|
44
|
+
executor_feedback = get_executor_agent_feedback(state)
|
|
45
|
+
|
|
46
|
+
current_locked_app_package = (
|
|
47
|
+
self.ctx.execution_setup.get_locked_app_package() if self.ctx.execution_setup else None
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
executor_wrappers = list(EXECUTOR_WRAPPERS_TOOLS)
|
|
51
|
+
if self.ctx.video_recording_enabled:
|
|
52
|
+
executor_wrappers.extend(VIDEO_RECORDING_WRAPPERS)
|
|
53
|
+
|
|
54
|
+
system_message = Template(
|
|
55
|
+
Path(__file__).parent.joinpath("cortex.md").read_text(encoding="utf-8")
|
|
56
|
+
).render(
|
|
57
|
+
platform=self.ctx.device.mobile_platform.value,
|
|
58
|
+
initial_goal=state.initial_goal,
|
|
59
|
+
subgoal_plan=state.subgoal_plan,
|
|
60
|
+
current_subgoal=get_current_subgoal(state.subgoal_plan),
|
|
61
|
+
executor_feedback=executor_feedback,
|
|
62
|
+
executor_tools_list=format_tools_list(ctx=self.ctx, wrappers=executor_wrappers),
|
|
63
|
+
locked_app_package=current_locked_app_package,
|
|
64
|
+
)
|
|
65
|
+
messages = [
|
|
66
|
+
SystemMessage(content=system_message),
|
|
67
|
+
HumanMessage(
|
|
68
|
+
content="Here are my device info:\n"
|
|
69
|
+
+ self.ctx.device.to_str()
|
|
70
|
+
+ f"Device date: {state.device_date}\n"
|
|
71
|
+
if state.device_date
|
|
72
|
+
else "" + f"Focused app info: {state.focused_app_info}\n"
|
|
73
|
+
if state.focused_app_info
|
|
74
|
+
else ""
|
|
75
|
+
),
|
|
76
|
+
]
|
|
77
|
+
for thought in state.agents_thoughts:
|
|
78
|
+
messages.append(AIMessage(content=thought))
|
|
79
|
+
|
|
80
|
+
if state.latest_ui_hierarchy:
|
|
81
|
+
ui_hierarchy_dict: list[dict] = state.latest_ui_hierarchy
|
|
82
|
+
ui_hierarchy_str = json.dumps(ui_hierarchy_dict, indent=2, ensure_ascii=False)
|
|
83
|
+
messages.append(HumanMessage(content="Here is the UI hierarchy:\n" + ui_hierarchy_str))
|
|
84
|
+
|
|
85
|
+
if state.latest_screenshot:
|
|
86
|
+
controller = create_device_controller(self.ctx)
|
|
87
|
+
compressed_image_base64 = controller.get_compressed_b64_screenshot(
|
|
88
|
+
state.latest_screenshot
|
|
89
|
+
)
|
|
90
|
+
messages.append(get_screenshot_message_for_llm(compressed_image_base64))
|
|
91
|
+
|
|
92
|
+
llm = get_llm(ctx=self.ctx, name="cortex", temperature=1).with_structured_output(
|
|
93
|
+
CortexOutput
|
|
94
|
+
)
|
|
95
|
+
llm_fallback = get_llm(
|
|
96
|
+
ctx=self.ctx, name="cortex", use_fallback=True, temperature=1
|
|
97
|
+
).with_structured_output(CortexOutput)
|
|
98
|
+
response: CortexOutput = await with_fallback(
|
|
99
|
+
main_call=lambda: invoke_llm_with_timeout_message(llm.ainvoke(messages)),
|
|
100
|
+
fallback_call=lambda: invoke_llm_with_timeout_message(llm_fallback.ainvoke(messages)),
|
|
101
|
+
) # type: ignore
|
|
102
|
+
|
|
103
|
+
EMPTY_STRING_TOKENS = ["{}", "[]", "null", "", "None"]
|
|
104
|
+
|
|
105
|
+
if response.decisions in EMPTY_STRING_TOKENS:
|
|
106
|
+
response.decisions = None
|
|
107
|
+
if response.goals_completion_reason in EMPTY_STRING_TOKENS:
|
|
108
|
+
response.goals_completion_reason = None
|
|
109
|
+
|
|
110
|
+
thought_parts = []
|
|
111
|
+
if response.decisions_reason:
|
|
112
|
+
thought_parts.append(f"Decisions reason: {response.decisions_reason}")
|
|
113
|
+
if response.goals_completion_reason:
|
|
114
|
+
thought_parts.append(f"Goals completion reason: {response.goals_completion_reason}")
|
|
115
|
+
|
|
116
|
+
agent_thought = "\n\n".join(thought_parts)
|
|
117
|
+
|
|
118
|
+
# Capture cortex decision telemetry (only non-sensitive flags)
|
|
119
|
+
telemetry.capture_cortex_decision(
|
|
120
|
+
task_id=self.ctx.trace_id,
|
|
121
|
+
has_decisions=response.decisions is not None,
|
|
122
|
+
has_goals_completion=response.goals_completion_reason is not None,
|
|
123
|
+
completed_subgoals_count=len(response.complete_subgoals_by_ids or []),
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
return await state.asanitize_update(
|
|
127
|
+
ctx=self.ctx,
|
|
128
|
+
update={
|
|
129
|
+
"agents_thoughts": [agent_thought],
|
|
130
|
+
"structured_decisions": response.decisions,
|
|
131
|
+
"complete_subgoals_by_ids": response.complete_subgoals_by_ids,
|
|
132
|
+
"latest_ui_hierarchy": None,
|
|
133
|
+
"latest_screenshot": None,
|
|
134
|
+
"focused_app_info": None,
|
|
135
|
+
"device_date": None,
|
|
136
|
+
# Executor related fields
|
|
137
|
+
EXECUTOR_MESSAGES_KEY: [RemoveMessage(id=REMOVE_ALL_MESSAGES)],
|
|
138
|
+
"cortex_last_thought": agent_thought,
|
|
139
|
+
},
|
|
140
|
+
agent="cortex",
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
def get_executor_agent_feedback(state: State) -> str:
|
|
145
|
+
if state.structured_decisions is None:
|
|
146
|
+
return "None."
|
|
147
|
+
executor_tool_messages = [m for m in state.executor_messages if isinstance(m, ToolMessage)]
|
|
148
|
+
return (
|
|
149
|
+
f"Latest UI decisions:\n{state.structured_decisions}"
|
|
150
|
+
+ "\n\n"
|
|
151
|
+
+ f"Executor feedback:\n{executor_tool_messages}"
|
|
152
|
+
)
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
from pydantic import BaseModel, Field
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class CortexOutput(BaseModel):
|
|
5
|
+
decisions: str | None = Field(
|
|
6
|
+
default=None, description="The decisions to be made. A stringified JSON object"
|
|
7
|
+
)
|
|
8
|
+
decisions_reason: str | None = Field(default=None, description="The reason for the decisions")
|
|
9
|
+
goals_completion_reason: str | None = Field(
|
|
10
|
+
default=None,
|
|
11
|
+
description="The reason for the goals completion, if there are any goals to be completed.",
|
|
12
|
+
)
|
|
13
|
+
complete_subgoals_by_ids: list[str] = Field(
|
|
14
|
+
default_factory=list, description="List of subgoal IDs to complete"
|
|
15
|
+
)
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
## You are the **Executor**
|
|
2
|
+
|
|
3
|
+
Interpret Cortex decisions and execute tools on {{ platform }} mobile device. You are the hands, Cortex is the brain.
|
|
4
|
+
|
|
5
|
+
---
|
|
6
|
+
|
|
7
|
+
## Your Job
|
|
8
|
+
|
|
9
|
+
1. **Parse** structured decisions from Cortex
|
|
10
|
+
2. **Call tools** in the specified order
|
|
11
|
+
3. **Always include `agent_thought`** for each tool - explains WHY (for debugging/tracing)
|
|
12
|
+
|
|
13
|
+
---
|
|
14
|
+
|
|
15
|
+
## Example
|
|
16
|
+
|
|
17
|
+
**Cortex decision:**
|
|
18
|
+
```json
|
|
19
|
+
"[{\"action\": \"tap\", \"target\": {\"resource_id\": \"com.whatsapp:id/chat\", \"text\": \"Alice\", \"bounds\": {\"x\": 100, \"y\": 350, \"width\": 50, \"height\": 50}}}]"
|
|
20
|
+
```
|
|
21
|
+
|
|
22
|
+
**You execute:**
|
|
23
|
+
```
|
|
24
|
+
tap(target={resource_id: "com.whatsapp:id/chat", text: "Alice", ...}, agent_thought: "Tapping Alice's chat to open conversation")
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
---
|
|
28
|
+
|
|
29
|
+
## Tool Notes
|
|
30
|
+
|
|
31
|
+
| Tool | Notes |
|
|
32
|
+
|------|-------|
|
|
33
|
+
| `focus_and_input_text` | Provide full target info. Auto-focuses + moves cursor to end. Special chars are supported like newlines (use `\n` not `\\n`) as well as UTF-8 characters `行` |
|
|
34
|
+
| `focus_and_clear_text` | Clears entire field. If fails: long press → select all → `erase_one_char` |
|
|
35
|
+
|
|
36
|
+
---
|
|
37
|
+
|
|
38
|
+
## Rules
|
|
39
|
+
|
|
40
|
+
- **Don't reason about strategy** - just execute what Cortex decided
|
|
41
|
+
- **`agent_thought` must be specific** - not generic/vague
|
|
42
|
+
- **Order matters** - tools execute in the order you return them
|