minitap-mobile-use 3.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (115) hide show
  1. minitap/mobile_use/__init__.py +0 -0
  2. minitap/mobile_use/agents/contextor/contextor.md +55 -0
  3. minitap/mobile_use/agents/contextor/contextor.py +175 -0
  4. minitap/mobile_use/agents/contextor/types.py +36 -0
  5. minitap/mobile_use/agents/cortex/cortex.md +135 -0
  6. minitap/mobile_use/agents/cortex/cortex.py +152 -0
  7. minitap/mobile_use/agents/cortex/types.py +15 -0
  8. minitap/mobile_use/agents/executor/executor.md +42 -0
  9. minitap/mobile_use/agents/executor/executor.py +87 -0
  10. minitap/mobile_use/agents/executor/tool_node.py +152 -0
  11. minitap/mobile_use/agents/hopper/hopper.md +15 -0
  12. minitap/mobile_use/agents/hopper/hopper.py +44 -0
  13. minitap/mobile_use/agents/orchestrator/human.md +12 -0
  14. minitap/mobile_use/agents/orchestrator/orchestrator.md +21 -0
  15. minitap/mobile_use/agents/orchestrator/orchestrator.py +134 -0
  16. minitap/mobile_use/agents/orchestrator/types.py +11 -0
  17. minitap/mobile_use/agents/outputter/human.md +25 -0
  18. minitap/mobile_use/agents/outputter/outputter.py +85 -0
  19. minitap/mobile_use/agents/outputter/test_outputter.py +167 -0
  20. minitap/mobile_use/agents/planner/human.md +14 -0
  21. minitap/mobile_use/agents/planner/planner.md +126 -0
  22. minitap/mobile_use/agents/planner/planner.py +101 -0
  23. minitap/mobile_use/agents/planner/types.py +51 -0
  24. minitap/mobile_use/agents/planner/utils.py +70 -0
  25. minitap/mobile_use/agents/summarizer/summarizer.py +35 -0
  26. minitap/mobile_use/agents/video_analyzer/__init__.py +5 -0
  27. minitap/mobile_use/agents/video_analyzer/human.md +5 -0
  28. minitap/mobile_use/agents/video_analyzer/video_analyzer.md +37 -0
  29. minitap/mobile_use/agents/video_analyzer/video_analyzer.py +111 -0
  30. minitap/mobile_use/clients/browserstack_client.py +477 -0
  31. minitap/mobile_use/clients/idb_client.py +429 -0
  32. minitap/mobile_use/clients/ios_client.py +332 -0
  33. minitap/mobile_use/clients/ios_client_config.py +141 -0
  34. minitap/mobile_use/clients/ui_automator_client.py +330 -0
  35. minitap/mobile_use/clients/wda_client.py +526 -0
  36. minitap/mobile_use/clients/wda_lifecycle.py +367 -0
  37. minitap/mobile_use/config.py +413 -0
  38. minitap/mobile_use/constants.py +3 -0
  39. minitap/mobile_use/context.py +106 -0
  40. minitap/mobile_use/controllers/__init__.py +0 -0
  41. minitap/mobile_use/controllers/android_controller.py +524 -0
  42. minitap/mobile_use/controllers/controller_factory.py +46 -0
  43. minitap/mobile_use/controllers/device_controller.py +182 -0
  44. minitap/mobile_use/controllers/ios_controller.py +436 -0
  45. minitap/mobile_use/controllers/platform_specific_commands_controller.py +199 -0
  46. minitap/mobile_use/controllers/types.py +106 -0
  47. minitap/mobile_use/controllers/unified_controller.py +193 -0
  48. minitap/mobile_use/graph/graph.py +160 -0
  49. minitap/mobile_use/graph/state.py +115 -0
  50. minitap/mobile_use/main.py +309 -0
  51. minitap/mobile_use/sdk/__init__.py +12 -0
  52. minitap/mobile_use/sdk/agent.py +1294 -0
  53. minitap/mobile_use/sdk/builders/__init__.py +10 -0
  54. minitap/mobile_use/sdk/builders/agent_config_builder.py +307 -0
  55. minitap/mobile_use/sdk/builders/index.py +15 -0
  56. minitap/mobile_use/sdk/builders/task_request_builder.py +236 -0
  57. minitap/mobile_use/sdk/constants.py +1 -0
  58. minitap/mobile_use/sdk/examples/README.md +83 -0
  59. minitap/mobile_use/sdk/examples/__init__.py +1 -0
  60. minitap/mobile_use/sdk/examples/app_lock_messaging.py +54 -0
  61. minitap/mobile_use/sdk/examples/platform_manual_task_example.py +67 -0
  62. minitap/mobile_use/sdk/examples/platform_minimal_example.py +48 -0
  63. minitap/mobile_use/sdk/examples/simple_photo_organizer.py +76 -0
  64. minitap/mobile_use/sdk/examples/smart_notification_assistant.py +225 -0
  65. minitap/mobile_use/sdk/examples/video_transcription_example.py +117 -0
  66. minitap/mobile_use/sdk/services/cloud_mobile.py +656 -0
  67. minitap/mobile_use/sdk/services/platform.py +434 -0
  68. minitap/mobile_use/sdk/types/__init__.py +51 -0
  69. minitap/mobile_use/sdk/types/agent.py +84 -0
  70. minitap/mobile_use/sdk/types/exceptions.py +138 -0
  71. minitap/mobile_use/sdk/types/platform.py +183 -0
  72. minitap/mobile_use/sdk/types/task.py +269 -0
  73. minitap/mobile_use/sdk/utils.py +29 -0
  74. minitap/mobile_use/services/accessibility.py +100 -0
  75. minitap/mobile_use/services/llm.py +247 -0
  76. minitap/mobile_use/services/telemetry.py +421 -0
  77. minitap/mobile_use/tools/index.py +67 -0
  78. minitap/mobile_use/tools/mobile/back.py +52 -0
  79. minitap/mobile_use/tools/mobile/erase_one_char.py +56 -0
  80. minitap/mobile_use/tools/mobile/focus_and_clear_text.py +317 -0
  81. minitap/mobile_use/tools/mobile/focus_and_input_text.py +153 -0
  82. minitap/mobile_use/tools/mobile/launch_app.py +86 -0
  83. minitap/mobile_use/tools/mobile/long_press_on.py +169 -0
  84. minitap/mobile_use/tools/mobile/open_link.py +62 -0
  85. minitap/mobile_use/tools/mobile/press_key.py +83 -0
  86. minitap/mobile_use/tools/mobile/stop_app.py +62 -0
  87. minitap/mobile_use/tools/mobile/swipe.py +156 -0
  88. minitap/mobile_use/tools/mobile/tap.py +154 -0
  89. minitap/mobile_use/tools/mobile/video_recording.py +177 -0
  90. minitap/mobile_use/tools/mobile/wait_for_delay.py +81 -0
  91. minitap/mobile_use/tools/scratchpad.py +147 -0
  92. minitap/mobile_use/tools/test_utils.py +413 -0
  93. minitap/mobile_use/tools/tool_wrapper.py +16 -0
  94. minitap/mobile_use/tools/types.py +35 -0
  95. minitap/mobile_use/tools/utils.py +336 -0
  96. minitap/mobile_use/utils/app_launch_utils.py +173 -0
  97. minitap/mobile_use/utils/cli_helpers.py +37 -0
  98. minitap/mobile_use/utils/cli_selection.py +143 -0
  99. minitap/mobile_use/utils/conversations.py +31 -0
  100. minitap/mobile_use/utils/decorators.py +124 -0
  101. minitap/mobile_use/utils/errors.py +6 -0
  102. minitap/mobile_use/utils/file.py +13 -0
  103. minitap/mobile_use/utils/logger.py +183 -0
  104. minitap/mobile_use/utils/media.py +186 -0
  105. minitap/mobile_use/utils/recorder.py +52 -0
  106. minitap/mobile_use/utils/requests_utils.py +37 -0
  107. minitap/mobile_use/utils/shell_utils.py +20 -0
  108. minitap/mobile_use/utils/test_ui_hierarchy.py +178 -0
  109. minitap/mobile_use/utils/time.py +6 -0
  110. minitap/mobile_use/utils/ui_hierarchy.py +132 -0
  111. minitap/mobile_use/utils/video.py +281 -0
  112. minitap_mobile_use-3.3.0.dist-info/METADATA +329 -0
  113. minitap_mobile_use-3.3.0.dist-info/RECORD +115 -0
  114. minitap_mobile_use-3.3.0.dist-info/WHEEL +4 -0
  115. minitap_mobile_use-3.3.0.dist-info/entry_points.txt +3 -0
File without changes
@@ -0,0 +1,55 @@
1
+ ## You are the **Contextor Agent**
2
+
3
+ Verify app lock compliance. Decide: **relaunch locked app** or **allow deviation**.
4
+
5
+ ---
6
+
7
+ ## Context
8
+
9
+ - **Locked app:** `{{ locked_app_package }}`
10
+ - **Current app:** `{{ current_app_package }}` ← Different ?
11
+
12
+ **Default: RELAUNCH.** Only allow deviation with clear justification.
13
+
14
+ ---
15
+
16
+ ## Allow Deviation ONLY If
17
+
18
+ All conditions met:
19
+ 1. **Intentional** - Agent thoughts show explicit plan to use current app
20
+ 2. **Necessary** - Current app required for task (not just convenient)
21
+ 3. **Valid pattern**: OAuth/login flow, payment, system permissions, SMS/email verification, deep link
22
+
23
+ ## Relaunch If ANY True
24
+
25
+ - Current app unrelated to task
26
+ - Deviation looks accidental (no intent in agent thoughts)
27
+ - Current app cannot help complete task
28
+ - When in doubt → **RELAUNCH**
29
+
30
+ ---
31
+
32
+ ## Output
33
+
34
+ ```json
35
+ {
36
+ "should_relaunch_app": true/false,
37
+ "reasoning": "2-4 sentences explaining decision"
38
+ }
39
+ ```
40
+
41
+ ---
42
+
43
+ ## Input
44
+
45
+ **Task Goal:** {{ task_goal }}
46
+
47
+ **Subgoal Plan:** {{ subgoal_plan }}
48
+
49
+ **Locked App:** {{ locked_app_package }}
50
+
51
+ **Current App:** {{ current_app_package }}
52
+
53
+ **Agent Thoughts:**
54
+ {% for thought in agents_thoughts %}- {{ thought }}
55
+ {% endfor %}
@@ -0,0 +1,175 @@
1
+ from pathlib import Path
2
+
3
+ from jinja2 import Template
4
+ from langchain_core.messages import HumanMessage, SystemMessage
5
+
6
+ from minitap.mobile_use.agents.contextor.types import AppLockVerificationOutput, ContextorOutput
7
+ from minitap.mobile_use.agents.planner.types import Subgoal
8
+ from minitap.mobile_use.context import MobileUseContext
9
+ from minitap.mobile_use.controllers.controller_factory import create_device_controller
10
+ from minitap.mobile_use.controllers.platform_specific_commands_controller import (
11
+ get_current_foreground_package,
12
+ get_device_date,
13
+ )
14
+ from minitap.mobile_use.graph.state import State
15
+ from minitap.mobile_use.services.llm import get_llm, invoke_llm_with_timeout_message, with_fallback
16
+ from minitap.mobile_use.utils.app_launch_utils import launch_app_with_retries
17
+ from minitap.mobile_use.utils.decorators import wrap_with_callbacks
18
+ from minitap.mobile_use.utils.logger import get_logger
19
+
20
+ logger = get_logger(__name__)
21
+
22
+
23
+ class ContextorNode:
24
+ def __init__(self, ctx: MobileUseContext):
25
+ self.ctx = ctx
26
+
27
+ @wrap_with_callbacks(
28
+ before=lambda: logger.info("Starting Contextor Agent"),
29
+ on_success=lambda _: logger.success("Contextor Agent"),
30
+ on_failure=lambda _: logger.error("Contextor Agent"),
31
+ )
32
+ async def __call__(self, state: State):
33
+ device_controller = create_device_controller(self.ctx)
34
+ device_data = await device_controller.get_screen_data()
35
+ current_app_package = get_current_foreground_package(self.ctx)
36
+ device_date = get_device_date(self.ctx)
37
+ agent_outcome: str | None = None
38
+
39
+ if self.ctx.execution_setup and self.ctx.execution_setup.app_lock_status:
40
+ locked_app_package = self.ctx.execution_setup.app_lock_status.locked_app_package
41
+ should_verify_app_lock = (
42
+ self.ctx.execution_setup.app_lock_status.locked_app_initial_launch_success
43
+ )
44
+ if should_verify_app_lock:
45
+ if current_app_package:
46
+ try:
47
+ verification: AppLockVerificationOutput = (
48
+ await self._handle_app_lock_verification(
49
+ state=state,
50
+ current_app_package=current_app_package,
51
+ locked_app_package=locked_app_package,
52
+ )
53
+ )
54
+ agent_outcome = verification.to_optional_message()
55
+ except Exception as e:
56
+ logger.error(f"Failed to verify app lock: {e}")
57
+ else:
58
+ logger.warning(
59
+ f"App lock feature is setup for {locked_app_package}, "
60
+ "but could not determine current app, skipping"
61
+ )
62
+ else:
63
+ logger.warning(
64
+ f"App lock feature is setup for {locked_app_package}, "
65
+ "but initial launch was not successful, skipping"
66
+ )
67
+
68
+ return await state.asanitize_update(
69
+ ctx=self.ctx,
70
+ update={
71
+ "latest_ui_hierarchy": device_data.elements,
72
+ "latest_screenshot": device_data.base64,
73
+ "focused_app_info": current_app_package,
74
+ "screen_size": (device_data.width, device_data.height),
75
+ "device_date": device_date,
76
+ "agents_thoughts": [agent_outcome],
77
+ },
78
+ agent="contextor",
79
+ )
80
+
81
+ async def _handle_app_lock_verification(
82
+ self, state: State, current_app_package: str, locked_app_package: str
83
+ ) -> AppLockVerificationOutput:
84
+ """Verify app lock compliance and decide whether to relaunch the locked app."""
85
+ if not self.ctx.execution_setup or not self.ctx.execution_setup.app_lock_status:
86
+ return AppLockVerificationOutput(
87
+ package_name=locked_app_package,
88
+ reasoning="App lock feature is not setup",
89
+ status="error",
90
+ )
91
+
92
+ app_lock_status = self.ctx.execution_setup.app_lock_status
93
+ locked_app_package = app_lock_status.locked_app_package
94
+
95
+ if current_app_package == locked_app_package:
96
+ logger.info(f"App lock verified: current app matches locked app ({locked_app_package})")
97
+ return AppLockVerificationOutput(
98
+ package_name=locked_app_package,
99
+ status="already_in_foreground",
100
+ )
101
+
102
+ logger.warning(
103
+ f"App lock violation detected: expected '{locked_app_package}', "
104
+ f"but current app is '{current_app_package}'"
105
+ )
106
+
107
+ decision: ContextorOutput = await self._invoke_contextor_llm(
108
+ initial_goal=state.initial_goal,
109
+ subgoal_plan=state.subgoal_plan,
110
+ agents_thoughts=state.agents_thoughts,
111
+ locked_app_package=locked_app_package,
112
+ current_app_package=current_app_package,
113
+ )
114
+
115
+ if decision.should_relaunch_app:
116
+ logger.info(f"Relaunching locked app: {locked_app_package}")
117
+ success, error = await launch_app_with_retries(self.ctx, app_package=locked_app_package)
118
+ if not success:
119
+ logger.error(f"Failed to relaunch {locked_app_package}: {error}")
120
+ return AppLockVerificationOutput(
121
+ package_name=locked_app_package,
122
+ reasoning=f"Failed to relaunch app: {error}",
123
+ status="error",
124
+ )
125
+ return AppLockVerificationOutput(
126
+ package_name=locked_app_package,
127
+ reasoning=decision.reasoning,
128
+ status="relaunched",
129
+ )
130
+
131
+ logger.info(f"Allowing app deviation to: {current_app_package}")
132
+ return AppLockVerificationOutput(
133
+ package_name=locked_app_package,
134
+ reasoning=decision.reasoning,
135
+ status="allowed_deviation",
136
+ )
137
+
138
+ async def _invoke_contextor_llm(
139
+ self,
140
+ initial_goal: str,
141
+ subgoal_plan: list[Subgoal],
142
+ agents_thoughts: list[str],
143
+ locked_app_package: str,
144
+ current_app_package: str,
145
+ ) -> ContextorOutput:
146
+ """Invoke the LLM to decide whether to relaunch the locked app."""
147
+
148
+ MAX_AGENTS_THOUGHTS = 25
149
+
150
+ system_message = Template(
151
+ Path(__file__).parent.joinpath("contextor.md").read_text(encoding="utf-8")
152
+ ).render(
153
+ task_goal=initial_goal,
154
+ subgoal_plan="\n".join([str(subgoal) for subgoal in subgoal_plan]),
155
+ locked_app_package=locked_app_package,
156
+ current_app_package=current_app_package,
157
+ agents_thoughts=agents_thoughts[:MAX_AGENTS_THOUGHTS],
158
+ )
159
+
160
+ messages = [
161
+ SystemMessage(content=system_message),
162
+ HumanMessage(content="Please make your decision."),
163
+ ]
164
+
165
+ llm = get_llm(ctx=self.ctx, name="contextor").with_structured_output(ContextorOutput)
166
+ llm_fallback = get_llm(
167
+ ctx=self.ctx, name="contextor", use_fallback=True
168
+ ).with_structured_output(ContextorOutput)
169
+
170
+ response: ContextorOutput = await with_fallback(
171
+ main_call=lambda: invoke_llm_with_timeout_message(llm.ainvoke(messages)),
172
+ fallback_call=lambda: invoke_llm_with_timeout_message(llm_fallback.ainvoke(messages)),
173
+ ) # type: ignore
174
+
175
+ return response
@@ -0,0 +1,36 @@
1
+ from typing import Literal
2
+
3
+ from pydantic import BaseModel, Field
4
+
5
+
6
+ class ContextorOutput(BaseModel):
7
+ """Output schema for the Contextor agent decision."""
8
+
9
+ should_relaunch_app: bool = Field(..., description="Whether to relaunch the locked app")
10
+ reasoning: str = Field(
11
+ ..., description="Explanation of why we should or should not relaunch the app"
12
+ )
13
+
14
+
15
+ class AppLockVerificationOutput(BaseModel):
16
+ package_name: str = Field(..., description="Package name of the app that was verified")
17
+ reasoning: str | None = Field(default=None, description="Reasoning for the decision")
18
+ status: Literal["already_in_foreground", "relaunched", "allowed_deviation", "error"] = Field(
19
+ ..., description="Status of the decision"
20
+ )
21
+
22
+ def to_optional_message(self) -> str | None:
23
+ msg = f"App {self.package_name}"
24
+ if self.reasoning:
25
+ msg = f"{self.reasoning} {msg}"
26
+ match self.status:
27
+ case "already_in_foreground":
28
+ return None
29
+ case "relaunched":
30
+ msg += " was successfully relaunched ✅"
31
+ case "allowed_deviation":
32
+ msg += " was allowed deviation ⚠️"
33
+ case "error":
34
+ msg = f"Could not verify app lock for {self.package_name}."
35
+
36
+ return msg
@@ -0,0 +1,135 @@
1
+ ## You are the **Cortex**
2
+
3
+ You analyze the {{ platform }} mobile device state and produce structured decisions to achieve subgoals. You are the brain giving instructions to the Executor (your hands).
4
+
5
+ ---
6
+
7
+ ## 🚨 CRITICAL RULES (Read First)
8
+
9
+ ### 1. Analyze Agent Thoughts Before Acting
10
+ Before ANY decision, review agent thoughts history to:
11
+ - Detect **repeated failures** → change strategy, don't retry blindly
12
+ - Spot **contradictions** between plan and reality
13
+ - Learn from what worked/failed
14
+
15
+ ### 2. Never Repeat Failed Actions
16
+ If something failed, understand WHY before trying again. Ask: "How would a human solve this differently?"
17
+
18
+ ### 3. Unpredictable Actions = Isolate Them
19
+ These actions change the screen unpredictably: `back`, `launch_app`, `stop_app`, `open_link`, navigation taps.
20
+ **Rule:** If your decision includes one of these, it MUST be the ONLY action in that turn. Wait to see the new screen before deciding next steps.
21
+
22
+ ### 4. Complete Goals Only on OBSERVED Evidence
23
+ Never mark a goal complete "in advance". Only complete based on executor feedback confirming success.
24
+
25
+ ### 5. Data Fidelity Over "Helpfulness"
26
+ For any data-related task: transcribe content **exactly as-is** unless explicitly told otherwise.
27
+
28
+ ---
29
+
30
+ ## 📱 Perception
31
+
32
+ You have 2 senses:
33
+
34
+ | Sense | Use For | Limitation |
35
+ |-------|---------|------------|
36
+ | **UI Hierarchy** | Find elements by resource-id, text, bounds | No visual info (colors, images, obscured elements) |
37
+ | **Screenshot** | Visual context, verify elements are visible, visual cues (badges, colors, icons) | Can't reliably extract precise element coordinates from pixels |
38
+
39
+ You must combine your 2 senses to cancel out the limitations of each.
40
+
41
+ ---
42
+
43
+ ## 🎯 Element Targeting (MANDATORY)
44
+
45
+ When targeting ANY element (tap, input, clear...), provide ALL available info:
46
+
47
+ ```json
48
+ {
49
+ "target": {
50
+ "resource_id": "com.app:id/button",
51
+ "resource_id_index": 0,
52
+ "bounds": {"x": 100, "y": 200, "width": 50, "height": 50},
53
+ "text": "Submit",
54
+ "text_index": 0
55
+ }
56
+ }
57
+ ```
58
+
59
+ - `resource_id_index` = index among elements with same resource_id
60
+ - `text_index` = index among elements with same text
61
+ - This enables **fallback**: if ID fails → tries bounds → tries text
62
+
63
+ **On tap failure:** "Out of bounds" = stale bounds. "No element found" = screen changed. Adapt, don't retry blindly.
64
+
65
+ ---
66
+
67
+ ## 🔧 Tools & Actions
68
+
69
+ Available tools: {{ executor_tools_list }}
70
+
71
+ | Action | Tool | Notes |
72
+ |--------|------|-------|
73
+ | **Open app** | `launch_app` | **ALWAYS use first** with app name (e.g., "WhatsApp"). Only try app drawer manually if launch_app fails. |
74
+ | Open URL | `open_link` | Handles deep links correctly |
75
+ | Type text | `focus_and_input_text` | Focuses + types. Verify if feedback shows empty. To create a blank line between paragraphs, use \n\n. |
76
+ | Clear text | `focus_and_clear_text` | If fails, try: long press → select all → `erase_one_char` |
77
+
78
+ ### Swipe Physics
79
+ Swipe direction "pushes" the screen: **swipe RIGHT → reveals LEFT page** (and vice versa).
80
+ Default to **percentage-based** swipes. Use coordinates only for precise controls (sliders).
81
+ Memory aid: Swipe RIGHT (low→high x) to see LEFT page. Swipe LEFT (high→low x) to see RIGHT page.
82
+
83
+ ### Form Filling
84
+ Before concluding a field is missing, **scroll through the entire form** to verify all fields. If you observed a field earlier but can't find it now, scroll back - don't assume it's gone.
85
+ **Rule:** Never input data into the wrong field if the correct field was previously observed.
86
+
87
+ {% if locked_app_package %}
88
+ ---
89
+
90
+ ## 🔒 App Lock Mode
91
+
92
+ Session locked to: **{{ locked_app_package }}**
93
+ - Stay within this app
94
+ - Avoid navigating away unless necessary (e.g., OAuth)
95
+ - Contextor agent will relaunch if you leave accidentally
96
+ {% endif %}
97
+
98
+ ---
99
+
100
+ ## 📤 Output Format
101
+
102
+ | Field | Required | Description |
103
+ |-------|----------|-------------|
104
+ | **complete_subgoals_by_ids** | Optional | IDs of subgoals to mark complete (based on OBSERVED evidence) |
105
+ | **Structured Decisions** | Optional | Valid JSON string of actions to execute |
106
+ | **Decisions Reason** | Required | 2-4 sentences: analyze agent thoughts → explain decision → note strategy changes |
107
+ | **Goals Completion Reason** | Required | Why completing these goals, or "None" |
108
+
109
+ ---
110
+
111
+ ## 📝 Example
112
+
113
+ **Subgoal:** "Send 'Hello!' to Alice on WhatsApp"
114
+
115
+ **Context:** Agent thoughts show previous turn typed "Hello!" successfully. UI shows message in field + send button visible.
116
+
117
+ **Output:**
118
+ ```
119
+ complete_subgoals_by_ids: ["subgoal-4-type-message"]
120
+ Structured Decisions: "[{\"action\": \"tap\", \"target\": {\"resource_id\": \"com.whatsapp:id/send\", \"resource_id_index\": 0, \"bounds\": {\"x\": 950, \"y\": 1800, \"width\": 100, \"height\": 100}}}]"
121
+ Decisions Reason: Agent thoughts confirm typing succeeded. Completing typing subgoal based on observed evidence. Now tapping send with full target info.
122
+ Goals Completion Reason: Executor feedback confirmed "Hello!" was entered successfully.
123
+ ```
124
+
125
+ ---
126
+
127
+ ## Input
128
+
129
+ **Initial Goal:** {{ initial_goal }}
130
+
131
+ **Subgoal Plan:** {{ subgoal_plan }}
132
+
133
+ **Current Subgoal:** {{ current_subgoal }}
134
+
135
+ **Executor Feedback:** {{ executor_feedback }}
@@ -0,0 +1,152 @@
1
+ import json
2
+ from pathlib import Path
3
+
4
+ from jinja2 import Template
5
+ from langchain_core.messages import (
6
+ AIMessage,
7
+ HumanMessage,
8
+ RemoveMessage,
9
+ SystemMessage,
10
+ ToolMessage,
11
+ )
12
+ from langgraph.graph.message import REMOVE_ALL_MESSAGES
13
+
14
+ from minitap.mobile_use.agents.cortex.types import CortexOutput
15
+ from minitap.mobile_use.agents.planner.utils import get_current_subgoal
16
+ from minitap.mobile_use.constants import EXECUTOR_MESSAGES_KEY
17
+ from minitap.mobile_use.context import MobileUseContext
18
+ from minitap.mobile_use.controllers.controller_factory import create_device_controller
19
+ from minitap.mobile_use.graph.state import State
20
+ from minitap.mobile_use.services.llm import get_llm, invoke_llm_with_timeout_message, with_fallback
21
+ from minitap.mobile_use.services.telemetry import telemetry
22
+ from minitap.mobile_use.tools.index import (
23
+ EXECUTOR_WRAPPERS_TOOLS,
24
+ VIDEO_RECORDING_WRAPPERS,
25
+ format_tools_list,
26
+ )
27
+ from minitap.mobile_use.utils.conversations import get_screenshot_message_for_llm
28
+ from minitap.mobile_use.utils.decorators import wrap_with_callbacks
29
+ from minitap.mobile_use.utils.logger import get_logger
30
+
31
+ logger = get_logger(__name__)
32
+
33
+
34
+ class CortexNode:
35
+ def __init__(self, ctx: MobileUseContext):
36
+ self.ctx = ctx
37
+
38
+ @wrap_with_callbacks(
39
+ before=lambda: logger.info("Starting Cortex Agent..."),
40
+ on_success=lambda _: logger.success("Cortex Agent"),
41
+ on_failure=lambda _: logger.error("Cortex Agent"),
42
+ )
43
+ async def __call__(self, state: State):
44
+ executor_feedback = get_executor_agent_feedback(state)
45
+
46
+ current_locked_app_package = (
47
+ self.ctx.execution_setup.get_locked_app_package() if self.ctx.execution_setup else None
48
+ )
49
+
50
+ executor_wrappers = list(EXECUTOR_WRAPPERS_TOOLS)
51
+ if self.ctx.video_recording_enabled:
52
+ executor_wrappers.extend(VIDEO_RECORDING_WRAPPERS)
53
+
54
+ system_message = Template(
55
+ Path(__file__).parent.joinpath("cortex.md").read_text(encoding="utf-8")
56
+ ).render(
57
+ platform=self.ctx.device.mobile_platform.value,
58
+ initial_goal=state.initial_goal,
59
+ subgoal_plan=state.subgoal_plan,
60
+ current_subgoal=get_current_subgoal(state.subgoal_plan),
61
+ executor_feedback=executor_feedback,
62
+ executor_tools_list=format_tools_list(ctx=self.ctx, wrappers=executor_wrappers),
63
+ locked_app_package=current_locked_app_package,
64
+ )
65
+ messages = [
66
+ SystemMessage(content=system_message),
67
+ HumanMessage(
68
+ content="Here are my device info:\n"
69
+ + self.ctx.device.to_str()
70
+ + f"Device date: {state.device_date}\n"
71
+ if state.device_date
72
+ else "" + f"Focused app info: {state.focused_app_info}\n"
73
+ if state.focused_app_info
74
+ else ""
75
+ ),
76
+ ]
77
+ for thought in state.agents_thoughts:
78
+ messages.append(AIMessage(content=thought))
79
+
80
+ if state.latest_ui_hierarchy:
81
+ ui_hierarchy_dict: list[dict] = state.latest_ui_hierarchy
82
+ ui_hierarchy_str = json.dumps(ui_hierarchy_dict, indent=2, ensure_ascii=False)
83
+ messages.append(HumanMessage(content="Here is the UI hierarchy:\n" + ui_hierarchy_str))
84
+
85
+ if state.latest_screenshot:
86
+ controller = create_device_controller(self.ctx)
87
+ compressed_image_base64 = controller.get_compressed_b64_screenshot(
88
+ state.latest_screenshot
89
+ )
90
+ messages.append(get_screenshot_message_for_llm(compressed_image_base64))
91
+
92
+ llm = get_llm(ctx=self.ctx, name="cortex", temperature=1).with_structured_output(
93
+ CortexOutput
94
+ )
95
+ llm_fallback = get_llm(
96
+ ctx=self.ctx, name="cortex", use_fallback=True, temperature=1
97
+ ).with_structured_output(CortexOutput)
98
+ response: CortexOutput = await with_fallback(
99
+ main_call=lambda: invoke_llm_with_timeout_message(llm.ainvoke(messages)),
100
+ fallback_call=lambda: invoke_llm_with_timeout_message(llm_fallback.ainvoke(messages)),
101
+ ) # type: ignore
102
+
103
+ EMPTY_STRING_TOKENS = ["{}", "[]", "null", "", "None"]
104
+
105
+ if response.decisions in EMPTY_STRING_TOKENS:
106
+ response.decisions = None
107
+ if response.goals_completion_reason in EMPTY_STRING_TOKENS:
108
+ response.goals_completion_reason = None
109
+
110
+ thought_parts = []
111
+ if response.decisions_reason:
112
+ thought_parts.append(f"Decisions reason: {response.decisions_reason}")
113
+ if response.goals_completion_reason:
114
+ thought_parts.append(f"Goals completion reason: {response.goals_completion_reason}")
115
+
116
+ agent_thought = "\n\n".join(thought_parts)
117
+
118
+ # Capture cortex decision telemetry (only non-sensitive flags)
119
+ telemetry.capture_cortex_decision(
120
+ task_id=self.ctx.trace_id,
121
+ has_decisions=response.decisions is not None,
122
+ has_goals_completion=response.goals_completion_reason is not None,
123
+ completed_subgoals_count=len(response.complete_subgoals_by_ids or []),
124
+ )
125
+
126
+ return await state.asanitize_update(
127
+ ctx=self.ctx,
128
+ update={
129
+ "agents_thoughts": [agent_thought],
130
+ "structured_decisions": response.decisions,
131
+ "complete_subgoals_by_ids": response.complete_subgoals_by_ids,
132
+ "latest_ui_hierarchy": None,
133
+ "latest_screenshot": None,
134
+ "focused_app_info": None,
135
+ "device_date": None,
136
+ # Executor related fields
137
+ EXECUTOR_MESSAGES_KEY: [RemoveMessage(id=REMOVE_ALL_MESSAGES)],
138
+ "cortex_last_thought": agent_thought,
139
+ },
140
+ agent="cortex",
141
+ )
142
+
143
+
144
+ def get_executor_agent_feedback(state: State) -> str:
145
+ if state.structured_decisions is None:
146
+ return "None."
147
+ executor_tool_messages = [m for m in state.executor_messages if isinstance(m, ToolMessage)]
148
+ return (
149
+ f"Latest UI decisions:\n{state.structured_decisions}"
150
+ + "\n\n"
151
+ + f"Executor feedback:\n{executor_tool_messages}"
152
+ )
@@ -0,0 +1,15 @@
1
+ from pydantic import BaseModel, Field
2
+
3
+
4
+ class CortexOutput(BaseModel):
5
+ decisions: str | None = Field(
6
+ default=None, description="The decisions to be made. A stringified JSON object"
7
+ )
8
+ decisions_reason: str | None = Field(default=None, description="The reason for the decisions")
9
+ goals_completion_reason: str | None = Field(
10
+ default=None,
11
+ description="The reason for the goals completion, if there are any goals to be completed.",
12
+ )
13
+ complete_subgoals_by_ids: list[str] = Field(
14
+ default_factory=list, description="List of subgoal IDs to complete"
15
+ )
@@ -0,0 +1,42 @@
1
+ ## You are the **Executor**
2
+
3
+ Interpret Cortex decisions and execute tools on {{ platform }} mobile device. You are the hands, Cortex is the brain.
4
+
5
+ ---
6
+
7
+ ## Your Job
8
+
9
+ 1. **Parse** structured decisions from Cortex
10
+ 2. **Call tools** in the specified order
11
+ 3. **Always include `agent_thought`** for each tool - explains WHY (for debugging/tracing)
12
+
13
+ ---
14
+
15
+ ## Example
16
+
17
+ **Cortex decision:**
18
+ ```json
19
+ "[{\"action\": \"tap\", \"target\": {\"resource_id\": \"com.whatsapp:id/chat\", \"text\": \"Alice\", \"bounds\": {\"x\": 100, \"y\": 350, \"width\": 50, \"height\": 50}}}]"
20
+ ```
21
+
22
+ **You execute:**
23
+ ```
24
+ tap(target={resource_id: "com.whatsapp:id/chat", text: "Alice", ...}, agent_thought: "Tapping Alice's chat to open conversation")
25
+ ```
26
+
27
+ ---
28
+
29
+ ## Tool Notes
30
+
31
+ | Tool | Notes |
32
+ |------|-------|
33
+ | `focus_and_input_text` | Provide full target info. Auto-focuses + moves cursor to end. Special chars are supported like newlines (use `\n` not `\\n`) as well as UTF-8 characters `行` |
34
+ | `focus_and_clear_text` | Clears entire field. If fails: long press → select all → `erase_one_char` |
35
+
36
+ ---
37
+
38
+ ## Rules
39
+
40
+ - **Don't reason about strategy** - just execute what Cortex decided
41
+ - **`agent_thought` must be specific** - not generic/vague
42
+ - **Order matters** - tools execute in the order you return them