minitap-mobile-use 3.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- minitap/mobile_use/__init__.py +0 -0
- minitap/mobile_use/agents/contextor/contextor.md +55 -0
- minitap/mobile_use/agents/contextor/contextor.py +175 -0
- minitap/mobile_use/agents/contextor/types.py +36 -0
- minitap/mobile_use/agents/cortex/cortex.md +135 -0
- minitap/mobile_use/agents/cortex/cortex.py +152 -0
- minitap/mobile_use/agents/cortex/types.py +15 -0
- minitap/mobile_use/agents/executor/executor.md +42 -0
- minitap/mobile_use/agents/executor/executor.py +87 -0
- minitap/mobile_use/agents/executor/tool_node.py +152 -0
- minitap/mobile_use/agents/hopper/hopper.md +15 -0
- minitap/mobile_use/agents/hopper/hopper.py +44 -0
- minitap/mobile_use/agents/orchestrator/human.md +12 -0
- minitap/mobile_use/agents/orchestrator/orchestrator.md +21 -0
- minitap/mobile_use/agents/orchestrator/orchestrator.py +134 -0
- minitap/mobile_use/agents/orchestrator/types.py +11 -0
- minitap/mobile_use/agents/outputter/human.md +25 -0
- minitap/mobile_use/agents/outputter/outputter.py +85 -0
- minitap/mobile_use/agents/outputter/test_outputter.py +167 -0
- minitap/mobile_use/agents/planner/human.md +14 -0
- minitap/mobile_use/agents/planner/planner.md +126 -0
- minitap/mobile_use/agents/planner/planner.py +101 -0
- minitap/mobile_use/agents/planner/types.py +51 -0
- minitap/mobile_use/agents/planner/utils.py +70 -0
- minitap/mobile_use/agents/summarizer/summarizer.py +35 -0
- minitap/mobile_use/agents/video_analyzer/__init__.py +5 -0
- minitap/mobile_use/agents/video_analyzer/human.md +5 -0
- minitap/mobile_use/agents/video_analyzer/video_analyzer.md +37 -0
- minitap/mobile_use/agents/video_analyzer/video_analyzer.py +111 -0
- minitap/mobile_use/clients/browserstack_client.py +477 -0
- minitap/mobile_use/clients/idb_client.py +429 -0
- minitap/mobile_use/clients/ios_client.py +332 -0
- minitap/mobile_use/clients/ios_client_config.py +141 -0
- minitap/mobile_use/clients/ui_automator_client.py +330 -0
- minitap/mobile_use/clients/wda_client.py +526 -0
- minitap/mobile_use/clients/wda_lifecycle.py +367 -0
- minitap/mobile_use/config.py +413 -0
- minitap/mobile_use/constants.py +3 -0
- minitap/mobile_use/context.py +106 -0
- minitap/mobile_use/controllers/__init__.py +0 -0
- minitap/mobile_use/controllers/android_controller.py +524 -0
- minitap/mobile_use/controllers/controller_factory.py +46 -0
- minitap/mobile_use/controllers/device_controller.py +182 -0
- minitap/mobile_use/controllers/ios_controller.py +436 -0
- minitap/mobile_use/controllers/platform_specific_commands_controller.py +199 -0
- minitap/mobile_use/controllers/types.py +106 -0
- minitap/mobile_use/controllers/unified_controller.py +193 -0
- minitap/mobile_use/graph/graph.py +160 -0
- minitap/mobile_use/graph/state.py +115 -0
- minitap/mobile_use/main.py +309 -0
- minitap/mobile_use/sdk/__init__.py +12 -0
- minitap/mobile_use/sdk/agent.py +1294 -0
- minitap/mobile_use/sdk/builders/__init__.py +10 -0
- minitap/mobile_use/sdk/builders/agent_config_builder.py +307 -0
- minitap/mobile_use/sdk/builders/index.py +15 -0
- minitap/mobile_use/sdk/builders/task_request_builder.py +236 -0
- minitap/mobile_use/sdk/constants.py +1 -0
- minitap/mobile_use/sdk/examples/README.md +83 -0
- minitap/mobile_use/sdk/examples/__init__.py +1 -0
- minitap/mobile_use/sdk/examples/app_lock_messaging.py +54 -0
- minitap/mobile_use/sdk/examples/platform_manual_task_example.py +67 -0
- minitap/mobile_use/sdk/examples/platform_minimal_example.py +48 -0
- minitap/mobile_use/sdk/examples/simple_photo_organizer.py +76 -0
- minitap/mobile_use/sdk/examples/smart_notification_assistant.py +225 -0
- minitap/mobile_use/sdk/examples/video_transcription_example.py +117 -0
- minitap/mobile_use/sdk/services/cloud_mobile.py +656 -0
- minitap/mobile_use/sdk/services/platform.py +434 -0
- minitap/mobile_use/sdk/types/__init__.py +51 -0
- minitap/mobile_use/sdk/types/agent.py +84 -0
- minitap/mobile_use/sdk/types/exceptions.py +138 -0
- minitap/mobile_use/sdk/types/platform.py +183 -0
- minitap/mobile_use/sdk/types/task.py +269 -0
- minitap/mobile_use/sdk/utils.py +29 -0
- minitap/mobile_use/services/accessibility.py +100 -0
- minitap/mobile_use/services/llm.py +247 -0
- minitap/mobile_use/services/telemetry.py +421 -0
- minitap/mobile_use/tools/index.py +67 -0
- minitap/mobile_use/tools/mobile/back.py +52 -0
- minitap/mobile_use/tools/mobile/erase_one_char.py +56 -0
- minitap/mobile_use/tools/mobile/focus_and_clear_text.py +317 -0
- minitap/mobile_use/tools/mobile/focus_and_input_text.py +153 -0
- minitap/mobile_use/tools/mobile/launch_app.py +86 -0
- minitap/mobile_use/tools/mobile/long_press_on.py +169 -0
- minitap/mobile_use/tools/mobile/open_link.py +62 -0
- minitap/mobile_use/tools/mobile/press_key.py +83 -0
- minitap/mobile_use/tools/mobile/stop_app.py +62 -0
- minitap/mobile_use/tools/mobile/swipe.py +156 -0
- minitap/mobile_use/tools/mobile/tap.py +154 -0
- minitap/mobile_use/tools/mobile/video_recording.py +177 -0
- minitap/mobile_use/tools/mobile/wait_for_delay.py +81 -0
- minitap/mobile_use/tools/scratchpad.py +147 -0
- minitap/mobile_use/tools/test_utils.py +413 -0
- minitap/mobile_use/tools/tool_wrapper.py +16 -0
- minitap/mobile_use/tools/types.py +35 -0
- minitap/mobile_use/tools/utils.py +336 -0
- minitap/mobile_use/utils/app_launch_utils.py +173 -0
- minitap/mobile_use/utils/cli_helpers.py +37 -0
- minitap/mobile_use/utils/cli_selection.py +143 -0
- minitap/mobile_use/utils/conversations.py +31 -0
- minitap/mobile_use/utils/decorators.py +124 -0
- minitap/mobile_use/utils/errors.py +6 -0
- minitap/mobile_use/utils/file.py +13 -0
- minitap/mobile_use/utils/logger.py +183 -0
- minitap/mobile_use/utils/media.py +186 -0
- minitap/mobile_use/utils/recorder.py +52 -0
- minitap/mobile_use/utils/requests_utils.py +37 -0
- minitap/mobile_use/utils/shell_utils.py +20 -0
- minitap/mobile_use/utils/test_ui_hierarchy.py +178 -0
- minitap/mobile_use/utils/time.py +6 -0
- minitap/mobile_use/utils/ui_hierarchy.py +132 -0
- minitap/mobile_use/utils/video.py +281 -0
- minitap_mobile_use-3.3.0.dist-info/METADATA +329 -0
- minitap_mobile_use-3.3.0.dist-info/RECORD +115 -0
- minitap_mobile_use-3.3.0.dist-info/WHEEL +4 -0
- minitap_mobile_use-3.3.0.dist-info/entry_points.txt +3 -0
|
@@ -0,0 +1,167 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
from unittest.mock import AsyncMock, Mock, patch
|
|
3
|
+
|
|
4
|
+
import pytest
|
|
5
|
+
from pydantic import BaseModel
|
|
6
|
+
|
|
7
|
+
sys.modules["langgraph.prebuilt.chat_agent_executor"] = Mock()
|
|
8
|
+
sys.modules["minitap.mobile_use.graph.state"] = Mock()
|
|
9
|
+
sys.modules["langchain_google_vertexai"] = Mock()
|
|
10
|
+
sys.modules["langchain_google_genai"] = Mock()
|
|
11
|
+
sys.modules["langchain_openai"] = Mock()
|
|
12
|
+
sys.modules["langchain_cerebras"] = Mock()
|
|
13
|
+
|
|
14
|
+
from minitap.mobile_use.agents.outputter.outputter import outputter # noqa: E402
|
|
15
|
+
from minitap.mobile_use.config import LLM, OutputConfig # noqa: E402
|
|
16
|
+
from minitap.mobile_use.context import MobileUseContext # noqa: E402
|
|
17
|
+
from minitap.mobile_use.utils.logger import get_logger # noqa: E402
|
|
18
|
+
|
|
19
|
+
logger = get_logger(__name__)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class MockPydanticSchema(BaseModel):
|
|
23
|
+
color: str
|
|
24
|
+
price: float
|
|
25
|
+
currency_symbol: str
|
|
26
|
+
website_url: str
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
mock_dict = {
|
|
30
|
+
"color": "green",
|
|
31
|
+
"price": 20,
|
|
32
|
+
"currency_symbol": "$",
|
|
33
|
+
"website_url": "http://superwebsite.fr",
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class DummyState:
|
|
38
|
+
def __init__(self, messages, initial_goal, agents_thoughts):
|
|
39
|
+
self.messages = messages
|
|
40
|
+
self.initial_goal = initial_goal
|
|
41
|
+
self.agents_thoughts = agents_thoughts
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
mocked_state = DummyState(
|
|
45
|
+
messages=[],
|
|
46
|
+
initial_goal="Find a green product on my website",
|
|
47
|
+
agents_thoughts=[
|
|
48
|
+
"Going on http://superwebsite.fr",
|
|
49
|
+
"Searching for products",
|
|
50
|
+
"Filtering by color",
|
|
51
|
+
"Color 'green' found for a 20 dollars product",
|
|
52
|
+
],
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
@pytest.fixture
|
|
57
|
+
def mock_context():
|
|
58
|
+
"""Create a properly mocked context with all required fields."""
|
|
59
|
+
ctx = Mock(spec=MobileUseContext)
|
|
60
|
+
ctx.llm_config = {
|
|
61
|
+
"executor": LLM(provider="openai", model="gpt-5-nano"),
|
|
62
|
+
"cortex": LLM(provider="openai", model="gpt-5-nano"),
|
|
63
|
+
"planner": LLM(provider="openai", model="gpt-5-nano"),
|
|
64
|
+
"orchestrator": LLM(provider="openai", model="gpt-5-nano"),
|
|
65
|
+
}
|
|
66
|
+
ctx.device = Mock()
|
|
67
|
+
return ctx
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
@pytest.fixture
|
|
71
|
+
def mock_state():
|
|
72
|
+
"""Create a mock state with test data."""
|
|
73
|
+
return DummyState(
|
|
74
|
+
messages=[],
|
|
75
|
+
initial_goal="Find a green product on my website",
|
|
76
|
+
agents_thoughts=[
|
|
77
|
+
"Going on http://superwebsite.fr",
|
|
78
|
+
"Searching for products",
|
|
79
|
+
"Filtering by color",
|
|
80
|
+
"Color 'green' found for a 20 dollars product",
|
|
81
|
+
],
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
@patch("minitap.mobile_use.agents.outputter.outputter.get_llm")
|
|
86
|
+
@pytest.mark.asyncio
|
|
87
|
+
async def test_outputter_with_pydantic_model(mock_get_llm, mock_context, mock_state):
|
|
88
|
+
"""Test outputter with Pydantic model output."""
|
|
89
|
+
# Mock the structured LLM response
|
|
90
|
+
mock_structured_llm = AsyncMock()
|
|
91
|
+
mock_structured_llm.ainvoke.return_value = MockPydanticSchema(
|
|
92
|
+
color="green", price=20, currency_symbol="$", website_url="http://superwebsite.fr"
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
# Mock the base LLM
|
|
96
|
+
mock_llm = Mock()
|
|
97
|
+
mock_llm.with_structured_output.return_value = mock_structured_llm
|
|
98
|
+
mock_get_llm.return_value = mock_llm
|
|
99
|
+
|
|
100
|
+
config = OutputConfig(
|
|
101
|
+
structured_output=MockPydanticSchema,
|
|
102
|
+
output_description=None,
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
result = await outputter(ctx=mock_context, output_config=config, graph_output=mock_state)
|
|
106
|
+
|
|
107
|
+
assert isinstance(result, dict)
|
|
108
|
+
assert result.get("color") == "green"
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
@patch("minitap.mobile_use.agents.outputter.outputter.get_llm")
|
|
112
|
+
@pytest.mark.asyncio
|
|
113
|
+
async def test_outputter_with_dict(mock_get_llm, mock_context, mock_state):
|
|
114
|
+
"""Test outputter with dictionary output."""
|
|
115
|
+
# Mock the structured LLM response for dict
|
|
116
|
+
mock_structured_llm = AsyncMock()
|
|
117
|
+
expected_dict = {
|
|
118
|
+
"color": "green",
|
|
119
|
+
"price": 20,
|
|
120
|
+
"currency_symbol": "$",
|
|
121
|
+
"website_url": "http://superwebsite.fr",
|
|
122
|
+
}
|
|
123
|
+
mock_structured_llm.ainvoke.return_value = expected_dict
|
|
124
|
+
|
|
125
|
+
# Mock the base LLM
|
|
126
|
+
mock_llm = Mock()
|
|
127
|
+
mock_llm.with_structured_output.return_value = mock_structured_llm
|
|
128
|
+
mock_get_llm.return_value = mock_llm
|
|
129
|
+
|
|
130
|
+
config = OutputConfig(
|
|
131
|
+
structured_output=mock_dict,
|
|
132
|
+
output_description=None,
|
|
133
|
+
)
|
|
134
|
+
|
|
135
|
+
result = await outputter(ctx=mock_context, output_config=config, graph_output=mock_state)
|
|
136
|
+
|
|
137
|
+
assert isinstance(result, dict)
|
|
138
|
+
assert result.get("color") == "green"
|
|
139
|
+
assert result.get("price") == 20
|
|
140
|
+
assert result.get("currency_symbol") == "$"
|
|
141
|
+
assert result.get("website_url") == "http://superwebsite.fr"
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
@patch("minitap.mobile_use.agents.outputter.outputter.get_llm")
|
|
145
|
+
@pytest.mark.asyncio
|
|
146
|
+
async def test_outputter_with_natural_language_output(mock_get_llm, mock_context, mock_state):
|
|
147
|
+
"""Test outputter with natural language description output."""
|
|
148
|
+
# Mock the LLM response for natural language output (no structured output)
|
|
149
|
+
mock_llm = AsyncMock()
|
|
150
|
+
expected_json = '{"color": "green", "price": 20, "currency_symbol": "$", "website_url": "http://superwebsite.fr"}'
|
|
151
|
+
mock_llm.ainvoke.return_value = Mock(content=expected_json)
|
|
152
|
+
mock_get_llm.return_value = mock_llm
|
|
153
|
+
|
|
154
|
+
config = OutputConfig(
|
|
155
|
+
structured_output=None,
|
|
156
|
+
output_description=(
|
|
157
|
+
"A JSON object with a color, a price, a currency_symbol and a website_url key"
|
|
158
|
+
),
|
|
159
|
+
)
|
|
160
|
+
|
|
161
|
+
result = await outputter(ctx=mock_context, output_config=config, graph_output=mock_state)
|
|
162
|
+
|
|
163
|
+
assert isinstance(result, dict)
|
|
164
|
+
assert result.get("color") == "green"
|
|
165
|
+
assert result.get("price") == 20
|
|
166
|
+
assert result.get("currency_symbol") == "$"
|
|
167
|
+
assert result.get("website_url") == "http://superwebsite.fr"
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
Here is your input.
|
|
2
|
+
|
|
3
|
+
---
|
|
4
|
+
|
|
5
|
+
**Action (plan or replan)**: {{ action }}
|
|
6
|
+
|
|
7
|
+
**Initial Goal**: {{ initial_goal }}
|
|
8
|
+
|
|
9
|
+
{% if action == "replan" %}
|
|
10
|
+
Relevant only if action is replan:
|
|
11
|
+
|
|
12
|
+
**Previous Plan**: {{ previous_plan }}
|
|
13
|
+
**Agent Thoughts**: {{ agent_thoughts }}
|
|
14
|
+
{% endif %}
|
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
## You are the **Planner**
|
|
2
|
+
|
|
3
|
+
Break down user goals into **sequential subgoals** for {{ platform }} mobile execution.
|
|
4
|
+
|
|
5
|
+
---
|
|
6
|
+
|
|
7
|
+
## 🚨 Critical Rules
|
|
8
|
+
|
|
9
|
+
{% if current_foreground_app %}
|
|
10
|
+
### App Already Open: `{{ current_foreground_app }}`
|
|
11
|
+
**NEVER** create "Open {{ current_foreground_app }}" subgoal. Start with first action INSIDE the app.
|
|
12
|
+
{% endif %}
|
|
13
|
+
{% if locked_app_package %}
|
|
14
|
+
### App Lock: `{{ locked_app_package }}`
|
|
15
|
+
All actions must stay within this app (except OAuth flows).
|
|
16
|
+
{% endif %}
|
|
17
|
+
|
|
18
|
+
---
|
|
19
|
+
|
|
20
|
+
## Planning Guidelines
|
|
21
|
+
|
|
22
|
+
**Subgoals should be:**
|
|
23
|
+
- **Purpose-driven**: "Open conversation with Alice to send message" not just "Tap chat"
|
|
24
|
+
- **Sequential**: Each step prepares the next
|
|
25
|
+
- **Not too granular**: High-level milestones, not button-by-button
|
|
26
|
+
- **No loops**: Instead of "repeat 3 times", write 3 separate subgoals
|
|
27
|
+
- **Self-Correcting**: If the goal has specific formatting constraints (e.g., "add a new line", "sorted list"), include a final subgoal to **verify the result and fix it** if necessary.
|
|
28
|
+
|
|
29
|
+
**Shortcuts**: Always prefer `launch_app` to open apps (not manual app drawer navigation), `open_link` for URLs.
|
|
30
|
+
{% if video_recording_enabled %}
|
|
31
|
+
### 🎥 Video Recording Pattern
|
|
32
|
+
|
|
33
|
+
When a goal involves capturing, transcribing, or analyzing video content, follow this execution order:
|
|
34
|
+
|
|
35
|
+
1. **Start recording BEFORE interacting with the video** — The recording must be active before any click that triggers playback
|
|
36
|
+
2. **Perform video interactions** — Open/play the video and wait for it to complete
|
|
37
|
+
3. **Stop recording AFTER the video ends** — Only stop once playback is fully finished
|
|
38
|
+
|
|
39
|
+
⚠️ **Critical:** Each recording action (start/stop) must be its own dedicated subgoal. This prevents truncation due to short videos or inference latency.
|
|
40
|
+
|
|
41
|
+
**Example subgoals for "transcribe video X":**
|
|
42
|
+
1. Navigate to video X location
|
|
43
|
+
2. Start screen recording
|
|
44
|
+
3. Play video X and wait for completion
|
|
45
|
+
4. Stop recording and extract transcription
|
|
46
|
+
{% endif %}
|
|
47
|
+
|
|
48
|
+
Available tools: {{ executor_tools_list }}
|
|
49
|
+
|
|
50
|
+
---
|
|
51
|
+
|
|
52
|
+
## Replanning
|
|
53
|
+
|
|
54
|
+
When revising a failed plan:
|
|
55
|
+
1. **Keep completed subgoals** - don't restart from scratch
|
|
56
|
+
2. **Use agent thoughts** as source of truth for what happened
|
|
57
|
+
3. **Pivot strategy** based on observations (e.g., use search if scrolling failed)
|
|
58
|
+
4. **Continue from current state**, not from beginning
|
|
59
|
+
|
|
60
|
+
---
|
|
61
|
+
|
|
62
|
+
## Output Format
|
|
63
|
+
|
|
64
|
+
```json
|
|
65
|
+
{
|
|
66
|
+
"subgoals": [
|
|
67
|
+
{"description": "First subgoal description"},
|
|
68
|
+
{"description": "Second subgoal description"}
|
|
69
|
+
]
|
|
70
|
+
}
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
---
|
|
74
|
+
|
|
75
|
+
## Examples
|
|
76
|
+
|
|
77
|
+
**Goal:** "Send 'I'm running late' to Alice on WhatsApp"
|
|
78
|
+
|
|
79
|
+
❌ **Bad subgoals (overlapping/vague):**
|
|
80
|
+
```
|
|
81
|
+
- Open WhatsApp to find Alice ← What does "find" mean?
|
|
82
|
+
- Open conversation with Alice ← Might already be done if "find" included tapping
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
✅ **Good subgoals (atomic, non-overlapping):**
|
|
86
|
+
```
|
|
87
|
+
- Open WhatsApp
|
|
88
|
+
- Navigate to Alice's conversation
|
|
89
|
+
- Send the message "I'm running late"
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
**Key principle:** Each subgoal = one clear checkpoint. The Cortex decides HOW, the Planner defines WHAT milestone to reach.
|
|
93
|
+
|
|
94
|
+
---
|
|
95
|
+
|
|
96
|
+
**Replanning after failure:**
|
|
97
|
+
```
|
|
98
|
+
Original: "Navigate to Alice's conversation" (FAILED)
|
|
99
|
+
Agent thoughts: Alice not in visible chats, search bar available
|
|
100
|
+
|
|
101
|
+
New plan:
|
|
102
|
+
- Search for "Alice" using search bar
|
|
103
|
+
- Open conversation from search results
|
|
104
|
+
- Send message
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
**Cross-app data transfer (using note tools):**
|
|
108
|
+
```
|
|
109
|
+
Goal: "Copy the recipe ingredients from RecipeApp and add them to my shopping list in ShoppingApp"
|
|
110
|
+
|
|
111
|
+
✅ Correct subgoals:
|
|
112
|
+
- Open RecipeApp and navigate to the recipe
|
|
113
|
+
- Save the ingredients list using the `save_note` tool
|
|
114
|
+
- Open ShoppingApp
|
|
115
|
+
- Read the saved note using the `read_note` tool and add items to shopping list
|
|
116
|
+
```
|
|
117
|
+
{% if current_foreground_app %}
|
|
118
|
+
|
|
119
|
+
**Foreground app already open (`{{ current_foreground_app }}`):**
|
|
120
|
+
```
|
|
121
|
+
Goal: "Send message to Bob"
|
|
122
|
+
|
|
123
|
+
✅ Correct: Navigate to Bob's chat → Send message
|
|
124
|
+
❌ Wrong: Open WhatsApp → ... (app already open!)
|
|
125
|
+
```
|
|
126
|
+
{% endif %}
|
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
|
|
3
|
+
from jinja2 import Template
|
|
4
|
+
from langchain_core.messages import HumanMessage, SystemMessage
|
|
5
|
+
|
|
6
|
+
from minitap.mobile_use.agents.planner.types import PlannerOutput, Subgoal, SubgoalStatus
|
|
7
|
+
from minitap.mobile_use.agents.planner.utils import generate_id, one_of_them_is_failure
|
|
8
|
+
from minitap.mobile_use.context import MobileUseContext
|
|
9
|
+
from minitap.mobile_use.controllers.platform_specific_commands_controller import (
|
|
10
|
+
get_current_foreground_package,
|
|
11
|
+
)
|
|
12
|
+
from minitap.mobile_use.graph.state import State
|
|
13
|
+
from minitap.mobile_use.services.llm import get_llm, invoke_llm_with_timeout_message, with_fallback
|
|
14
|
+
from minitap.mobile_use.tools.index import (
|
|
15
|
+
EXECUTOR_WRAPPERS_TOOLS,
|
|
16
|
+
VIDEO_RECORDING_WRAPPERS,
|
|
17
|
+
format_tools_list,
|
|
18
|
+
)
|
|
19
|
+
from minitap.mobile_use.utils.decorators import wrap_with_callbacks
|
|
20
|
+
from minitap.mobile_use.utils.logger import get_logger
|
|
21
|
+
|
|
22
|
+
logger = get_logger(__name__)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class PlannerNode:
|
|
26
|
+
def __init__(self, ctx: MobileUseContext):
|
|
27
|
+
self.ctx = ctx
|
|
28
|
+
|
|
29
|
+
@wrap_with_callbacks(
|
|
30
|
+
before=lambda: logger.info("Starting Planner Agent..."),
|
|
31
|
+
on_success=lambda _: logger.success("Planner Agent"),
|
|
32
|
+
on_failure=lambda _: logger.error("Planner Agent"),
|
|
33
|
+
)
|
|
34
|
+
async def __call__(self, state: State):
|
|
35
|
+
needs_replan = one_of_them_is_failure(state.subgoal_plan)
|
|
36
|
+
|
|
37
|
+
current_locked_app_package = (
|
|
38
|
+
self.ctx.execution_setup.get_locked_app_package() if self.ctx.execution_setup else None
|
|
39
|
+
)
|
|
40
|
+
current_foreground_app = get_current_foreground_package(self.ctx)
|
|
41
|
+
|
|
42
|
+
executor_wrappers = list(EXECUTOR_WRAPPERS_TOOLS)
|
|
43
|
+
if self.ctx.video_recording_enabled:
|
|
44
|
+
executor_wrappers.extend(VIDEO_RECORDING_WRAPPERS)
|
|
45
|
+
|
|
46
|
+
system_message = Template(
|
|
47
|
+
Path(__file__).parent.joinpath("planner.md").read_text(encoding="utf-8")
|
|
48
|
+
).render(
|
|
49
|
+
platform=self.ctx.device.mobile_platform.value,
|
|
50
|
+
executor_tools_list=format_tools_list(ctx=self.ctx, wrappers=executor_wrappers),
|
|
51
|
+
locked_app_package=current_locked_app_package,
|
|
52
|
+
current_foreground_app=current_foreground_app,
|
|
53
|
+
video_recording_enabled=self.ctx.video_recording_enabled,
|
|
54
|
+
)
|
|
55
|
+
human_message = Template(
|
|
56
|
+
Path(__file__).parent.joinpath("human.md").read_text(encoding="utf-8")
|
|
57
|
+
).render(
|
|
58
|
+
action="replan" if needs_replan else "plan",
|
|
59
|
+
initial_goal=state.initial_goal,
|
|
60
|
+
previous_plan="\n".join(str(s) for s in state.subgoal_plan),
|
|
61
|
+
agent_thoughts="\n".join(state.agents_thoughts),
|
|
62
|
+
)
|
|
63
|
+
messages = [
|
|
64
|
+
SystemMessage(content=system_message),
|
|
65
|
+
HumanMessage(content=human_message),
|
|
66
|
+
]
|
|
67
|
+
|
|
68
|
+
llm = get_llm(ctx=self.ctx, name="planner").with_structured_output(PlannerOutput)
|
|
69
|
+
llm_fallback = get_llm(
|
|
70
|
+
ctx=self.ctx, name="planner", use_fallback=True
|
|
71
|
+
).with_structured_output(PlannerOutput)
|
|
72
|
+
response: PlannerOutput = await with_fallback(
|
|
73
|
+
main_call=lambda: invoke_llm_with_timeout_message(
|
|
74
|
+
llm.ainvoke(messages),
|
|
75
|
+
),
|
|
76
|
+
fallback_call=lambda: invoke_llm_with_timeout_message(
|
|
77
|
+
llm_fallback.ainvoke(messages),
|
|
78
|
+
),
|
|
79
|
+
) # type: ignore
|
|
80
|
+
subgoals_plan = [
|
|
81
|
+
Subgoal(
|
|
82
|
+
id=generate_id(),
|
|
83
|
+
description=subgoal.description,
|
|
84
|
+
status=SubgoalStatus.NOT_STARTED,
|
|
85
|
+
completion_reason=None,
|
|
86
|
+
)
|
|
87
|
+
for subgoal in response.subgoals
|
|
88
|
+
]
|
|
89
|
+
logger.info("📜 Generated plan:")
|
|
90
|
+
logger.info("\n".join(str(s) for s in subgoals_plan))
|
|
91
|
+
|
|
92
|
+
if self.ctx.on_plan_changes:
|
|
93
|
+
await self.ctx.on_plan_changes(subgoals_plan, needs_replan)
|
|
94
|
+
|
|
95
|
+
return await state.asanitize_update(
|
|
96
|
+
ctx=self.ctx,
|
|
97
|
+
update={
|
|
98
|
+
"subgoal_plan": subgoals_plan,
|
|
99
|
+
},
|
|
100
|
+
agent="planner",
|
|
101
|
+
)
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
from datetime import datetime
|
|
2
|
+
from enum import Enum
|
|
3
|
+
from typing import Annotated
|
|
4
|
+
|
|
5
|
+
from pydantic import BaseModel
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class PlannerSubgoalOutput(BaseModel):
|
|
9
|
+
description: str
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class PlannerOutput(BaseModel):
|
|
13
|
+
subgoals: list[PlannerSubgoalOutput]
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class SubgoalStatus(Enum):
|
|
17
|
+
NOT_STARTED = "NOT_STARTED"
|
|
18
|
+
PENDING = "PENDING"
|
|
19
|
+
SUCCESS = "SUCCESS"
|
|
20
|
+
FAILURE = "FAILURE"
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class Subgoal(BaseModel):
|
|
24
|
+
id: Annotated[str, "Unique identifier of the subgoal"]
|
|
25
|
+
description: Annotated[str, "Description of the subgoal"]
|
|
26
|
+
completion_reason: Annotated[
|
|
27
|
+
str | None, "Reason why the subgoal was completed (failure or success)"
|
|
28
|
+
] = None
|
|
29
|
+
status: SubgoalStatus
|
|
30
|
+
started_at: Annotated[datetime | None, "When the subgoal started"] = None
|
|
31
|
+
ended_at: Annotated[datetime | None, "When the subgoal ended"] = None
|
|
32
|
+
|
|
33
|
+
def __str__(self):
|
|
34
|
+
status_emoji = "❓"
|
|
35
|
+
match self.status:
|
|
36
|
+
case SubgoalStatus.SUCCESS:
|
|
37
|
+
status_emoji = "✅"
|
|
38
|
+
case SubgoalStatus.FAILURE:
|
|
39
|
+
status_emoji = "❌"
|
|
40
|
+
case SubgoalStatus.PENDING:
|
|
41
|
+
status_emoji = "⏳"
|
|
42
|
+
case SubgoalStatus.NOT_STARTED:
|
|
43
|
+
status_emoji = "(not started yet)"
|
|
44
|
+
|
|
45
|
+
output = f"- [ID:{self.id}]: {self.description} : {status_emoji}."
|
|
46
|
+
if self.completion_reason:
|
|
47
|
+
output += f" Completion reason: {self.completion_reason}"
|
|
48
|
+
return output
|
|
49
|
+
|
|
50
|
+
def __repr__(self):
|
|
51
|
+
return str(self)
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
import random
|
|
2
|
+
import string
|
|
3
|
+
|
|
4
|
+
from minitap.mobile_use.agents.planner.types import Subgoal, SubgoalStatus
|
|
5
|
+
from datetime import datetime, UTC
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def get_current_subgoal(subgoals: list[Subgoal]) -> Subgoal | None:
|
|
9
|
+
return next((s for s in subgoals if s.status == SubgoalStatus.PENDING), None)
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def get_subgoals_by_ids(subgoals: list[Subgoal], ids: list[str]) -> list[Subgoal]:
|
|
13
|
+
return [s for s in subgoals if s.id in ids]
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def get_next_subgoal(subgoals: list[Subgoal]) -> Subgoal | None:
|
|
17
|
+
return next((s for s in subgoals if s.status == SubgoalStatus.NOT_STARTED), None)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def nothing_started(subgoals: list[Subgoal]) -> bool:
|
|
21
|
+
return all(s.status == SubgoalStatus.NOT_STARTED for s in subgoals)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def complete_current_subgoal(subgoals: list[Subgoal]) -> list[Subgoal]:
|
|
25
|
+
current_subgoal = get_current_subgoal(subgoals)
|
|
26
|
+
if not current_subgoal:
|
|
27
|
+
return subgoals
|
|
28
|
+
current_subgoal.status = SubgoalStatus.SUCCESS
|
|
29
|
+
current_subgoal.ended_at = datetime.now(UTC)
|
|
30
|
+
return subgoals
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def complete_subgoals_by_ids(subgoals: list[Subgoal], ids: list[str]) -> list[Subgoal]:
|
|
34
|
+
for subgoal in subgoals:
|
|
35
|
+
if subgoal.id in ids:
|
|
36
|
+
subgoal.status = SubgoalStatus.SUCCESS
|
|
37
|
+
subgoal.ended_at = datetime.now(UTC)
|
|
38
|
+
return subgoals
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def fail_current_subgoal(subgoals: list[Subgoal]) -> list[Subgoal]:
|
|
42
|
+
current_subgoal = get_current_subgoal(subgoals)
|
|
43
|
+
if not current_subgoal:
|
|
44
|
+
return subgoals
|
|
45
|
+
current_subgoal.status = SubgoalStatus.FAILURE
|
|
46
|
+
current_subgoal.ended_at = datetime.now(UTC)
|
|
47
|
+
return subgoals
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def all_completed(subgoals: list[Subgoal]) -> bool:
|
|
51
|
+
return all(s.status == SubgoalStatus.SUCCESS for s in subgoals)
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def one_of_them_is_failure(subgoals: list[Subgoal]) -> bool:
|
|
55
|
+
return any(s.status == SubgoalStatus.FAILURE for s in subgoals)
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def start_next_subgoal(subgoals: list[Subgoal]) -> list[Subgoal]:
|
|
59
|
+
next_subgoal = get_next_subgoal(subgoals)
|
|
60
|
+
if not next_subgoal:
|
|
61
|
+
return subgoals
|
|
62
|
+
next_subgoal.status = SubgoalStatus.PENDING
|
|
63
|
+
next_subgoal.started_at = datetime.now(UTC)
|
|
64
|
+
return subgoals
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def generate_id(length: int = 6) -> str:
|
|
68
|
+
"""Generates a small and distinct random string ID."""
|
|
69
|
+
chars = string.ascii_lowercase + string.digits
|
|
70
|
+
return "".join(random.choice(chars) for _ in range(length))
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
from langchain_core.messages import (
|
|
2
|
+
HumanMessage,
|
|
3
|
+
RemoveMessage,
|
|
4
|
+
ToolMessage,
|
|
5
|
+
)
|
|
6
|
+
|
|
7
|
+
from minitap.mobile_use.constants import MAX_MESSAGES_IN_HISTORY
|
|
8
|
+
from minitap.mobile_use.context import MobileUseContext
|
|
9
|
+
from minitap.mobile_use.graph.state import State
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class SummarizerNode:
|
|
13
|
+
def __init__(self, ctx: MobileUseContext):
|
|
14
|
+
self.ctx = ctx
|
|
15
|
+
|
|
16
|
+
async def __call__(self, state: State):
|
|
17
|
+
if len(state.messages) <= MAX_MESSAGES_IN_HISTORY:
|
|
18
|
+
return {}
|
|
19
|
+
|
|
20
|
+
nb_removal_candidates = len(state.messages) - MAX_MESSAGES_IN_HISTORY
|
|
21
|
+
|
|
22
|
+
remove_messages = []
|
|
23
|
+
start_removal = False
|
|
24
|
+
|
|
25
|
+
for msg in reversed(state.messages[:nb_removal_candidates]):
|
|
26
|
+
if isinstance(msg, ToolMessage | HumanMessage):
|
|
27
|
+
start_removal = True
|
|
28
|
+
if start_removal and msg.id:
|
|
29
|
+
remove_messages.append(RemoveMessage(id=msg.id))
|
|
30
|
+
return await state.asanitize_update(
|
|
31
|
+
ctx=self.ctx,
|
|
32
|
+
update={
|
|
33
|
+
"messages": remove_messages,
|
|
34
|
+
},
|
|
35
|
+
)
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
## You are a **Video Analysis Assistant**
|
|
2
|
+
|
|
3
|
+
You analyze video recordings of mobile device screens and provide accurate, detailed responses based on what you observe.
|
|
4
|
+
|
|
5
|
+
---
|
|
6
|
+
|
|
7
|
+
## Your Focus Areas
|
|
8
|
+
|
|
9
|
+
When analyzing videos, pay attention to:
|
|
10
|
+
|
|
11
|
+
- **UI elements** and their states (buttons, text fields, toggles, etc.)
|
|
12
|
+
- **Text content** displayed on screen
|
|
13
|
+
- **Actions that occur** (taps, scrolls, transitions, animations)
|
|
14
|
+
- **Notifications or dialogs** that appear
|
|
15
|
+
- **Changes in the interface** over time
|
|
16
|
+
- **Audio content** if present (transcribe speech, describe sounds)
|
|
17
|
+
|
|
18
|
+
---
|
|
19
|
+
|
|
20
|
+
## Guidelines
|
|
21
|
+
|
|
22
|
+
- **Be precise and factual** - Only describe what you can actually see or hear
|
|
23
|
+
- **Note uncertainty** - If you cannot clearly see or determine something, say so
|
|
24
|
+
- **Be thorough** - Capture all relevant details that relate to the user's question
|
|
25
|
+
- **Use timestamps** when describing sequences of events (e.g., "At 0:05, the user taps...")
|
|
26
|
+
- **Structure your response** clearly when there's a lot of information
|
|
27
|
+
|
|
28
|
+
---
|
|
29
|
+
|
|
30
|
+
## Response Format
|
|
31
|
+
|
|
32
|
+
Adapt your response format to the user's request:
|
|
33
|
+
|
|
34
|
+
- For **transcription requests**: Provide clean, readable text of what was spoken or displayed
|
|
35
|
+
- For **description requests**: Give a chronological narrative of events
|
|
36
|
+
- For **specific questions**: Answer directly and concisely
|
|
37
|
+
- For **extraction requests**: List items clearly (e.g., notifications, text content)
|