minitap-mobile-use 2.0.0__py3-none-any.whl → 2.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of minitap-mobile-use might be problematic. Click here for more details.
- minitap/mobile_use/agents/cortex/cortex.md +19 -10
- minitap/mobile_use/agents/cortex/cortex.py +15 -2
- minitap/mobile_use/agents/cortex/types.py +2 -4
- minitap/mobile_use/agents/executor/executor.md +20 -15
- minitap/mobile_use/agents/executor/executor.py +6 -18
- minitap/mobile_use/agents/executor/tool_node.py +105 -0
- minitap/mobile_use/agents/hopper/hopper.md +2 -10
- minitap/mobile_use/agents/hopper/hopper.py +4 -9
- minitap/mobile_use/agents/orchestrator/human.md +3 -4
- minitap/mobile_use/agents/orchestrator/orchestrator.md +25 -7
- minitap/mobile_use/agents/orchestrator/orchestrator.py +56 -56
- minitap/mobile_use/agents/orchestrator/types.py +5 -8
- minitap/mobile_use/agents/outputter/outputter.py +1 -2
- minitap/mobile_use/agents/planner/planner.md +25 -15
- minitap/mobile_use/agents/planner/planner.py +7 -1
- minitap/mobile_use/agents/planner/types.py +10 -5
- minitap/mobile_use/agents/planner/utils.py +11 -0
- minitap/mobile_use/agents/summarizer/summarizer.py +2 -1
- minitap/mobile_use/clients/device_hardware_client.py +3 -0
- minitap/mobile_use/config.py +16 -14
- minitap/mobile_use/constants.py +1 -0
- minitap/mobile_use/context.py +3 -4
- minitap/mobile_use/controllers/mobile_command_controller.py +37 -26
- minitap/mobile_use/controllers/platform_specific_commands_controller.py +3 -4
- minitap/mobile_use/graph/graph.py +10 -31
- minitap/mobile_use/graph/state.py +34 -14
- minitap/mobile_use/main.py +11 -8
- minitap/mobile_use/sdk/agent.py +78 -63
- minitap/mobile_use/sdk/builders/agent_config_builder.py +23 -11
- minitap/mobile_use/sdk/builders/task_request_builder.py +9 -9
- minitap/mobile_use/sdk/examples/smart_notification_assistant.py +1 -2
- minitap/mobile_use/sdk/types/agent.py +10 -5
- minitap/mobile_use/sdk/types/task.py +19 -18
- minitap/mobile_use/sdk/utils.py +1 -1
- minitap/mobile_use/servers/config.py +1 -2
- minitap/mobile_use/servers/device_hardware_bridge.py +3 -4
- minitap/mobile_use/servers/start_servers.py +4 -4
- minitap/mobile_use/servers/stop_servers.py +12 -18
- minitap/mobile_use/services/llm.py +4 -2
- minitap/mobile_use/tools/index.py +11 -7
- minitap/mobile_use/tools/mobile/back.py +8 -12
- minitap/mobile_use/tools/mobile/clear_text.py +277 -0
- minitap/mobile_use/tools/mobile/copy_text_from.py +8 -12
- minitap/mobile_use/tools/mobile/erase_one_char.py +56 -0
- minitap/mobile_use/tools/mobile/find_packages.py +69 -0
- minitap/mobile_use/tools/mobile/input_text.py +55 -32
- minitap/mobile_use/tools/mobile/launch_app.py +8 -12
- minitap/mobile_use/tools/mobile/long_press_on.py +9 -13
- minitap/mobile_use/tools/mobile/open_link.py +8 -12
- minitap/mobile_use/tools/mobile/paste_text.py +8 -12
- minitap/mobile_use/tools/mobile/press_key.py +8 -12
- minitap/mobile_use/tools/mobile/stop_app.py +9 -13
- minitap/mobile_use/tools/mobile/swipe.py +8 -12
- minitap/mobile_use/tools/mobile/take_screenshot.py +8 -12
- minitap/mobile_use/tools/mobile/tap.py +9 -13
- minitap/mobile_use/tools/mobile/wait_for_animation_to_end.py +9 -13
- minitap/mobile_use/tools/tool_wrapper.py +1 -23
- minitap/mobile_use/tools/utils.py +86 -0
- minitap/mobile_use/utils/cli_helpers.py +1 -2
- minitap/mobile_use/utils/cli_selection.py +5 -6
- minitap/mobile_use/utils/decorators.py +21 -20
- minitap/mobile_use/utils/logger.py +3 -4
- minitap/mobile_use/utils/media.py +1 -1
- minitap/mobile_use/utils/recorder.py +11 -10
- minitap/mobile_use/utils/ui_hierarchy.py +98 -3
- {minitap_mobile_use-2.0.0.dist-info → minitap_mobile_use-2.1.0.dist-info}/METADATA +12 -2
- minitap_mobile_use-2.1.0.dist-info/RECORD +96 -0
- minitap/mobile_use/agents/executor/executor_context_cleaner.py +0 -27
- minitap/mobile_use/tools/mobile/erase_text.py +0 -124
- minitap/mobile_use/tools/mobile/list_packages.py +0 -78
- minitap/mobile_use/tools/mobile/run_flow.py +0 -57
- minitap_mobile_use-2.0.0.dist-info/RECORD +0 -95
- {minitap_mobile_use-2.0.0.dist-info → minitap_mobile_use-2.1.0.dist-info}/WHEEL +0 -0
- {minitap_mobile_use-2.0.0.dist-info → minitap_mobile_use-2.1.0.dist-info}/entry_points.txt +0 -0
|
@@ -1,14 +1,11 @@
|
|
|
1
|
-
from
|
|
1
|
+
from typing import Annotated
|
|
2
2
|
|
|
3
3
|
from pydantic import BaseModel
|
|
4
4
|
|
|
5
5
|
|
|
6
|
-
class OrchestratorStatus(Enum):
|
|
7
|
-
CONTINUE = "continue"
|
|
8
|
-
RESUME = "resume"
|
|
9
|
-
REPLAN = "replan"
|
|
10
|
-
|
|
11
|
-
|
|
12
6
|
class OrchestratorOutput(BaseModel):
|
|
13
|
-
|
|
7
|
+
completed_subgoal_ids: Annotated[
|
|
8
|
+
list[str], "IDs of subgoals that can now be marked as complete"
|
|
9
|
+
] = []
|
|
10
|
+
needs_replaning: Annotated[bool, "Whether the orchestrator needs to replan the subgoal plan"]
|
|
14
11
|
reason: str
|
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
import json
|
|
2
2
|
from pathlib import Path
|
|
3
|
-
from typing import Dict, Type, Union
|
|
4
3
|
|
|
5
4
|
from jinja2 import Template
|
|
6
5
|
from langchain_core.messages import BaseMessage, HumanMessage, SystemMessage
|
|
@@ -49,7 +48,7 @@ async def outputter(
|
|
|
49
48
|
structured_llm = llm
|
|
50
49
|
|
|
51
50
|
if output_config.structured_output:
|
|
52
|
-
schema:
|
|
51
|
+
schema: dict | type[BaseModel] | None = None
|
|
53
52
|
so = output_config.structured_output
|
|
54
53
|
|
|
55
54
|
if isinstance(so, dict):
|
|
@@ -12,7 +12,9 @@ You work like an agile tech lead: defining the key milestones without locking in
|
|
|
12
12
|
- Subgoals should reflect real interactions with mobile UIs (e.g. "Open app", "Tap search bar", "Scroll to item", "Send message to Bob", etc).
|
|
13
13
|
- Don't assume the full UI is visible yet. Plan based on how most mobile apps work, and keep flexibility.
|
|
14
14
|
- List of agents thoughts is empty which is expected, since it is the first plan.
|
|
15
|
-
-
|
|
15
|
+
- Avoid too granular UI actions based tasks (e.g. "tap", "swipe", "copy", "paste") unless explicitly required.
|
|
16
|
+
- The executor has the following available tools: **{{ executor_tools_list }}**.
|
|
17
|
+
When one of these tools offers a direct shortcut (e.g. `openLink` instead of manually launching a browser and typing a URL), prefer it over decomposed manual steps.
|
|
16
18
|
|
|
17
19
|
2. **Replanning**
|
|
18
20
|
If you're asked to **revise a previous plan**, you'll also receive:
|
|
@@ -25,7 +27,7 @@ You work like an agile tech lead: defining the key milestones without locking in
|
|
|
25
27
|
|
|
26
28
|
### Output
|
|
27
29
|
|
|
28
|
-
You must output a **list of
|
|
30
|
+
You must output a **list of subgoals (description + optional subgoal ID)**, each representing a clear subgoal.
|
|
29
31
|
Each subgoal should be:
|
|
30
32
|
|
|
31
33
|
- Focused on **realistic mobile interactions**
|
|
@@ -33,32 +35,40 @@ Each subgoal should be:
|
|
|
33
35
|
- Sequential (later steps may depend on earlier ones)
|
|
34
36
|
- Don't use loop-like formulation unless necessary (e.g. don't say "repeat this X times", instead reuse the same steps X times as subgoals)
|
|
35
37
|
|
|
38
|
+
If you're replaning and need to keep a previous subgoal, you **must keep the same subgoal ID**.
|
|
39
|
+
|
|
36
40
|
### Examples
|
|
37
41
|
|
|
38
42
|
#### **Initial Goal**: "Open WhatsApp and send 'I’m running late' to Alice"
|
|
39
43
|
|
|
40
44
|
**Plan**:
|
|
41
45
|
|
|
42
|
-
- Open the WhatsApp app
|
|
43
|
-
- Locate or search for Alice
|
|
44
|
-
- Open the conversation with Alice
|
|
45
|
-
- Type the message "I’m running late"
|
|
46
|
-
- Send the message
|
|
46
|
+
- Open the WhatsApp app (ID: None -> will be generated as a UUID like bc3c362d-f498-4f1a-991e-4a2d1f8c1226)
|
|
47
|
+
- Locate or search for Alice (ID: None)
|
|
48
|
+
- Open the conversation with Alice (ID: None)
|
|
49
|
+
- Type the message "I’m running late" (ID: None)
|
|
50
|
+
- Send the message (ID: None)
|
|
51
|
+
|
|
52
|
+
#### **Initial Goal**: "Go on https://tesla.com, and tell me what is the first car being displayed"
|
|
53
|
+
|
|
54
|
+
**Plan**:
|
|
55
|
+
|
|
56
|
+
- Open the link https://tesla.com (ID: None)
|
|
57
|
+
- Find the first car displayed on the home page (ID: None)
|
|
47
58
|
|
|
48
59
|
#### **Replanning Example**
|
|
49
60
|
|
|
50
|
-
**Original Plan**: same as above
|
|
61
|
+
**Original Plan**: same as above with IDs set
|
|
51
62
|
**Agent Thoughts**:
|
|
52
63
|
|
|
53
|
-
- Couldn
|
|
64
|
+
- Couldn't find Alice in recent chats
|
|
54
65
|
- Search bar was present on top of the chat screen
|
|
55
66
|
- Keyboard appeared after tapping search
|
|
56
67
|
|
|
57
68
|
**New Plan**:
|
|
58
69
|
|
|
59
|
-
-
|
|
60
|
-
-
|
|
61
|
-
-
|
|
62
|
-
-
|
|
63
|
-
-
|
|
64
|
-
- Type and send "I’m running late"
|
|
70
|
+
- Open WhatsApp (ID: bc3c362d-f498-4f1a-991e-4a2d1f8c1226)
|
|
71
|
+
- Tap the search bar (ID: None)
|
|
72
|
+
- Search for "Alice" (ID: None)
|
|
73
|
+
- Select the correct chat (ID: None)
|
|
74
|
+
- Type and send "I’m running late" (ID: None)
|
|
@@ -1,12 +1,15 @@
|
|
|
1
|
+
import uuid
|
|
1
2
|
from pathlib import Path
|
|
2
3
|
|
|
3
4
|
from jinja2 import Template
|
|
4
5
|
from langchain_core.messages import HumanMessage, SystemMessage
|
|
6
|
+
|
|
5
7
|
from minitap.mobile_use.agents.planner.types import PlannerOutput, Subgoal, SubgoalStatus
|
|
6
8
|
from minitap.mobile_use.agents.planner.utils import one_of_them_is_failure
|
|
7
9
|
from minitap.mobile_use.context import MobileUseContext
|
|
8
10
|
from minitap.mobile_use.graph.state import State
|
|
9
11
|
from minitap.mobile_use.services.llm import get_llm
|
|
12
|
+
from minitap.mobile_use.tools.index import EXECUTOR_WRAPPERS_TOOLS, format_tools_list
|
|
10
13
|
from minitap.mobile_use.utils.decorators import wrap_with_callbacks
|
|
11
14
|
from minitap.mobile_use.utils.logger import get_logger
|
|
12
15
|
|
|
@@ -35,6 +38,7 @@ class PlannerNode:
|
|
|
35
38
|
initial_goal=state.initial_goal,
|
|
36
39
|
previous_plan="\n".join(str(s) for s in state.subgoal_plan),
|
|
37
40
|
agent_thoughts="\n".join(state.agents_thoughts),
|
|
41
|
+
executor_tools_list=format_tools_list(self.ctx, EXECUTOR_WRAPPERS_TOOLS),
|
|
38
42
|
)
|
|
39
43
|
messages = [
|
|
40
44
|
SystemMessage(content=system_message),
|
|
@@ -47,7 +51,8 @@ class PlannerNode:
|
|
|
47
51
|
|
|
48
52
|
subgoals_plan = [
|
|
49
53
|
Subgoal(
|
|
50
|
-
|
|
54
|
+
id=subgoal.id or str(uuid.uuid4()),
|
|
55
|
+
description=subgoal.description,
|
|
51
56
|
status=SubgoalStatus.NOT_STARTED,
|
|
52
57
|
completion_reason=None,
|
|
53
58
|
)
|
|
@@ -61,4 +66,5 @@ class PlannerNode:
|
|
|
61
66
|
update={
|
|
62
67
|
"subgoal_plan": subgoals_plan,
|
|
63
68
|
},
|
|
69
|
+
agent="planner",
|
|
64
70
|
)
|
|
@@ -1,12 +1,16 @@
|
|
|
1
1
|
from enum import Enum
|
|
2
|
-
from typing import Optional
|
|
3
2
|
|
|
4
3
|
from pydantic import BaseModel
|
|
5
|
-
from
|
|
4
|
+
from typing import Annotated
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class PlannerSubgoalOutput(BaseModel):
|
|
8
|
+
id: Annotated[str | None, "If not provided, it will be generated"] = None
|
|
9
|
+
description: str
|
|
6
10
|
|
|
7
11
|
|
|
8
12
|
class PlannerOutput(BaseModel):
|
|
9
|
-
subgoals: list[
|
|
13
|
+
subgoals: list[PlannerSubgoalOutput]
|
|
10
14
|
|
|
11
15
|
|
|
12
16
|
class SubgoalStatus(Enum):
|
|
@@ -17,9 +21,10 @@ class SubgoalStatus(Enum):
|
|
|
17
21
|
|
|
18
22
|
|
|
19
23
|
class Subgoal(BaseModel):
|
|
24
|
+
id: Annotated[str, "Unique identifier of the subgoal"]
|
|
20
25
|
description: Annotated[str, "Description of the subgoal"]
|
|
21
26
|
completion_reason: Annotated[
|
|
22
|
-
|
|
27
|
+
str | None, "Reason why the subgoal was completed (failure or success)"
|
|
23
28
|
] = None
|
|
24
29
|
status: SubgoalStatus
|
|
25
30
|
|
|
@@ -35,7 +40,7 @@ class Subgoal(BaseModel):
|
|
|
35
40
|
case SubgoalStatus.NOT_STARTED:
|
|
36
41
|
status_emoji = "(not started yet)"
|
|
37
42
|
|
|
38
|
-
output = f"- {self.description} : {status_emoji}."
|
|
43
|
+
output = f"- [ID:{self.id}]: {self.description} : {status_emoji}."
|
|
39
44
|
if self.completion_reason:
|
|
40
45
|
output += f" Completion reason: {self.completion_reason}"
|
|
41
46
|
return output
|
|
@@ -5,6 +5,10 @@ def get_current_subgoal(subgoals: list[Subgoal]) -> Subgoal | None:
|
|
|
5
5
|
return next((s for s in subgoals if s.status == SubgoalStatus.PENDING), None)
|
|
6
6
|
|
|
7
7
|
|
|
8
|
+
def get_subgoals_by_ids(subgoals: list[Subgoal], ids: list[str]) -> list[Subgoal]:
|
|
9
|
+
return [s for s in subgoals if s.id in ids]
|
|
10
|
+
|
|
11
|
+
|
|
8
12
|
def get_next_subgoal(subgoals: list[Subgoal]) -> Subgoal | None:
|
|
9
13
|
return next((s for s in subgoals if s.status == SubgoalStatus.NOT_STARTED), None)
|
|
10
14
|
|
|
@@ -21,6 +25,13 @@ def complete_current_subgoal(subgoals: list[Subgoal]) -> list[Subgoal]:
|
|
|
21
25
|
return subgoals
|
|
22
26
|
|
|
23
27
|
|
|
28
|
+
def complete_subgoals_by_ids(subgoals: list[Subgoal], ids: list[str]) -> list[Subgoal]:
|
|
29
|
+
for subgoal in subgoals:
|
|
30
|
+
if subgoal.id in ids:
|
|
31
|
+
subgoal.status = SubgoalStatus.SUCCESS
|
|
32
|
+
return subgoals
|
|
33
|
+
|
|
34
|
+
|
|
24
35
|
def fail_current_subgoal(subgoals: list[Subgoal]) -> list[Subgoal]:
|
|
25
36
|
current_subgoal = get_current_subgoal(subgoals)
|
|
26
37
|
if not current_subgoal:
|
|
@@ -3,6 +3,7 @@ from langchain_core.messages import (
|
|
|
3
3
|
RemoveMessage,
|
|
4
4
|
ToolMessage,
|
|
5
5
|
)
|
|
6
|
+
|
|
6
7
|
from minitap.mobile_use.constants import MAX_MESSAGES_IN_HISTORY
|
|
7
8
|
from minitap.mobile_use.context import MobileUseContext
|
|
8
9
|
from minitap.mobile_use.graph.state import State
|
|
@@ -22,7 +23,7 @@ class SummarizerNode:
|
|
|
22
23
|
start_removal = False
|
|
23
24
|
|
|
24
25
|
for msg in reversed(state.messages[:nb_removal_candidates]):
|
|
25
|
-
if isinstance(msg,
|
|
26
|
+
if isinstance(msg, ToolMessage | HumanMessage):
|
|
26
27
|
start_removal = True
|
|
27
28
|
if start_removal and msg.id:
|
|
28
29
|
remove_messages.append(RemoveMessage(id=msg.id))
|
|
@@ -12,6 +12,9 @@ class DeviceHardwareClient:
|
|
|
12
12
|
url = urljoin(self.base_url, f"/api/{path.lstrip('/')}")
|
|
13
13
|
return self.session.get(url, **kwargs)
|
|
14
14
|
|
|
15
|
+
def get_rich_hierarchy(self) -> list[dict]:
|
|
16
|
+
return self.get("last-view-hierarchy").json().get("children", [])
|
|
17
|
+
|
|
15
18
|
def post(self, path: str, **kwargs):
|
|
16
19
|
url = urljoin(self.base_url, f"/api/{path.lstrip('/')}")
|
|
17
20
|
return self.session.post(url, **kwargs)
|
minitap/mobile_use/config.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import os
|
|
3
3
|
from pathlib import Path
|
|
4
|
-
from typing import Annotated, Any, Literal
|
|
4
|
+
from typing import Annotated, Any, Literal
|
|
5
5
|
|
|
6
6
|
from dotenv import load_dotenv
|
|
7
7
|
from pydantic import BaseModel, Field, SecretStr, ValidationError, model_validator
|
|
@@ -17,15 +17,17 @@ logger = get_logger(__name__)
|
|
|
17
17
|
|
|
18
18
|
|
|
19
19
|
class Settings(BaseSettings):
|
|
20
|
-
OPENAI_API_KEY:
|
|
21
|
-
GOOGLE_API_KEY:
|
|
22
|
-
XAI_API_KEY:
|
|
23
|
-
OPEN_ROUTER_API_KEY:
|
|
20
|
+
OPENAI_API_KEY: SecretStr | None = None
|
|
21
|
+
GOOGLE_API_KEY: SecretStr | None = None
|
|
22
|
+
XAI_API_KEY: SecretStr | None = None
|
|
23
|
+
OPEN_ROUTER_API_KEY: SecretStr | None = None
|
|
24
24
|
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
25
|
+
OPENAI_BASE_URL: str | None = None
|
|
26
|
+
|
|
27
|
+
DEVICE_SCREEN_API_BASE_URL: str | None = None
|
|
28
|
+
DEVICE_HARDWARE_BRIDGE_BASE_URL: str | None = None
|
|
29
|
+
ADB_HOST: str | None = None
|
|
30
|
+
ADB_PORT: int | None = None
|
|
29
31
|
|
|
30
32
|
model_config = {"env_file": ".env", "extra": "ignore"}
|
|
31
33
|
|
|
@@ -69,7 +71,7 @@ def prepare_output_files() -> tuple[str | None, str | None]:
|
|
|
69
71
|
return validated_events_path, validated_results_path
|
|
70
72
|
|
|
71
73
|
|
|
72
|
-
def record_events(output_path: Path | None, events:
|
|
74
|
+
def record_events(output_path: Path | None, events: list[str] | BaseModel | Any):
|
|
73
75
|
if not output_path:
|
|
74
76
|
return
|
|
75
77
|
|
|
@@ -168,7 +170,7 @@ def get_default_llm_config() -> LLMConfig:
|
|
|
168
170
|
try:
|
|
169
171
|
if not os.path.exists(ROOT_DIR / DEFAULT_LLM_CONFIG_FILENAME):
|
|
170
172
|
raise Exception("Default llm config not found")
|
|
171
|
-
with open(ROOT_DIR / DEFAULT_LLM_CONFIG_FILENAME
|
|
173
|
+
with open(ROOT_DIR / DEFAULT_LLM_CONFIG_FILENAME) as f:
|
|
172
174
|
default_config_dict = load_jsonc(f)
|
|
173
175
|
return LLMConfig.model_validate(default_config_dict["default"])
|
|
174
176
|
except Exception as e:
|
|
@@ -209,7 +211,7 @@ def parse_llm_config() -> LLMConfig:
|
|
|
209
211
|
override_config_dict = {}
|
|
210
212
|
if os.path.exists(ROOT_DIR / OVERRIDE_LLM_CONFIG_FILENAME):
|
|
211
213
|
logger.info("Loading custom llm config...")
|
|
212
|
-
with open(ROOT_DIR / OVERRIDE_LLM_CONFIG_FILENAME
|
|
214
|
+
with open(ROOT_DIR / OVERRIDE_LLM_CONFIG_FILENAME) as f:
|
|
213
215
|
override_config_dict = load_jsonc(f)
|
|
214
216
|
else:
|
|
215
217
|
logger.warning("Custom llm config not found, loading default config")
|
|
@@ -235,7 +237,7 @@ def initialize_llm_config() -> LLMConfig:
|
|
|
235
237
|
|
|
236
238
|
class OutputConfig(BaseModel):
|
|
237
239
|
structured_output: Annotated[
|
|
238
|
-
|
|
240
|
+
type[BaseModel] | dict | None,
|
|
239
241
|
Field(
|
|
240
242
|
default=None,
|
|
241
243
|
description=(
|
|
@@ -245,7 +247,7 @@ class OutputConfig(BaseModel):
|
|
|
245
247
|
),
|
|
246
248
|
]
|
|
247
249
|
output_description: Annotated[
|
|
248
|
-
|
|
250
|
+
str | None,
|
|
249
251
|
Field(
|
|
250
252
|
default=None,
|
|
251
253
|
description=(
|
minitap/mobile_use/constants.py
CHANGED
minitap/mobile_use/context.py
CHANGED
|
@@ -6,12 +6,11 @@ Uses ContextVar to avoid prop drilling and maintain clean function signatures.
|
|
|
6
6
|
|
|
7
7
|
from enum import Enum
|
|
8
8
|
from pathlib import Path
|
|
9
|
-
from typing import Optional
|
|
10
9
|
|
|
11
10
|
from adbutils import AdbClient
|
|
12
11
|
from openai import BaseModel
|
|
13
12
|
from pydantic import ConfigDict
|
|
14
|
-
from
|
|
13
|
+
from typing import Literal
|
|
15
14
|
|
|
16
15
|
from minitap.mobile_use.clients.device_hardware_client import DeviceHardwareClient
|
|
17
16
|
from minitap.mobile_use.clients.screen_api_client import ScreenApiClient
|
|
@@ -56,8 +55,8 @@ class MobileUseContext(BaseModel):
|
|
|
56
55
|
hw_bridge_client: DeviceHardwareClient
|
|
57
56
|
screen_api_client: ScreenApiClient
|
|
58
57
|
llm_config: LLMConfig
|
|
59
|
-
adb_client:
|
|
60
|
-
execution_setup:
|
|
58
|
+
adb_client: AdbClient | None = None
|
|
59
|
+
execution_setup: ExecutionSetup | None = None
|
|
61
60
|
|
|
62
61
|
def get_adb_client(self) -> AdbClient:
|
|
63
62
|
if self.adb_client is None:
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import uuid
|
|
2
2
|
from enum import Enum
|
|
3
|
-
from typing import Annotated, Literal
|
|
3
|
+
from typing import Annotated, Literal
|
|
4
4
|
|
|
5
5
|
import yaml
|
|
6
6
|
from langgraph.types import Command
|
|
@@ -9,6 +9,7 @@ from requests import JSONDecodeError
|
|
|
9
9
|
|
|
10
10
|
from minitap.mobile_use.clients.device_hardware_client import DeviceHardwareClient
|
|
11
11
|
from minitap.mobile_use.clients.screen_api_client import ScreenApiClient
|
|
12
|
+
from minitap.mobile_use.config import initialize_llm_config
|
|
12
13
|
from minitap.mobile_use.context import DeviceContext, DevicePlatform, MobileUseContext
|
|
13
14
|
from minitap.mobile_use.utils.errors import ControllerErrors
|
|
14
15
|
from minitap.mobile_use.utils.logger import get_logger
|
|
@@ -42,7 +43,7 @@ class RunFlowRequest(BaseModel):
|
|
|
42
43
|
dry_run: bool = Field(default=False, alias="dryRun")
|
|
43
44
|
|
|
44
45
|
|
|
45
|
-
def run_flow(ctx: MobileUseContext, flow_steps: list, dry_run: bool = False) ->
|
|
46
|
+
def run_flow(ctx: MobileUseContext, flow_steps: list, dry_run: bool = False) -> dict | None:
|
|
46
47
|
"""
|
|
47
48
|
Run a flow i.e, a sequence of commands.
|
|
48
49
|
Returns None on success, or the response body of the failed command.
|
|
@@ -136,20 +137,20 @@ class SelectorRequestWithPercentages(BaseModel):
|
|
|
136
137
|
return {"point": self.percentages.to_str()}
|
|
137
138
|
|
|
138
139
|
|
|
139
|
-
SelectorRequest =
|
|
140
|
-
IdSelectorRequest
|
|
141
|
-
SelectorRequestWithCoordinates
|
|
142
|
-
SelectorRequestWithPercentages
|
|
143
|
-
TextSelectorRequest
|
|
144
|
-
IdWithTextSelectorRequest
|
|
145
|
-
|
|
140
|
+
SelectorRequest = (
|
|
141
|
+
IdSelectorRequest
|
|
142
|
+
| SelectorRequestWithCoordinates
|
|
143
|
+
| SelectorRequestWithPercentages
|
|
144
|
+
| TextSelectorRequest
|
|
145
|
+
| IdWithTextSelectorRequest
|
|
146
|
+
)
|
|
146
147
|
|
|
147
148
|
|
|
148
149
|
def tap(
|
|
149
150
|
ctx: MobileUseContext,
|
|
150
151
|
selector_request: SelectorRequest,
|
|
151
152
|
dry_run: bool = False,
|
|
152
|
-
index:
|
|
153
|
+
index: int | None = None,
|
|
153
154
|
):
|
|
154
155
|
"""
|
|
155
156
|
Tap on a selector.
|
|
@@ -170,7 +171,7 @@ def long_press_on(
|
|
|
170
171
|
ctx: MobileUseContext,
|
|
171
172
|
selector_request: SelectorRequest,
|
|
172
173
|
dry_run: bool = False,
|
|
173
|
-
index:
|
|
174
|
+
index: int | None = None,
|
|
174
175
|
):
|
|
175
176
|
long_press_on_body = selector_request.to_dict()
|
|
176
177
|
if not long_press_on_body:
|
|
@@ -210,7 +211,7 @@ SwipeDirection = Annotated[
|
|
|
210
211
|
class SwipeRequest(BaseModel):
|
|
211
212
|
model_config = ConfigDict(extra="forbid")
|
|
212
213
|
swipe_mode: SwipeStartEndCoordinatesRequest | SwipeStartEndPercentagesRequest | SwipeDirection
|
|
213
|
-
duration:
|
|
214
|
+
duration: int | None = None # in ms, default is 400ms
|
|
214
215
|
|
|
215
216
|
def to_dict(self):
|
|
216
217
|
res = {}
|
|
@@ -256,7 +257,7 @@ def paste_text(ctx: MobileUseContext, dry_run: bool = False):
|
|
|
256
257
|
return run_flow(ctx, ["pasteText"], dry_run=dry_run)
|
|
257
258
|
|
|
258
259
|
|
|
259
|
-
def erase_text(ctx: MobileUseContext, nb_chars:
|
|
260
|
+
def erase_text(ctx: MobileUseContext, nb_chars: int | None = None, dry_run: bool = False):
|
|
260
261
|
"""
|
|
261
262
|
Removes characters from the currently selected textfield (if any)
|
|
262
263
|
Removes 50 characters if nb_chars is not specified.
|
|
@@ -274,7 +275,7 @@ def launch_app(ctx: MobileUseContext, package_name: str, dry_run: bool = False):
|
|
|
274
275
|
return run_flow_with_wait_for_animation_to_end(ctx, flow_input, dry_run=dry_run)
|
|
275
276
|
|
|
276
277
|
|
|
277
|
-
def stop_app(ctx: MobileUseContext, package_name:
|
|
278
|
+
def stop_app(ctx: MobileUseContext, package_name: str | None = None, dry_run: bool = False):
|
|
278
279
|
if package_name is None:
|
|
279
280
|
flow_input = ["stopApp"]
|
|
280
281
|
else:
|
|
@@ -316,7 +317,7 @@ class WaitTimeout(Enum):
|
|
|
316
317
|
|
|
317
318
|
|
|
318
319
|
def wait_for_animation_to_end(
|
|
319
|
-
ctx: MobileUseContext, timeout:
|
|
320
|
+
ctx: MobileUseContext, timeout: WaitTimeout | None = None, dry_run: bool = False
|
|
320
321
|
):
|
|
321
322
|
if timeout is None:
|
|
322
323
|
return run_flow(ctx, ["waitForAnimationToEnd"], dry_run=dry_run)
|
|
@@ -331,12 +332,10 @@ def run_flow_with_wait_for_animation_to_end(
|
|
|
331
332
|
|
|
332
333
|
|
|
333
334
|
if __name__ == "__main__":
|
|
334
|
-
# long press, erase
|
|
335
|
-
# input_text(text="test")
|
|
336
|
-
# erase_text()
|
|
337
335
|
ctx = MobileUseContext(
|
|
336
|
+
llm_config=initialize_llm_config(),
|
|
338
337
|
device=DeviceContext(
|
|
339
|
-
host_platform="
|
|
338
|
+
host_platform="WINDOWS",
|
|
340
339
|
mobile_platform=DevicePlatform.ANDROID,
|
|
341
340
|
device_id="emulator-5554",
|
|
342
341
|
device_width=1080,
|
|
@@ -347,7 +346,6 @@ if __name__ == "__main__":
|
|
|
347
346
|
)
|
|
348
347
|
screen_data = get_screen_data(ctx.screen_api_client)
|
|
349
348
|
from minitap.mobile_use.graph.state import State
|
|
350
|
-
from minitap.mobile_use.tools.mobile.erase_text import get_erase_text_tool
|
|
351
349
|
|
|
352
350
|
dummy_state = State(
|
|
353
351
|
latest_ui_hierarchy=screen_data.elements,
|
|
@@ -358,20 +356,33 @@ if __name__ == "__main__":
|
|
|
358
356
|
focused_app_info=None,
|
|
359
357
|
device_date="",
|
|
360
358
|
structured_decisions=None,
|
|
361
|
-
|
|
362
|
-
executor_failed=False,
|
|
359
|
+
complete_subgoals_by_ids=[],
|
|
363
360
|
executor_messages=[],
|
|
364
361
|
cortex_last_thought="",
|
|
365
362
|
agents_thoughts=[],
|
|
366
363
|
)
|
|
367
364
|
|
|
368
|
-
#
|
|
369
|
-
|
|
370
|
-
|
|
365
|
+
# from minitap.mobile_use.tools.mobile.input_text import get_input_text_tool
|
|
366
|
+
|
|
367
|
+
# input_resource_id = "com.google.android.apps.nexuslauncher:id/search_container_hotseat"
|
|
368
|
+
# command_output: Command = get_input_text_tool(ctx=ctx).invoke(
|
|
369
|
+
# {
|
|
370
|
+
# "tool_call_id": uuid.uuid4().hex,
|
|
371
|
+
# "agent_thought": "",
|
|
372
|
+
# "text_input_resource_id": input_resource_id,
|
|
373
|
+
# "text": "Hello World",
|
|
374
|
+
# "state": dummy_state,
|
|
375
|
+
# "executor_metadata": None,
|
|
376
|
+
# }
|
|
377
|
+
# )
|
|
378
|
+
from minitap.mobile_use.tools.mobile.clear_text import get_clear_text_tool
|
|
379
|
+
|
|
380
|
+
input_resource_id = "com.google.android.apps.nexuslauncher:id/input"
|
|
381
|
+
command_output: Command = get_clear_text_tool(ctx=ctx).invoke(
|
|
371
382
|
{
|
|
372
383
|
"tool_call_id": uuid.uuid4().hex,
|
|
373
384
|
"agent_thought": "",
|
|
374
|
-
"
|
|
385
|
+
"text_input_resource_id": input_resource_id,
|
|
375
386
|
"state": dummy_state,
|
|
376
387
|
"executor_metadata": None,
|
|
377
388
|
}
|
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
from datetime import date
|
|
2
2
|
import json
|
|
3
|
-
from typing import Optional
|
|
4
3
|
|
|
5
4
|
from adbutils import AdbDevice
|
|
6
5
|
from minitap.mobile_use.utils.logger import MobileUseLogger
|
|
@@ -20,8 +19,8 @@ def get_adb_device(ctx: MobileUseContext) -> AdbDevice:
|
|
|
20
19
|
|
|
21
20
|
|
|
22
21
|
def get_first_device(
|
|
23
|
-
logger:
|
|
24
|
-
) -> tuple[
|
|
22
|
+
logger: MobileUseLogger | None = None,
|
|
23
|
+
) -> tuple[str | None, DevicePlatform | None]:
|
|
25
24
|
"""Gets the first available device."""
|
|
26
25
|
try:
|
|
27
26
|
android_output = run_shell_command_on_host("adb devices")
|
|
@@ -50,7 +49,7 @@ def get_first_device(
|
|
|
50
49
|
return None, None
|
|
51
50
|
|
|
52
51
|
|
|
53
|
-
def get_focused_app_info(ctx: MobileUseContext) ->
|
|
52
|
+
def get_focused_app_info(ctx: MobileUseContext) -> str | None:
|
|
54
53
|
if ctx.device.mobile_platform == DevicePlatform.IOS:
|
|
55
54
|
return None
|
|
56
55
|
device = get_adb_device(ctx)
|
|
@@ -6,13 +6,11 @@ from langchain_core.messages import (
|
|
|
6
6
|
from langgraph.constants import END, START
|
|
7
7
|
from langgraph.graph import StateGraph
|
|
8
8
|
from langgraph.graph.state import CompiledStateGraph
|
|
9
|
-
|
|
9
|
+
|
|
10
10
|
from minitap.mobile_use.agents.contextor.contextor import ContextorNode
|
|
11
11
|
from minitap.mobile_use.agents.cortex.cortex import CortexNode
|
|
12
12
|
from minitap.mobile_use.agents.executor.executor import ExecutorNode
|
|
13
|
-
from minitap.mobile_use.agents.executor.
|
|
14
|
-
executor_context_cleaner_node,
|
|
15
|
-
)
|
|
13
|
+
from minitap.mobile_use.agents.executor.tool_node import ExecutorToolNode
|
|
16
14
|
from minitap.mobile_use.agents.orchestrator.orchestrator import OrchestratorNode
|
|
17
15
|
from minitap.mobile_use.agents.planner.planner import PlannerNode
|
|
18
16
|
from minitap.mobile_use.agents.planner.utils import (
|
|
@@ -21,6 +19,7 @@ from minitap.mobile_use.agents.planner.utils import (
|
|
|
21
19
|
one_of_them_is_failure,
|
|
22
20
|
)
|
|
23
21
|
from minitap.mobile_use.agents.summarizer.summarizer import SummarizerNode
|
|
22
|
+
from minitap.mobile_use.constants import EXECUTOR_MESSAGES_KEY
|
|
24
23
|
from minitap.mobile_use.context import MobileUseContext
|
|
25
24
|
from minitap.mobile_use.graph.state import State
|
|
26
25
|
from minitap.mobile_use.tools.index import EXECUTOR_WRAPPERS_TOOLS, get_tools_from_wrappers
|
|
@@ -53,7 +52,7 @@ def post_cortex_gate(
|
|
|
53
52
|
state: State,
|
|
54
53
|
) -> Literal["continue", "end_subgoal"]:
|
|
55
54
|
logger.info("Starting post_cortex_gate")
|
|
56
|
-
if
|
|
55
|
+
if len(state.complete_subgoals_by_ids) > 0:
|
|
57
56
|
return "end_subgoal"
|
|
58
57
|
return "continue"
|
|
59
58
|
|
|
@@ -62,7 +61,7 @@ def post_executor_gate(
|
|
|
62
61
|
state: State,
|
|
63
62
|
) -> Literal["invoke_tools", "skip"]:
|
|
64
63
|
logger.info("Starting post_executor_gate")
|
|
65
|
-
messages = state.
|
|
64
|
+
messages = state.executor_messages
|
|
66
65
|
if not messages:
|
|
67
66
|
return "skip"
|
|
68
67
|
last_message = messages[-1]
|
|
@@ -77,17 +76,6 @@ def post_executor_gate(
|
|
|
77
76
|
return "skip"
|
|
78
77
|
|
|
79
78
|
|
|
80
|
-
def post_executor_tools_gate(
|
|
81
|
-
state: State,
|
|
82
|
-
) -> Literal["continue", "failed", "done"]:
|
|
83
|
-
logger.info("Starting post_executor_tools_gate")
|
|
84
|
-
if state.executor_failed:
|
|
85
|
-
return "failed"
|
|
86
|
-
if state.executor_retrigger:
|
|
87
|
-
return "continue"
|
|
88
|
-
return "done"
|
|
89
|
-
|
|
90
|
-
|
|
91
79
|
async def get_graph(ctx: MobileUseContext) -> CompiledStateGraph:
|
|
92
80
|
graph_builder = StateGraph(State)
|
|
93
81
|
|
|
@@ -100,12 +88,12 @@ async def get_graph(ctx: MobileUseContext) -> CompiledStateGraph:
|
|
|
100
88
|
graph_builder.add_node("cortex", CortexNode(ctx))
|
|
101
89
|
|
|
102
90
|
graph_builder.add_node("executor", ExecutorNode(ctx))
|
|
103
|
-
executor_tool_node =
|
|
104
|
-
get_tools_from_wrappers(ctx=ctx, wrappers=EXECUTOR_WRAPPERS_TOOLS)
|
|
91
|
+
executor_tool_node = ExecutorToolNode(
|
|
92
|
+
tools=get_tools_from_wrappers(ctx=ctx, wrappers=EXECUTOR_WRAPPERS_TOOLS),
|
|
93
|
+
messages_key=EXECUTOR_MESSAGES_KEY,
|
|
105
94
|
)
|
|
106
95
|
graph_builder.add_node("executor_tools", executor_tool_node)
|
|
107
96
|
|
|
108
|
-
graph_builder.add_node("executor_context_cleaner", executor_context_cleaner_node)
|
|
109
97
|
graph_builder.add_node("summarizer", SummarizerNode(ctx))
|
|
110
98
|
|
|
111
99
|
# Linking nodes
|
|
@@ -132,18 +120,9 @@ async def get_graph(ctx: MobileUseContext) -> CompiledStateGraph:
|
|
|
132
120
|
graph_builder.add_conditional_edges(
|
|
133
121
|
"executor",
|
|
134
122
|
post_executor_gate,
|
|
135
|
-
{"invoke_tools": "executor_tools", "skip": "
|
|
136
|
-
)
|
|
137
|
-
graph_builder.add_conditional_edges(
|
|
138
|
-
"executor_tools",
|
|
139
|
-
post_executor_tools_gate,
|
|
140
|
-
{
|
|
141
|
-
"continue": "executor",
|
|
142
|
-
"done": "executor_context_cleaner",
|
|
143
|
-
"failed": "executor_context_cleaner",
|
|
144
|
-
},
|
|
123
|
+
{"invoke_tools": "executor_tools", "skip": "summarizer"},
|
|
145
124
|
)
|
|
146
|
-
graph_builder.add_edge("
|
|
125
|
+
graph_builder.add_edge("executor_tools", "summarizer")
|
|
147
126
|
graph_builder.add_edge("summarizer", "contextor")
|
|
148
127
|
|
|
149
128
|
return graph_builder.compile()
|