minitap-mobile-use 2.0.1__py3-none-any.whl → 2.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of minitap-mobile-use might be problematic. Click here for more details.
- minitap/mobile_use/agents/cortex/cortex.md +7 -5
- minitap/mobile_use/agents/cortex/cortex.py +4 -1
- minitap/mobile_use/agents/cortex/types.py +1 -3
- minitap/mobile_use/agents/executor/executor.md +4 -5
- minitap/mobile_use/agents/executor/tool_node.py +6 -6
- minitap/mobile_use/agents/outputter/outputter.py +1 -2
- minitap/mobile_use/agents/planner/planner.md +11 -2
- minitap/mobile_use/agents/planner/planner.py +4 -1
- minitap/mobile_use/agents/planner/types.py +3 -4
- minitap/mobile_use/agents/summarizer/summarizer.py +2 -1
- minitap/mobile_use/config.py +15 -15
- minitap/mobile_use/context.py +3 -4
- minitap/mobile_use/controllers/mobile_command_controller.py +32 -20
- minitap/mobile_use/controllers/platform_specific_commands_controller.py +3 -4
- minitap/mobile_use/graph/graph.py +1 -0
- minitap/mobile_use/graph/state.py +9 -9
- minitap/mobile_use/main.py +5 -6
- minitap/mobile_use/sdk/agent.py +24 -24
- minitap/mobile_use/sdk/builders/agent_config_builder.py +7 -8
- minitap/mobile_use/sdk/builders/task_request_builder.py +9 -9
- minitap/mobile_use/sdk/examples/smart_notification_assistant.py +1 -2
- minitap/mobile_use/sdk/types/agent.py +5 -5
- minitap/mobile_use/sdk/types/task.py +19 -18
- minitap/mobile_use/sdk/utils.py +1 -1
- minitap/mobile_use/servers/config.py +1 -2
- minitap/mobile_use/servers/device_hardware_bridge.py +3 -4
- minitap/mobile_use/servers/start_servers.py +4 -4
- minitap/mobile_use/servers/stop_servers.py +2 -3
- minitap/mobile_use/services/llm.py +3 -2
- minitap/mobile_use/tools/index.py +10 -4
- minitap/mobile_use/tools/mobile/back.py +1 -1
- minitap/mobile_use/tools/mobile/clear_text.py +277 -0
- minitap/mobile_use/tools/mobile/copy_text_from.py +1 -1
- minitap/mobile_use/tools/mobile/erase_one_char.py +56 -0
- minitap/mobile_use/tools/mobile/find_packages.py +1 -1
- minitap/mobile_use/tools/mobile/input_text.py +4 -80
- minitap/mobile_use/tools/mobile/launch_app.py +1 -1
- minitap/mobile_use/tools/mobile/long_press_on.py +2 -4
- minitap/mobile_use/tools/mobile/open_link.py +1 -1
- minitap/mobile_use/tools/mobile/paste_text.py +1 -1
- minitap/mobile_use/tools/mobile/press_key.py +1 -1
- minitap/mobile_use/tools/mobile/stop_app.py +2 -4
- minitap/mobile_use/tools/mobile/swipe.py +1 -1
- minitap/mobile_use/tools/mobile/take_screenshot.py +1 -1
- minitap/mobile_use/tools/mobile/tap.py +2 -4
- minitap/mobile_use/tools/mobile/wait_for_animation_to_end.py +2 -4
- minitap/mobile_use/tools/tool_wrapper.py +1 -1
- minitap/mobile_use/tools/utils.py +86 -0
- minitap/mobile_use/utils/cli_helpers.py +1 -2
- minitap/mobile_use/utils/cli_selection.py +5 -6
- minitap/mobile_use/utils/decorators.py +21 -20
- minitap/mobile_use/utils/logger.py +3 -4
- minitap/mobile_use/utils/media.py +1 -1
- minitap/mobile_use/utils/ui_hierarchy.py +13 -5
- {minitap_mobile_use-2.0.1.dist-info → minitap_mobile_use-2.1.0.dist-info}/METADATA +11 -1
- minitap_mobile_use-2.1.0.dist-info/RECORD +96 -0
- minitap/mobile_use/tools/mobile/erase_text.py +0 -122
- minitap_mobile_use-2.0.1.dist-info/RECORD +0 -94
- {minitap_mobile_use-2.0.1.dist-info → minitap_mobile_use-2.1.0.dist-info}/WHEEL +0 -0
- {minitap_mobile_use-2.0.1.dist-info → minitap_mobile_use-2.1.0.dist-info}/entry_points.txt +0 -0
|
@@ -35,17 +35,19 @@ Focus on the **current PENDING subgoal and the next subgoals not yet started**.
|
|
|
35
35
|
- Past agent thoughts
|
|
36
36
|
- Recent tool effects
|
|
37
37
|
|
|
38
|
-
2.2. Otherwise, output a **stringified structured set of instructions** that an **Executor agent** can perform on a real mobile device:
|
|
38
|
+
2.2. Otherwise, output a **stringified structured set of instructions** that an **Executor agent** can perform on a real mobile device:
|
|
39
39
|
|
|
40
|
-
- These must be **concrete low-level actions
|
|
41
|
-
-
|
|
42
|
-
-
|
|
40
|
+
- These must be **concrete low-level actions**.
|
|
41
|
+
- The executor has the following available tools: **{{ executor_tools_list }}**.
|
|
42
|
+
- Your goal is to achieve subgoals **fast** - so you must put as much actions as possible in your instructions to complete all achievable subgoals (based on your observations) in one go.
|
|
43
|
+
- To open URLs/links directly, use the `open_link` tool - it will automatically handle opening in the appropriate browser. It also handles deep links.
|
|
44
|
+
- When you need to open an app, use the `find_packages` low-level action to try and get its name. Then, simply use the `launch_app` low-level action to launch it.
|
|
43
45
|
- If you refer to a UI element or coordinates, specify it clearly (e.g., `resource-id: com.whatsapp:id/search`, `text: "Alice"`, `x: 100, y: 200`).
|
|
44
46
|
- **The structure is up to you**, but it must be valid **JSON stringified output**. You will accompany this output with a **natural-language summary** of your reasoning and approach in your agent thought.
|
|
45
47
|
- **Never use a sequence of `tap` + `input_text` to type into a field. Always use a single `input_text` action** with the correct `resource_id` (this already ensures the element is focused and the cursor is moved to the end).
|
|
46
48
|
- When you want to launch/stop an app, prefer using its package name.
|
|
47
49
|
- **Only reference UI element IDs or visible texts that are explicitly present in the provided UI hierarchy or screenshot. Do not invent, infer, or guess any IDs or texts that are not directly observed**.
|
|
48
|
-
- **For text clearing**: When you need to completely clear text from an input field, always
|
|
50
|
+
- **For text clearing**: When you need to completely clear text from an input field, always call the `clear_text` tool with the correct resource_id. This tool automatically focuses the element, and ensures the field is emptied. If you notice this tool fails to clear the text, try to long press the input, select all, and call `erase_one_char`.
|
|
49
51
|
|
|
50
52
|
### Output
|
|
51
53
|
|
|
@@ -10,12 +10,14 @@ from langchain_core.messages import (
|
|
|
10
10
|
ToolMessage,
|
|
11
11
|
)
|
|
12
12
|
from langgraph.graph.message import REMOVE_ALL_MESSAGES
|
|
13
|
+
|
|
13
14
|
from minitap.mobile_use.agents.cortex.types import CortexOutput
|
|
14
15
|
from minitap.mobile_use.agents.planner.utils import get_current_subgoal
|
|
15
16
|
from minitap.mobile_use.constants import EXECUTOR_MESSAGES_KEY
|
|
16
17
|
from minitap.mobile_use.context import MobileUseContext
|
|
17
18
|
from minitap.mobile_use.graph.state import State
|
|
18
19
|
from minitap.mobile_use.services.llm import get_llm, with_fallback
|
|
20
|
+
from minitap.mobile_use.tools.index import EXECUTOR_WRAPPERS_TOOLS, format_tools_list
|
|
19
21
|
from minitap.mobile_use.utils.conversations import get_screenshot_message_for_llm
|
|
20
22
|
from minitap.mobile_use.utils.decorators import wrap_with_callbacks
|
|
21
23
|
from minitap.mobile_use.utils.logger import get_logger
|
|
@@ -44,6 +46,7 @@ class CortexNode:
|
|
|
44
46
|
current_subgoal=get_current_subgoal(state.subgoal_plan),
|
|
45
47
|
agents_thoughts=state.agents_thoughts,
|
|
46
48
|
executor_feedback=executor_feedback,
|
|
49
|
+
executor_tools_list=format_tools_list(self.ctx, EXECUTOR_WRAPPERS_TOOLS),
|
|
47
50
|
)
|
|
48
51
|
messages = [
|
|
49
52
|
SystemMessage(content=system_message),
|
|
@@ -83,7 +86,7 @@ class CortexNode:
|
|
|
83
86
|
is_subgoal_completed = (
|
|
84
87
|
response.complete_subgoals_by_ids is not None
|
|
85
88
|
and len(response.complete_subgoals_by_ids) > 0
|
|
86
|
-
and len(response.decisions) == 0
|
|
89
|
+
and (len(response.decisions) == 0 or response.decisions in ["{}", "[]", "null", ""])
|
|
87
90
|
)
|
|
88
91
|
if not is_subgoal_completed:
|
|
89
92
|
response.complete_subgoals_by_ids = []
|
|
@@ -1,11 +1,9 @@
|
|
|
1
|
-
from typing import Optional
|
|
2
|
-
|
|
3
1
|
from pydantic import BaseModel, Field
|
|
4
2
|
|
|
5
3
|
|
|
6
4
|
class CortexOutput(BaseModel):
|
|
7
5
|
decisions: str = Field(..., description="The decisions to be made. A stringified JSON object")
|
|
8
6
|
agent_thought: str = Field(..., description="The agent's thought")
|
|
9
|
-
complete_subgoals_by_ids:
|
|
7
|
+
complete_subgoals_by_ids: list[str] | None = Field(
|
|
10
8
|
[], description="List of subgoal IDs to complete"
|
|
11
9
|
)
|
|
@@ -64,14 +64,13 @@ When using the `input_text` tool:
|
|
|
64
64
|
|
|
65
65
|
#### 🔄 Text Clearing Best Practice
|
|
66
66
|
|
|
67
|
-
When you need to completely clear text from an input field,
|
|
67
|
+
When you need to completely clear text from an input field, always use the clear_text tool with the correct resource_id.
|
|
68
68
|
|
|
69
|
-
|
|
70
|
-
2. **Then use `erase_text`** to clear the selected content
|
|
69
|
+
This tool automatically takes care of focusing the element (if needed), and ensuring the field is fully emptied.
|
|
71
70
|
|
|
72
|
-
|
|
71
|
+
Only and if only the clear_text tool fails to clear the text, try to long press the input, select all, and call erase_one_char.
|
|
73
72
|
|
|
74
|
-
|
|
73
|
+
#### 🔁 Final Notes
|
|
75
74
|
|
|
76
75
|
- **You do not need to reason or decide strategy** — that's the Cortex's job.
|
|
77
76
|
- You simply interpret and execute — like hands following the brain.
|
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
import asyncio
|
|
2
|
-
from typing import Any
|
|
2
|
+
from typing import Any
|
|
3
3
|
from langgraph.types import Command
|
|
4
4
|
from pydantic import BaseModel
|
|
5
|
-
from
|
|
5
|
+
from typing import override
|
|
6
6
|
from langchain_core.runnables import RunnableConfig
|
|
7
7
|
from langgraph.store.base import BaseStore
|
|
8
8
|
from langchain_core.messages import AnyMessage, ToolCall, ToolMessage
|
|
@@ -21,7 +21,7 @@ class ExecutorToolNode(ToolNode):
|
|
|
21
21
|
input: list[AnyMessage] | dict[str, Any] | BaseModel,
|
|
22
22
|
config: RunnableConfig,
|
|
23
23
|
*,
|
|
24
|
-
store:
|
|
24
|
+
store: BaseStore | None,
|
|
25
25
|
):
|
|
26
26
|
return await self.__func(is_async=True, input=input, config=config, store=store)
|
|
27
27
|
|
|
@@ -31,7 +31,7 @@ class ExecutorToolNode(ToolNode):
|
|
|
31
31
|
input: list[AnyMessage] | dict[str, Any] | BaseModel,
|
|
32
32
|
config: RunnableConfig,
|
|
33
33
|
*,
|
|
34
|
-
store:
|
|
34
|
+
store: BaseStore | None,
|
|
35
35
|
) -> Any:
|
|
36
36
|
loop = asyncio.get_event_loop()
|
|
37
37
|
return loop.run_until_complete(
|
|
@@ -44,7 +44,7 @@ class ExecutorToolNode(ToolNode):
|
|
|
44
44
|
input: list[AnyMessage] | dict[str, Any] | BaseModel,
|
|
45
45
|
config: RunnableConfig,
|
|
46
46
|
*,
|
|
47
|
-
store:
|
|
47
|
+
store: BaseStore | None,
|
|
48
48
|
) -> Any:
|
|
49
49
|
tool_calls, input_type = self._parse_input(input, store)
|
|
50
50
|
outputs: list[Command | ToolMessage] = []
|
|
@@ -74,7 +74,7 @@ class ExecutorToolNode(ToolNode):
|
|
|
74
74
|
self,
|
|
75
75
|
call: ToolCall,
|
|
76
76
|
output: ToolMessage | Command,
|
|
77
|
-
) ->
|
|
77
|
+
) -> bool | None:
|
|
78
78
|
if isinstance(output, ToolMessage):
|
|
79
79
|
return output.status == "error"
|
|
80
80
|
if isinstance(output, Command):
|
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
import json
|
|
2
2
|
from pathlib import Path
|
|
3
|
-
from typing import Dict, Type, Union
|
|
4
3
|
|
|
5
4
|
from jinja2 import Template
|
|
6
5
|
from langchain_core.messages import BaseMessage, HumanMessage, SystemMessage
|
|
@@ -49,7 +48,7 @@ async def outputter(
|
|
|
49
48
|
structured_llm = llm
|
|
50
49
|
|
|
51
50
|
if output_config.structured_output:
|
|
52
|
-
schema:
|
|
51
|
+
schema: dict | type[BaseModel] | None = None
|
|
53
52
|
so = output_config.structured_output
|
|
54
53
|
|
|
55
54
|
if isinstance(so, dict):
|
|
@@ -12,7 +12,9 @@ You work like an agile tech lead: defining the key milestones without locking in
|
|
|
12
12
|
- Subgoals should reflect real interactions with mobile UIs (e.g. "Open app", "Tap search bar", "Scroll to item", "Send message to Bob", etc).
|
|
13
13
|
- Don't assume the full UI is visible yet. Plan based on how most mobile apps work, and keep flexibility.
|
|
14
14
|
- List of agents thoughts is empty which is expected, since it is the first plan.
|
|
15
|
-
-
|
|
15
|
+
- Avoid too granular UI actions based tasks (e.g. "tap", "swipe", "copy", "paste") unless explicitly required.
|
|
16
|
+
- The executor has the following available tools: **{{ executor_tools_list }}**.
|
|
17
|
+
When one of these tools offers a direct shortcut (e.g. `openLink` instead of manually launching a browser and typing a URL), prefer it over decomposed manual steps.
|
|
16
18
|
|
|
17
19
|
2. **Replanning**
|
|
18
20
|
If you're asked to **revise a previous plan**, you'll also receive:
|
|
@@ -47,12 +49,19 @@ If you're replaning and need to keep a previous subgoal, you **must keep the sam
|
|
|
47
49
|
- Type the message "I’m running late" (ID: None)
|
|
48
50
|
- Send the message (ID: None)
|
|
49
51
|
|
|
52
|
+
#### **Initial Goal**: "Go on https://tesla.com, and tell me what is the first car being displayed"
|
|
53
|
+
|
|
54
|
+
**Plan**:
|
|
55
|
+
|
|
56
|
+
- Open the link https://tesla.com (ID: None)
|
|
57
|
+
- Find the first car displayed on the home page (ID: None)
|
|
58
|
+
|
|
50
59
|
#### **Replanning Example**
|
|
51
60
|
|
|
52
61
|
**Original Plan**: same as above with IDs set
|
|
53
62
|
**Agent Thoughts**:
|
|
54
63
|
|
|
55
|
-
- Couldn
|
|
64
|
+
- Couldn't find Alice in recent chats
|
|
56
65
|
- Search bar was present on top of the chat screen
|
|
57
66
|
- Keyboard appeared after tapping search
|
|
58
67
|
|
|
@@ -1,13 +1,15 @@
|
|
|
1
|
-
from pathlib import Path
|
|
2
1
|
import uuid
|
|
2
|
+
from pathlib import Path
|
|
3
3
|
|
|
4
4
|
from jinja2 import Template
|
|
5
5
|
from langchain_core.messages import HumanMessage, SystemMessage
|
|
6
|
+
|
|
6
7
|
from minitap.mobile_use.agents.planner.types import PlannerOutput, Subgoal, SubgoalStatus
|
|
7
8
|
from minitap.mobile_use.agents.planner.utils import one_of_them_is_failure
|
|
8
9
|
from minitap.mobile_use.context import MobileUseContext
|
|
9
10
|
from minitap.mobile_use.graph.state import State
|
|
10
11
|
from minitap.mobile_use.services.llm import get_llm
|
|
12
|
+
from minitap.mobile_use.tools.index import EXECUTOR_WRAPPERS_TOOLS, format_tools_list
|
|
11
13
|
from minitap.mobile_use.utils.decorators import wrap_with_callbacks
|
|
12
14
|
from minitap.mobile_use.utils.logger import get_logger
|
|
13
15
|
|
|
@@ -36,6 +38,7 @@ class PlannerNode:
|
|
|
36
38
|
initial_goal=state.initial_goal,
|
|
37
39
|
previous_plan="\n".join(str(s) for s in state.subgoal_plan),
|
|
38
40
|
agent_thoughts="\n".join(state.agents_thoughts),
|
|
41
|
+
executor_tools_list=format_tools_list(self.ctx, EXECUTOR_WRAPPERS_TOOLS),
|
|
39
42
|
)
|
|
40
43
|
messages = [
|
|
41
44
|
SystemMessage(content=system_message),
|
|
@@ -1,12 +1,11 @@
|
|
|
1
1
|
from enum import Enum
|
|
2
|
-
from typing import Optional
|
|
3
2
|
|
|
4
3
|
from pydantic import BaseModel
|
|
5
|
-
from
|
|
4
|
+
from typing import Annotated
|
|
6
5
|
|
|
7
6
|
|
|
8
7
|
class PlannerSubgoalOutput(BaseModel):
|
|
9
|
-
id: Annotated[
|
|
8
|
+
id: Annotated[str | None, "If not provided, it will be generated"] = None
|
|
10
9
|
description: str
|
|
11
10
|
|
|
12
11
|
|
|
@@ -25,7 +24,7 @@ class Subgoal(BaseModel):
|
|
|
25
24
|
id: Annotated[str, "Unique identifier of the subgoal"]
|
|
26
25
|
description: Annotated[str, "Description of the subgoal"]
|
|
27
26
|
completion_reason: Annotated[
|
|
28
|
-
|
|
27
|
+
str | None, "Reason why the subgoal was completed (failure or success)"
|
|
29
28
|
] = None
|
|
30
29
|
status: SubgoalStatus
|
|
31
30
|
|
|
@@ -3,6 +3,7 @@ from langchain_core.messages import (
|
|
|
3
3
|
RemoveMessage,
|
|
4
4
|
ToolMessage,
|
|
5
5
|
)
|
|
6
|
+
|
|
6
7
|
from minitap.mobile_use.constants import MAX_MESSAGES_IN_HISTORY
|
|
7
8
|
from minitap.mobile_use.context import MobileUseContext
|
|
8
9
|
from minitap.mobile_use.graph.state import State
|
|
@@ -22,7 +23,7 @@ class SummarizerNode:
|
|
|
22
23
|
start_removal = False
|
|
23
24
|
|
|
24
25
|
for msg in reversed(state.messages[:nb_removal_candidates]):
|
|
25
|
-
if isinstance(msg,
|
|
26
|
+
if isinstance(msg, ToolMessage | HumanMessage):
|
|
26
27
|
start_removal = True
|
|
27
28
|
if start_removal and msg.id:
|
|
28
29
|
remove_messages.append(RemoveMessage(id=msg.id))
|
minitap/mobile_use/config.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import os
|
|
3
3
|
from pathlib import Path
|
|
4
|
-
from typing import Annotated, Any, Literal
|
|
4
|
+
from typing import Annotated, Any, Literal
|
|
5
5
|
|
|
6
6
|
from dotenv import load_dotenv
|
|
7
7
|
from pydantic import BaseModel, Field, SecretStr, ValidationError, model_validator
|
|
@@ -17,17 +17,17 @@ logger = get_logger(__name__)
|
|
|
17
17
|
|
|
18
18
|
|
|
19
19
|
class Settings(BaseSettings):
|
|
20
|
-
OPENAI_API_KEY:
|
|
21
|
-
GOOGLE_API_KEY:
|
|
22
|
-
XAI_API_KEY:
|
|
23
|
-
OPEN_ROUTER_API_KEY:
|
|
20
|
+
OPENAI_API_KEY: SecretStr | None = None
|
|
21
|
+
GOOGLE_API_KEY: SecretStr | None = None
|
|
22
|
+
XAI_API_KEY: SecretStr | None = None
|
|
23
|
+
OPEN_ROUTER_API_KEY: SecretStr | None = None
|
|
24
24
|
|
|
25
|
-
OPENAI_BASE_URL:
|
|
25
|
+
OPENAI_BASE_URL: str | None = None
|
|
26
26
|
|
|
27
|
-
DEVICE_SCREEN_API_BASE_URL:
|
|
28
|
-
DEVICE_HARDWARE_BRIDGE_BASE_URL:
|
|
29
|
-
ADB_HOST:
|
|
30
|
-
ADB_PORT:
|
|
27
|
+
DEVICE_SCREEN_API_BASE_URL: str | None = None
|
|
28
|
+
DEVICE_HARDWARE_BRIDGE_BASE_URL: str | None = None
|
|
29
|
+
ADB_HOST: str | None = None
|
|
30
|
+
ADB_PORT: int | None = None
|
|
31
31
|
|
|
32
32
|
model_config = {"env_file": ".env", "extra": "ignore"}
|
|
33
33
|
|
|
@@ -71,7 +71,7 @@ def prepare_output_files() -> tuple[str | None, str | None]:
|
|
|
71
71
|
return validated_events_path, validated_results_path
|
|
72
72
|
|
|
73
73
|
|
|
74
|
-
def record_events(output_path: Path | None, events:
|
|
74
|
+
def record_events(output_path: Path | None, events: list[str] | BaseModel | Any):
|
|
75
75
|
if not output_path:
|
|
76
76
|
return
|
|
77
77
|
|
|
@@ -170,7 +170,7 @@ def get_default_llm_config() -> LLMConfig:
|
|
|
170
170
|
try:
|
|
171
171
|
if not os.path.exists(ROOT_DIR / DEFAULT_LLM_CONFIG_FILENAME):
|
|
172
172
|
raise Exception("Default llm config not found")
|
|
173
|
-
with open(ROOT_DIR / DEFAULT_LLM_CONFIG_FILENAME
|
|
173
|
+
with open(ROOT_DIR / DEFAULT_LLM_CONFIG_FILENAME) as f:
|
|
174
174
|
default_config_dict = load_jsonc(f)
|
|
175
175
|
return LLMConfig.model_validate(default_config_dict["default"])
|
|
176
176
|
except Exception as e:
|
|
@@ -211,7 +211,7 @@ def parse_llm_config() -> LLMConfig:
|
|
|
211
211
|
override_config_dict = {}
|
|
212
212
|
if os.path.exists(ROOT_DIR / OVERRIDE_LLM_CONFIG_FILENAME):
|
|
213
213
|
logger.info("Loading custom llm config...")
|
|
214
|
-
with open(ROOT_DIR / OVERRIDE_LLM_CONFIG_FILENAME
|
|
214
|
+
with open(ROOT_DIR / OVERRIDE_LLM_CONFIG_FILENAME) as f:
|
|
215
215
|
override_config_dict = load_jsonc(f)
|
|
216
216
|
else:
|
|
217
217
|
logger.warning("Custom llm config not found, loading default config")
|
|
@@ -237,7 +237,7 @@ def initialize_llm_config() -> LLMConfig:
|
|
|
237
237
|
|
|
238
238
|
class OutputConfig(BaseModel):
|
|
239
239
|
structured_output: Annotated[
|
|
240
|
-
|
|
240
|
+
type[BaseModel] | dict | None,
|
|
241
241
|
Field(
|
|
242
242
|
default=None,
|
|
243
243
|
description=(
|
|
@@ -247,7 +247,7 @@ class OutputConfig(BaseModel):
|
|
|
247
247
|
),
|
|
248
248
|
]
|
|
249
249
|
output_description: Annotated[
|
|
250
|
-
|
|
250
|
+
str | None,
|
|
251
251
|
Field(
|
|
252
252
|
default=None,
|
|
253
253
|
description=(
|
minitap/mobile_use/context.py
CHANGED
|
@@ -6,12 +6,11 @@ Uses ContextVar to avoid prop drilling and maintain clean function signatures.
|
|
|
6
6
|
|
|
7
7
|
from enum import Enum
|
|
8
8
|
from pathlib import Path
|
|
9
|
-
from typing import Optional
|
|
10
9
|
|
|
11
10
|
from adbutils import AdbClient
|
|
12
11
|
from openai import BaseModel
|
|
13
12
|
from pydantic import ConfigDict
|
|
14
|
-
from
|
|
13
|
+
from typing import Literal
|
|
15
14
|
|
|
16
15
|
from minitap.mobile_use.clients.device_hardware_client import DeviceHardwareClient
|
|
17
16
|
from minitap.mobile_use.clients.screen_api_client import ScreenApiClient
|
|
@@ -56,8 +55,8 @@ class MobileUseContext(BaseModel):
|
|
|
56
55
|
hw_bridge_client: DeviceHardwareClient
|
|
57
56
|
screen_api_client: ScreenApiClient
|
|
58
57
|
llm_config: LLMConfig
|
|
59
|
-
adb_client:
|
|
60
|
-
execution_setup:
|
|
58
|
+
adb_client: AdbClient | None = None
|
|
59
|
+
execution_setup: ExecutionSetup | None = None
|
|
61
60
|
|
|
62
61
|
def get_adb_client(self) -> AdbClient:
|
|
63
62
|
if self.adb_client is None:
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import uuid
|
|
2
2
|
from enum import Enum
|
|
3
|
-
from typing import Annotated, Literal
|
|
3
|
+
from typing import Annotated, Literal
|
|
4
4
|
|
|
5
5
|
import yaml
|
|
6
6
|
from langgraph.types import Command
|
|
@@ -43,7 +43,7 @@ class RunFlowRequest(BaseModel):
|
|
|
43
43
|
dry_run: bool = Field(default=False, alias="dryRun")
|
|
44
44
|
|
|
45
45
|
|
|
46
|
-
def run_flow(ctx: MobileUseContext, flow_steps: list, dry_run: bool = False) ->
|
|
46
|
+
def run_flow(ctx: MobileUseContext, flow_steps: list, dry_run: bool = False) -> dict | None:
|
|
47
47
|
"""
|
|
48
48
|
Run a flow i.e, a sequence of commands.
|
|
49
49
|
Returns None on success, or the response body of the failed command.
|
|
@@ -137,20 +137,20 @@ class SelectorRequestWithPercentages(BaseModel):
|
|
|
137
137
|
return {"point": self.percentages.to_str()}
|
|
138
138
|
|
|
139
139
|
|
|
140
|
-
SelectorRequest =
|
|
141
|
-
IdSelectorRequest
|
|
142
|
-
SelectorRequestWithCoordinates
|
|
143
|
-
SelectorRequestWithPercentages
|
|
144
|
-
TextSelectorRequest
|
|
145
|
-
IdWithTextSelectorRequest
|
|
146
|
-
|
|
140
|
+
SelectorRequest = (
|
|
141
|
+
IdSelectorRequest
|
|
142
|
+
| SelectorRequestWithCoordinates
|
|
143
|
+
| SelectorRequestWithPercentages
|
|
144
|
+
| TextSelectorRequest
|
|
145
|
+
| IdWithTextSelectorRequest
|
|
146
|
+
)
|
|
147
147
|
|
|
148
148
|
|
|
149
149
|
def tap(
|
|
150
150
|
ctx: MobileUseContext,
|
|
151
151
|
selector_request: SelectorRequest,
|
|
152
152
|
dry_run: bool = False,
|
|
153
|
-
index:
|
|
153
|
+
index: int | None = None,
|
|
154
154
|
):
|
|
155
155
|
"""
|
|
156
156
|
Tap on a selector.
|
|
@@ -171,7 +171,7 @@ def long_press_on(
|
|
|
171
171
|
ctx: MobileUseContext,
|
|
172
172
|
selector_request: SelectorRequest,
|
|
173
173
|
dry_run: bool = False,
|
|
174
|
-
index:
|
|
174
|
+
index: int | None = None,
|
|
175
175
|
):
|
|
176
176
|
long_press_on_body = selector_request.to_dict()
|
|
177
177
|
if not long_press_on_body:
|
|
@@ -211,7 +211,7 @@ SwipeDirection = Annotated[
|
|
|
211
211
|
class SwipeRequest(BaseModel):
|
|
212
212
|
model_config = ConfigDict(extra="forbid")
|
|
213
213
|
swipe_mode: SwipeStartEndCoordinatesRequest | SwipeStartEndPercentagesRequest | SwipeDirection
|
|
214
|
-
duration:
|
|
214
|
+
duration: int | None = None # in ms, default is 400ms
|
|
215
215
|
|
|
216
216
|
def to_dict(self):
|
|
217
217
|
res = {}
|
|
@@ -257,7 +257,7 @@ def paste_text(ctx: MobileUseContext, dry_run: bool = False):
|
|
|
257
257
|
return run_flow(ctx, ["pasteText"], dry_run=dry_run)
|
|
258
258
|
|
|
259
259
|
|
|
260
|
-
def erase_text(ctx: MobileUseContext, nb_chars:
|
|
260
|
+
def erase_text(ctx: MobileUseContext, nb_chars: int | None = None, dry_run: bool = False):
|
|
261
261
|
"""
|
|
262
262
|
Removes characters from the currently selected textfield (if any)
|
|
263
263
|
Removes 50 characters if nb_chars is not specified.
|
|
@@ -275,7 +275,7 @@ def launch_app(ctx: MobileUseContext, package_name: str, dry_run: bool = False):
|
|
|
275
275
|
return run_flow_with_wait_for_animation_to_end(ctx, flow_input, dry_run=dry_run)
|
|
276
276
|
|
|
277
277
|
|
|
278
|
-
def stop_app(ctx: MobileUseContext, package_name:
|
|
278
|
+
def stop_app(ctx: MobileUseContext, package_name: str | None = None, dry_run: bool = False):
|
|
279
279
|
if package_name is None:
|
|
280
280
|
flow_input = ["stopApp"]
|
|
281
281
|
else:
|
|
@@ -317,7 +317,7 @@ class WaitTimeout(Enum):
|
|
|
317
317
|
|
|
318
318
|
|
|
319
319
|
def wait_for_animation_to_end(
|
|
320
|
-
ctx: MobileUseContext, timeout:
|
|
320
|
+
ctx: MobileUseContext, timeout: WaitTimeout | None = None, dry_run: bool = False
|
|
321
321
|
):
|
|
322
322
|
if timeout is None:
|
|
323
323
|
return run_flow(ctx, ["waitForAnimationToEnd"], dry_run=dry_run)
|
|
@@ -362,15 +362,27 @@ if __name__ == "__main__":
|
|
|
362
362
|
agents_thoughts=[],
|
|
363
363
|
)
|
|
364
364
|
|
|
365
|
-
from minitap.mobile_use.tools.mobile.input_text import get_input_text_tool
|
|
366
|
-
|
|
367
|
-
input_resource_id = "com.google.android.apps.nexuslauncher:id/search_container_hotseat"
|
|
368
|
-
command_output: Command = get_input_text_tool(ctx=ctx).invoke(
|
|
365
|
+
# from minitap.mobile_use.tools.mobile.input_text import get_input_text_tool
|
|
366
|
+
|
|
367
|
+
# input_resource_id = "com.google.android.apps.nexuslauncher:id/search_container_hotseat"
|
|
368
|
+
# command_output: Command = get_input_text_tool(ctx=ctx).invoke(
|
|
369
|
+
# {
|
|
370
|
+
# "tool_call_id": uuid.uuid4().hex,
|
|
371
|
+
# "agent_thought": "",
|
|
372
|
+
# "text_input_resource_id": input_resource_id,
|
|
373
|
+
# "text": "Hello World",
|
|
374
|
+
# "state": dummy_state,
|
|
375
|
+
# "executor_metadata": None,
|
|
376
|
+
# }
|
|
377
|
+
# )
|
|
378
|
+
from minitap.mobile_use.tools.mobile.clear_text import get_clear_text_tool
|
|
379
|
+
|
|
380
|
+
input_resource_id = "com.google.android.apps.nexuslauncher:id/input"
|
|
381
|
+
command_output: Command = get_clear_text_tool(ctx=ctx).invoke(
|
|
369
382
|
{
|
|
370
383
|
"tool_call_id": uuid.uuid4().hex,
|
|
371
384
|
"agent_thought": "",
|
|
372
385
|
"text_input_resource_id": input_resource_id,
|
|
373
|
-
"text": "Hello World",
|
|
374
386
|
"state": dummy_state,
|
|
375
387
|
"executor_metadata": None,
|
|
376
388
|
}
|
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
from datetime import date
|
|
2
2
|
import json
|
|
3
|
-
from typing import Optional
|
|
4
3
|
|
|
5
4
|
from adbutils import AdbDevice
|
|
6
5
|
from minitap.mobile_use.utils.logger import MobileUseLogger
|
|
@@ -20,8 +19,8 @@ def get_adb_device(ctx: MobileUseContext) -> AdbDevice:
|
|
|
20
19
|
|
|
21
20
|
|
|
22
21
|
def get_first_device(
|
|
23
|
-
logger:
|
|
24
|
-
) -> tuple[
|
|
22
|
+
logger: MobileUseLogger | None = None,
|
|
23
|
+
) -> tuple[str | None, DevicePlatform | None]:
|
|
25
24
|
"""Gets the first available device."""
|
|
26
25
|
try:
|
|
27
26
|
android_output = run_shell_command_on_host("adb devices")
|
|
@@ -50,7 +49,7 @@ def get_first_device(
|
|
|
50
49
|
return None, None
|
|
51
50
|
|
|
52
51
|
|
|
53
|
-
def get_focused_app_info(ctx: MobileUseContext) ->
|
|
52
|
+
def get_focused_app_info(ctx: MobileUseContext) -> str | None:
|
|
54
53
|
if ctx.device.mobile_platform == DevicePlatform.IOS:
|
|
55
54
|
return None
|
|
56
55
|
device = get_adb_device(ctx)
|
|
@@ -6,6 +6,7 @@ from langchain_core.messages import (
|
|
|
6
6
|
from langgraph.constants import END, START
|
|
7
7
|
from langgraph.graph import StateGraph
|
|
8
8
|
from langgraph.graph.state import CompiledStateGraph
|
|
9
|
+
|
|
9
10
|
from minitap.mobile_use.agents.contextor.contextor import ContextorNode
|
|
10
11
|
from minitap.mobile_use.agents.cortex.cortex import CortexNode
|
|
11
12
|
from minitap.mobile_use.agents.executor.executor import ExecutorNode
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
from langchain_core.messages import AIMessage, AnyMessage
|
|
2
2
|
from langgraph.graph import add_messages
|
|
3
3
|
from langgraph.prebuilt.chat_agent_executor import AgentStatePydantic
|
|
4
|
-
from
|
|
4
|
+
from typing import Annotated
|
|
5
5
|
|
|
6
6
|
from minitap.mobile_use.agents.planner.types import Subgoal
|
|
7
7
|
from minitap.mobile_use.config import AgentNode
|
|
@@ -24,16 +24,16 @@ class State(AgentStatePydantic):
|
|
|
24
24
|
subgoal_plan: Annotated[list[Subgoal], "The current plan, made of subgoals"]
|
|
25
25
|
|
|
26
26
|
# contextor related keys
|
|
27
|
-
latest_screenshot_base64: Annotated[
|
|
27
|
+
latest_screenshot_base64: Annotated[str | None, "Latest screenshot of the device", take_last]
|
|
28
28
|
latest_ui_hierarchy: Annotated[
|
|
29
|
-
|
|
29
|
+
list[dict] | None, "Latest UI hierarchy of the device", take_last
|
|
30
30
|
]
|
|
31
|
-
focused_app_info: Annotated[
|
|
32
|
-
device_date: Annotated[
|
|
31
|
+
focused_app_info: Annotated[str | None, "Focused app info", take_last]
|
|
32
|
+
device_date: Annotated[str | None, "Date of the device", take_last]
|
|
33
33
|
|
|
34
34
|
# cortex related keys
|
|
35
35
|
structured_decisions: Annotated[
|
|
36
|
-
|
|
36
|
+
str | None,
|
|
37
37
|
"Structured decisions made by the cortex, for the executor to follow",
|
|
38
38
|
take_last,
|
|
39
39
|
]
|
|
@@ -45,7 +45,7 @@ class State(AgentStatePydantic):
|
|
|
45
45
|
|
|
46
46
|
# executor related keys
|
|
47
47
|
executor_messages: Annotated[list[AnyMessage], "Sequential Executor messages", add_messages]
|
|
48
|
-
cortex_last_thought: Annotated[
|
|
48
|
+
cortex_last_thought: Annotated[str | None, "Last thought of the cortex for the executor"]
|
|
49
49
|
|
|
50
50
|
# common keys
|
|
51
51
|
agents_thoughts: Annotated[
|
|
@@ -58,13 +58,13 @@ class State(AgentStatePydantic):
|
|
|
58
58
|
self,
|
|
59
59
|
ctx: MobileUseContext,
|
|
60
60
|
update: dict,
|
|
61
|
-
agent:
|
|
61
|
+
agent: AgentNode | None = None,
|
|
62
62
|
):
|
|
63
63
|
"""
|
|
64
64
|
Sanitizes the state update to ensure it is valid and apply side effect logic where required.
|
|
65
65
|
The agent is required if the update contains the "agents_thoughts" key.
|
|
66
66
|
"""
|
|
67
|
-
updated_agents_thoughts:
|
|
67
|
+
updated_agents_thoughts: str | list[str] | None = update.get("agents_thoughts", None)
|
|
68
68
|
if updated_agents_thoughts is not None:
|
|
69
69
|
if isinstance(updated_agents_thoughts, str):
|
|
70
70
|
updated_agents_thoughts = [updated_agents_thoughts]
|
minitap/mobile_use/main.py
CHANGED
|
@@ -1,12 +1,11 @@
|
|
|
1
1
|
import asyncio
|
|
2
2
|
import os
|
|
3
|
-
from typing import Optional
|
|
4
3
|
|
|
5
4
|
import typer
|
|
6
5
|
from adbutils import AdbClient
|
|
7
6
|
from langchain.callbacks.base import Callbacks
|
|
8
7
|
from rich.console import Console
|
|
9
|
-
from
|
|
8
|
+
from typing import Annotated
|
|
10
9
|
|
|
11
10
|
from minitap.mobile_use.config import (
|
|
12
11
|
initialize_llm_config,
|
|
@@ -24,9 +23,9 @@ logger = get_logger(__name__)
|
|
|
24
23
|
|
|
25
24
|
async def run_automation(
|
|
26
25
|
goal: str,
|
|
27
|
-
test_name:
|
|
26
|
+
test_name: str | None = None,
|
|
28
27
|
traces_output_path_str: str = "traces",
|
|
29
|
-
output_description:
|
|
28
|
+
output_description: str | None = None,
|
|
30
29
|
graph_config_callbacks: Callbacks = [],
|
|
31
30
|
):
|
|
32
31
|
llm_config = initialize_llm_config()
|
|
@@ -70,7 +69,7 @@ async def run_automation(
|
|
|
70
69
|
def main(
|
|
71
70
|
goal: Annotated[str, typer.Argument(help="The main goal for the agent to achieve.")],
|
|
72
71
|
test_name: Annotated[
|
|
73
|
-
|
|
72
|
+
str | None,
|
|
74
73
|
typer.Option(
|
|
75
74
|
"--test-name",
|
|
76
75
|
"-n",
|
|
@@ -86,7 +85,7 @@ def main(
|
|
|
86
85
|
),
|
|
87
86
|
] = "traces",
|
|
88
87
|
output_description: Annotated[
|
|
89
|
-
|
|
88
|
+
str | None,
|
|
90
89
|
typer.Option(
|
|
91
90
|
"--output-description",
|
|
92
91
|
"-o",
|