minitap-mobile-use 2.0.1__py3-none-any.whl → 2.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of minitap-mobile-use might be problematic. Click here for more details.
- minitap/mobile_use/agents/cortex/cortex.md +7 -5
- minitap/mobile_use/agents/cortex/cortex.py +4 -1
- minitap/mobile_use/agents/cortex/types.py +1 -3
- minitap/mobile_use/agents/executor/executor.md +4 -5
- minitap/mobile_use/agents/executor/executor.py +3 -1
- minitap/mobile_use/agents/executor/tool_node.py +6 -6
- minitap/mobile_use/agents/outputter/outputter.py +1 -2
- minitap/mobile_use/agents/planner/planner.md +11 -2
- minitap/mobile_use/agents/planner/planner.py +7 -2
- minitap/mobile_use/agents/planner/types.py +3 -4
- minitap/mobile_use/agents/summarizer/summarizer.py +2 -1
- minitap/mobile_use/config.py +31 -16
- minitap/mobile_use/context.py +3 -4
- minitap/mobile_use/controllers/mobile_command_controller.py +36 -24
- minitap/mobile_use/controllers/platform_specific_commands_controller.py +3 -4
- minitap/mobile_use/graph/graph.py +1 -0
- minitap/mobile_use/graph/state.py +9 -9
- minitap/mobile_use/main.py +7 -8
- minitap/mobile_use/sdk/agent.py +25 -26
- minitap/mobile_use/sdk/builders/agent_config_builder.py +9 -10
- minitap/mobile_use/sdk/builders/task_request_builder.py +9 -9
- minitap/mobile_use/sdk/examples/smart_notification_assistant.py +1 -2
- minitap/mobile_use/sdk/types/agent.py +5 -5
- minitap/mobile_use/sdk/types/task.py +19 -18
- minitap/mobile_use/sdk/utils.py +4 -3
- minitap/mobile_use/servers/config.py +1 -2
- minitap/mobile_use/servers/device_hardware_bridge.py +3 -4
- minitap/mobile_use/servers/start_servers.py +4 -4
- minitap/mobile_use/servers/stop_servers.py +2 -3
- minitap/mobile_use/services/llm.py +24 -6
- minitap/mobile_use/tools/index.py +26 -14
- minitap/mobile_use/tools/mobile/back.py +1 -1
- minitap/mobile_use/tools/mobile/clear_text.py +277 -0
- minitap/mobile_use/tools/mobile/copy_text_from.py +1 -1
- minitap/mobile_use/tools/mobile/erase_one_char.py +56 -0
- minitap/mobile_use/tools/mobile/find_packages.py +1 -1
- minitap/mobile_use/tools/mobile/input_text.py +4 -80
- minitap/mobile_use/tools/mobile/launch_app.py +1 -1
- minitap/mobile_use/tools/mobile/long_press_on.py +2 -4
- minitap/mobile_use/tools/mobile/open_link.py +1 -1
- minitap/mobile_use/tools/mobile/paste_text.py +1 -1
- minitap/mobile_use/tools/mobile/press_key.py +1 -1
- minitap/mobile_use/tools/mobile/stop_app.py +2 -4
- minitap/mobile_use/tools/mobile/swipe.py +107 -9
- minitap/mobile_use/tools/mobile/take_screenshot.py +1 -1
- minitap/mobile_use/tools/mobile/tap.py +2 -4
- minitap/mobile_use/tools/mobile/wait_for_animation_to_end.py +2 -4
- minitap/mobile_use/tools/tool_wrapper.py +6 -1
- minitap/mobile_use/tools/utils.py +86 -0
- minitap/mobile_use/utils/cli_helpers.py +1 -2
- minitap/mobile_use/utils/cli_selection.py +5 -6
- minitap/mobile_use/utils/decorators.py +21 -20
- minitap/mobile_use/utils/logger.py +3 -4
- minitap/mobile_use/utils/media.py +1 -1
- minitap/mobile_use/utils/recorder.py +2 -9
- minitap/mobile_use/utils/ui_hierarchy.py +13 -5
- {minitap_mobile_use-2.0.1.dist-info → minitap_mobile_use-2.2.0.dist-info}/METADATA +35 -5
- minitap_mobile_use-2.2.0.dist-info/RECORD +96 -0
- minitap/mobile_use/tools/mobile/erase_text.py +0 -122
- minitap_mobile_use-2.0.1.dist-info/RECORD +0 -94
- {minitap_mobile_use-2.0.1.dist-info → minitap_mobile_use-2.2.0.dist-info}/WHEEL +0 -0
- {minitap_mobile_use-2.0.1.dist-info → minitap_mobile_use-2.2.0.dist-info}/entry_points.txt +0 -0
|
@@ -35,17 +35,19 @@ Focus on the **current PENDING subgoal and the next subgoals not yet started**.
|
|
|
35
35
|
- Past agent thoughts
|
|
36
36
|
- Recent tool effects
|
|
37
37
|
|
|
38
|
-
2.2. Otherwise, output a **stringified structured set of instructions** that an **Executor agent** can perform on a real mobile device:
|
|
38
|
+
2.2. Otherwise, output a **stringified structured set of instructions** that an **Executor agent** can perform on a real mobile device:
|
|
39
39
|
|
|
40
|
-
- These must be **concrete low-level actions
|
|
41
|
-
-
|
|
42
|
-
-
|
|
40
|
+
- These must be **concrete low-level actions**.
|
|
41
|
+
- The executor has the following available tools: {{ executor_tools_list }}.
|
|
42
|
+
- Your goal is to achieve subgoals **fast** - so you must put as much actions as possible in your instructions to complete all achievable subgoals (based on your observations) in one go.
|
|
43
|
+
- To open URLs/links directly, use the `open_link` tool - it will automatically handle opening in the appropriate browser. It also handles deep links.
|
|
44
|
+
- When you need to open an app, use the `find_packages` low-level action to try and get its name. Then, simply use the `launch_app` low-level action to launch it.
|
|
43
45
|
- If you refer to a UI element or coordinates, specify it clearly (e.g., `resource-id: com.whatsapp:id/search`, `text: "Alice"`, `x: 100, y: 200`).
|
|
44
46
|
- **The structure is up to you**, but it must be valid **JSON stringified output**. You will accompany this output with a **natural-language summary** of your reasoning and approach in your agent thought.
|
|
45
47
|
- **Never use a sequence of `tap` + `input_text` to type into a field. Always use a single `input_text` action** with the correct `resource_id` (this already ensures the element is focused and the cursor is moved to the end).
|
|
46
48
|
- When you want to launch/stop an app, prefer using its package name.
|
|
47
49
|
- **Only reference UI element IDs or visible texts that are explicitly present in the provided UI hierarchy or screenshot. Do not invent, infer, or guess any IDs or texts that are not directly observed**.
|
|
48
|
-
- **For text clearing**: When you need to completely clear text from an input field, always
|
|
50
|
+
- **For text clearing**: When you need to completely clear text from an input field, always call the `clear_text` tool with the correct resource_id. This tool automatically focuses the element, and ensures the field is emptied. If you notice this tool fails to clear the text, try to long press the input, select all, and call `erase_one_char`.
|
|
49
51
|
|
|
50
52
|
### Output
|
|
51
53
|
|
|
@@ -10,12 +10,14 @@ from langchain_core.messages import (
|
|
|
10
10
|
ToolMessage,
|
|
11
11
|
)
|
|
12
12
|
from langgraph.graph.message import REMOVE_ALL_MESSAGES
|
|
13
|
+
|
|
13
14
|
from minitap.mobile_use.agents.cortex.types import CortexOutput
|
|
14
15
|
from minitap.mobile_use.agents.planner.utils import get_current_subgoal
|
|
15
16
|
from minitap.mobile_use.constants import EXECUTOR_MESSAGES_KEY
|
|
16
17
|
from minitap.mobile_use.context import MobileUseContext
|
|
17
18
|
from minitap.mobile_use.graph.state import State
|
|
18
19
|
from minitap.mobile_use.services.llm import get_llm, with_fallback
|
|
20
|
+
from minitap.mobile_use.tools.index import EXECUTOR_WRAPPERS_TOOLS, format_tools_list
|
|
19
21
|
from minitap.mobile_use.utils.conversations import get_screenshot_message_for_llm
|
|
20
22
|
from minitap.mobile_use.utils.decorators import wrap_with_callbacks
|
|
21
23
|
from minitap.mobile_use.utils.logger import get_logger
|
|
@@ -44,6 +46,7 @@ class CortexNode:
|
|
|
44
46
|
current_subgoal=get_current_subgoal(state.subgoal_plan),
|
|
45
47
|
agents_thoughts=state.agents_thoughts,
|
|
46
48
|
executor_feedback=executor_feedback,
|
|
49
|
+
executor_tools_list=format_tools_list(ctx=self.ctx, wrappers=EXECUTOR_WRAPPERS_TOOLS),
|
|
47
50
|
)
|
|
48
51
|
messages = [
|
|
49
52
|
SystemMessage(content=system_message),
|
|
@@ -83,7 +86,7 @@ class CortexNode:
|
|
|
83
86
|
is_subgoal_completed = (
|
|
84
87
|
response.complete_subgoals_by_ids is not None
|
|
85
88
|
and len(response.complete_subgoals_by_ids) > 0
|
|
86
|
-
and len(response.decisions) == 0
|
|
89
|
+
and (len(response.decisions) == 0 or response.decisions in ["{}", "[]", "null", ""])
|
|
87
90
|
)
|
|
88
91
|
if not is_subgoal_completed:
|
|
89
92
|
response.complete_subgoals_by_ids = []
|
|
@@ -1,11 +1,9 @@
|
|
|
1
|
-
from typing import Optional
|
|
2
|
-
|
|
3
1
|
from pydantic import BaseModel, Field
|
|
4
2
|
|
|
5
3
|
|
|
6
4
|
class CortexOutput(BaseModel):
|
|
7
5
|
decisions: str = Field(..., description="The decisions to be made. A stringified JSON object")
|
|
8
6
|
agent_thought: str = Field(..., description="The agent's thought")
|
|
9
|
-
complete_subgoals_by_ids:
|
|
7
|
+
complete_subgoals_by_ids: list[str] | None = Field(
|
|
10
8
|
[], description="List of subgoal IDs to complete"
|
|
11
9
|
)
|
|
@@ -64,14 +64,13 @@ When using the `input_text` tool:
|
|
|
64
64
|
|
|
65
65
|
#### 🔄 Text Clearing Best Practice
|
|
66
66
|
|
|
67
|
-
When you need to completely clear text from an input field,
|
|
67
|
+
When you need to completely clear text from an input field, always use the clear_text tool with the correct resource_id.
|
|
68
68
|
|
|
69
|
-
|
|
70
|
-
2. **Then use `erase_text`** to clear the selected content
|
|
69
|
+
This tool automatically takes care of focusing the element (if needed), and ensuring the field is fully emptied.
|
|
71
70
|
|
|
72
|
-
|
|
71
|
+
Only and if only the clear_text tool fails to clear the text, try to long press the input, select all, and call erase_one_char.
|
|
73
72
|
|
|
74
|
-
|
|
73
|
+
#### 🔁 Final Notes
|
|
75
74
|
|
|
76
75
|
- **You do not need to reason or decide strategy** — that's the Cortex's job.
|
|
77
76
|
- You simply interpret and execute — like hands following the brain.
|
|
@@ -3,6 +3,8 @@ from pathlib import Path
|
|
|
3
3
|
from jinja2 import Template
|
|
4
4
|
from langchain_core.messages import HumanMessage, SystemMessage
|
|
5
5
|
from langchain_google_genai import ChatGoogleGenerativeAI
|
|
6
|
+
from langchain_google_vertexai.chat_models import ChatVertexAI
|
|
7
|
+
|
|
6
8
|
from minitap.mobile_use.constants import EXECUTOR_MESSAGES_KEY
|
|
7
9
|
from minitap.mobile_use.context import MobileUseContext
|
|
8
10
|
from minitap.mobile_use.graph.state import State
|
|
@@ -56,7 +58,7 @@ class ExecutorNode:
|
|
|
56
58
|
}
|
|
57
59
|
|
|
58
60
|
# ChatGoogleGenerativeAI does not support the "parallel_tool_calls" keyword
|
|
59
|
-
if not isinstance(llm, ChatGoogleGenerativeAI):
|
|
61
|
+
if not isinstance(llm, ChatGoogleGenerativeAI | ChatVertexAI):
|
|
60
62
|
llm_bind_tools_kwargs["parallel_tool_calls"] = True
|
|
61
63
|
|
|
62
64
|
llm = llm.bind_tools(**llm_bind_tools_kwargs)
|
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
import asyncio
|
|
2
|
-
from typing import Any
|
|
2
|
+
from typing import Any
|
|
3
3
|
from langgraph.types import Command
|
|
4
4
|
from pydantic import BaseModel
|
|
5
|
-
from
|
|
5
|
+
from typing import override
|
|
6
6
|
from langchain_core.runnables import RunnableConfig
|
|
7
7
|
from langgraph.store.base import BaseStore
|
|
8
8
|
from langchain_core.messages import AnyMessage, ToolCall, ToolMessage
|
|
@@ -21,7 +21,7 @@ class ExecutorToolNode(ToolNode):
|
|
|
21
21
|
input: list[AnyMessage] | dict[str, Any] | BaseModel,
|
|
22
22
|
config: RunnableConfig,
|
|
23
23
|
*,
|
|
24
|
-
store:
|
|
24
|
+
store: BaseStore | None,
|
|
25
25
|
):
|
|
26
26
|
return await self.__func(is_async=True, input=input, config=config, store=store)
|
|
27
27
|
|
|
@@ -31,7 +31,7 @@ class ExecutorToolNode(ToolNode):
|
|
|
31
31
|
input: list[AnyMessage] | dict[str, Any] | BaseModel,
|
|
32
32
|
config: RunnableConfig,
|
|
33
33
|
*,
|
|
34
|
-
store:
|
|
34
|
+
store: BaseStore | None,
|
|
35
35
|
) -> Any:
|
|
36
36
|
loop = asyncio.get_event_loop()
|
|
37
37
|
return loop.run_until_complete(
|
|
@@ -44,7 +44,7 @@ class ExecutorToolNode(ToolNode):
|
|
|
44
44
|
input: list[AnyMessage] | dict[str, Any] | BaseModel,
|
|
45
45
|
config: RunnableConfig,
|
|
46
46
|
*,
|
|
47
|
-
store:
|
|
47
|
+
store: BaseStore | None,
|
|
48
48
|
) -> Any:
|
|
49
49
|
tool_calls, input_type = self._parse_input(input, store)
|
|
50
50
|
outputs: list[Command | ToolMessage] = []
|
|
@@ -74,7 +74,7 @@ class ExecutorToolNode(ToolNode):
|
|
|
74
74
|
self,
|
|
75
75
|
call: ToolCall,
|
|
76
76
|
output: ToolMessage | Command,
|
|
77
|
-
) ->
|
|
77
|
+
) -> bool | None:
|
|
78
78
|
if isinstance(output, ToolMessage):
|
|
79
79
|
return output.status == "error"
|
|
80
80
|
if isinstance(output, Command):
|
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
import json
|
|
2
2
|
from pathlib import Path
|
|
3
|
-
from typing import Dict, Type, Union
|
|
4
3
|
|
|
5
4
|
from jinja2 import Template
|
|
6
5
|
from langchain_core.messages import BaseMessage, HumanMessage, SystemMessage
|
|
@@ -49,7 +48,7 @@ async def outputter(
|
|
|
49
48
|
structured_llm = llm
|
|
50
49
|
|
|
51
50
|
if output_config.structured_output:
|
|
52
|
-
schema:
|
|
51
|
+
schema: dict | type[BaseModel] | None = None
|
|
53
52
|
so = output_config.structured_output
|
|
54
53
|
|
|
55
54
|
if isinstance(so, dict):
|
|
@@ -12,7 +12,9 @@ You work like an agile tech lead: defining the key milestones without locking in
|
|
|
12
12
|
- Subgoals should reflect real interactions with mobile UIs (e.g. "Open app", "Tap search bar", "Scroll to item", "Send message to Bob", etc).
|
|
13
13
|
- Don't assume the full UI is visible yet. Plan based on how most mobile apps work, and keep flexibility.
|
|
14
14
|
- List of agents thoughts is empty which is expected, since it is the first plan.
|
|
15
|
-
-
|
|
15
|
+
- Avoid too granular UI actions based tasks (e.g. "tap", "swipe", "copy", "paste") unless explicitly required.
|
|
16
|
+
- The executor has the following available tools: {{ executor_tools_list }}.
|
|
17
|
+
When one of these tools offers a direct shortcut (e.g. `openLink` instead of manually launching a browser and typing a URL), prefer it over decomposed manual steps.
|
|
16
18
|
|
|
17
19
|
2. **Replanning**
|
|
18
20
|
If you're asked to **revise a previous plan**, you'll also receive:
|
|
@@ -47,12 +49,19 @@ If you're replaning and need to keep a previous subgoal, you **must keep the sam
|
|
|
47
49
|
- Type the message "I’m running late" (ID: None)
|
|
48
50
|
- Send the message (ID: None)
|
|
49
51
|
|
|
52
|
+
#### **Initial Goal**: "Go on https://tesla.com, and tell me what is the first car being displayed"
|
|
53
|
+
|
|
54
|
+
**Plan**:
|
|
55
|
+
|
|
56
|
+
- Open the link https://tesla.com (ID: None)
|
|
57
|
+
- Find the first car displayed on the home page (ID: None)
|
|
58
|
+
|
|
50
59
|
#### **Replanning Example**
|
|
51
60
|
|
|
52
61
|
**Original Plan**: same as above with IDs set
|
|
53
62
|
**Agent Thoughts**:
|
|
54
63
|
|
|
55
|
-
- Couldn
|
|
64
|
+
- Couldn't find Alice in recent chats
|
|
56
65
|
- Search bar was present on top of the chat screen
|
|
57
66
|
- Keyboard appeared after tapping search
|
|
58
67
|
|
|
@@ -1,13 +1,15 @@
|
|
|
1
|
-
from pathlib import Path
|
|
2
1
|
import uuid
|
|
2
|
+
from pathlib import Path
|
|
3
3
|
|
|
4
4
|
from jinja2 import Template
|
|
5
5
|
from langchain_core.messages import HumanMessage, SystemMessage
|
|
6
|
+
|
|
6
7
|
from minitap.mobile_use.agents.planner.types import PlannerOutput, Subgoal, SubgoalStatus
|
|
7
8
|
from minitap.mobile_use.agents.planner.utils import one_of_them_is_failure
|
|
8
9
|
from minitap.mobile_use.context import MobileUseContext
|
|
9
10
|
from minitap.mobile_use.graph.state import State
|
|
10
11
|
from minitap.mobile_use.services.llm import get_llm
|
|
12
|
+
from minitap.mobile_use.tools.index import EXECUTOR_WRAPPERS_TOOLS, format_tools_list
|
|
11
13
|
from minitap.mobile_use.utils.decorators import wrap_with_callbacks
|
|
12
14
|
from minitap.mobile_use.utils.logger import get_logger
|
|
13
15
|
|
|
@@ -28,7 +30,10 @@ class PlannerNode:
|
|
|
28
30
|
|
|
29
31
|
system_message = Template(
|
|
30
32
|
Path(__file__).parent.joinpath("planner.md").read_text(encoding="utf-8")
|
|
31
|
-
).render(
|
|
33
|
+
).render(
|
|
34
|
+
platform=self.ctx.device.mobile_platform.value,
|
|
35
|
+
executor_tools_list=format_tools_list(ctx=self.ctx, wrappers=EXECUTOR_WRAPPERS_TOOLS),
|
|
36
|
+
)
|
|
32
37
|
human_message = Template(
|
|
33
38
|
Path(__file__).parent.joinpath("human.md").read_text(encoding="utf-8")
|
|
34
39
|
).render(
|
|
@@ -1,12 +1,11 @@
|
|
|
1
1
|
from enum import Enum
|
|
2
|
-
from typing import Optional
|
|
3
2
|
|
|
4
3
|
from pydantic import BaseModel
|
|
5
|
-
from
|
|
4
|
+
from typing import Annotated
|
|
6
5
|
|
|
7
6
|
|
|
8
7
|
class PlannerSubgoalOutput(BaseModel):
|
|
9
|
-
id: Annotated[
|
|
8
|
+
id: Annotated[str | None, "If not provided, it will be generated"] = None
|
|
10
9
|
description: str
|
|
11
10
|
|
|
12
11
|
|
|
@@ -25,7 +24,7 @@ class Subgoal(BaseModel):
|
|
|
25
24
|
id: Annotated[str, "Unique identifier of the subgoal"]
|
|
26
25
|
description: Annotated[str, "Description of the subgoal"]
|
|
27
26
|
completion_reason: Annotated[
|
|
28
|
-
|
|
27
|
+
str | None, "Reason why the subgoal was completed (failure or success)"
|
|
29
28
|
] = None
|
|
30
29
|
status: SubgoalStatus
|
|
31
30
|
|
|
@@ -3,6 +3,7 @@ from langchain_core.messages import (
|
|
|
3
3
|
RemoveMessage,
|
|
4
4
|
ToolMessage,
|
|
5
5
|
)
|
|
6
|
+
|
|
6
7
|
from minitap.mobile_use.constants import MAX_MESSAGES_IN_HISTORY
|
|
7
8
|
from minitap.mobile_use.context import MobileUseContext
|
|
8
9
|
from minitap.mobile_use.graph.state import State
|
|
@@ -22,7 +23,7 @@ class SummarizerNode:
|
|
|
22
23
|
start_removal = False
|
|
23
24
|
|
|
24
25
|
for msg in reversed(state.messages[:nb_removal_candidates]):
|
|
25
|
-
if isinstance(msg,
|
|
26
|
+
if isinstance(msg, ToolMessage | HumanMessage):
|
|
26
27
|
start_removal = True
|
|
27
28
|
if start_removal and msg.id:
|
|
28
29
|
remove_messages.append(RemoveMessage(id=msg.id))
|
minitap/mobile_use/config.py
CHANGED
|
@@ -1,9 +1,11 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import os
|
|
3
3
|
from pathlib import Path
|
|
4
|
-
from typing import Annotated, Any, Literal
|
|
4
|
+
from typing import Annotated, Any, Literal
|
|
5
5
|
|
|
6
|
+
import google.auth
|
|
6
7
|
from dotenv import load_dotenv
|
|
8
|
+
from google.auth.exceptions import DefaultCredentialsError
|
|
7
9
|
from pydantic import BaseModel, Field, SecretStr, ValidationError, model_validator
|
|
8
10
|
from pydantic_settings import BaseSettings
|
|
9
11
|
|
|
@@ -17,17 +19,17 @@ logger = get_logger(__name__)
|
|
|
17
19
|
|
|
18
20
|
|
|
19
21
|
class Settings(BaseSettings):
|
|
20
|
-
OPENAI_API_KEY:
|
|
21
|
-
GOOGLE_API_KEY:
|
|
22
|
-
XAI_API_KEY:
|
|
23
|
-
OPEN_ROUTER_API_KEY:
|
|
22
|
+
OPENAI_API_KEY: SecretStr | None = None
|
|
23
|
+
GOOGLE_API_KEY: SecretStr | None = None
|
|
24
|
+
XAI_API_KEY: SecretStr | None = None
|
|
25
|
+
OPEN_ROUTER_API_KEY: SecretStr | None = None
|
|
24
26
|
|
|
25
|
-
OPENAI_BASE_URL:
|
|
27
|
+
OPENAI_BASE_URL: str | None = None
|
|
26
28
|
|
|
27
|
-
DEVICE_SCREEN_API_BASE_URL:
|
|
28
|
-
DEVICE_HARDWARE_BRIDGE_BASE_URL:
|
|
29
|
-
ADB_HOST:
|
|
30
|
-
ADB_PORT:
|
|
29
|
+
DEVICE_SCREEN_API_BASE_URL: str | None = None
|
|
30
|
+
DEVICE_HARDWARE_BRIDGE_BASE_URL: str | None = None
|
|
31
|
+
ADB_HOST: str | None = None
|
|
32
|
+
ADB_PORT: int | None = None
|
|
31
33
|
|
|
32
34
|
model_config = {"env_file": ".env", "extra": "ignore"}
|
|
33
35
|
|
|
@@ -71,7 +73,7 @@ def prepare_output_files() -> tuple[str | None, str | None]:
|
|
|
71
73
|
return validated_events_path, validated_results_path
|
|
72
74
|
|
|
73
75
|
|
|
74
|
-
def record_events(output_path: Path | None, events:
|
|
76
|
+
def record_events(output_path: Path | None, events: list[str] | BaseModel | Any):
|
|
75
77
|
if not output_path:
|
|
76
78
|
return
|
|
77
79
|
|
|
@@ -88,7 +90,7 @@ def record_events(output_path: Path | None, events: Union[list[str], BaseModel,
|
|
|
88
90
|
|
|
89
91
|
### LLM Configuration
|
|
90
92
|
|
|
91
|
-
LLMProvider = Literal["openai", "google", "openrouter", "xai"]
|
|
93
|
+
LLMProvider = Literal["openai", "google", "openrouter", "xai", "vertexai"]
|
|
92
94
|
LLMUtilsNode = Literal["outputter", "hopper"]
|
|
93
95
|
AgentNode = Literal["planner", "orchestrator", "cortex", "executor"]
|
|
94
96
|
AgentNodeWithFallback = Literal["cortex"]
|
|
@@ -98,6 +100,17 @@ DEFAULT_LLM_CONFIG_FILENAME = "llm-config.defaults.jsonc"
|
|
|
98
100
|
OVERRIDE_LLM_CONFIG_FILENAME = "llm-config.override.jsonc"
|
|
99
101
|
|
|
100
102
|
|
|
103
|
+
def validate_vertex_ai_credentials():
|
|
104
|
+
try:
|
|
105
|
+
_, project = google.auth.default()
|
|
106
|
+
if not project:
|
|
107
|
+
raise Exception("VertexAI requires a Google Cloud project to be set.")
|
|
108
|
+
except DefaultCredentialsError as e:
|
|
109
|
+
raise Exception(
|
|
110
|
+
f"VertexAI requires valid Google Application Default Credentials (ADC): {e}"
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
|
|
101
114
|
class LLM(BaseModel):
|
|
102
115
|
provider: LLMProvider
|
|
103
116
|
model: str
|
|
@@ -110,6 +123,8 @@ class LLM(BaseModel):
|
|
|
110
123
|
case "google":
|
|
111
124
|
if not settings.GOOGLE_API_KEY:
|
|
112
125
|
raise Exception(f"{name} requires GOOGLE_API_KEY in .env")
|
|
126
|
+
case "vertexai":
|
|
127
|
+
validate_vertex_ai_credentials()
|
|
113
128
|
case "openrouter":
|
|
114
129
|
if not settings.OPEN_ROUTER_API_KEY:
|
|
115
130
|
raise Exception(f"{name} requires OPEN_ROUTER_API_KEY in .env")
|
|
@@ -170,7 +185,7 @@ def get_default_llm_config() -> LLMConfig:
|
|
|
170
185
|
try:
|
|
171
186
|
if not os.path.exists(ROOT_DIR / DEFAULT_LLM_CONFIG_FILENAME):
|
|
172
187
|
raise Exception("Default llm config not found")
|
|
173
|
-
with open(ROOT_DIR / DEFAULT_LLM_CONFIG_FILENAME
|
|
188
|
+
with open(ROOT_DIR / DEFAULT_LLM_CONFIG_FILENAME) as f:
|
|
174
189
|
default_config_dict = load_jsonc(f)
|
|
175
190
|
return LLMConfig.model_validate(default_config_dict["default"])
|
|
176
191
|
except Exception as e:
|
|
@@ -211,7 +226,7 @@ def parse_llm_config() -> LLMConfig:
|
|
|
211
226
|
override_config_dict = {}
|
|
212
227
|
if os.path.exists(ROOT_DIR / OVERRIDE_LLM_CONFIG_FILENAME):
|
|
213
228
|
logger.info("Loading custom llm config...")
|
|
214
|
-
with open(ROOT_DIR / OVERRIDE_LLM_CONFIG_FILENAME
|
|
229
|
+
with open(ROOT_DIR / OVERRIDE_LLM_CONFIG_FILENAME) as f:
|
|
215
230
|
override_config_dict = load_jsonc(f)
|
|
216
231
|
else:
|
|
217
232
|
logger.warning("Custom llm config not found, loading default config")
|
|
@@ -237,7 +252,7 @@ def initialize_llm_config() -> LLMConfig:
|
|
|
237
252
|
|
|
238
253
|
class OutputConfig(BaseModel):
|
|
239
254
|
structured_output: Annotated[
|
|
240
|
-
|
|
255
|
+
type[BaseModel] | dict | None,
|
|
241
256
|
Field(
|
|
242
257
|
default=None,
|
|
243
258
|
description=(
|
|
@@ -247,7 +262,7 @@ class OutputConfig(BaseModel):
|
|
|
247
262
|
),
|
|
248
263
|
]
|
|
249
264
|
output_description: Annotated[
|
|
250
|
-
|
|
265
|
+
str | None,
|
|
251
266
|
Field(
|
|
252
267
|
default=None,
|
|
253
268
|
description=(
|
minitap/mobile_use/context.py
CHANGED
|
@@ -6,12 +6,11 @@ Uses ContextVar to avoid prop drilling and maintain clean function signatures.
|
|
|
6
6
|
|
|
7
7
|
from enum import Enum
|
|
8
8
|
from pathlib import Path
|
|
9
|
-
from typing import Optional
|
|
10
9
|
|
|
11
10
|
from adbutils import AdbClient
|
|
12
11
|
from openai import BaseModel
|
|
13
12
|
from pydantic import ConfigDict
|
|
14
|
-
from
|
|
13
|
+
from typing import Literal
|
|
15
14
|
|
|
16
15
|
from minitap.mobile_use.clients.device_hardware_client import DeviceHardwareClient
|
|
17
16
|
from minitap.mobile_use.clients.screen_api_client import ScreenApiClient
|
|
@@ -56,8 +55,8 @@ class MobileUseContext(BaseModel):
|
|
|
56
55
|
hw_bridge_client: DeviceHardwareClient
|
|
57
56
|
screen_api_client: ScreenApiClient
|
|
58
57
|
llm_config: LLMConfig
|
|
59
|
-
adb_client:
|
|
60
|
-
execution_setup:
|
|
58
|
+
adb_client: AdbClient | None = None
|
|
59
|
+
execution_setup: ExecutionSetup | None = None
|
|
61
60
|
|
|
62
61
|
def get_adb_client(self) -> AdbClient:
|
|
63
62
|
if self.adb_client is None:
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import uuid
|
|
2
2
|
from enum import Enum
|
|
3
|
-
from typing import Annotated, Literal
|
|
3
|
+
from typing import Annotated, Literal
|
|
4
4
|
|
|
5
5
|
import yaml
|
|
6
6
|
from langgraph.types import Command
|
|
@@ -43,7 +43,7 @@ class RunFlowRequest(BaseModel):
|
|
|
43
43
|
dry_run: bool = Field(default=False, alias="dryRun")
|
|
44
44
|
|
|
45
45
|
|
|
46
|
-
def run_flow(ctx: MobileUseContext, flow_steps: list, dry_run: bool = False) ->
|
|
46
|
+
def run_flow(ctx: MobileUseContext, flow_steps: list, dry_run: bool = False) -> dict | None:
|
|
47
47
|
"""
|
|
48
48
|
Run a flow i.e, a sequence of commands.
|
|
49
49
|
Returns None on success, or the response body of the failed command.
|
|
@@ -137,20 +137,20 @@ class SelectorRequestWithPercentages(BaseModel):
|
|
|
137
137
|
return {"point": self.percentages.to_str()}
|
|
138
138
|
|
|
139
139
|
|
|
140
|
-
SelectorRequest =
|
|
141
|
-
IdSelectorRequest
|
|
142
|
-
SelectorRequestWithCoordinates
|
|
143
|
-
SelectorRequestWithPercentages
|
|
144
|
-
TextSelectorRequest
|
|
145
|
-
IdWithTextSelectorRequest
|
|
146
|
-
|
|
140
|
+
SelectorRequest = (
|
|
141
|
+
IdSelectorRequest
|
|
142
|
+
| SelectorRequestWithCoordinates
|
|
143
|
+
| SelectorRequestWithPercentages
|
|
144
|
+
| TextSelectorRequest
|
|
145
|
+
| IdWithTextSelectorRequest
|
|
146
|
+
)
|
|
147
147
|
|
|
148
148
|
|
|
149
149
|
def tap(
|
|
150
150
|
ctx: MobileUseContext,
|
|
151
151
|
selector_request: SelectorRequest,
|
|
152
152
|
dry_run: bool = False,
|
|
153
|
-
index:
|
|
153
|
+
index: int | None = None,
|
|
154
154
|
):
|
|
155
155
|
"""
|
|
156
156
|
Tap on a selector.
|
|
@@ -171,7 +171,7 @@ def long_press_on(
|
|
|
171
171
|
ctx: MobileUseContext,
|
|
172
172
|
selector_request: SelectorRequest,
|
|
173
173
|
dry_run: bool = False,
|
|
174
|
-
index:
|
|
174
|
+
index: int | None = None,
|
|
175
175
|
):
|
|
176
176
|
long_press_on_body = selector_request.to_dict()
|
|
177
177
|
if not long_press_on_body:
|
|
@@ -211,7 +211,7 @@ SwipeDirection = Annotated[
|
|
|
211
211
|
class SwipeRequest(BaseModel):
|
|
212
212
|
model_config = ConfigDict(extra="forbid")
|
|
213
213
|
swipe_mode: SwipeStartEndCoordinatesRequest | SwipeStartEndPercentagesRequest | SwipeDirection
|
|
214
|
-
duration:
|
|
214
|
+
duration: int | None = None # in ms, default is 400ms
|
|
215
215
|
|
|
216
216
|
def to_dict(self):
|
|
217
217
|
res = {}
|
|
@@ -257,7 +257,7 @@ def paste_text(ctx: MobileUseContext, dry_run: bool = False):
|
|
|
257
257
|
return run_flow(ctx, ["pasteText"], dry_run=dry_run)
|
|
258
258
|
|
|
259
259
|
|
|
260
|
-
def erase_text(ctx: MobileUseContext, nb_chars:
|
|
260
|
+
def erase_text(ctx: MobileUseContext, nb_chars: int | None = None, dry_run: bool = False):
|
|
261
261
|
"""
|
|
262
262
|
Removes characters from the currently selected textfield (if any)
|
|
263
263
|
Removes 50 characters if nb_chars is not specified.
|
|
@@ -275,7 +275,7 @@ def launch_app(ctx: MobileUseContext, package_name: str, dry_run: bool = False):
|
|
|
275
275
|
return run_flow_with_wait_for_animation_to_end(ctx, flow_input, dry_run=dry_run)
|
|
276
276
|
|
|
277
277
|
|
|
278
|
-
def stop_app(ctx: MobileUseContext, package_name:
|
|
278
|
+
def stop_app(ctx: MobileUseContext, package_name: str | None = None, dry_run: bool = False):
|
|
279
279
|
if package_name is None:
|
|
280
280
|
flow_input = ["stopApp"]
|
|
281
281
|
else:
|
|
@@ -311,13 +311,13 @@ def press_key(ctx: MobileUseContext, key: Key, dry_run: bool = False):
|
|
|
311
311
|
|
|
312
312
|
|
|
313
313
|
class WaitTimeout(Enum):
|
|
314
|
-
SHORT = 500
|
|
315
|
-
MEDIUM = 1000
|
|
316
|
-
LONG = 5000
|
|
314
|
+
SHORT = "500"
|
|
315
|
+
MEDIUM = "1000"
|
|
316
|
+
LONG = "5000"
|
|
317
317
|
|
|
318
318
|
|
|
319
319
|
def wait_for_animation_to_end(
|
|
320
|
-
ctx: MobileUseContext, timeout:
|
|
320
|
+
ctx: MobileUseContext, timeout: WaitTimeout | None = None, dry_run: bool = False
|
|
321
321
|
):
|
|
322
322
|
if timeout is None:
|
|
323
323
|
return run_flow(ctx, ["waitForAnimationToEnd"], dry_run=dry_run)
|
|
@@ -327,7 +327,7 @@ def wait_for_animation_to_end(
|
|
|
327
327
|
def run_flow_with_wait_for_animation_to_end(
|
|
328
328
|
ctx: MobileUseContext, base_flow: list, dry_run: bool = False
|
|
329
329
|
):
|
|
330
|
-
base_flow.append({"waitForAnimationToEnd": {"timeout": WaitTimeout.MEDIUM.value}})
|
|
330
|
+
base_flow.append({"waitForAnimationToEnd": {"timeout": int(WaitTimeout.MEDIUM.value)}})
|
|
331
331
|
return run_flow(ctx, base_flow, dry_run=dry_run)
|
|
332
332
|
|
|
333
333
|
|
|
@@ -362,15 +362,27 @@ if __name__ == "__main__":
|
|
|
362
362
|
agents_thoughts=[],
|
|
363
363
|
)
|
|
364
364
|
|
|
365
|
-
from minitap.mobile_use.tools.mobile.input_text import get_input_text_tool
|
|
366
|
-
|
|
367
|
-
input_resource_id = "com.google.android.apps.nexuslauncher:id/search_container_hotseat"
|
|
368
|
-
command_output: Command = get_input_text_tool(ctx=ctx).invoke(
|
|
365
|
+
# from minitap.mobile_use.tools.mobile.input_text import get_input_text_tool
|
|
366
|
+
|
|
367
|
+
# input_resource_id = "com.google.android.apps.nexuslauncher:id/search_container_hotseat"
|
|
368
|
+
# command_output: Command = get_input_text_tool(ctx=ctx).invoke(
|
|
369
|
+
# {
|
|
370
|
+
# "tool_call_id": uuid.uuid4().hex,
|
|
371
|
+
# "agent_thought": "",
|
|
372
|
+
# "text_input_resource_id": input_resource_id,
|
|
373
|
+
# "text": "Hello World",
|
|
374
|
+
# "state": dummy_state,
|
|
375
|
+
# "executor_metadata": None,
|
|
376
|
+
# }
|
|
377
|
+
# )
|
|
378
|
+
from minitap.mobile_use.tools.mobile.clear_text import get_clear_text_tool
|
|
379
|
+
|
|
380
|
+
input_resource_id = "com.google.android.apps.nexuslauncher:id/input"
|
|
381
|
+
command_output: Command = get_clear_text_tool(ctx=ctx).invoke(
|
|
369
382
|
{
|
|
370
383
|
"tool_call_id": uuid.uuid4().hex,
|
|
371
384
|
"agent_thought": "",
|
|
372
385
|
"text_input_resource_id": input_resource_id,
|
|
373
|
-
"text": "Hello World",
|
|
374
386
|
"state": dummy_state,
|
|
375
387
|
"executor_metadata": None,
|
|
376
388
|
}
|
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
from datetime import date
|
|
2
2
|
import json
|
|
3
|
-
from typing import Optional
|
|
4
3
|
|
|
5
4
|
from adbutils import AdbDevice
|
|
6
5
|
from minitap.mobile_use.utils.logger import MobileUseLogger
|
|
@@ -20,8 +19,8 @@ def get_adb_device(ctx: MobileUseContext) -> AdbDevice:
|
|
|
20
19
|
|
|
21
20
|
|
|
22
21
|
def get_first_device(
|
|
23
|
-
logger:
|
|
24
|
-
) -> tuple[
|
|
22
|
+
logger: MobileUseLogger | None = None,
|
|
23
|
+
) -> tuple[str | None, DevicePlatform | None]:
|
|
25
24
|
"""Gets the first available device."""
|
|
26
25
|
try:
|
|
27
26
|
android_output = run_shell_command_on_host("adb devices")
|
|
@@ -50,7 +49,7 @@ def get_first_device(
|
|
|
50
49
|
return None, None
|
|
51
50
|
|
|
52
51
|
|
|
53
|
-
def get_focused_app_info(ctx: MobileUseContext) ->
|
|
52
|
+
def get_focused_app_info(ctx: MobileUseContext) -> str | None:
|
|
54
53
|
if ctx.device.mobile_platform == DevicePlatform.IOS:
|
|
55
54
|
return None
|
|
56
55
|
device = get_adb_device(ctx)
|
|
@@ -6,6 +6,7 @@ from langchain_core.messages import (
|
|
|
6
6
|
from langgraph.constants import END, START
|
|
7
7
|
from langgraph.graph import StateGraph
|
|
8
8
|
from langgraph.graph.state import CompiledStateGraph
|
|
9
|
+
|
|
9
10
|
from minitap.mobile_use.agents.contextor.contextor import ContextorNode
|
|
10
11
|
from minitap.mobile_use.agents.cortex.cortex import CortexNode
|
|
11
12
|
from minitap.mobile_use.agents.executor.executor import ExecutorNode
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
from langchain_core.messages import AIMessage, AnyMessage
|
|
2
2
|
from langgraph.graph import add_messages
|
|
3
3
|
from langgraph.prebuilt.chat_agent_executor import AgentStatePydantic
|
|
4
|
-
from
|
|
4
|
+
from typing import Annotated
|
|
5
5
|
|
|
6
6
|
from minitap.mobile_use.agents.planner.types import Subgoal
|
|
7
7
|
from minitap.mobile_use.config import AgentNode
|
|
@@ -24,16 +24,16 @@ class State(AgentStatePydantic):
|
|
|
24
24
|
subgoal_plan: Annotated[list[Subgoal], "The current plan, made of subgoals"]
|
|
25
25
|
|
|
26
26
|
# contextor related keys
|
|
27
|
-
latest_screenshot_base64: Annotated[
|
|
27
|
+
latest_screenshot_base64: Annotated[str | None, "Latest screenshot of the device", take_last]
|
|
28
28
|
latest_ui_hierarchy: Annotated[
|
|
29
|
-
|
|
29
|
+
list[dict] | None, "Latest UI hierarchy of the device", take_last
|
|
30
30
|
]
|
|
31
|
-
focused_app_info: Annotated[
|
|
32
|
-
device_date: Annotated[
|
|
31
|
+
focused_app_info: Annotated[str | None, "Focused app info", take_last]
|
|
32
|
+
device_date: Annotated[str | None, "Date of the device", take_last]
|
|
33
33
|
|
|
34
34
|
# cortex related keys
|
|
35
35
|
structured_decisions: Annotated[
|
|
36
|
-
|
|
36
|
+
str | None,
|
|
37
37
|
"Structured decisions made by the cortex, for the executor to follow",
|
|
38
38
|
take_last,
|
|
39
39
|
]
|
|
@@ -45,7 +45,7 @@ class State(AgentStatePydantic):
|
|
|
45
45
|
|
|
46
46
|
# executor related keys
|
|
47
47
|
executor_messages: Annotated[list[AnyMessage], "Sequential Executor messages", add_messages]
|
|
48
|
-
cortex_last_thought: Annotated[
|
|
48
|
+
cortex_last_thought: Annotated[str | None, "Last thought of the cortex for the executor"]
|
|
49
49
|
|
|
50
50
|
# common keys
|
|
51
51
|
agents_thoughts: Annotated[
|
|
@@ -58,13 +58,13 @@ class State(AgentStatePydantic):
|
|
|
58
58
|
self,
|
|
59
59
|
ctx: MobileUseContext,
|
|
60
60
|
update: dict,
|
|
61
|
-
agent:
|
|
61
|
+
agent: AgentNode | None = None,
|
|
62
62
|
):
|
|
63
63
|
"""
|
|
64
64
|
Sanitizes the state update to ensure it is valid and apply side effect logic where required.
|
|
65
65
|
The agent is required if the update contains the "agents_thoughts" key.
|
|
66
66
|
"""
|
|
67
|
-
updated_agents_thoughts:
|
|
67
|
+
updated_agents_thoughts: str | list[str] | None = update.get("agents_thoughts", None)
|
|
68
68
|
if updated_agents_thoughts is not None:
|
|
69
69
|
if isinstance(updated_agents_thoughts, str):
|
|
70
70
|
updated_agents_thoughts = [updated_agents_thoughts]
|