minitap-mobile-use 2.3.0__py3-none-any.whl → 2.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of minitap-mobile-use might be problematic. Click here for more details.
- minitap/mobile_use/agents/contextor/contextor.py +2 -2
- minitap/mobile_use/agents/cortex/cortex.md +49 -8
- minitap/mobile_use/agents/cortex/cortex.py +8 -4
- minitap/mobile_use/agents/executor/executor.md +14 -11
- minitap/mobile_use/agents/executor/executor.py +6 -5
- minitap/mobile_use/agents/hopper/hopper.py +6 -3
- minitap/mobile_use/agents/orchestrator/orchestrator.py +26 -11
- minitap/mobile_use/agents/outputter/outputter.py +6 -3
- minitap/mobile_use/agents/planner/planner.md +20 -22
- minitap/mobile_use/agents/planner/planner.py +10 -7
- minitap/mobile_use/agents/planner/types.py +4 -2
- minitap/mobile_use/agents/planner/utils.py +14 -0
- minitap/mobile_use/agents/summarizer/summarizer.py +2 -2
- minitap/mobile_use/config.py +6 -1
- minitap/mobile_use/context.py +13 -3
- minitap/mobile_use/controllers/mobile_command_controller.py +1 -14
- minitap/mobile_use/graph/state.py +7 -3
- minitap/mobile_use/sdk/agent.py +188 -23
- minitap/mobile_use/sdk/examples/README.md +19 -1
- minitap/mobile_use/sdk/examples/platform_manual_task_example.py +65 -0
- minitap/mobile_use/sdk/examples/platform_minimal_example.py +46 -0
- minitap/mobile_use/sdk/services/platform.py +307 -0
- minitap/mobile_use/sdk/types/__init__.py +16 -14
- minitap/mobile_use/sdk/types/exceptions.py +27 -0
- minitap/mobile_use/sdk/types/platform.py +127 -0
- minitap/mobile_use/sdk/types/task.py +78 -17
- minitap/mobile_use/servers/device_hardware_bridge.py +1 -1
- minitap/mobile_use/servers/stop_servers.py +11 -12
- minitap/mobile_use/services/llm.py +89 -5
- minitap/mobile_use/tools/index.py +0 -6
- minitap/mobile_use/tools/mobile/back.py +3 -3
- minitap/mobile_use/tools/mobile/clear_text.py +24 -43
- minitap/mobile_use/tools/mobile/erase_one_char.py +5 -4
- minitap/mobile_use/tools/mobile/glimpse_screen.py +11 -7
- minitap/mobile_use/tools/mobile/input_text.py +21 -51
- minitap/mobile_use/tools/mobile/launch_app.py +54 -22
- minitap/mobile_use/tools/mobile/long_press_on.py +15 -8
- minitap/mobile_use/tools/mobile/open_link.py +15 -8
- minitap/mobile_use/tools/mobile/press_key.py +15 -8
- minitap/mobile_use/tools/mobile/stop_app.py +14 -8
- minitap/mobile_use/tools/mobile/swipe.py +11 -5
- minitap/mobile_use/tools/mobile/tap.py +103 -21
- minitap/mobile_use/tools/mobile/wait_for_animation_to_end.py +3 -3
- minitap/mobile_use/tools/test_utils.py +104 -78
- minitap/mobile_use/tools/types.py +35 -0
- minitap/mobile_use/tools/utils.py +51 -48
- minitap/mobile_use/utils/recorder.py +1 -1
- minitap/mobile_use/utils/ui_hierarchy.py +9 -2
- {minitap_mobile_use-2.3.0.dist-info → minitap_mobile_use-2.5.0.dist-info}/METADATA +3 -1
- minitap_mobile_use-2.5.0.dist-info/RECORD +100 -0
- minitap/mobile_use/tools/mobile/copy_text_from.py +0 -75
- minitap/mobile_use/tools/mobile/find_packages.py +0 -69
- minitap/mobile_use/tools/mobile/paste_text.py +0 -88
- minitap_mobile_use-2.3.0.dist-info/RECORD +0 -98
- {minitap_mobile_use-2.3.0.dist-info → minitap_mobile_use-2.5.0.dist-info}/WHEEL +0 -0
- {minitap_mobile_use-2.3.0.dist-info → minitap_mobile_use-2.5.0.dist-info}/entry_points.txt +0 -0
|
@@ -70,18 +70,17 @@ def stop_process_gracefully(process: psutil.Process, timeout: int = 5) -> bool:
|
|
|
70
70
|
return False
|
|
71
71
|
|
|
72
72
|
|
|
73
|
-
def
|
|
73
|
+
def check_service_running(port: int, service_name: str) -> bool:
|
|
74
74
|
try:
|
|
75
75
|
if port == server_settings.DEVICE_SCREEN_API_PORT:
|
|
76
|
-
|
|
76
|
+
requests.get(f"http://localhost:{port}/health", timeout=2)
|
|
77
77
|
elif port == DEVICE_HARDWARE_BRIDGE_PORT:
|
|
78
|
-
|
|
78
|
+
requests.get(f"http://localhost:{port}/api/banner-message", timeout=2)
|
|
79
79
|
else:
|
|
80
80
|
return False
|
|
81
81
|
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
return True
|
|
82
|
+
logger.debug(f"{service_name} is still responding on port {port}")
|
|
83
|
+
return True
|
|
85
84
|
except requests.exceptions.RequestException:
|
|
86
85
|
pass
|
|
87
86
|
|
|
@@ -92,7 +91,7 @@ def stop_device_screen_api() -> bool:
|
|
|
92
91
|
logger.info("Stopping Device Screen API...")
|
|
93
92
|
api_port = server_settings.DEVICE_SCREEN_API_PORT
|
|
94
93
|
|
|
95
|
-
if not
|
|
94
|
+
if not check_service_running(api_port, "Device Screen API"):
|
|
96
95
|
logger.success("Device Screen API is not running")
|
|
97
96
|
return True
|
|
98
97
|
|
|
@@ -109,7 +108,7 @@ def stop_device_screen_api() -> bool:
|
|
|
109
108
|
logger.warning("No Device Screen API processes found, but service is still responding")
|
|
110
109
|
# Still try to verify if service actually stops
|
|
111
110
|
time.sleep(1)
|
|
112
|
-
if not
|
|
111
|
+
if not check_service_running(api_port, "Device Screen API"):
|
|
113
112
|
logger.success("Device Screen API stopped successfully (was orphaned)")
|
|
114
113
|
return True
|
|
115
114
|
return False
|
|
@@ -120,7 +119,7 @@ def stop_device_screen_api() -> bool:
|
|
|
120
119
|
|
|
121
120
|
# Verify service is stopped
|
|
122
121
|
time.sleep(1)
|
|
123
|
-
if
|
|
122
|
+
if check_service_running(api_port, "Device Screen API"):
|
|
124
123
|
logger.error("Device Screen API is still running after stop attempt")
|
|
125
124
|
return False
|
|
126
125
|
|
|
@@ -131,7 +130,7 @@ def stop_device_screen_api() -> bool:
|
|
|
131
130
|
def stop_device_hardware_bridge() -> bool:
|
|
132
131
|
logger.info("Stopping Device Hardware Bridge...")
|
|
133
132
|
|
|
134
|
-
if not
|
|
133
|
+
if not check_service_running(DEVICE_HARDWARE_BRIDGE_PORT, "Maestro Studio"):
|
|
135
134
|
logger.success("Device Hardware Bridge is not running")
|
|
136
135
|
return True
|
|
137
136
|
|
|
@@ -145,7 +144,7 @@ def stop_device_hardware_bridge() -> bool:
|
|
|
145
144
|
logger.warning("No Device Hardware Bridge processes found, but service is still responding")
|
|
146
145
|
# Still try to verify if service actually stops
|
|
147
146
|
time.sleep(1)
|
|
148
|
-
if not
|
|
147
|
+
if not check_service_running(DEVICE_HARDWARE_BRIDGE_PORT, "Maestro Studio"):
|
|
149
148
|
logger.success("Device Hardware Bridge stopped successfully (was orphaned)")
|
|
150
149
|
return True
|
|
151
150
|
return False
|
|
@@ -154,7 +153,7 @@ def stop_device_hardware_bridge() -> bool:
|
|
|
154
153
|
stop_process_gracefully(proc)
|
|
155
154
|
|
|
156
155
|
time.sleep(1)
|
|
157
|
-
if
|
|
156
|
+
if check_service_running(DEVICE_HARDWARE_BRIDGE_PORT, "Maestro Studio"):
|
|
158
157
|
logger.error("Device Hardware Bridge is still running after stop attempt")
|
|
159
158
|
return False
|
|
160
159
|
|
|
@@ -1,11 +1,13 @@
|
|
|
1
|
+
import asyncio
|
|
1
2
|
import logging
|
|
2
|
-
from collections.abc import Awaitable, Callable
|
|
3
|
-
from typing import Literal, TypeVar, overload
|
|
3
|
+
from collections.abc import Awaitable, Callable, Coroutine
|
|
4
|
+
from typing import Any, Literal, TypeVar, overload
|
|
4
5
|
|
|
5
6
|
from langchain_core.language_models.chat_models import BaseChatModel
|
|
6
7
|
from langchain_google_genai import ChatGoogleGenerativeAI
|
|
7
8
|
from langchain_google_vertexai import ChatVertexAI
|
|
8
9
|
from langchain_openai import ChatOpenAI
|
|
10
|
+
from pydantic import SecretStr
|
|
9
11
|
|
|
10
12
|
from minitap.mobile_use.config import (
|
|
11
13
|
AgentNode,
|
|
@@ -15,8 +17,79 @@ from minitap.mobile_use.config import (
|
|
|
15
17
|
settings,
|
|
16
18
|
)
|
|
17
19
|
from minitap.mobile_use.context import MobileUseContext
|
|
20
|
+
from minitap.mobile_use.utils.logger import get_logger
|
|
18
21
|
|
|
19
|
-
|
|
22
|
+
# Logger for internal messages (ex: fallback)
|
|
23
|
+
llm_logger = logging.getLogger(__name__)
|
|
24
|
+
# Logger for user messages
|
|
25
|
+
user_messages_logger = get_logger(__name__)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
async def invoke_llm_with_timeout_message[T](
|
|
29
|
+
llm_call: Coroutine[Any, Any, T],
|
|
30
|
+
agent_name: str,
|
|
31
|
+
timeout_seconds: int = 10,
|
|
32
|
+
) -> T:
|
|
33
|
+
"""
|
|
34
|
+
Send a LLM call and display a timeout message if it takes too long.
|
|
35
|
+
|
|
36
|
+
Args:
|
|
37
|
+
llm_call: The coroutine of the LLM call to execute.
|
|
38
|
+
agent_name: The name of the agent making the call (for the message).
|
|
39
|
+
timeout_seconds: The delay in seconds before displaying the message.
|
|
40
|
+
|
|
41
|
+
Returns:
|
|
42
|
+
The result of the LLM call.
|
|
43
|
+
"""
|
|
44
|
+
llm_task = asyncio.create_task(llm_call)
|
|
45
|
+
waiter_task = asyncio.create_task(asyncio.sleep(timeout_seconds))
|
|
46
|
+
|
|
47
|
+
done, _ = await asyncio.wait({llm_task, waiter_task}, return_when=asyncio.FIRST_COMPLETED)
|
|
48
|
+
|
|
49
|
+
if llm_task in done:
|
|
50
|
+
# The LLM call has finished before the timeout, cancel the timer
|
|
51
|
+
waiter_task.cancel()
|
|
52
|
+
return llm_task.result()
|
|
53
|
+
else:
|
|
54
|
+
# The timeout has been reached, display the message and wait for the call to finish
|
|
55
|
+
user_messages_logger.info("Waiting for LLM call response...")
|
|
56
|
+
return await llm_task
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def get_minitap_llm(
|
|
60
|
+
trace_id: str,
|
|
61
|
+
remote_tracing: bool = False,
|
|
62
|
+
model: str = "google/gemini-2.5-pro",
|
|
63
|
+
temperature: float | None = None,
|
|
64
|
+
max_retries: int | None = None,
|
|
65
|
+
api_key: str | None = None,
|
|
66
|
+
) -> ChatOpenAI:
|
|
67
|
+
if api_key:
|
|
68
|
+
effective_api_key = SecretStr(api_key)
|
|
69
|
+
elif settings.MINITAP_API_KEY:
|
|
70
|
+
effective_api_key = settings.MINITAP_API_KEY
|
|
71
|
+
else:
|
|
72
|
+
raise ValueError("MINITAP_API_KEY must be provided or set in environment")
|
|
73
|
+
|
|
74
|
+
if settings.MINITAP_API_BASE_URL is None:
|
|
75
|
+
raise ValueError("MINITAP_API_BASE_URL must be set in environment")
|
|
76
|
+
|
|
77
|
+
llm_base_url = f"{settings.MINITAP_API_BASE_URL}/api/v1"
|
|
78
|
+
|
|
79
|
+
if max_retries is None and model.startswith("google/"):
|
|
80
|
+
max_retries = 2
|
|
81
|
+
client = ChatOpenAI(
|
|
82
|
+
model=model,
|
|
83
|
+
temperature=temperature,
|
|
84
|
+
max_retries=max_retries,
|
|
85
|
+
api_key=effective_api_key,
|
|
86
|
+
base_url=llm_base_url,
|
|
87
|
+
default_query={
|
|
88
|
+
"sessionId": trace_id,
|
|
89
|
+
"traceOnlyUsage": remote_tracing,
|
|
90
|
+
},
|
|
91
|
+
)
|
|
92
|
+
return client
|
|
20
93
|
|
|
21
94
|
|
|
22
95
|
def get_google_llm(
|
|
@@ -139,6 +212,17 @@ def get_llm(
|
|
|
139
212
|
return get_openrouter_llm(llm.model, temperature)
|
|
140
213
|
elif llm.provider == "xai":
|
|
141
214
|
return get_grok_llm(llm.model, temperature)
|
|
215
|
+
elif llm.provider == "minitap":
|
|
216
|
+
remote_tracing = False
|
|
217
|
+
if ctx.execution_setup:
|
|
218
|
+
remote_tracing = ctx.execution_setup.enable_remote_tracing
|
|
219
|
+
return get_minitap_llm(
|
|
220
|
+
trace_id=ctx.trace_id,
|
|
221
|
+
remote_tracing=remote_tracing,
|
|
222
|
+
model=llm.model,
|
|
223
|
+
temperature=temperature,
|
|
224
|
+
api_key=ctx.minitap_api_key,
|
|
225
|
+
)
|
|
142
226
|
else:
|
|
143
227
|
raise ValueError(f"Unsupported provider: {llm.provider}")
|
|
144
228
|
|
|
@@ -154,9 +238,9 @@ async def with_fallback(
|
|
|
154
238
|
try:
|
|
155
239
|
result = await main_call()
|
|
156
240
|
if result is None and none_should_fallback:
|
|
157
|
-
|
|
241
|
+
llm_logger.warning("Main LLM inference returned None. Falling back...")
|
|
158
242
|
return await fallback_call()
|
|
159
243
|
return result
|
|
160
244
|
except Exception as e:
|
|
161
|
-
|
|
245
|
+
llm_logger.warning(f"❗ Main LLM inference failed: {e}. Falling back...")
|
|
162
246
|
return await fallback_call()
|
|
@@ -3,15 +3,12 @@ from langchain_core.tools import BaseTool
|
|
|
3
3
|
from minitap.mobile_use.context import MobileUseContext
|
|
4
4
|
from minitap.mobile_use.tools.mobile.back import back_wrapper
|
|
5
5
|
from minitap.mobile_use.tools.mobile.clear_text import clear_text_wrapper
|
|
6
|
-
from minitap.mobile_use.tools.mobile.copy_text_from import copy_text_from_wrapper
|
|
7
6
|
from minitap.mobile_use.tools.mobile.erase_one_char import erase_one_char_wrapper
|
|
8
|
-
from minitap.mobile_use.tools.mobile.find_packages import find_packages_wrapper
|
|
9
7
|
from minitap.mobile_use.tools.mobile.glimpse_screen import glimpse_screen_wrapper
|
|
10
8
|
from minitap.mobile_use.tools.mobile.input_text import input_text_wrapper
|
|
11
9
|
from minitap.mobile_use.tools.mobile.launch_app import launch_app_wrapper
|
|
12
10
|
from minitap.mobile_use.tools.mobile.long_press_on import long_press_on_wrapper
|
|
13
11
|
from minitap.mobile_use.tools.mobile.open_link import open_link_wrapper
|
|
14
|
-
from minitap.mobile_use.tools.mobile.paste_text import paste_text_wrapper
|
|
15
12
|
from minitap.mobile_use.tools.mobile.press_key import press_key_wrapper
|
|
16
13
|
from minitap.mobile_use.tools.mobile.stop_app import stop_app_wrapper
|
|
17
14
|
from minitap.mobile_use.tools.mobile.swipe import swipe_wrapper
|
|
@@ -28,13 +25,10 @@ EXECUTOR_WRAPPERS_TOOLS = [
|
|
|
28
25
|
long_press_on_wrapper,
|
|
29
26
|
swipe_wrapper,
|
|
30
27
|
glimpse_screen_wrapper,
|
|
31
|
-
copy_text_from_wrapper,
|
|
32
28
|
input_text_wrapper,
|
|
33
29
|
erase_one_char_wrapper,
|
|
34
|
-
find_packages_wrapper,
|
|
35
30
|
launch_app_wrapper,
|
|
36
31
|
stop_app_wrapper,
|
|
37
|
-
paste_text_wrapper,
|
|
38
32
|
clear_text_wrapper,
|
|
39
33
|
press_key_wrapper,
|
|
40
34
|
wait_for_animation_to_end_wrapper,
|
|
@@ -13,11 +13,11 @@ from langgraph.prebuilt import InjectedState
|
|
|
13
13
|
|
|
14
14
|
def get_back_tool(ctx: MobileUseContext):
|
|
15
15
|
@tool
|
|
16
|
-
def back(
|
|
16
|
+
async def back(
|
|
17
17
|
tool_call_id: Annotated[str, InjectedToolCallId],
|
|
18
18
|
state: Annotated[State, InjectedState],
|
|
19
19
|
agent_thought: str,
|
|
20
|
-
):
|
|
20
|
+
) -> Command:
|
|
21
21
|
"""Navigates to the previous screen. (Only works on Android for the moment)"""
|
|
22
22
|
output = back_controller(ctx=ctx)
|
|
23
23
|
has_failed = output is not None
|
|
@@ -28,7 +28,7 @@ def get_back_tool(ctx: MobileUseContext):
|
|
|
28
28
|
status="error" if has_failed else "success",
|
|
29
29
|
)
|
|
30
30
|
return Command(
|
|
31
|
-
update=state.
|
|
31
|
+
update=await state.asanitize_update(
|
|
32
32
|
ctx=ctx,
|
|
33
33
|
update={
|
|
34
34
|
"agents_thoughts": [agent_thought],
|
|
@@ -12,18 +12,13 @@ from minitap.mobile_use.context import MobileUseContext
|
|
|
12
12
|
from minitap.mobile_use.controllers.mobile_command_controller import (
|
|
13
13
|
erase_text as erase_text_controller,
|
|
14
14
|
)
|
|
15
|
-
from minitap.mobile_use.controllers.mobile_command_controller import
|
|
16
|
-
get_screen_data,
|
|
17
|
-
)
|
|
15
|
+
from minitap.mobile_use.controllers.mobile_command_controller import get_screen_data
|
|
18
16
|
from minitap.mobile_use.graph.state import State
|
|
19
17
|
from minitap.mobile_use.tools.tool_wrapper import ToolWrapper
|
|
20
|
-
from minitap.mobile_use.tools.
|
|
21
|
-
|
|
22
|
-
move_cursor_to_end_if_bounds,
|
|
23
|
-
)
|
|
18
|
+
from minitap.mobile_use.tools.types import Target
|
|
19
|
+
from minitap.mobile_use.tools.utils import focus_element_if_needed, move_cursor_to_end_if_bounds
|
|
24
20
|
from minitap.mobile_use.utils.logger import get_logger
|
|
25
21
|
from minitap.mobile_use.utils.ui_hierarchy import (
|
|
26
|
-
ElementBounds,
|
|
27
22
|
find_element_by_resource_id,
|
|
28
23
|
get_element_text,
|
|
29
24
|
text_input_is_empty,
|
|
@@ -90,24 +85,18 @@ class TextClearer:
|
|
|
90
85
|
|
|
91
86
|
def _prepare_element_for_clearing(
|
|
92
87
|
self,
|
|
93
|
-
|
|
94
|
-
text_input_coordinates: ElementBounds | None,
|
|
95
|
-
text_input_text: str | None,
|
|
88
|
+
target: Target,
|
|
96
89
|
) -> bool:
|
|
97
90
|
if not focus_element_if_needed(
|
|
98
91
|
ctx=self.ctx,
|
|
99
|
-
|
|
100
|
-
input_coordinates=text_input_coordinates,
|
|
101
|
-
input_text=text_input_text,
|
|
92
|
+
target=target,
|
|
102
93
|
):
|
|
103
94
|
return False
|
|
104
95
|
|
|
105
96
|
move_cursor_to_end_if_bounds(
|
|
106
97
|
ctx=self.ctx,
|
|
107
98
|
state=self.state,
|
|
108
|
-
|
|
109
|
-
text_input_coordinates=text_input_coordinates,
|
|
110
|
-
text_input_text=text_input_text,
|
|
99
|
+
target=target,
|
|
111
100
|
)
|
|
112
101
|
return True
|
|
113
102
|
|
|
@@ -124,9 +113,7 @@ class TextClearer:
|
|
|
124
113
|
|
|
125
114
|
def _clear_with_retries(
|
|
126
115
|
self,
|
|
127
|
-
|
|
128
|
-
text_input_coordinates: ElementBounds | None,
|
|
129
|
-
text_input_text: str | None,
|
|
116
|
+
target: Target,
|
|
130
117
|
initial_text: str,
|
|
131
118
|
hint_text: str | None,
|
|
132
119
|
) -> tuple[bool, str | None, int]:
|
|
@@ -145,10 +132,10 @@ class TextClearer:
|
|
|
145
132
|
|
|
146
133
|
self._refresh_ui_hierarchy()
|
|
147
134
|
elt = None
|
|
148
|
-
if
|
|
135
|
+
if target.resource_id:
|
|
149
136
|
elt = find_element_by_resource_id(
|
|
150
137
|
ui_hierarchy=self.state.latest_ui_hierarchy or [],
|
|
151
|
-
resource_id=
|
|
138
|
+
resource_id=target.resource_id,
|
|
152
139
|
)
|
|
153
140
|
if elt:
|
|
154
141
|
current_text = get_element_text(elt)
|
|
@@ -159,9 +146,7 @@ class TextClearer:
|
|
|
159
146
|
move_cursor_to_end_if_bounds(
|
|
160
147
|
ctx=self.ctx,
|
|
161
148
|
state=self.state,
|
|
162
|
-
|
|
163
|
-
text_input_coordinates=text_input_coordinates,
|
|
164
|
-
text_input_text=text_input_text,
|
|
149
|
+
target=target,
|
|
165
150
|
elt=elt,
|
|
166
151
|
)
|
|
167
152
|
|
|
@@ -213,20 +198,20 @@ class TextClearer:
|
|
|
213
198
|
|
|
214
199
|
def clear_input_text(
|
|
215
200
|
self,
|
|
216
|
-
|
|
217
|
-
text_input_coordinates: ElementBounds | None,
|
|
218
|
-
text_input_text: str | None,
|
|
201
|
+
target: Target,
|
|
219
202
|
) -> ClearTextResult:
|
|
220
|
-
element, current_text, hint_text = self._get_element_info(
|
|
203
|
+
element, current_text, hint_text = self._get_element_info(
|
|
204
|
+
resource_id=target.resource_id,
|
|
205
|
+
)
|
|
221
206
|
|
|
222
207
|
if not element:
|
|
223
|
-
return self._handle_element_not_found(
|
|
208
|
+
return self._handle_element_not_found(target.resource_id, hint_text)
|
|
224
209
|
|
|
225
210
|
if not self._should_clear_text(current_text, hint_text):
|
|
226
211
|
return self._handle_no_clearing_needed(current_text, hint_text)
|
|
227
212
|
|
|
228
213
|
if not self._prepare_element_for_clearing(
|
|
229
|
-
|
|
214
|
+
target=target,
|
|
230
215
|
):
|
|
231
216
|
return self._create_result(
|
|
232
217
|
success=False,
|
|
@@ -237,9 +222,7 @@ class TextClearer:
|
|
|
237
222
|
)
|
|
238
223
|
|
|
239
224
|
success, final_text, chars_erased = self._clear_with_retries(
|
|
240
|
-
|
|
241
|
-
text_input_coordinates=text_input_coordinates,
|
|
242
|
-
text_input_text=text_input_text,
|
|
225
|
+
target=target,
|
|
243
226
|
initial_text=current_text or "",
|
|
244
227
|
hint_text=hint_text,
|
|
245
228
|
)
|
|
@@ -257,23 +240,21 @@ class TextClearer:
|
|
|
257
240
|
|
|
258
241
|
def get_clear_text_tool(ctx: MobileUseContext):
|
|
259
242
|
@tool
|
|
260
|
-
def clear_text(
|
|
243
|
+
async def clear_text(
|
|
261
244
|
tool_call_id: Annotated[str, InjectedToolCallId],
|
|
262
245
|
state: Annotated[State, InjectedState],
|
|
263
246
|
agent_thought: str,
|
|
264
|
-
|
|
265
|
-
text_input_coordinates: ElementBounds | None,
|
|
266
|
-
text_input_text: str | None,
|
|
247
|
+
target: Target,
|
|
267
248
|
):
|
|
268
249
|
"""
|
|
269
250
|
Clears all the text from the text field, by focusing it if needed.
|
|
270
251
|
"""
|
|
271
252
|
clearer = TextClearer(ctx, state)
|
|
272
253
|
result = clearer.clear_input_text(
|
|
273
|
-
|
|
254
|
+
target=target,
|
|
274
255
|
)
|
|
275
256
|
|
|
276
|
-
|
|
257
|
+
agent_outcome = (
|
|
277
258
|
clear_text_wrapper.on_failure_fn(result.error_message)
|
|
278
259
|
if not result.success
|
|
279
260
|
else clear_text_wrapper.on_success_fn(
|
|
@@ -283,16 +264,16 @@ def get_clear_text_tool(ctx: MobileUseContext):
|
|
|
283
264
|
|
|
284
265
|
tool_message = ToolMessage(
|
|
285
266
|
tool_call_id=tool_call_id,
|
|
286
|
-
content=
|
|
267
|
+
content=agent_outcome,
|
|
287
268
|
additional_kwargs={"error": result.error_message} if not result.success else {},
|
|
288
269
|
status="error" if not result.success else "success",
|
|
289
270
|
)
|
|
290
271
|
|
|
291
272
|
return Command(
|
|
292
|
-
update=state.
|
|
273
|
+
update=await state.asanitize_update(
|
|
293
274
|
ctx=ctx,
|
|
294
275
|
update={
|
|
295
|
-
"agents_thoughts": [agent_thought],
|
|
276
|
+
"agents_thoughts": [agent_thought, agent_outcome],
|
|
296
277
|
EXECUTOR_MESSAGES_KEY: [tool_message],
|
|
297
278
|
},
|
|
298
279
|
agent="executor",
|
|
@@ -1,9 +1,10 @@
|
|
|
1
|
+
from typing import Annotated
|
|
2
|
+
|
|
1
3
|
from langchain_core.messages import ToolMessage
|
|
2
4
|
from langchain_core.tools import tool
|
|
3
5
|
from langchain_core.tools.base import InjectedToolCallId
|
|
4
6
|
from langgraph.prebuilt import InjectedState
|
|
5
7
|
from langgraph.types import Command
|
|
6
|
-
from typing import Annotated
|
|
7
8
|
|
|
8
9
|
from minitap.mobile_use.constants import EXECUTOR_MESSAGES_KEY
|
|
9
10
|
from minitap.mobile_use.context import MobileUseContext
|
|
@@ -16,11 +17,11 @@ from minitap.mobile_use.tools.tool_wrapper import ToolWrapper
|
|
|
16
17
|
|
|
17
18
|
def get_erase_one_char_tool(ctx: MobileUseContext):
|
|
18
19
|
@tool
|
|
19
|
-
def erase_one_char(
|
|
20
|
+
async def erase_one_char(
|
|
20
21
|
tool_call_id: Annotated[str, InjectedToolCallId],
|
|
21
22
|
state: Annotated[State, InjectedState],
|
|
22
23
|
agent_thought: str,
|
|
23
|
-
):
|
|
24
|
+
) -> Command:
|
|
24
25
|
"""
|
|
25
26
|
Erase one character from a text area.
|
|
26
27
|
It acts the same as pressing backspace a single time.
|
|
@@ -36,7 +37,7 @@ def get_erase_one_char_tool(ctx: MobileUseContext):
|
|
|
36
37
|
status="error" if has_failed else "success",
|
|
37
38
|
)
|
|
38
39
|
return Command(
|
|
39
|
-
update=state.
|
|
40
|
+
update=await state.asanitize_update(
|
|
40
41
|
ctx=ctx,
|
|
41
42
|
update={
|
|
42
43
|
"agents_thoughts": [agent_thought],
|
|
@@ -18,11 +18,11 @@ from minitap.mobile_use.utils.media import compress_base64_jpeg
|
|
|
18
18
|
|
|
19
19
|
def get_glimpse_screen_tool(ctx: MobileUseContext):
|
|
20
20
|
@tool
|
|
21
|
-
def glimpse_screen(
|
|
21
|
+
async def glimpse_screen(
|
|
22
22
|
tool_call_id: Annotated[str, InjectedToolCallId],
|
|
23
23
|
state: Annotated[State, InjectedState],
|
|
24
24
|
agent_thought: str,
|
|
25
|
-
):
|
|
25
|
+
) -> Command:
|
|
26
26
|
"""
|
|
27
27
|
Captures the current screen as an image.
|
|
28
28
|
The resulting screenshot is added to the context for the next reasoning step.
|
|
@@ -37,22 +37,26 @@ def get_glimpse_screen_tool(ctx: MobileUseContext):
|
|
|
37
37
|
output = str(e)
|
|
38
38
|
has_failed = True
|
|
39
39
|
|
|
40
|
+
agent_outcome = (
|
|
41
|
+
glimpse_screen_wrapper.on_failure_fn()
|
|
42
|
+
if has_failed
|
|
43
|
+
else glimpse_screen_wrapper.on_success_fn()
|
|
44
|
+
)
|
|
45
|
+
|
|
40
46
|
tool_message = ToolMessage(
|
|
41
47
|
tool_call_id=tool_call_id,
|
|
42
|
-
content=
|
|
43
|
-
if has_failed
|
|
44
|
-
else glimpse_screen_wrapper.on_success_fn(),
|
|
48
|
+
content=agent_outcome,
|
|
45
49
|
additional_kwargs={"error": output} if has_failed else {},
|
|
46
50
|
status="error" if has_failed else "success",
|
|
47
51
|
)
|
|
48
52
|
updates = {
|
|
49
|
-
"agents_thoughts": [agent_thought],
|
|
53
|
+
"agents_thoughts": [agent_thought, agent_outcome],
|
|
50
54
|
EXECUTOR_MESSAGES_KEY: [tool_message],
|
|
51
55
|
}
|
|
52
56
|
if compressed_image_base64:
|
|
53
57
|
updates["latest_screenshot_base64"] = compressed_image_base64
|
|
54
58
|
return Command(
|
|
55
|
-
update=state.
|
|
59
|
+
update=await state.asanitize_update(
|
|
56
60
|
ctx=ctx,
|
|
57
61
|
update=updates,
|
|
58
62
|
agent="executor",
|
|
@@ -11,21 +11,16 @@ from pydantic import BaseModel
|
|
|
11
11
|
|
|
12
12
|
from minitap.mobile_use.constants import EXECUTOR_MESSAGES_KEY
|
|
13
13
|
from minitap.mobile_use.context import MobileUseContext
|
|
14
|
-
from minitap.mobile_use.controllers.mobile_command_controller import
|
|
15
|
-
get_screen_data,
|
|
16
|
-
)
|
|
14
|
+
from minitap.mobile_use.controllers.mobile_command_controller import get_screen_data
|
|
17
15
|
from minitap.mobile_use.controllers.mobile_command_controller import (
|
|
18
16
|
input_text as input_text_controller,
|
|
19
17
|
)
|
|
20
18
|
from minitap.mobile_use.graph.state import State
|
|
21
19
|
from minitap.mobile_use.tools.tool_wrapper import ToolWrapper
|
|
20
|
+
from minitap.mobile_use.tools.types import Target
|
|
22
21
|
from minitap.mobile_use.tools.utils import focus_element_if_needed, move_cursor_to_end_if_bounds
|
|
23
22
|
from minitap.mobile_use.utils.logger import get_logger
|
|
24
|
-
from minitap.mobile_use.utils.ui_hierarchy import
|
|
25
|
-
ElementBounds,
|
|
26
|
-
find_element_by_resource_id,
|
|
27
|
-
get_element_text,
|
|
28
|
-
)
|
|
23
|
+
from minitap.mobile_use.utils.ui_hierarchy import find_element_by_resource_id, get_element_text
|
|
29
24
|
|
|
30
25
|
logger = get_logger(__name__)
|
|
31
26
|
|
|
@@ -49,14 +44,12 @@ def _controller_input_text(ctx: MobileUseContext, text: str) -> InputResult:
|
|
|
49
44
|
|
|
50
45
|
def get_input_text_tool(ctx: MobileUseContext):
|
|
51
46
|
@tool
|
|
52
|
-
def input_text(
|
|
47
|
+
async def input_text(
|
|
53
48
|
tool_call_id: Annotated[str, InjectedToolCallId],
|
|
54
49
|
state: Annotated[State, InjectedState],
|
|
55
50
|
agent_thought: str,
|
|
56
51
|
text: str,
|
|
57
|
-
|
|
58
|
-
text_input_coordinates: ElementBounds | None,
|
|
59
|
-
text_input_text: str | None,
|
|
52
|
+
target: Target,
|
|
60
53
|
):
|
|
61
54
|
"""
|
|
62
55
|
Focus a text field and type text into it.
|
|
@@ -70,17 +63,9 @@ def get_input_text_tool(ctx: MobileUseContext):
|
|
|
70
63
|
state: The state of the agent.
|
|
71
64
|
agent_thought: The thought of the agent.
|
|
72
65
|
text: The text to type.
|
|
73
|
-
|
|
74
|
-
text_input_coordinates: The bounds (ElementBounds) of the text input (if available).
|
|
75
|
-
text_input_text: The current text content of the text input (if available).
|
|
66
|
+
target: The target of the text input (if available).
|
|
76
67
|
"""
|
|
77
|
-
|
|
78
|
-
focused = focus_element_if_needed(
|
|
79
|
-
ctx=ctx,
|
|
80
|
-
input_resource_id=text_input_resource_id,
|
|
81
|
-
input_coordinates=text_input_coordinates,
|
|
82
|
-
input_text=text_input_text,
|
|
83
|
-
)
|
|
68
|
+
focused = focus_element_if_needed(ctx=ctx, target=target)
|
|
84
69
|
if not focused:
|
|
85
70
|
error_message = "Failed to focus the text input element before typing."
|
|
86
71
|
tool_message = ToolMessage(
|
|
@@ -90,7 +75,7 @@ def get_input_text_tool(ctx: MobileUseContext):
|
|
|
90
75
|
status="error",
|
|
91
76
|
)
|
|
92
77
|
return Command(
|
|
93
|
-
update=state.
|
|
78
|
+
update=await state.asanitize_update(
|
|
94
79
|
ctx=ctx,
|
|
95
80
|
update={
|
|
96
81
|
"agents_thoughts": [agent_thought, error_message],
|
|
@@ -100,40 +85,25 @@ def get_input_text_tool(ctx: MobileUseContext):
|
|
|
100
85
|
),
|
|
101
86
|
)
|
|
102
87
|
|
|
103
|
-
move_cursor_to_end_if_bounds(
|
|
104
|
-
ctx=ctx,
|
|
105
|
-
state=state,
|
|
106
|
-
text_input_resource_id=text_input_resource_id,
|
|
107
|
-
text_input_coordinates=text_input_coordinates,
|
|
108
|
-
text_input_text=text_input_text,
|
|
109
|
-
)
|
|
88
|
+
move_cursor_to_end_if_bounds(ctx=ctx, state=state, target=target)
|
|
110
89
|
|
|
111
90
|
result = _controller_input_text(ctx=ctx, text=text)
|
|
112
|
-
|
|
113
91
|
status: Literal["success", "error"] = "success" if result.ok else "error"
|
|
114
92
|
|
|
115
93
|
text_input_content = ""
|
|
116
|
-
if status == "success":
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
state.latest_ui_hierarchy
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
if not element:
|
|
127
|
-
result = InputResult(ok=False, error="Element not found")
|
|
128
|
-
|
|
129
|
-
if element:
|
|
130
|
-
text_input_content = get_element_text(element)
|
|
131
|
-
else:
|
|
132
|
-
# For elements without resource_id, skip verification and use direct message
|
|
133
|
-
pass
|
|
94
|
+
if status == "success" and target.resource_id:
|
|
95
|
+
screen_data = get_screen_data(screen_api_client=ctx.screen_api_client)
|
|
96
|
+
state.latest_ui_hierarchy = screen_data.elements
|
|
97
|
+
element = find_element_by_resource_id(
|
|
98
|
+
ui_hierarchy=state.latest_ui_hierarchy,
|
|
99
|
+
resource_id=target.resource_id,
|
|
100
|
+
index=target.resource_id_index,
|
|
101
|
+
)
|
|
102
|
+
if element:
|
|
103
|
+
text_input_content = get_element_text(element)
|
|
134
104
|
|
|
135
105
|
agent_outcome = (
|
|
136
|
-
input_text_wrapper.on_success_fn(text, text_input_content,
|
|
106
|
+
input_text_wrapper.on_success_fn(text, text_input_content, target.resource_id)
|
|
137
107
|
if result.ok
|
|
138
108
|
else input_text_wrapper.on_failure_fn(text, result.error)
|
|
139
109
|
)
|
|
@@ -146,7 +116,7 @@ def get_input_text_tool(ctx: MobileUseContext):
|
|
|
146
116
|
)
|
|
147
117
|
|
|
148
118
|
return Command(
|
|
149
|
-
update=state.
|
|
119
|
+
update=await state.asanitize_update(
|
|
150
120
|
ctx=ctx,
|
|
151
121
|
update={
|
|
152
122
|
"agents_thoughts": [agent_thought, agent_outcome],
|